一个小爬虫

今天就写了个爬CVPR年会的爬虫,,没别的,代码如下

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
# -*- coding: utf-8 -*-
import re
import requests
import urllib.request
import os
import argparse

parser = argparse.ArgumentParser(description=__doc__)
parser.add_argument('--keyword',type=str, default='detection')
#传参匹配
args = parser.parse_args()

#获得网页信息
r = requests.get('http://openaccess.thecvf.com/CVPR2020.py') #修改年份看这里
#信息保存在json文件里,没有保存在text因为我不喜欢
data = r.text
#获取PDF links
linklist = re.findall(r"(?<=href=\").+?pdf(?=\">pdf)|(?<=href=\').+?pdf(?=\">pdf)" ,data)
namelist = re.findall(r"(?<=href=\").+?2020_paper.html\">.+?</a>" ,data) #还要看这里

cnt = 0
num = len(linklist)

#local path
localpath = './CVPR2020/{}/'.format(args.keyword) #修改年份看这里
if not os.path.exists(localpath):
os.makedirs(localpath)
while cnt < num:
url = linklist[cnt] # define download url
filename = namelist[cnt].split('<')[0].split('>')[1]# distribute file name from list
filename = filename.replace(':','_')
filename = filename.replace('\"','_')
filename = filename.replace('?','_')
filename = filename.replace('/','_')
filename = filename.replace('+','_')
filename = filename.replace(' ','_')
searchlist = filename.split('_')
searchmodel = re.compile(r'{}'.format(args.keyword),re.IGNORECASE)

download_next_paper = True

if ([True for i in searchlist if searchmodel.findall(i)]):
download_next_paper = False

if download_next_paper:
cnt += 1
continue

filepath = localpath + filename + '.pdf'
if os.path.exists(filepath):
print('file [{}.pdf] exist, skip downloading')
cnt += 1
continue
else:
print('['+str(cnt)+"/"+str(num)+"] Downloading -> "+filepath)
try:

urllib.request.urlretrieve('http://openaccess.thecvf.com/'+url,filepath)
except :
print('download failed: ' + filepath)
cnt += 1

print('finished')