今天就写了个爬CVPR年会的爬虫,,没别的,代码如下
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63
| # -*- coding: utf-8 -*- import re import requests import urllib.request import os import argparse
parser = argparse.ArgumentParser(description=__doc__) parser.add_argument('--keyword',type=str, default='detection') #传参匹配 args = parser.parse_args()
#获得网页信息 r = requests.get('http://openaccess.thecvf.com/CVPR2020.py') #修改年份看这里 #信息保存在json文件里,没有保存在text因为我不喜欢 data = r.text #获取PDF links linklist = re.findall(r"(?<=href=\").+?pdf(?=\">pdf)|(?<=href=\').+?pdf(?=\">pdf)" ,data) namelist = re.findall(r"(?<=href=\").+?2020_paper.html\">.+?</a>" ,data) #还要看这里
cnt = 0 num = len(linklist)
#local path localpath = './CVPR2020/{}/'.format(args.keyword) #修改年份看这里 if not os.path.exists(localpath): os.makedirs(localpath) while cnt < num: url = linklist[cnt] # define download url filename = namelist[cnt].split('<')[0].split('>')[1]# distribute file name from list filename = filename.replace(':','_') filename = filename.replace('\"','_') filename = filename.replace('?','_') filename = filename.replace('/','_') filename = filename.replace('+','_') filename = filename.replace(' ','_') searchlist = filename.split('_') searchmodel = re.compile(r'{}'.format(args.keyword),re.IGNORECASE) download_next_paper = True if ([True for i in searchlist if searchmodel.findall(i)]): download_next_paper = False if download_next_paper: cnt += 1 continue filepath = localpath + filename + '.pdf' if os.path.exists(filepath): print('file [{}.pdf] exist, skip downloading') cnt += 1 continue else: print('['+str(cnt)+"/"+str(num)+"] Downloading -> "+filepath) try: urllib.request.urlretrieve('http://openaccess.thecvf.com/'+url,filepath) except : print('download failed: ' + filepath) cnt += 1 print('finished')
|