# coding:utf-8
import urllib.request
import requests
from bs4 import BeautifulSoup
import os
def main():
root_link = 'http://openaccess.thecvf.com/'
save_path = './ICCV2017' # pdf save path
conference = 'ICCV' # conference name
year = 2017 # conference year
pdflist = os.listdir(save_path)
from_page(root_link=root_link, conference=conference, year=year, save_path=save_path, pdflist=pdflist)
def from_page(root_link, conference, year, save_path, pdflist):
"""
Get all .pdf url from root_link
"""
url = root_link + conference + str(year) + '.py'
print("download form:", url)
# 提取url
r = requests.get(url)
if r.status_code != 200:
print("ERRORS occur !!!")
soup = BeautifulSoup(r.text, "html5lib")
links = []
for link in soup.find_all('a'):
new_link = link.get('href')
if new_link == None:
continue
file_name = new_link.split('/')[-1]
if file_name in pdflist:
continue
if new_link.endswith('paper.pdf'):
new_link = root_link + new_link
links.append(new_link)
for key, link in enumerate(links):
print(key + 1 + len(pdflist), '/', len(links) + len(pdflist))
download_file(link, save_path)
def download_file(download_url, save_path):
try:
response = urllib.request.urlopen(download_url)
except urllib.request.HTTPError as e:
print(e.reason)
except urllib.request.URLError as e:
print(e.reason)
else:
file_name = download_url.split('/')[-1]
save_name = os.path.join(save_path, file_name)
file = open(save_name, 'wb')
file.write(response.read())
file.close()
if __name__ == "__main__":
main()