swimmingwhale
3/29/2019 - 1:14 AM

批量下载ICCVpaper

# coding:utf-8
import urllib.request
import requests
from bs4 import BeautifulSoup
import os


def main():
    root_link = 'http://openaccess.thecvf.com/'
    save_path = './ICCV2017'  # pdf save path
    conference = 'ICCV'  # conference name
    year = 2017  # conference year

    pdflist = os.listdir(save_path)
    from_page(root_link=root_link, conference=conference, year=year, save_path=save_path, pdflist=pdflist)


def from_page(root_link, conference, year, save_path, pdflist):
    """
    Get all .pdf url from root_link
    """
    url = root_link + conference + str(year) + '.py'

    print("download form:", url)

    # 提取url
    r = requests.get(url)
    if r.status_code != 200:
        print("ERRORS occur !!!")

    soup = BeautifulSoup(r.text, "html5lib")
    links = []
    for link in soup.find_all('a'):
        new_link = link.get('href')
        if new_link == None:
            continue
        file_name = new_link.split('/')[-1]
        if file_name in pdflist:
            continue
        if new_link.endswith('paper.pdf'):
            new_link = root_link + new_link
            links.append(new_link)

    for key, link in enumerate(links):
        print(key + 1 + len(pdflist), '/', len(links) + len(pdflist))
        download_file(link, save_path)


def download_file(download_url, save_path):
    try:
        response = urllib.request.urlopen(download_url)
    except urllib.request.HTTPError as e:
        print(e.reason)
    except urllib.request.URLError as e:
        print(e.reason)
    else:
        file_name = download_url.split('/')[-1]
        save_name = os.path.join(save_path, file_name)
        file = open(save_name, 'wb')
        file.write(response.read())
        file.close()


if __name__ == "__main__":
    main()