dynames0098
12/26/2017 - 3:10 PM

scrawler_spyder_python

python 网页爬虫

# coding:utf-8
import bs4
import requests
import re
import time


def one_page(url, name, timelimitmk):
    head = {"Host": "www.itjuzi.com",
            "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0",
            "Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
            "Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
            "Accept-Encoding": "gzip, deflate, br",
            "Connection": "close",
            "Upgrade-Insecure-Requests": "1"}
    html = requests.get(url, headers=head, verify=True)
    html = bs4.BeautifulSoup(html.content, "html.parser")
    a = html.find_all("ul", attrs={"class": "list-main-eventset"})
    # print a[1]
    lis = a[1].find_all("li")
    count = 0
    flag = 1
    for lll in lis:
        cr = lll.find('i')
        time2 = cr.text
        cm = cr.find_next('i').find_next('i')
        company = cm.find("p").text
        hangye = cm.find("p").find_next("p").find("a").text
        didian = cm.find("p").find_next("p").find("a").find_next("a").text
        cm = cm.find_next("i")
        lunci = cm.text.strip()
        cm = cm.find_next("i")
        jine = cm.text
        cm = cm.find_next("i")
        time2 = time2.strip().encode("utf-8")
        if timelimitmk > time.mktime(time.strptime(time2, '%Y.%m.%d')):
            return -1
        company = company.strip().encode("utf-8")
        hangye = hangye.strip().encode("utf-8")
        didian = didian.strip().encode("utf-8")
        lunci = lunci.strip().encode("utf-8")
        jine = jine.strip().encode("utf-8")
        rongzifang = re.sub('\s+', ';', cm.text.strip()).encode("utf-8")

        f = open(name + ".txt", 'a')
        f.write(
            time2 + "|" + company + "|" + hangye + "|" + didian + "|" + lunci + "|" + jine + "|" + rongzifang + "\n")
        f.close()
    return 1


if __name__ == "__main__":
    maxpage = 20
    timelimit = "2016-11-01"
    urllist = ["https://www.itjuzi.com/investevents?page=", "https://www.itjuzi.com/merger?page=",
               "https://www.itjuzi.com/investevents/foreign?page=", "https://www.itjuzi.com/merger/foreign?page="]
    namelist = ["invest", "merger", "foreign_invest", "foreign_merger"]
    timelimitmk = time.mktime(time.strptime(timelimit, '%Y-%m-%d'))

    for j in range(4):
        i = 1
        flag = 1
        while i <= maxpage and flag == 1:
            flag = one_page(urllist[j] + str(i), namelist[j], timelimitmk)
            print("%sth page of %s is print" % (i, namelist[j]))
            time.sleep(10)
            i = i + 1