python 网页爬虫
# coding:utf-8
import bs4
import requests
import re
import time
def one_page(url, name, timelimitmk):
head = {"Host": "www.itjuzi.com",
"User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64; rv:48.0) Gecko/20100101 Firefox/48.0",
"Accept": "text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8",
"Accept-Language": "zh-CN,zh;q=0.8,en-US;q=0.5,en;q=0.3",
"Accept-Encoding": "gzip, deflate, br",
"Connection": "close",
"Upgrade-Insecure-Requests": "1"}
html = requests.get(url, headers=head, verify=True)
html = bs4.BeautifulSoup(html.content, "html.parser")
a = html.find_all("ul", attrs={"class": "list-main-eventset"})
# print a[1]
lis = a[1].find_all("li")
count = 0
flag = 1
for lll in lis:
cr = lll.find('i')
time2 = cr.text
cm = cr.find_next('i').find_next('i')
company = cm.find("p").text
hangye = cm.find("p").find_next("p").find("a").text
didian = cm.find("p").find_next("p").find("a").find_next("a").text
cm = cm.find_next("i")
lunci = cm.text.strip()
cm = cm.find_next("i")
jine = cm.text
cm = cm.find_next("i")
time2 = time2.strip().encode("utf-8")
if timelimitmk > time.mktime(time.strptime(time2, '%Y.%m.%d')):
return -1
company = company.strip().encode("utf-8")
hangye = hangye.strip().encode("utf-8")
didian = didian.strip().encode("utf-8")
lunci = lunci.strip().encode("utf-8")
jine = jine.strip().encode("utf-8")
rongzifang = re.sub('\s+', ';', cm.text.strip()).encode("utf-8")
f = open(name + ".txt", 'a')
f.write(
time2 + "|" + company + "|" + hangye + "|" + didian + "|" + lunci + "|" + jine + "|" + rongzifang + "\n")
f.close()
return 1
if __name__ == "__main__":
maxpage = 20
timelimit = "2016-11-01"
urllist = ["https://www.itjuzi.com/investevents?page=", "https://www.itjuzi.com/merger?page=",
"https://www.itjuzi.com/investevents/foreign?page=", "https://www.itjuzi.com/merger/foreign?page="]
namelist = ["invest", "merger", "foreign_invest", "foreign_merger"]
timelimitmk = time.mktime(time.strptime(timelimit, '%Y-%m-%d'))
for j in range(4):
i = 1
flag = 1
while i <= maxpage and flag == 1:
flag = one_page(urllist[j] + str(i), namelist[j], timelimitmk)
print("%sth page of %s is print" % (i, namelist[j]))
time.sleep(10)
i = i + 1