suibenzhi
3/16/2020 - 4:16 AM

爬虫1

#!/usr/bin/python

import string
import re
import urllib.request
import urllib
#import urllib2
#import cookielib
import re
import string
import chardet  

#from urllib.request import urlopen
#from bs4 import BeautifulSoup

from selenium import webdriver
from selenium.webdriver.common.keys import Keys


def GetUrls(filename):
	fo = open(filename, "r")

	str = fo.read()

	fo.close()

	hrefs = re.findall("href=.* ",str)
	
	http_head = 'http://jcpt.chengdu.gov.cn'
	http_urls = []

	for href in hrefs:
		url = re.findall("\/.*[0-9]",href)
		http_url = http_head + url[0]
		http_urls.append(http_url)
		
	return 	http_urls
	#print(str)
	#print(hrefs)



def GetData(url, data):
	browser = webdriver.Chrome()
	browser.get(url)

	#page = browser.page_source

	#lis = browser.find_element_by_tag_name("table")
	lis = browser.find_element_by_class_name("searchbox2")

	InnerElement = lis.get_attribute('innerHTML')

	lis_title = browser.find_element_by_tag_name("h3")
	data["title"] = lis_title.get_attribute('innerHTML')
	#title = re.findall('>(.*)</h3>',InnerElement)
	#data = re.findall('</h3>(.*)</table>',InnerElement)
#	print(data["title"])
	index = InnerElement.find("<table");
#	print(index)
	data["content"] = InnerElement[index:]
#	print(content)
#	print(data["content"])

	browser.close()


def HandleData(data):
	data["content"] = data["content"].replace("\n","")
	data["content"] = data["content"].replace("</th>","</span></th>")
	data["content"] = data["content"].replace("</td>","</span></td>")

	#添加背景色
	data["content"] = data["content"].replace("bgcolor","style=\"background:#FFF\" bgcolor")

	index = data["content"].find("<th",0)
	
	data_len = len(data["content"])
	while(index >=0 ):

#		data["content"] = data["content"].replace(">", "><span style=\"color: #000000;\">",1);
		index = data["content"].find(">",index)

		data["content"] = data["content"][:index+1] + "<span style=\"color: #000000;\">" + data["content"][index+1:] 
		index = data["content"].find("<th",index)
		
	
	index = data["content"].find("<td",0)
	
	while(index >=0 ):

#		data["content"] = data["content"].replace(">", "><span style=\"color: #000000;\">",1);
		index = data["content"].find(">",index)

		data["content"] = data["content"][:index+1] + "<span style=\"color: #000000;\">" + data["content"][index+1:] 
		index = data["content"].find("<td",index)
#		print(index)
#	data["content"] = data["content"].strip('\n');
#	return;
	

def SaveToCSV(filename, datas):
	fo = open(filename,"a+")
	title = "post_title"
	
	fo.seek(0)

	temp = fo.readline()
	print(temp)

	if(temp.find(title) == -1):
		print("temp:",temp)
		fo.write("post_title,post_content\n")
		
	fo.seek(fo.tell())
	for data in datas:
		title = data["title"]
		print("chardet.detect(title):",chardet.detect(title.encode('utf-8')) ) 

		content = data["content"]
#		title = title.decode('utf-8')
#		content = content.decode('utf-8')
		buffer_write = title + ',' + content+'\n'
#		print(chardet.detect(join(buffer_write)) ) 
		buffer_temp = buffer_write.encode('utf-8')

		print(chardet.detect(buffer_temp) ) 

		fo.write(buffer_temp)


	fo.close()

		
if __name__ == "__main__":
	http_urls = GetUrls("/Users/suibenzhi/MyFiles/临时文件夹/menu.txt")
#	data = {"title":"","content":""}
	datas = []
	
#	for url in http_urls:
	for i in range(0,1):
		url = http_urls[i]

		data = {"title":"","content":""}
		GetData(url,data)
		HandleData(data)
		print("title:",data["title"])

#		datas.append(data)
		datas.insert(0,data)

#	print("title:",data["title"])
#	print("content:",data["content"])

	SaveToCSV("/Users/suibenzhi/MyFiles/临时文件夹/test.csv",datas)
	print(url)
	print(datas)