#!/usr/bin/python
import string
import re
import urllib.request
import urllib
#import urllib2
#import cookielib
import re
import string
import chardet
#from urllib.request import urlopen
#from bs4 import BeautifulSoup
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
def GetUrls(filename):
fo = open(filename, "r")
str = fo.read()
fo.close()
hrefs = re.findall("href=.* ",str)
http_head = 'http://jcpt.chengdu.gov.cn'
http_urls = []
for href in hrefs:
url = re.findall("\/.*[0-9]",href)
http_url = http_head + url[0]
http_urls.append(http_url)
return http_urls
#print(str)
#print(hrefs)
def GetData(url, data):
browser = webdriver.Chrome()
browser.get(url)
#page = browser.page_source
#lis = browser.find_element_by_tag_name("table")
lis = browser.find_element_by_class_name("searchbox2")
InnerElement = lis.get_attribute('innerHTML')
lis_title = browser.find_element_by_tag_name("h3")
data["title"] = lis_title.get_attribute('innerHTML')
#title = re.findall('>(.*)</h3>',InnerElement)
#data = re.findall('</h3>(.*)</table>',InnerElement)
# print(data["title"])
index = InnerElement.find("<table");
# print(index)
data["content"] = InnerElement[index:]
# print(content)
# print(data["content"])
browser.close()
def HandleData(data):
data["content"] = data["content"].replace("\n","")
data["content"] = data["content"].replace("</th>","</span></th>")
data["content"] = data["content"].replace("</td>","</span></td>")
#添加背景色
data["content"] = data["content"].replace("bgcolor","style=\"background:#FFF\" bgcolor")
index = data["content"].find("<th",0)
data_len = len(data["content"])
while(index >=0 ):
# data["content"] = data["content"].replace(">", "><span style=\"color: #000000;\">",1);
index = data["content"].find(">",index)
data["content"] = data["content"][:index+1] + "<span style=\"color: #000000;\">" + data["content"][index+1:]
index = data["content"].find("<th",index)
index = data["content"].find("<td",0)
while(index >=0 ):
# data["content"] = data["content"].replace(">", "><span style=\"color: #000000;\">",1);
index = data["content"].find(">",index)
data["content"] = data["content"][:index+1] + "<span style=\"color: #000000;\">" + data["content"][index+1:]
index = data["content"].find("<td",index)
# print(index)
# data["content"] = data["content"].strip('\n');
# return;
def SaveToCSV(filename, datas):
fo = open(filename,"a+")
title = "post_title"
fo.seek(0)
temp = fo.readline()
print(temp)
if(temp.find(title) == -1):
print("temp:",temp)
fo.write("post_title,post_content\n")
fo.seek(fo.tell())
for data in datas:
title = data["title"]
print("chardet.detect(title):",chardet.detect(title.encode('utf-8')) )
content = data["content"]
# title = title.decode('utf-8')
# content = content.decode('utf-8')
buffer_write = title + ',' + content+'\n'
# print(chardet.detect(join(buffer_write)) )
buffer_temp = buffer_write.encode('utf-8')
print(chardet.detect(buffer_temp) )
fo.write(buffer_temp)
fo.close()
if __name__ == "__main__":
http_urls = GetUrls("/Users/suibenzhi/MyFiles/临时文件夹/menu.txt")
# data = {"title":"","content":""}
datas = []
# for url in http_urls:
for i in range(0,1):
url = http_urls[i]
data = {"title":"","content":""}
GetData(url,data)
HandleData(data)
print("title:",data["title"])
# datas.append(data)
datas.insert(0,data)
# print("title:",data["title"])
# print("content:",data["content"])
SaveToCSV("/Users/suibenzhi/MyFiles/临时文件夹/test.csv",datas)
print(url)
print(datas)