normal.py
rownodes=data.find_all('div',id=re.compile('post_(\d+)'))
content = node[0].xpath('string(.)').strip()
node=rownode.xpath('.//p[@class="post-timestamp"]/a[contains(@name,"floor_(\d+)")]')
floor=re.search("write\('(.*)'\)",floor).group(1)
node = rownode.xpath('.//div[@class="post_msg replyBody"]/div//text() | .//div[@class="post_msg replyBody"]/font//text()')
def parse_item(self, response):
self.log('Hi, this is an item page! %s' % response.url)
item = scrapy.Item()
item['id'] = response.xpath('//td[@id="item_id"]/text()').re(r'ID: (\d+)')
item['name'] = response.xpath('//td[@id="item_name"]/text()').extract()
item['description'] = response.xpath('//td[@id="item_description"]/text()').extract()
return item
def parsePosterName(rownode):
node=rownode.select('div.authi a.xw1')
if len(node)==0:
node=rownode.select('td.pls div.pi')
if len(node)==0:
raise NameError('Can not parse PosterName!')
if(re.search('(.*)该用户已被删除',node[0].get_text())):
tmp=re.search('(.*)该用户已被删除',node[0].get_text()).group(1)
return tmp
else:
return node[0].string
def getExcel(data):
title=['siteid','subject','content','dateOfPost','floor','posterName','posterURL','posterID','threadURL','isTopicPost','pageNum']
filename=str(theSiteid)+'_'+str(time.strftime('%Y-%m-%d %H-%M-%S',time.localtime()))
workbook = wx.Workbook(filename+'.xlsx')
worksheet = workbook.add_worksheet()
for i in range(len(data)):
for j in range(len(title)):
if i==0:
worksheet.write(i, j, title[j])
worksheet.write(i+1, j, data[i][j])
workbook.close()
文件夹框打开方式
import win32ui
clr = Color()
clr.print_green_text('Enter to Open File')
dlg = win32ui.CreateFileDialog(1) # 表示打开文件对话框
dlg.SetOFNInitialDir('C:/') # 设置打开文件对话框中的初始显示目录
dlg.DoModal()
filename = dlg.GetPathName()
clr.print_green_text('Open File or directory: '+filename)
# f = open(os.getcwd()+r'/indexCrawl.txt','rb')
if filename is None or filename == '':
sys.exit(0)
f = open(filename,'rb')
mainlists = [i for i in f.readlines()]
f.close()
线程使用
#coding:utf-8
import threading
import time
list_thread=[]
## realthreadnum=threadnum
threadnum=math.ceil(len(mainlists)/10)
for i in range(0,len(mainlists),threadnum):
list_thread.append(mainlists[i:i+threadnum])
threads = []
for i in list_thread:
threads.append(threading.Thread(target=threadMain,args=(i,)))
print('============== start in threading ==============')
for t in threads:
t.setDaemon(True)
t.start()
for t in threads:
t.join()
def action(arg):
time.sleep(1)
print 'sub thread start!the thread name is:%s ' % threading.currentThread().getName()
print 'the arg is:%s ' %arg
time.sleep(1)
thread_list = [] #线程存放列表
for i in xrange(4):
t =threading.Thread(target=action,args=(i,))
t.setDaemon(True)
thread_list.append(t)
for t in thread_list:
t.start()
for t in thread_list:
t.join()
excel读取
wr = xlrd.open_workbook(filename)
wr = wr.sheet_by_name('Thread')
wr = wr.col_values(3)
print("ALL "+str(len(wr))+" threads!")
excel 插入数据
import xlwt
import xlrd
from xlutils.copy import copy
styleBoldRed = xlwt.easyxf('font: color-index red, bold on')
headerStyle = styleBoldRed
wb = xlwt.Workbook()
ws = wb.add_sheet(gConst['xls']['sheetName'])
ws.write(0, 0, "Header", headerStyle)
ws.write(0, 1, "CatalogNumber", headerStyle)
ws.write(0, 2, "PartNumber", headerStyle)
wb.save(gConst['xls']['fileName'])
#open existed xls file
#newWb = xlutils.copy(gConst['xls']['fileName'])
#newWb = copy(gConst['xls']['fileName'])
oldWb = xlrd.open_workbook(gConst['xls']['fileName'], formatting_info=True)
print oldWb #<xlrd.book.Book object at 0x000000000315C940>
newWb = copy(oldWb)
print newWb #<xlwt.Workbook.Workbook object at 0x000000000315F470>
newWs = newWb.get_sheet(0)
newWs.write(1, 0, "value1")
newWs.write(1, 1, "value2")
newWs.write(1, 2, "value3")
print "write new values ok"
newWb.save(gConst['xls']['fileName'])
print ("save with same name ok")
MYSQL操作
class SelectMySQL(object):
def select_data(self,sql):
result = []
try:
conn = MySQLdb.connect(host=host,
port=port,
user=user,
passwd=passwd,
db=db,
charset='utf8', )
cur = conn.cursor()
cur.execute(sql)
alldata = cur.fetchall()
for rec in alldata:
result.append(rec[0])
except Exception as e:
print('Error msg: ' + e)
finally:
cur.close()
conn.close()
return result
def get_result(self, sql, filename):
print(sql)
results = self.select_data(sql)
print('The amount of datas: %d' % (len(results)))
with open(filename, 'w') as f:
for result in results:
f.write(str(result) + '\n')
print('Data write is over!')
return results
tutorial
>>> # flatten a list using a listcomp with two 'for'
>>> vec = [[1,2,3], [4,5,6], [7,8,9]]
>>> [num for elem in vec for num in elem]
[1, 2, 3, 4, 5, 6, 7, 8, 9]
------
>>> from math import pi
>>> [str(round(pi, i)) for i in range(1, 6)]
['3.1', '3.14', '3.142', '3.1416', '3.14159']
------
matrix = [
[1, 2, 3, 4],
[5, 6, 7, 8],
[9, 10, 11, 12],
]
print(len(matrix[0]))
print([[row[i] for row in matrix] for i in range(len(matrix[0]))])
>>4
>>[[1, 5, 9], [2, 6, 10], [3, 7, 11], [4, 8, 12]]
继承顺序问题
class Init(object):
def __init__(self, value):
self.val = value
class Add2(Init):
def __init__(self, val):
super(Add2, self).__init__(val)
self.val += 2
class Mul5(Init):
def __init__(self, val):
super(Mul5, self).__init__(val)
self.val *= 5
class Pro(Mul5, Add2):
pass
class Incr(Pro):
csup = super(Pro)
def __init__(self, val):
self.csup.__init__(val)
self.val += 1
p = Incr(5)
print(p.val)