ficapy
8/7/2015 - 4:17 AM

多线程请求

多线程请求

#!/usr/bin/env python
# -*- coding: utf-8 -*-
# Author: Ficapy
# Create: '15/8/6'

import time
import math
import requests
from concurrent.futures import ThreadPoolExecutor, as_completed
from xlrd import open_workbook

USERNAME = 'XXXXXXXXX'
PASSWORD = 'XXXXXXXXX'

session = requests.Session()

headers = {
    'User-Agent': 'Mozilla/4.0 (compatible; MSIE 8.0; Windows NT 6.1; WOW64; Trident/4.0; SLCC2; .NET CLR 2.0.'
                  '50727; .NET CLR 3.5.30729; .NET CLR 3.0.30729; Media Center PC 6.0; .NET4.0C; .NET4.0E)',
    'Host': 'www.szceb.com',
    'Accept-Encoding': 'gzip, deflate, sdch',
    'Accept-Language': 'zh-CN,zh;q=0.8,en;q=0.6',
    'Cache-Control': 'no-cache',
    'Connection': 'keep-alive',
}

session.headers.update(headers)

raw_data = []
with open_workbook('1.xls') as book:
    sheet = book.sheet_by_index(0)
    row_number = sheet.nrows
    for row in range(1, row_number):
        raw_data.append(sheet.cell(row, 1).value)


# 模拟登陆,网站有漏洞,不会验证验证码有效性
session.get('http://www.szceb.com/szceb/login.jsp')
loging_url = 'http://www.szceb.com/szceb/login.do?method=login'
login = session.post(loging_url, data={
    'loginSignal': 1,
    'username': USERNAME,
    'password': PASSWORD,
    'yanzheng': 1234,
})

assert '登陆后显示的主页面' in login.text


def work(input_data, retry=3):
    url = 'http://www.szceb.com/szceb/goodsFilingStatus.do?method=filingStatus'

    data = {
        'goodsRegCiqList.itemNo': input_data,
        'goodsRegCiqList.GName': '',
        'goodsRegCiq.ebcCode': '',
        'goodsRegCiq.ebpCode': '',
        'goodsRegCiqList.operType': '',
        'goodsRegCiqList.ciqStatus': '',
        'strStatDate': '',
        'strEndDate': '',
        'page': 1,
    }
    try:
        result = session.post(url, data=data, timeout=60)
        result.raise_for_status()
        if '没有找到任何数据' in result.text:
            return input_data
    except:
        retry -= 1
        time.sleep(math.pow(2, retry))
        print('{}重试第{}次'.format(input_data, 3 - retry))
        if retry < 0:
            raise ValueError(input_data)
        return work(input_data, retry)
    return True


def main():
    non_data = []
    print('开始咯~~~')
    with ThreadPoolExecutor(max_workers=10) as executor:
        future_to_url = {executor.submit(work, input_data): input_data for input_data in raw_data}
        for future in as_completed(future_to_url):
            input_data = future_to_url[future]
            try:
                result = future.result()
            except Exception as exc:
                print('%r 请求出错: %s' % (input_data, exc))
            else:
                if result is not True:
                    non_data.append(input_data)
    print('共查询{}条数据, 没有任何数据的有{}条'.format(len(raw_data), len(non_data)))
    print('以下条目没有查询到数据:')
    print('\n'.join(sorted(non_data)))


if __name__ == '__main__':
    main()