zjdznl
7/6/2018 - 4:39 AM

crawl bing wallpapers from bing.ioliu.cn/

crawl bing wallpapers from bing.ioliu.cn/

import re
import os
import aiohttp
import asyncio
import requests
import time
from lxml import etree

headers = {
    'user-agent': 'Mozilla/5.0 (X11; Linux x86_64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.132 Safari/537.36'}
exception_list = []

async def fetch(url):
    async with aiohttp.ClientSession() as session:
        async with session.get(url, headers=headers) as response:
            return await response.text()


async def get_html_url_list():
    url_base = "https://bing.ioliu.cn/?p={}"
    html = await fetch('https://bing.ioliu.cn/?p=1')
    selector = etree.HTML(html)
    content = selector.xpath('/html/body/div[@class="page"]/span/text()')  # 这里使用id属性来定位哪个div和ul被匹配 使用text()获取文本内容
    if content:
        page = int(str(content[0]).split('/')[-1])
    else:
        page = 70
    return (url_base.format(i) for i in range(1, page + 1))


async def get_image_and_save(session, url):
    filename = os.path.join('YOU_PATH', url.split('/')[-1])
    try:
        async with session.get(url, headers=headers) as resp:
            with open(filename, 'wb') as fd:
                while True:
                    chunk = await resp.content.read(1024)
                    if not chunk:
                        break
                    fd.write(chunk)
    except Exception as e:
        exception_list.append(e)
        return


async def get_all_images(image_url_list):
    conn = aiohttp.TCPConnector(verify_ssl=False)
    async with aiohttp.ClientSession(connector=conn) as session:
        sem = asyncio.Semaphore(50)
        for image_url in image_url_list:
            async with sem:
                await get_image_and_save(session, image_url)


def get_image_url_list(html_content_list):
    regex = r"http://h1\.ioliu.*?jpg"
    image_url_list = []
    for html in html_content_list:
        matches = re.finditer(regex, html)
        for match in list(matches):
            image_url_list.append(match.group(0))
    image_url_list = image_url_list[1::3]
    image_url_list = map(lambda url: '_'.join(url.split('_')[:-1]) + '_1920x1080.jpg', image_url_list)
    return image_url_list


def main():
    start_time = time.time()
    loop = asyncio.get_event_loop()

    url_list = loop.run_until_complete(get_html_url_list())
    html_content_tasks = [fetch(url) for url in list(url_list)]
    html_content_list = loop.run_until_complete(asyncio.gather(*html_content_tasks))

    image_url_list = get_image_url_list(html_content_list)
    download_image_tasks = [get_all_images(image_url_list)]
    loop.run_until_complete(asyncio.gather(*download_image_tasks))

    cost_time = time.time() - start_time
    print('cost time:', round(cost_time, 2), 'seconds')

    for e in exception_list:
        print(e)
    print("length :{}".format(len(exception_list)))

    loop.close()


if __name__ == '__main__':
    main()