houlairen-1
9/30/2019 - 7:04 AM

一禅小和尚 语录 爬虫

#!/bin/python3

import requests
from lxml import etree

NUM_ITEM_OF_PAGE=10
NUM_PAGE=8
LINE='--------------------------------------------------------------'


def get_content_of_page(etree_html,content_xpath_list):
    for i in range(NUM_ITEM_OF_PAGE):
        content_xpath = "//*[@id=\"__layout\"]/div/div[3]/div[1]/div[2]/div[2]/section[{}]/div[2]/a/span/span/span/text()".format(i+1)
        content_xpath_list.append(content_xpath)
        content = etree_html.xpath(content_xpath)
        
        for each in content:
            replace = each.replace('\n', '').replace(' ', '')
            if replace == '\n' or replace == '':
                continue
            print(replace)
        print(LINE)


def get_all_pages(content_xpath_list):
    for i in range(NUM_PAGE):
        url="https://www.juzikong.com/works/ac5e4867-5b49-4ea0-b8cc-c1777522ea5b?page={}".format(i+1)
        html = requests.get(url)
        get_content_of_page(etree_html=etree.HTML(html.text),content_xpath_list=content_xpath_list)


if __name__ == '__main__':
    content_xpath_list = []
    get_all_pages(content_xpath_list)