dmitriiweb
10/29/2017 - 6:18 PM

BeautifulSoup vs lxml

BeautifulSoup vs lxml

from datetime import datetime
import requests
from bs4 import BeautifulSoup as BSoup
from lxml import html


def get_html():
    url = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States'
    r = requests.get(url)
    return r.text


def bs_scraping(page_source, parser):
    bs_obj = BSoup(page_source, parser)
    rows = bs_obj.find_all('table')[0].find_all('tr')
    data = []
    for row in rows[2:]:
        cells = row.find_all('td')
        name = row.find('th').get_text()
        abbr = cells[0].get_text()
        reps = cells[-1].get_text()
        water_km = cells[-2].get_text()
        land_km = cells[-4].get_text()
        total_km = cells[-6].get_text()
        population = cells[-8].get_text()
        data.append([name, abbr, reps, water_km, land_km, total_km, population])
    return data


def lxml_scraping(page_source):
    tree = html.fromstring(page_source)
    table = tree.xpath('//*[@id="mw-content-text"]/div/table[1]')[0]
    rows = table.findall('tr')
    data = []
    for row in rows[2:]:
        name = row.xpath('./th')[0].text_content()
        cells = row.xpath('./td')
        abbr = cells[0].text_content()
        reps = cells[-1].text_content()
        water_km = cells[-2].text_content()
        land_km = cells[-4].text_content()
        total_km = cells[-6].text_content()
        population = cells[-8].text_content()
        data.append([name, abbr, reps, water_km, land_km, total_km, population])
    return data


if __name__ == '__main__':
    page_source = get_html()
    bs_parsers = ['lxml', 'html.parser', 'html5lib']
    for parser in bs_parsers:
        bs_start = datetime.now()
        bs_result = bs_scraping(page_source, parser)
        bs_finish = datetime.now() - bs_start
        print('BeautifulSoup {} time: {}'.format(parser, bs_finish))

    lxml_start = datetime.now()
    lxml_result = lxml_scraping(page_source)
    lxml_finish = datetime.now() - lxml_start
    print('lxml time:', lxml_finish)
    
    # BeautifulSoup lxml time: 0:00:00.328582
    # BeautifulSoup html.parser time: 0:00:00.484112
    # BeautifulSoup html5lib time: 0:00:01.028619
    #
    # lxml time: 0:00:00.038192