BeautifulSoup vs lxml
from datetime import datetime
import requests
from bs4 import BeautifulSoup as BSoup
from lxml import html
def get_html():
url = 'https://en.wikipedia.org/wiki/List_of_states_and_territories_of_the_United_States'
r = requests.get(url)
return r.text
def bs_scraping(page_source, parser):
bs_obj = BSoup(page_source, parser)
rows = bs_obj.find_all('table')[0].find_all('tr')
data = []
for row in rows[2:]:
cells = row.find_all('td')
name = row.find('th').get_text()
abbr = cells[0].get_text()
reps = cells[-1].get_text()
water_km = cells[-2].get_text()
land_km = cells[-4].get_text()
total_km = cells[-6].get_text()
population = cells[-8].get_text()
data.append([name, abbr, reps, water_km, land_km, total_km, population])
return data
def lxml_scraping(page_source):
tree = html.fromstring(page_source)
table = tree.xpath('//*[@id="mw-content-text"]/div/table[1]')[0]
rows = table.findall('tr')
data = []
for row in rows[2:]:
name = row.xpath('./th')[0].text_content()
cells = row.xpath('./td')
abbr = cells[0].text_content()
reps = cells[-1].text_content()
water_km = cells[-2].text_content()
land_km = cells[-4].text_content()
total_km = cells[-6].text_content()
population = cells[-8].text_content()
data.append([name, abbr, reps, water_km, land_km, total_km, population])
return data
if __name__ == '__main__':
page_source = get_html()
bs_parsers = ['lxml', 'html.parser', 'html5lib']
for parser in bs_parsers:
bs_start = datetime.now()
bs_result = bs_scraping(page_source, parser)
bs_finish = datetime.now() - bs_start
print('BeautifulSoup {} time: {}'.format(parser, bs_finish))
lxml_start = datetime.now()
lxml_result = lxml_scraping(page_source)
lxml_finish = datetime.now() - lxml_start
print('lxml time:', lxml_finish)
# BeautifulSoup lxml time: 0:00:00.328582
# BeautifulSoup html.parser time: 0:00:00.484112
# BeautifulSoup html5lib time: 0:00:01.028619
#
# lxml time: 0:00:00.038192