Screen Scraping with BeautifulSoup for Python. http://www.crummy.com/software/BeautifulSoup/bs4/doc/
from bs4 import BeautifulSoup
soup = BeautifulSoup(html_doc, 'html.parser')
print(soup.prettify())
tag = soup.kind_of_tag # retrieve the element with kind_of_tag (the first occurrence I suppose)
tag.name # name of the tag
tag.string # text of the tag
tag.attrs # attributes of the tag as a dictionary
tag['attribute_name'] # access to an attribute
tag.attribute_name # same
# search for tags
soup.find_all('tag-name') # find all 'tag-name' tags and return a list of them
for link in soup.find_all('a'):
print(link.get('href'))
# search for tags (with attributes of them)
soup.find(id='my_id')
soup.find(id=True) # the id attribute has a value
soup.find_all(href=re.compile('.es$'))
soup.find_all(attrs={'data-foo': 'value'})
soup.find_all('a', attrs={'class': 'sister'})
soup.find_all(class_=re.compile('title')) # search by CSS class: <p class="title"><b>The Dormouse's story</b></p>
soup.find_all('a', class_='sister') # <a class="sister" href="http://example.com/elsie" id="link1">Elsie</a>
# Any argument that’s not recognized will be turned into a filter on one of a tag’s attributes
# Some attributes, like the data-* attributes in HTML 5, have names that can’t be used as the names of keyword arguments
# search for strings instead of tags
soup.find_all('a', text='Elsie')