akash-coded
3/12/2020 - 1:52 PM

Web Scraping with Python

Text extraction from websites, or Web Mining, is a major area of interest in today's connected world. Extract text content from web pages with Python tools. Some of the tools mentioned here have a more focussed approach on main text extraction whereas some others are more naive but guarantee no omission of content.

The tools used are:

  • BoilerPy3
  • Readability with Beautiful Soup
  • Beautiful Soup with Google and Requests

1. Using BoilerPy3, a native Python port of Christian Kohlschütter's Boilerpipe for automatically extracting main textual content of a webpage based on shallow text features. It gives us the page title and it's contents

!pip install boilerpy3
from boilerpy3 import extractors

extractor = extractors.CanolaExtractor()

doc = extractor.get_doc_from_url('https://www.newsrain.in/petrol-diesel-prices/Odisha')
page_title = doc.title
page_contents = doc.content

print(page_title, end = "\n\n")
print(page_contents)

2. Using readability alongside requests and bs4 (SSL verification warning suppressed)

!pip install readability-lxml
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from readability import Document
from bs4 import BeautifulSoup
from bs4.element import Comment
import re
import string

requests.packages.urllib3.disable_warnings(InsecureRequestWarning)

responses = ["https://www.newsrain.in/petrol-diesel-prices/Odisha"]

def tag_visible(element):
    blacklist = ['style', 'label', '[document]', 'embed'
                 'noscript', 'header', 'html', 'iframe'
                 'meta', 'title', 'aside', 'footer',
                 'form', 'nav', 'head', 'link', 
                 'br', 'input', 'script', 'figure']
    if element.parent.name in blacklist:
        return False
    if isinstance(element, Comment):
        return False
    return True


def text_from_html(html):
    try:
        soup = BeautifulSoup(html, 'lxml')
    except AttributeError as e:
        return None
    texts = soup.body.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return "\n".join(t.strip() for t in visible_texts)

all_retrieved_contents = []

for page_link in responses:
    try:
        response = requests.get(page_link, verify=False)
    except requests.exceptions.RequestException:
        continue

    if response.status_code != 200:
        continue
    else:
        html_page = response.text
        doc = Document(html_page)
        cleaned_html_page = doc.summary()
        print(doc.title(), end = "\n\n")
        page_text = text_from_html(cleaned_html_page)
        if page_text != None:
            all_retrieved_contents.append(page_text)

corpus = "\n".join(all_retrieved_contents)
print(corpus)

3. Using the google library for links lookup along with requests package for HTML page extraction and then using bs4 for scraping the page for content (SSL Verification warning suppressed)

from googlesearch import search
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning 
from bs4 import BeautifulSoup
from bs4.element import Comment
from multiprocessing import Pool
import numpy as np
import re
import string

# Disable displaying SSL verification warnings
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)



# Utility function to pick a random user-agent
def get_random_ua():
    random_ua = None
    ua_file = 'ua_file.txt'
    try:
        ua_file_text = uploaded['ua_file.txt'].decode("utf-8")
        lines = ua_file_text.split('\n')
        # with open(ua_file) as f:
            # lines = f.readlines()
        if len(lines) > 0:
            prng = np.random.RandomState()
            index = prng.permutation(len(lines) - 1)
            idx = np.asarray(index, dtype=np.integer)[0]
            random_ua = lines[int(idx)]
    except Exception as ex:
        pass
    finally:
        return random_ua      


# Utility function to pick a random delay
def get_random_delay():
    delay = 2.0
    try:
        random_num = np.random.uniform(2, 3)
        delay = round(random_num, 4)
    except Exception:
        pass
    finally:
        return delay

# Get top sites from google for a query
def google_search(query, num_results=None):
    def empty():  # Empty generator
        yield from ()
    results = empty()
    try:
        results = search(
            query, tld = 'co.in',  lang = 'en',
            start=0, stop=num_results, 
            pause=get_random_delay(),
            user_agent = get_random_ua()          
        )
    except Exception:
        pass
    finally:
        return results



# Helper function to download the html page of a site
def download_site(url, session):
    html_page = None
    user_agent = get_random_ua()
    headers = {
        'user-agent': user_agent,
    }
    try:
        with session.get(url, \
                          headers=headers, \
                          timeout = 3.5, \
                          verify=False) as response:
            if(response.status_code == 200):
                html_page = response.text
    except requests.exceptions.RequestException:
        pass
    finally:
        return html_page

# Retrieve html responses from all sites 
def download_all_sites(sites):
    all_html_pages = []
    with requests.Session() as session:
        for url in sites:
            html_page = download_site(url, session)
            if html_page is not None:
                all_html_pages.append(html_page)
    return all_html_pages



# Helper function to extract text from the web page's source code
def text_from_html(html):
    try:
        soup = BeautifulSoup(html, 'lxml')
    except AttributeError as e:
        return None
    def tag_visible(element):  # Helper function to filter out futile HTML tags
        blacklist = ['style', 'label', '[document]', 'embed', 'img', 'object',
                    'noscript', 'header', 'html', 'iframe', 'audio', 'picture',
                    'meta', 'title', 'aside', 'footer', 'svg', 'base', 'figure',
                    'form', 'nav', 'head', 'link', 'button', 'source', 'canvas',
                    'br', 'input', 'script', 'wbr', 'video', 'param', 'hr']
        if element.parent.name in blacklist:
            return False
        if isinstance(element, Comment):
            return False
        return True
    texts = soup.body.findAll(text=True)
    visible_texts = filter(tag_visible, texts)  
    return " ".join(t.strip() for t in visible_texts)

# Extract textual content from all pages
def extract_text(html_pages):
    textual_contents = []
    for page_html in html_pages:
        page_text = text_from_html(page_html)
        if page_text is not None:
            textual_contents.append(page_text)
    return textual_contents



# Get a list of relevant text documents for the input query
def fetch_text_results(query):
    text_results = []
    sites = google_search(query, num_results=5)  # Obtain the top 4 URLs
    html_pages = download_all_sites(sites)  # Get HTML from URLs
    if html_pages:
        text_results = extract_text(html_pages) # Extract texts from HTML
    return text_results

# Get an agglomerated string from multiple relevant documents for a query
def fetch_merged_results(query):
    results_list = fetch_text_results(query)
    results_string = "\n".join(results_list)  # Join the list into a string
    return results_string



query_string = "Who is the Prime Minister of India"
results = fetch_merged_results(query_string)
if not results:
    print("NA")
else:
    print(results)