Text extraction from websites, or Web Mining, is a major area of interest in today's connected world. Extract text content from web pages with Python tools. Some of the tools mentioned here have a more focussed approach on main text extraction whereas some others are more naive but guarantee no omission of content.
The tools used are:
!pip install boilerpy3
from boilerpy3 import extractors
extractor = extractors.CanolaExtractor()
doc = extractor.get_doc_from_url('https://www.newsrain.in/petrol-diesel-prices/Odisha')
page_title = doc.title
page_contents = doc.content
print(page_title, end = "\n\n")
print(page_contents)
!pip install readability-lxml
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from readability import Document
from bs4 import BeautifulSoup
from bs4.element import Comment
import re
import string
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
responses = ["https://www.newsrain.in/petrol-diesel-prices/Odisha"]
def tag_visible(element):
blacklist = ['style', 'label', '[document]', 'embed'
'noscript', 'header', 'html', 'iframe'
'meta', 'title', 'aside', 'footer',
'form', 'nav', 'head', 'link',
'br', 'input', 'script', 'figure']
if element.parent.name in blacklist:
return False
if isinstance(element, Comment):
return False
return True
def text_from_html(html):
try:
soup = BeautifulSoup(html, 'lxml')
except AttributeError as e:
return None
texts = soup.body.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return "\n".join(t.strip() for t in visible_texts)
all_retrieved_contents = []
for page_link in responses:
try:
response = requests.get(page_link, verify=False)
except requests.exceptions.RequestException:
continue
if response.status_code != 200:
continue
else:
html_page = response.text
doc = Document(html_page)
cleaned_html_page = doc.summary()
print(doc.title(), end = "\n\n")
page_text = text_from_html(cleaned_html_page)
if page_text != None:
all_retrieved_contents.append(page_text)
corpus = "\n".join(all_retrieved_contents)
print(corpus)
from googlesearch import search
import requests
from requests.packages.urllib3.exceptions import InsecureRequestWarning
from bs4 import BeautifulSoup
from bs4.element import Comment
from multiprocessing import Pool
import numpy as np
import re
import string
# Disable displaying SSL verification warnings
requests.packages.urllib3.disable_warnings(InsecureRequestWarning)
# Utility function to pick a random user-agent
def get_random_ua():
random_ua = None
ua_file = 'ua_file.txt'
try:
ua_file_text = uploaded['ua_file.txt'].decode("utf-8")
lines = ua_file_text.split('\n')
# with open(ua_file) as f:
# lines = f.readlines()
if len(lines) > 0:
prng = np.random.RandomState()
index = prng.permutation(len(lines) - 1)
idx = np.asarray(index, dtype=np.integer)[0]
random_ua = lines[int(idx)]
except Exception as ex:
pass
finally:
return random_ua
# Utility function to pick a random delay
def get_random_delay():
delay = 2.0
try:
random_num = np.random.uniform(2, 3)
delay = round(random_num, 4)
except Exception:
pass
finally:
return delay
# Get top sites from google for a query
def google_search(query, num_results=None):
def empty(): # Empty generator
yield from ()
results = empty()
try:
results = search(
query, tld = 'co.in', lang = 'en',
start=0, stop=num_results,
pause=get_random_delay(),
user_agent = get_random_ua()
)
except Exception:
pass
finally:
return results
# Helper function to download the html page of a site
def download_site(url, session):
html_page = None
user_agent = get_random_ua()
headers = {
'user-agent': user_agent,
}
try:
with session.get(url, \
headers=headers, \
timeout = 3.5, \
verify=False) as response:
if(response.status_code == 200):
html_page = response.text
except requests.exceptions.RequestException:
pass
finally:
return html_page
# Retrieve html responses from all sites
def download_all_sites(sites):
all_html_pages = []
with requests.Session() as session:
for url in sites:
html_page = download_site(url, session)
if html_page is not None:
all_html_pages.append(html_page)
return all_html_pages
# Helper function to extract text from the web page's source code
def text_from_html(html):
try:
soup = BeautifulSoup(html, 'lxml')
except AttributeError as e:
return None
def tag_visible(element): # Helper function to filter out futile HTML tags
blacklist = ['style', 'label', '[document]', 'embed', 'img', 'object',
'noscript', 'header', 'html', 'iframe', 'audio', 'picture',
'meta', 'title', 'aside', 'footer', 'svg', 'base', 'figure',
'form', 'nav', 'head', 'link', 'button', 'source', 'canvas',
'br', 'input', 'script', 'wbr', 'video', 'param', 'hr']
if element.parent.name in blacklist:
return False
if isinstance(element, Comment):
return False
return True
texts = soup.body.findAll(text=True)
visible_texts = filter(tag_visible, texts)
return " ".join(t.strip() for t in visible_texts)
# Extract textual content from all pages
def extract_text(html_pages):
textual_contents = []
for page_html in html_pages:
page_text = text_from_html(page_html)
if page_text is not None:
textual_contents.append(page_text)
return textual_contents
# Get a list of relevant text documents for the input query
def fetch_text_results(query):
text_results = []
sites = google_search(query, num_results=5) # Obtain the top 4 URLs
html_pages = download_all_sites(sites) # Get HTML from URLs
if html_pages:
text_results = extract_text(html_pages) # Extract texts from HTML
return text_results
# Get an agglomerated string from multiple relevant documents for a query
def fetch_merged_results(query):
results_list = fetch_text_results(query)
results_string = "\n".join(results_list) # Join the list into a string
return results_string
query_string = "Who is the Prime Minister of India"
results = fetch_merged_results(query_string)
if not results:
print("NA")
else:
print(results)