Scrapes URL, Title and Keywords from Nested Sitemap to Elasticsearch
import time
import requests
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch
es_client = Elasticsearch(['http://search-domain:9200'])
drop_index = es_client.indices.create(index='myindex', ignore=400)
create_index = es_client.indices.delete(index='myindex', ignore=[400, 404])
def urlparser(title, url):
# scrape title
p = {}
tag_names = []
post = title
page = requests.get(post).content
soup = BeautifulSoup(page, 'lxml')
title_name = soup.title.string
# scrape tags
desc = soup.findAll(attrs={"name":"keywords"})
if len(desc) >=1:
tag_names = desc[0]['content'].split(',')
else:
tag_names = []
#pass
# payload for elasticsearch
doc = {
'date': time.strftime("%Y-%m-%d"),
'title': title_name,
'tags': tag_names,
'url': url
}
# ingest payload into elasticsearch
res = es_client.index(index="myindex", doc_type="docs", body=doc)
print(res)
time.sleep(1.5)
sitemap_feed = 'http://www.domain.com/sitemap.xml'
page = requests.get(sitemap_feed)
sitemap_index = BeautifulSoup(page.content, 'html.parser')
urls = [element.text for element in sitemap_index.findAll('loc')]
for xx in urls:
sub_sitemap_feed = xx
sub_page = requests.get(sub_sitemap_feed)
sub_sitemap_index = BeautifulSoup(sub_page.content, 'html.parser')
sub_urls = [element.text for element in sub_sitemap_index.findAll('loc')]
for xxy in sub_urls:
urlparser(xxy, xxy)