ruanbekker
4/30/2017 - 1:42 PM

Scrapes URL, Title and Keywords from Nested Sitemap to Elasticsearch

Scrapes URL, Title and Keywords from Nested Sitemap to Elasticsearch

import time
import requests
from bs4 import BeautifulSoup
from elasticsearch import Elasticsearch

es_client = Elasticsearch(['http://search-domain:9200'])

drop_index = es_client.indices.create(index='myindex', ignore=400)
create_index = es_client.indices.delete(index='myindex', ignore=[400, 404])

def urlparser(title, url):
    # scrape title
    p = {}
    tag_names = []
    post = title
    page = requests.get(post).content
    soup = BeautifulSoup(page, 'lxml')
    title_name = soup.title.string

    # scrape tags
    desc = soup.findAll(attrs={"name":"keywords"})
    if len(desc) >=1:
        tag_names = desc[0]['content'].split(',')
    else:
        tag_names = []
    #pass
    # payload for elasticsearch
    doc = {
        'date': time.strftime("%Y-%m-%d"),
        'title': title_name,
        'tags': tag_names,
        'url': url
    }

    # ingest payload into elasticsearch
    res = es_client.index(index="myindex", doc_type="docs", body=doc)
    print(res)
    time.sleep(1.5)

sitemap_feed = 'http://www.domain.com/sitemap.xml'
page = requests.get(sitemap_feed)
sitemap_index = BeautifulSoup(page.content, 'html.parser')
urls = [element.text for element in sitemap_index.findAll('loc')]

for xx in urls:
    sub_sitemap_feed = xx
    sub_page = requests.get(sub_sitemap_feed)
    sub_sitemap_index = BeautifulSoup(sub_page.content, 'html.parser')
    sub_urls = [element.text for element in sub_sitemap_index.findAll('loc')]
    for xxy in sub_urls:
        urlparser(xxy, xxy)