Create an RSS feed out of schwaebische.de orbituaries images

3/9/2014 - 3:41 PM
Create an RSS feed out of schwaebische.de orbituaries images

#!/usr/bin/python

#
# Create an RSS feed out of schwaebische.de orbituaries images
#

import os, sys, time, datetime, urllib2, re
from BeautifulSoup import BeautifulSoup

debug = False

headers = { 'User-Agent' : 'Mozilla/5.0' }
yesterday = datetime.date.today() - datetime.timedelta(days=1)
searchdate = yesterday.strftime("%d-%m-%Y")

# 14 = Ravensburg
def grab_page(page=1):
    url = "http://trauer.schwaebische.de/Anzeige-suchen/_/_/_/14/%s/_/%s" % (searchdate, page)
    print url
    req = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(req)
    the_page = response.read()
    return the_page

page = grab_page(1)
soup = BeautifulSoup(''.join(page))

# Find out how many pages we need to process
number_of_pages = int(soup.findAll('span', {"class": "lbl-counter-by-pager"})[0].text.split(" ")[3])
if(debug): print number_of_pages

orbituaries = []
images = []
pages = [page] # There is at least one page (the one we already grabbed), but there could be more

# If there is more than one page, grab the additional ones as well
if (number_of_pages > 1):
    for x in range(2, number_of_pages+1):
        page = grab_page(x)
        pages.append(page)

# Get the links to the orbituaries
for page in pages:
    soup = BeautifulSoup(''.join(page))
    for item in soup.findAll('a', {"class": "hyperLinkSearchItemTitle nounderline"}):
        orbituaries.append(item['href'])

if(debug): print orbituaries

############

# The actual image seems to be dynamically loaded by a XMLHttpRequest POST request to
# http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx
# with request body
# themesiteDomainName=Firstname-Lastname
# However, the following also works: GET
# http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx?themesiteDomainName=Firstname-Lastname

def grab_orbituary(namestr):
    url = "http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx?themesiteDomainName=%s" % (namestr)
    print url
    req = urllib2.Request(url, None, headers)
    response = urllib2.urlopen(req)
    the_page = response.read()
    return the_page

for orbituary in orbituaries:
    namestr = orbituary.split("/")[4]
    if(debug): print namestr
    page = grab_orbituary(namestr)
    soup = BeautifulSoup(''.join(page))
    # print soup
    for item in soup.findAll('a', {"class": "std-hyperlink obitoryTooltipOverview"}):
        image = item.attrs[3][1].split("'")[3]
        if(debug): print image
        images.append(image)

if(debug): print images

rss="""<rss version="2.0">

<channel>
  <title>Traueranzeigen seit %s</title>
  <link>http://trauer.schwaebische.de/</link>
  <description>Traueranzeigen seit %s</description>
""" % (yesterday.strftime("%d.%m.%Y"), yesterday.strftime("%d.%m.%Y"))

for image in images:
    rss = rss + """
  <item>
    <title>Traueranzeige</title>
    <link>%s</link>
    <guid>%s</guid>
  </item>
"""  % (image, image)

rss = rss + """
</channel>

</rss>"""



with open("todesanzeigen.rss", "w") as myfile:
    myfile.write(rss)
Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Create an RSS feed out of schwaebische.de orbituaries images