Create an RSS feed out of schwaebische.de orbituaries images
#!/usr/bin/python
#
# Create an RSS feed out of schwaebische.de orbituaries images
#
import os, sys, time, datetime, urllib2, re
from BeautifulSoup import BeautifulSoup
debug = False
headers = { 'User-Agent' : 'Mozilla/5.0' }
yesterday = datetime.date.today() - datetime.timedelta(days=1)
searchdate = yesterday.strftime("%d-%m-%Y")
# 14 = Ravensburg
def grab_page(page=1):
url = "http://trauer.schwaebische.de/Anzeige-suchen/_/_/_/14/%s/_/%s" % (searchdate, page)
print url
req = urllib2.Request(url, None, headers)
response = urllib2.urlopen(req)
the_page = response.read()
return the_page
page = grab_page(1)
soup = BeautifulSoup(''.join(page))
# Find out how many pages we need to process
number_of_pages = int(soup.findAll('span', {"class": "lbl-counter-by-pager"})[0].text.split(" ")[3])
if(debug): print number_of_pages
orbituaries = []
images = []
pages = [page] # There is at least one page (the one we already grabbed), but there could be more
# If there is more than one page, grab the additional ones as well
if (number_of_pages > 1):
for x in range(2, number_of_pages+1):
page = grab_page(x)
pages.append(page)
# Get the links to the orbituaries
for page in pages:
soup = BeautifulSoup(''.join(page))
for item in soup.findAll('a', {"class": "hyperLinkSearchItemTitle nounderline"}):
orbituaries.append(item['href'])
if(debug): print orbituaries
############
# The actual image seems to be dynamically loaded by a XMLHttpRequest POST request to
# http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx
# with request body
# themesiteDomainName=Firstname-Lastname
# However, the following also works: GET
# http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx?themesiteDomainName=Firstname-Lastname
def grab_orbituary(namestr):
url = "http://trauer.schwaebische.de/Microsite/Views/Apps/Startpage/Startpage.aspx?themesiteDomainName=%s" % (namestr)
print url
req = urllib2.Request(url, None, headers)
response = urllib2.urlopen(req)
the_page = response.read()
return the_page
for orbituary in orbituaries:
namestr = orbituary.split("/")[4]
if(debug): print namestr
page = grab_orbituary(namestr)
soup = BeautifulSoup(''.join(page))
# print soup
for item in soup.findAll('a', {"class": "std-hyperlink obitoryTooltipOverview"}):
image = item.attrs[3][1].split("'")[3]
if(debug): print image
images.append(image)
if(debug): print images
rss="""<rss version="2.0">
<channel>
<title>Traueranzeigen seit %s</title>
<link>http://trauer.schwaebische.de/</link>
<description>Traueranzeigen seit %s</description>
""" % (yesterday.strftime("%d.%m.%Y"), yesterday.strftime("%d.%m.%Y"))
for image in images:
rss = rss + """
<item>
<title>Traueranzeige</title>
<link>%s</link>
<guid>%s</guid>
</item>
""" % (image, image)
rss = rss + """
</channel>
</rss>"""
with open("todesanzeigen.rss", "w") as myfile:
myfile.write(rss)