import re
import json
import urllib2
import re
import uuid
import PyRSS2Gen
from datetime import datetime
items_re = re.compile(r'<item>.+</item>', re.S)
def get_now():
date = datetime.now()
year = str(date.year)
month = str(date.month)
day = str(date.day)
return ''.join([year, '-', month, '-', day])
simpletv_config = 'simpletv.config'
all_history = 'all_history.json'
simpletv_xml = 'simpletv.xml'
header_pattern1 = r'<td\s*id="overview-top">.+?</td>'
header_pattern2 = r'<h1.+?>(.+?)<'
header_re1 = re.compile(header_pattern1, re.S)
header_re2 = re.compile(header_pattern2, re.S)
detail_pattern1 = r'''<div\s*class="odd">.*?<a([^>]+)>([^<]+?)</a>\s*(\(.+?\))'''
detail_pattern2 = r'href="(.+?)"'
detail_pattern3 = r'<h4[^>]*?>[^<]*?</h4>\s*<time.*?datetime="(.+?)"\s*>'
detail_re1 = re.compile(detail_pattern1, re.S)
detail_re2 = re.compile(detail_pattern2, re.S)
detail_re3 = re.compile(detail_pattern3, re.S)
agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:10.0.2) '
'Gecko/20100101 Firefox/10.0.2'
rss_header = '''<?xml version="1.0" encoding="iso-8859-1"?>
<rss version="2.0"><channel>
<title>simpletv</title><link>http://www.dabin.info</link>
<description>self tv</description>
<lastBuildDate>%s</lastBuildDate>'''
rss_foot = '''</channel></rss>'''
def main():
f = None
config_strs = []
try:
f = open(simpletv_config, 'rb')
for each in f:
each = each.strip()
if each:
config_strs.append(each)
except:
pass
finally:
if f is not None:
f.close()
f = None
o = None
try:
f = open(all_history, 'rb')
o = json.load(f)
except:
pass
finally:
if f is not None:
f.close()
if o is None:
o = {}
for url in config_strs:
try:
header = None
date = None
num = None
name = None
link = None
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', agent)]
i1 = opener.open(url, timeout = 15)
s1 = i1.read()
o1 = re.search(header_re1, s1)
if o1 is not None:
o2 = re.search(header_re2, o1.group())
if o2 is not None:
header = o2.group(1).strip()
print header
o1 = re.search(detail_re1, s1)
print 'o1:',o1 is None
if o1 is not None:
s2 = o1.group(1)
name = o1.group(2)
num = o1.group(3)
o2 = re.search(detail_re2, s2)
print 'o2 is None ',o2
if o2 is not None:
link = 'http://www.imdb.com' + o2.group(1)
opener = urllib2.build_opener()
opener.addheaders = [('User-agent', agent)]
i1 = opener.open(link, timeout = 15)
s1 = i1.read()
_date = re.search(detail_re3, s1)
print '_date:', _date is None
if _date is not None:
date = _date.group(1).strip()
if header is None or\
date is None or\
num is None or\
name is None or\
link is None:
continue
if not date in o:
o[date] = {}
if not header in o[date]:
o[date][header] = {}
o[date][header]['num'] = num
o[date][header]['name'] = name
o[date][header]['link'] = link
except:
continue
new_rss_items = []
now_time = get_now()
#now_time = '2012-11-16'
print 'now_time is',now_time
print o
print now_time
if now_time in o:
print 'found'
for each in o[now_time]:
used = o[now_time][each].get('used')
if used is None:
o[now_time][each]['used'] = '1'
rss_item = PyRSS2Gen.RSSItem(
title = each + ' ' + o[now_time][each]['num'][2:-1],
description = o[now_time][each]['name'],
link = o[now_time][each]['link'],
guid = PyRSS2Gen.Guid(str(uuid.uuid1()), 0),
pubDate = datetime.now())
item = re.search(items_re, rss_item.to_xml())
if item is not None:
new_rss_items.append(item.group())
else:
print 'error get find item'
f = None
xml = ''
try:
f = open(simpletv_xml, 'rb')
xml = f.read()
except:
pass
finally:
if f is not None:
f.close()
if new_rss_items:
items_text = ''
o1 = re.search(items_re, xml)
if o1 is not None:
items_text = o1.group()
header = rss_header % PyRSS2Gen._format_date(datetime.now())
body = items_text
foot = rss_foot
f = None
try:
f = open(simpletv_xml, 'w+b')
f.write(header + ''.join(new_rss_items) + body + foot)
except:
pass
finally:
if f is not None:
f.close()
f = None
try:
f = open(all_history, 'w+b')
json.dump(o, f, indent = 4)
except:
pass
finally:
if f is not None:
f.close()
if __name__ == '__main__':
main()
http://www.imdb.com/title/tt0096697/
http://www.imdb.com/title/tt1830617/
http://www.imdb.com/title/tt0121955/
http://www.imdb.com/title/tt0944947/
http://www.imdb.com/title/tt1592154/
PyRSS2Gen==1.1