wynemo
11/4/2012 - 9:49 AM

requirements.txt

import re                                                                          
import json
import urllib2                                                                     
import re
import uuid
import PyRSS2Gen
from datetime import datetime

items_re = re.compile(r'<item>.+</item>', re.S)

def get_now():
    date = datetime.now()
    year = str(date.year)
    month = str(date.month)
    day = str(date.day)
    return ''.join([year, '-', month, '-', day])

                                                                               
simpletv_config = 'simpletv.config'                                                
all_history = 'all_history.json'
simpletv_xml = 'simpletv.xml'
header_pattern1 = r'<td\s*id="overview-top">.+?</td>'                              
header_pattern2 = r'<h1.+?>(.+?)<'                                                 
header_re1 = re.compile(header_pattern1, re.S)                                     
header_re2 = re.compile(header_pattern2, re.S)                                     

detail_pattern1 = r'''<div\s*class="odd">.*?<a([^>]+)>([^<]+?)</a>\s*(\(.+?\))'''
detail_pattern2 = r'href="(.+?)"'
detail_pattern3 = r'<h4[^>]*?>[^<]*?</h4>\s*<time.*?datetime="(.+?)"\s*>'

detail_re1 = re.compile(detail_pattern1, re.S)                                     
detail_re2 = re.compile(detail_pattern2, re.S)                                     
detail_re3 = re.compile(detail_pattern3, re.S)                                     

agent = 'Mozilla/5.0 (X11; Ubuntu; Linux x86_64; rv:10.0.2) '
'Gecko/20100101 Firefox/10.0.2'                                                                  

rss_header = '''<?xml version="1.0" encoding="iso-8859-1"?>
<rss version="2.0"><channel>
<title>simpletv</title><link>http://www.dabin.info</link>
<description>self tv</description>
<lastBuildDate>%s</lastBuildDate>'''

rss_foot = '''</channel></rss>'''

                                                                                   
def main():                                                                        
    f = None                                                                       
    config_strs = []                                                               
    try:                                                                           
        f = open(simpletv_config, 'rb')                                            
        for each in f:                                                             
            each = each.strip()                                                    
            if each:                                                               
                config_strs.append(each)                                           
    except:                                                                        
        pass                                                                       
    finally:                                                                       
        if f is not None:                                                          
            f.close()                                                              

    f = None
    o = None
    try:
        f = open(all_history, 'rb')                                            
        o = json.load(f)
    except:
        pass
    finally:                                                                       
        if f is not None:                                                          
            f.close()                                                              

    if o is None:
        o = {}
        
                                                                                   
    for url in config_strs:                                                        
        try:                                                                       
            header = None
            date = None
            num = None
            name = None
            link = None

            opener = urllib2.build_opener()                                        
            opener.addheaders = [('User-agent', agent)]
            i1 = opener.open(url, timeout = 15)                                     
            s1 = i1.read()                                                         
            o1 = re.search(header_re1, s1)                                         
            if o1 is not None:                                                     
                o2 = re.search(header_re2, o1.group())                             
                if o2 is not None:                                                 
                    header = o2.group(1).strip()                                      
                    print header

            o1 = re.search(detail_re1, s1)
            print 'o1:',o1 is None
            if o1 is not None:
                s2 = o1.group(1)
                name = o1.group(2)
                num = o1.group(3)

                o2 = re.search(detail_re2, s2)
                print 'o2 is None ',o2
                if o2 is not None:
                    link = 'http://www.imdb.com' + o2.group(1)
                    opener = urllib2.build_opener()                                        
                    opener.addheaders = [('User-agent', agent)]
                    i1 = opener.open(link, timeout = 15)                                     
                    s1 = i1.read()                                                         
                    _date = re.search(detail_re3, s1)
                    print '_date:', _date is None
                    if _date is not None:
                        date = _date.group(1).strip()



            if header is None or\
                date is None or\
                num is None or\
                name is None or\
                link is None:
                continue

            if not date in o: 
                o[date] = {}
            if not header in o[date]:
                o[date][header] = {}
                o[date][header]['num'] = num
                o[date][header]['name'] = name
                o[date][header]['link'] = link


        except:                                                                    
            continue                                                               

    new_rss_items = []
    now_time = get_now()
    #now_time = '2012-11-16'
    print 'now_time is',now_time
    print o
    print now_time
    if now_time in o:
        print 'found'
        for each in o[now_time]:
            used = o[now_time][each].get('used')
            if used is None:
                o[now_time][each]['used'] = '1'
                rss_item = PyRSS2Gen.RSSItem(
                    title = each + ' ' + o[now_time][each]['num'][2:-1],
                    description = o[now_time][each]['name'],
                    link = o[now_time][each]['link'],
                    guid = PyRSS2Gen.Guid(str(uuid.uuid1()), 0),
                    pubDate = datetime.now())
                item = re.search(items_re, rss_item.to_xml())
                if item is not None:
                    new_rss_items.append(item.group())
                else:
                    print 'error get find item'

    f = None
    xml = ''
    try:
        f = open(simpletv_xml, 'rb')
        xml = f.read()
    except:
        pass
    finally:                                                                       
        if f is not None:                                                          
            f.close()                                                              

    if new_rss_items:
        items_text = ''
        o1 = re.search(items_re, xml)
        if o1 is not None:
            items_text = o1.group()
        header = rss_header % PyRSS2Gen._format_date(datetime.now())
        body = items_text
        foot = rss_foot 
        
        f = None
        try:
            f = open(simpletv_xml, 'w+b')
            f.write(header + ''.join(new_rss_items) + body + foot)
        except:
            pass
        finally:                                                                       
            if f is not None:                                                          
                f.close()                                                              

    f = None
    try:
        f = open(all_history, 'w+b')                                            
        json.dump(o, f, indent = 4)
    except:
        pass
    finally:                                                                       
        if f is not None:                                                          
            f.close()                                                              
                                                                                   
if __name__ == '__main__':                                                         
    main()                   
http://www.imdb.com/title/tt0096697/
http://www.imdb.com/title/tt1830617/
http://www.imdb.com/title/tt0121955/
http://www.imdb.com/title/tt0944947/
http://www.imdb.com/title/tt1592154/