xuanyuanaosheng
3/8/2014 - 3:15 AM

demo script for douban.com

demo script for douban.com

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
# vim: set et sw=4 ts=4 sts=4 ff=unix fenc=utf8:
# Created on 2014-02-28 00:15:53

from libs.pprint import pprint 
from libs.base_handler import *

class Handler(BaseHandler):
    '''
    this is a sample handler
    '''
    def on_start(self):
        self.crawl('http://movie.douban.com/', callback=self.index_page)

    def index_page(self, response):
        self.crawl([x.attr.href for x in \
            response.doc('a[href^="http://movie.douban.com/subject"]').items()],
            callback=self.detail_page)
            
    def detail_page(self, response):
        self.crawl([x.attr.href for x in \
            response.doc('a[href^="http://movie.douban.com/subject"]').items()],
            callback=self.detail_page)
        return {
            "title": response.doc('*[property="v:itemreviewed"]').text(),
            "directedBy": [x.text() for x in response.doc('*[rel="v:directedBy"]').items()],
            "genre": [x.text() for x in response.doc('*[property="v:genre"]').items()],
            "编剧": [x.text() for x in response.doc(u'#info .pl:contains("编剧") ~ a').items()],
            "主演": [x.text() for x in response.doc('*[rel="v:starring"]').items()],
            "制片国家 ": response.doc(u'#info .pl:contains("制片国家")')[0].tail,
            "集数 ": int(response.doc(u'#info .pl:contains("集数")')[0].tail),
        }
            
    def on_result(self, result):
        if not result:
            return
        pprint(result)