masakichi
6/15/2014 - 9:51 AM

javpop.py

#!/usr/bin/env python
# -*- encoding: utf-8 -*-
 
from libs.pprint import pprint 
from libs.base_handler import *
import json
 
class Handler(BaseHandler):
    '''
    this is a sample handler
    '''
    def on_start(self):
        self.crawl('http://javpop.com/', callback=self.index_page)
 
    def index_page(self, response):
        self.crawl(['http://javpop.com/category/uncensored/page/'+str(x) for x in range(1,258)],
            callback=self.page_page)
            
    def page_page(self, response):
        self.crawl([x.attr.href for x in \
            response.doc('div.entry li a[href^="http://javpop.com/201"][href$="html"]:first-child').items()],
            callback=self.detail_page)
        
    def detail_page(self, response):
        return {
            "title": response.doc('h1').text(),
            "url": response.url,
            "poster": [x.attr.src for x in response.doc('p.poster > img').items()],
            "screenshot": [x.attr.src for x in response.doc('p.screenshot > img').items()],
            "tags": [x.text() for x in response.doc('div.post-meta-b a').items()],
        }
        
    def on_result(self, result):
        if not result:
            return
        pprint(result)
        with open("data/test.txt", "a") as myfile:
            myfile.write(json.dumps(result)+'\n')