#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from libs.pprint import pprint
from libs.base_handler import *
import json
class Handler(BaseHandler):
'''
this is a sample handler
'''
def on_start(self):
self.crawl('http://javpop.com/', callback=self.index_page)
def index_page(self, response):
self.crawl(['http://javpop.com/category/uncensored/page/'+str(x) for x in range(1,258)],
callback=self.page_page)
def page_page(self, response):
self.crawl([x.attr.href for x in \
response.doc('div.entry li a[href^="http://javpop.com/201"][href$="html"]:first-child').items()],
callback=self.detail_page)
def detail_page(self, response):
return {
"title": response.doc('h1').text(),
"url": response.url,
"poster": [x.attr.src for x in response.doc('p.poster > img').items()],
"screenshot": [x.attr.src for x in response.doc('p.screenshot > img').items()],
"tags": [x.text() for x in response.doc('div.post-meta-b a').items()],
}
def on_result(self, result):
if not result:
return
pprint(result)
with open("data/test.txt", "a") as myfile:
myfile.write(json.dumps(result)+'\n')