iamsk
9/2/2014 - 12:15 PM

doubangroupapi

import requests

from db import db

"""
The Douban Group API which not display on http://developers.douban.com/wiki/?title=api_v2

Base url: https://api.douban.com/v2

Group info: /group/:id
Group topics: /group/:id/topics
Group comments: /group/topic/:id/comments

REF: http://www.douban.com/group/topic/33507002/
"""

base_url = 'http://api.douban.com/v2/group/%s/topics'
PER_PAGE_COUNT = 100


def real_fetch(group_id, start=0):
    headers = {
        'Host': 'api.douban.com',
        'Referer':'api.douban.com',
        'Cookie': '',
        'User-Agent':'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_9_4) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/36.0.1985.143 Safari/537.36',
        'Connection': 'keep-alive',
        'Accept':'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
        'Accept-Language':'zh-cn,zh;q=0.5',
        'Accept-Charset':'GB2312,utf-8;q=0.7,*;q=0.7',
    }
    params = {'start': start, 'count': PER_PAGE_COUNT}
    url = base_url % group_id
    r = requests.get(url, params=params, headers=headers)
    
    ret = r.json()
    for topic in ret['topics']:
        db.topic.insert(group_id = group_id,
            title=topic['title'],
            content=topic['content'],
            url=topic['alt'],
            created=topic['created'])
        db.flush()


def fetch(group_id):
    for i in range(10):
        print i
        real_fetch(group_id, i * PER_PAGE_COUNT)

group_ids = ['zhufang', 'xiaotanzi']

if __name__ == '__main__':
    for group_id in group_ids:
        print group_id
        fetch(group_id)