navigaid
10/4/2016 - 6:51 PM

A 'Crawler' for http://msdn.itellyou.cn/

A 'Crawler' for http://msdn.itellyou.cn/

#!/usr/bin/env python
# coding:utf-8

"""
A 'Crawler' for http://msdn.itellyou.cn/
"""

import re
import urllib
import urllib2
import json
import sqlite3

dbconn = sqlite3.Connection('msdn.db')


def do_post(url, params):
    params = urllib.urlencode(params)
    return urllib2.urlopen(url, params).read().decode('utf-8')


def get_menus():
    """
    (<id>,<name>)+
    """
    src = urllib.urlopen('http://msdn.itellyou.cn/').read().decode('utf-8')
    pat = 'data-target="#collapse_(.*?)">(.*?)</a>'
    return re.findall(pat, src)


def get_sub_menus(mid):
    """
    ({'id':<id>,'name':<name>})+
    """
    src = do_post('http://msdn.itellyou.cn/Category/Index', {'id': mid})
    return json.loads(src)


def get_lang_list(sid):
    """
    {"status":true,"result":({'id':<id>,'lang':<lang>})+}
    """
    src = do_post('http://msdn.itellyou.cn/Category/GetLang', {'id': sid})
    return json.loads(src)['result']


def get_iso_list(sid, lid):
    """
    {"status":true,"result":({'id':<id>,'name':<name>,'post':<date>,'url':<url>})+}
    """
    src = do_post('http://msdn.itellyou.cn/Category/GetList',
                  {'id': sid, 'lang': lid, 'filter': 'true'})
    return json.loads(src)['result']


def get_iso(iid):
    """
    {"status":true,"result":({'DownLoad':<url>,'FileName':<fname>,
        'PostDateString':<2008-09-08>,'SHA1':<SHA1>,'size':<size>})+
    }
    """
    src = do_post('http://msdn.itellyou.cn/Category/GetProduct', {'id': iid})
    return json.loads(src)['result']


def create_db():
    c = dbconn.cursor()
    c.executescript("""
    DROP TABLE IF EXISTS iso;
    CREATE TABLE iso(
        id      INTEGER AUTO INCREMENT PRIMARY KEY,
        name    TEXT,
        fname   TEXT,
        cate    TEXT,
        product TEXT,
        lang    TEXT,
        url     TEXT,
        sha1    TEXT,
        size    TEXT,
        date    TEXT
    );
    """)
    dbconn.commit()
    c.close()


if __name__ == '__main__':
    create_db()
    c = dbconn.cursor()
    for menu in get_menus():
        for sub in get_sub_menus(menu[0]):
            for lang in get_lang_list(sub['id']):
                for iso in get_iso_list(sub['id'], lang['id']):
                    detail = get_iso(iso['id'])
                    print detail['FileName']
                    c.execute('INSERT INTO iso(name, fname, cate, product, lang,'
                              'url, sha1, size, date)'
                              'VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?)',
                              (iso['name'], detail['FileName'],
                               menu[1], sub['name'],
                               lang['lang'], detail[
                                   'DownLoad'], detail['SHA1'],
                               detail['size'], detail['PostDateString'])
                              )
    dbconn.commit()
    c.close()
    dbconn.close()