liuyenting
5/19/2016 - 4:45 PM

Dictionary Crawler

Dictionary Crawler

#!/usr/bin/env python3

import requests, json, sys, random
from html.parser import HTMLParser
from time import sleep
from tqdm import tqdm

import cProfile

url = 'http://words.sinica.edu.tw/ftms-bin/scripts/words_scripts/zizi1.pl'

single_page = 20

dict_tags = ['id', 'word', 'phonetic', 'pinyin', 'pos', 'frequency']
html_tags = ['table', 'tr', 'th', 'td']

def main(start, end) :
    parser = table_parser()
    # create indicators for the parser
    parser.can_parse = False
    parser.counter = 0
    # parser storages
    parser.buffer = {}
    parser.writer = None

    # open as writable and no buffer
    filename = 'dict-' + str(start) + '-' + str(end) + '.json'
    data = { 'count': 0, 'amount': 127274 }
    with open(filename, 'w') as fp:
        parser.writer = fp

        id_start = start * single_page
        id_end = (end+1) * single_page
        for pos in tqdm(range(id_start, id_end, single_page)) :
            data['count'] = pos
            res = requests.post(url, data = data);
            if res.status_code != 200 :
                raise ValueError('HTTP reqeuest failed, ' + r.reason)

            res.encoding = None
            parser.feed(res.text)
            fp.flush()

# subclass to override the handler methods
class table_parser(HTMLParser) :
    def handle_starttag(self, tag, attrs) :
        if tag == 'tr' :
            self.can_parse = True
            # reset the storage
            self.counter = 0
            #self.buffer = {}
        elif tag == 'td' :
            self.counter = self.counter + 1
        elif tag == 'th' :
            # ignore table headers
            self.can_parse = False

    def handle_endtag(self, tag) :
        if tag in html_tags:
            # ensure the parser is stopped
            self.can_parse = False

    def handle_data(self, data) :
        if self.can_parse :
            if self.counter >= 2 and self.counter <= 6 :
                # strip the spaces and newline
                data = data.strip(' \n')
                self.buffer[dict_tags[self.counter-1]] = data
            elif self.counter > 6 :
                raise ValeError('Invalid counter position')

            if self.counter == 6 :
                # write parsed entry to file
                json.dump(self.buffer, self.writer,
                          indent = 4, ensure_ascii = False)
                self.writer.write(',\n')

if __name__ == '__main__' :
    start = int(sys.argv[1])
    end = int(sys.argv[2])
    main(start, end)