Dictionary Crawler
#!/usr/bin/env python3
import requests, json, sys, random
from html.parser import HTMLParser
from time import sleep
from tqdm import tqdm
import cProfile
url = 'http://words.sinica.edu.tw/ftms-bin/scripts/words_scripts/zizi1.pl'
single_page = 20
dict_tags = ['id', 'word', 'phonetic', 'pinyin', 'pos', 'frequency']
html_tags = ['table', 'tr', 'th', 'td']
def main(start, end) :
parser = table_parser()
# create indicators for the parser
parser.can_parse = False
parser.counter = 0
# parser storages
parser.buffer = {}
parser.writer = None
# open as writable and no buffer
filename = 'dict-' + str(start) + '-' + str(end) + '.json'
data = { 'count': 0, 'amount': 127274 }
with open(filename, 'w') as fp:
parser.writer = fp
id_start = start * single_page
id_end = (end+1) * single_page
for pos in tqdm(range(id_start, id_end, single_page)) :
data['count'] = pos
res = requests.post(url, data = data);
if res.status_code != 200 :
raise ValueError('HTTP reqeuest failed, ' + r.reason)
res.encoding = None
parser.feed(res.text)
fp.flush()
# subclass to override the handler methods
class table_parser(HTMLParser) :
def handle_starttag(self, tag, attrs) :
if tag == 'tr' :
self.can_parse = True
# reset the storage
self.counter = 0
#self.buffer = {}
elif tag == 'td' :
self.counter = self.counter + 1
elif tag == 'th' :
# ignore table headers
self.can_parse = False
def handle_endtag(self, tag) :
if tag in html_tags:
# ensure the parser is stopped
self.can_parse = False
def handle_data(self, data) :
if self.can_parse :
if self.counter >= 2 and self.counter <= 6 :
# strip the spaces and newline
data = data.strip(' \n')
self.buffer[dict_tags[self.counter-1]] = data
elif self.counter > 6 :
raise ValeError('Invalid counter position')
if self.counter == 6 :
# write parsed entry to file
json.dump(self.buffer, self.writer,
indent = 4, ensure_ascii = False)
self.writer.write(',\n')
if __name__ == '__main__' :
start = int(sys.argv[1])
end = int(sys.argv[2])
main(start, end)