bkmeneguello
1/3/2016 - 12:26 PM

delicious-parser.py

#!/usr/bin/python

import json
from HTMLParser import HTMLParser

class MyHTMLParser(HTMLParser):
  def __init__(self):
    HTMLParser.__init__(self)
    self.on_dt = False
    self.on_dd = False
    self.on_a = False
    self.node = None
    self.tags = None
    self.url = None
    self.info = None

  def handle_starttag(self, tag, attrs):
    #print "start", tag
    attrs = dict(attrs)
    if tag == 'dt':
      if self.on_dt:
        self.flush_node()
      self.on_dt = True
    if tag == 'a':
      self.url = attrs['href']
      self.tags = attrs['tags']
      self.on_a = True
    if tag == 'dd':
      self.on_dt = False
      self.on_dd = True

  def handle_endtag(self, tag):
    #print "end", tag
    if tag == 'a':
      self.on_a = False
    if tag == 'dl':
      self.flush_node()

  def handle_data(self, data):
    #print "data", data
    if self.on_a:
      self.node = data
    if self.on_dd:
      self.info = data

  def flush_node(self):
    node = {'name': self.node, 'tags': ['url:%s' % self.url] + self.tags.split(',')}
    if self.info:
      node['info'] = self.info
    print '{"name": "%s", "tags": ["%s"]%s}' % (self.node, '", "'.join(['url:%s' % self.url] + self.tags.split(',')), ', "info": "%s"'%self.info.strip() if self.info else '')
    self.node = None
    self.tags = None
    self.url = None
    self.info = None
    self.on_dt = False
    self.on_dd = False
    self.on_a = False

parser = MyHTMLParser()
with open('/home/bruno/Desktop/delicious.html', 'r') as f:
  parser.feed(f.read())