ishideo
11/9/2016 - 5:29 AM

get_bhatena.py

#!/usr/bin/env python


from BeautifulSoup import BeautifulStoneSoup
import sys, urllib2, re, pprint, codecs
import inspect, doctest
from pprint import pprint

sys.stdout = codecs.getwriter('utf-8')(sys.stdout)

def main():
  file = './list.txt'
  for line in open(file, 'r'):
    if line:
      items = line.split('\t')
      id = items[0]
      print_date(id)
    else:
      break

def print_date(id):
  url = 'http://b.hatena.ne.jp/' + id + '/rss'
  try:
    get_xml = urllib2.urlopen(url).read()
  except:
    get_xml = ''

  soup = BeautifulStoneSoup(get_xml, convertEntities=BeautifulStoneSoup.XML_ENTITIES)
  reg = re.compile(ur"""<opensearch:totalResults>(.*?)</opensearch:totalResults>""")
  reg.search(str(get_xml))
  try:
    s  = reg.search(get_xml)
    total = s.group(1)
    title = soup.channel('title')[0].string
    date = soup('dc:date')
#    import prettyPrint;prettyPrint.dumpObj(total)
#    pprint(inspect.getmembers(total),indent=2)
#    print id + '::' + '::' + total + '::' + date[0].string
    print id + "\t" + title + "\t" + total + "\t" + date[0].string
  except:
    pass

if __name__ == '__main__':
  main()