#!/usr/bin/env python2
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import print_function
from bs4 import BeautifulSoup
import mechanize, re, time, json, codecs
class LodgeArticles:
def __init__(self):
self.mech = mechanize.Browser()
self.mech.set_handle_equiv(True)
self.mech.set_handle_redirect(True)
self.mech.set_handle_referer(True)
self.mech.set_handle_robots(False)
self.mech.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
self.mech.addheaders = [('User-agent',
('Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3)'
' Gecko/20041001 Firefox/0.10.1'))]
loginurl = 'http://xxx.xxx.xxx.xxx:8080/'
self.mech.add_password(loginurl, 'id', 'password')
self.mech.open(loginurl)
self.scraping()
def mechanize(self):
mech = self.mech
mech.select_form(nr=0)
mech['user[email]'] = 'xxx@xxxx.xxx'
mech['user[password]'] = 'xxxxxxxx'
mech.submit()
mech.open('http://52.198.111.63:8080/articles')
return mech.response().read().decode('utf-8')
def scraping(self):
soup = BeautifulSoup(self.mechanize())
tr_items = soup.tbody.findAll('tr')
articles = map(lambda x: [x.findAll('td')[0].get_text().strip(),
x.findAll('td')[0].a.get('href'),
x.findAll('td')[4].get_text()], tr_items)
self.encode_json(articles)
return articles
def encode_json(self, articles):
hash = {"headers":["title","url","date","error"],"rows":articles}
f = codecs.open('lodge_articles.txt', 'w', 'utf-8')
json.dump(hash, f, ensure_ascii=False)
return json.dumps(hash, ensure_ascii=False)
if __name__ == '__main__':
LodgeArticles()