ishideo
10/16/2017 - 6:25 AM

lodge_articles.py

#!/usr/bin/env python2
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
from __future__ import print_function
from bs4 import BeautifulSoup
import mechanize, re, time, json, codecs

class LodgeArticles:
    def __init__(self):
        self.mech = mechanize.Browser()
        self.mech.set_handle_equiv(True)
        self.mech.set_handle_redirect(True)
        self.mech.set_handle_referer(True)
        self.mech.set_handle_robots(False)
        self.mech.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=1)
        self.mech.addheaders = [('User-agent',
                                ('Mozilla/5.0 (Windows; U; Windows NT 5.1; rv:1.7.3)'
                                ' Gecko/20041001 Firefox/0.10.1'))]
        loginurl = 'http://xxx.xxx.xxx.xxx:8080/'
        self.mech.add_password(loginurl, 'id', 'password')
        self.mech.open(loginurl)
        self.scraping()

    def mechanize(self):
        mech = self.mech
        mech.select_form(nr=0)
        mech['user[email]'] = 'xxx@xxxx.xxx'
        mech['user[password]'] = 'xxxxxxxx'
        mech.submit()
        mech.open('http://52.198.111.63:8080/articles')
        return mech.response().read().decode('utf-8')

    def scraping(self):
        soup = BeautifulSoup(self.mechanize())
        tr_items = soup.tbody.findAll('tr')
        articles = map(lambda x: [x.findAll('td')[0].get_text().strip(),
            x.findAll('td')[0].a.get('href'),
            x.findAll('td')[4].get_text()], tr_items)
        self.encode_json(articles)
        return articles

    def encode_json(self, articles):
        hash = {"headers":["title","url","date","error"],"rows":articles}
        f = codecs.open('lodge_articles.txt', 'w', 'utf-8')
        json.dump(hash, f, ensure_ascii=False)
        return json.dumps(hash, ensure_ascii=False)


if __name__ == '__main__':
    LodgeArticles()