ishideo
10/16/2017 - 6:30 AM

knowledge_articles.py

#!/usr/bin/env python2
# -*- coding: utf-8 -*-

from __future__ import unicode_literals
from __future__ import print_function
from bs4 import BeautifulSoup
import mechanize, re, time, json, codecs

class KnowledgeArticles:
    def __init__(self):
        self.mech = mechanize.Browser()
        self.mech.set_handle_equiv(True)
        self.mech.set_handle_redirect(True)
        self.mech.set_handle_referer(True)
        self.mech.set_handle_robots(False)
        self.mech.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=3)
        self.mech.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]

        self.mech.open('https://xxxx.xxx')
        self.mech.select_form(name='login')
        self.mech['username'] = 'xxx@xxxx.xxxx'
        self.mech['password'] = 'xxxxxxxx'
        self.mech.submit(type='submit')
        self.scraping()

    def mechanize(self):
        mech = self.mech
        mech.open('https://xxxx.xxx/open.knowledge/list')
        print(mech.response().read().decode('utf-8'))
        return mech.response().read().decode('utf-8')

    def scraping(self):
        soup = BeautifulSoup(self.mechanize())
        tr_items = soup.tbody.findAll('tr')
        articles = map(lambda x: [x.findAll('td')[0].get_text().strip(),
            x.findAll('td')[0].a.get('href'),
            x.findAll('td')[4].get_text()], tr_items)
        self.encode_json(articles)
        return articles

    def encode_json(self, articles):
        hash = {"headers":["title","url","date","error"],"rows":articles}
        f = codecs.open('knowledge_articles.txt', 'w', 'utf-8')
        json.dump(hash, f, ensure_ascii=False)
        return json.dumps(hash, ensure_ascii=False)

if __name__ == '__main__':
    KnowledgeArticles()