#!/usr/bin/env python2
# -*- coding: utf-8 -*-
from __future__ import unicode_literals
from __future__ import print_function
from bs4 import BeautifulSoup
import mechanize, re, time, json, codecs
class KnowledgeArticles:
def __init__(self):
self.mech = mechanize.Browser()
self.mech.set_handle_equiv(True)
self.mech.set_handle_redirect(True)
self.mech.set_handle_referer(True)
self.mech.set_handle_robots(False)
self.mech.set_handle_refresh(mechanize._http.HTTPRefreshProcessor(), max_time=3)
self.mech.addheaders = [('User-agent', 'Mozilla/5.0 (X11; U; Linux i686; en-US; rv:1.9.0.1) Gecko/2008071615 Fedora/3.0.1-1.fc9 Firefox/3.0.1')]
self.mech.open('https://xxxx.xxx')
self.mech.select_form(name='login')
self.mech['username'] = 'xxx@xxxx.xxxx'
self.mech['password'] = 'xxxxxxxx'
self.mech.submit(type='submit')
self.scraping()
def mechanize(self):
mech = self.mech
mech.open('https://xxxx.xxx/open.knowledge/list')
print(mech.response().read().decode('utf-8'))
return mech.response().read().decode('utf-8')
def scraping(self):
soup = BeautifulSoup(self.mechanize())
tr_items = soup.tbody.findAll('tr')
articles = map(lambda x: [x.findAll('td')[0].get_text().strip(),
x.findAll('td')[0].a.get('href'),
x.findAll('td')[4].get_text()], tr_items)
self.encode_json(articles)
return articles
def encode_json(self, articles):
hash = {"headers":["title","url","date","error"],"rows":articles}
f = codecs.open('knowledge_articles.txt', 'w', 'utf-8')
json.dump(hash, f, ensure_ascii=False)
return json.dumps(hash, ensure_ascii=False)
if __name__ == '__main__':
KnowledgeArticles()