zenwalker
9/4/2015 - 1:15 PM

habraproxy.py

# -*- coding: utf-8 -*-
from SimpleHTTPServer import SimpleHTTPRequestHandler
from SocketServer import TCPServer
from StringIO import StringIO
from lxml import html
import webbrowser
import urllib2
import gzip
import re

PORT = 8232
WEBSITE = 'habrahabr.ru'


class Proxy(SimpleHTTPRequestHandler):
    def do_GET(self):
        response = urllib2.urlopen('http://' + WEBSITE + self.path)
        info = response.info()

        if info.get('Content-Encoding') == 'gzip':
            response = gzip.GzipFile(fileobj=StringIO(response.read()))

        content = response.read()
        if 'text/html' in info.get('Content-Type'):
            content = process_content(content)

        self.wfile.write(content)


def process_content(content):
    word_regex = re.compile(r'(\b\w{6}\b)', flags=re.UNICODE)
    ignore_tags = ['pre', 'code', 'script', 'style']
    tree = html.fromstring(content)

    def add_tm(text):
        return word_regex.sub(ur'\1™', unicode(text))

    def fix_link(text):
        return text.replace('//%s' % WEBSITE, '//localhost:' + str(PORT))

    for node in tree.xpath('.//body/descendant-or-self::*'):
        if node.tag not in ignore_tags:
            if node.text:
                node.text = add_tm(node.text)
            if node.tail:
                node.tail = add_tm(node.tail)

        if node.tag == 'a' and 'href' in node.attrib:
            node.attrib['href'] = fix_link(node.attrib['href'])

    return html.tostring(tree)


def main():
    webbrowser.open('http://localhost:%d' % PORT)

    server = TCPServer(('', PORT), Proxy)
    server.serve_forever()


if __name__ == '__main__':
    main()