# -*- coding: utf-8 -*-
from SimpleHTTPServer import SimpleHTTPRequestHandler
from SocketServer import TCPServer
from StringIO import StringIO
from lxml import html
import webbrowser
import urllib2
import gzip
import re
PORT = 8232
WEBSITE = 'habrahabr.ru'
class Proxy(SimpleHTTPRequestHandler):
def do_GET(self):
response = urllib2.urlopen('http://' + WEBSITE + self.path)
info = response.info()
if info.get('Content-Encoding') == 'gzip':
response = gzip.GzipFile(fileobj=StringIO(response.read()))
content = response.read()
if 'text/html' in info.get('Content-Type'):
content = process_content(content)
self.wfile.write(content)
def process_content(content):
word_regex = re.compile(r'(\b\w{6}\b)', flags=re.UNICODE)
ignore_tags = ['pre', 'code', 'script', 'style']
tree = html.fromstring(content)
def add_tm(text):
return word_regex.sub(ur'\1™', unicode(text))
def fix_link(text):
return text.replace('//%s' % WEBSITE, '//localhost:' + str(PORT))
for node in tree.xpath('.//body/descendant-or-self::*'):
if node.tag not in ignore_tags:
if node.text:
node.text = add_tm(node.text)
if node.tail:
node.tail = add_tm(node.tail)
if node.tag == 'a' and 'href' in node.attrib:
node.attrib['href'] = fix_link(node.attrib['href'])
return html.tostring(tree)
def main():
webbrowser.open('http://localhost:%d' % PORT)
server = TCPServer(('', PORT), Proxy)
server.serve_forever()
if __name__ == '__main__':
main()