iamsk
8/8/2015 - 9:10 AM

capture_online_website_for_preview.py

# -*- coding: utf-8 -*-

import os
import slugify
from selenium import webdriver

DIR = os.path.dirname(os.path.abspath(__file__))


def capture_with_proxy(url, proxy_url, username, password):
    if proxy_url.startswith('http://'):
        proxy_url = proxy_url[7:]
    service_args = [
        '--proxy=%s' % proxy_url,
        '--proxy-auth=%s:%s' % (username, password),
    ]
    service_log_path = 'capture.log'
    browser = webdriver.PhantomJS(service_args=service_args,
                                  service_log_path=service_log_path)
    browser.set_window_size(400, 300)
    max_wait = 30
    browser.set_page_load_timeout(max_wait)
    browser.set_script_timeout(max_wait)
    file_path = slugify.slugify(url.decode('utf-8')) + '.png'
    file_path = os.path.join(DIR, file_path)
    try:
        browser.get(url)
        browser.save_screenshot(file_path)
    except Exception:
        pass
    browser.close()


if __name__ == "__main__":
    proxy_url = ''
    username = ''
    password = ''
    count = 0
    # blogs.txt is file with lots of blog urls in each line
    with open(os.path.join(DIR, 'blogs.txt')) as f:
        blogs = f.readlines()
        for blog in blogs[99:]:
            count += 1
            print count, blog
            capture_with_proxy(blog, proxy_url, username, password)