timbroder
2/6/2013 - 10:12 PM

images_for_seth.py

from bs4 import BeautifulSoup
import urllib2, urllib
import sys
import imghdr
import os

root = "http://www.imgspark.com"
base = "%s/image/popular/sethwhitton/alltime/" % root

page = urllib2.urlopen(base).read()
soup = BeautifulSoup(page)
print "pulling from %s" % base
next_page = True
while next_page:
    for thumb in soup.findAll('img', { "class":"spark_image" }):
        preview_url = "%s%s" % (root, thumb.parent['href'])
        preview_page = urllib2.urlopen(preview_url).read()
        preview_soup = BeautifulSoup(preview_page)
        orig_size = preview_soup.find('p', { "class":"original_size" })
        orig_size_url = orig_size.find('a')['href']
        orig_sizes = orig_size_url.split('/')
        orig_size = orig_sizes[len(orig_sizes)-2]
        foo_url = "%s%s" % (root, orig_size_url)
        urllib.urlretrieve(foo_url, orig_size)
        test = file(orig_size, "r")
        file_type = imghdr.what(test)
        new_file = "%s.%s" % (orig_size, file_type)
        test.close()
        if not os.path.isfile(new_file):
            os.rename(orig_size, new_file)
        print new_file
    next_page = soup.find('a', { "id":"next-page-link" })
    next_page_url = "%s%s" % (root, next_page['href'])
    page = urllib2.urlopen(next_page_url).read()
    soup = BeautifulSoup(page)
    print "pulling from %s" % next_page_url