flyte
12/16/2013 - 1:22 AM

Gets links to all pages on a website.

Gets links to all pages on a website.

import mechanize
import urllib2
import urlparse
import argparse
import json

p = argparse.ArgumentParser()
p.add_argument("-s", "--site", required=True)
p.add_argument("-d", "--domain_limit")

def get_all_links(br, links, visited=set(), recursion=0, domain_limit=None):
	if recursion:
		print "***** RECURSION %d *****" % recursion
	new_links = set()
	for link in links:
		if link not in visited:
			if domain_limit:
				link_parsed = urlparse.urlparse(link)
				dom = ".".join(link_parsed.netloc.split(".")[-2:])
				if dom != domain_limit:
					print "Skipping %s because it's not in the %s domain" % (link, domain_limit)
					continue
			print "Getting page: %s" % link
			visited.add(link)
			try:
				br.open(link)
				if not br.viewing_html():
					continue
			except urllib2.HTTPError, e:
				if e.getcode() == 403:
					print "Skipping %s because it's in robots.txt" % link
					continue
			except urllib2.URLError, e:
				print "URLError: %s" % e
				continue
			for l in br.links():
				if l.absolute_url not in links and l.absolute_url not in new_links:
					new_links.add(l.absolute_url)
	if new_links:
		recursion += 1
		links = links.union(get_all_links(br, new_links, links.union(visited), recursion, domain_limit))
	return links

if __name__ == "__main__":
	args = p.parse_args()
	br = mechanize.Browser()
	links = set()
	try:
		links = get_all_links(br, set([args.site]), domain_limit=args.domain_limit)
	except Exception, e:
		print e
	if links:
		print "Found %d links!" % len(links)
		url = urlparse.urlparse(args.site)
		with open("%s.json" % url.netloc, "w") as f:
			f.write(json.dumps(list(links), indent=2))