jcxia43
4/26/2012 - 11:34 AM

Simple web crawler

Simple web crawler

import urllib

#this is just a very simple web crawler, can not actually do
#what a real web crawler do :)

#get the next link on the page,here page is the content of 
#the HTML text, also a string
def get_next_link(page):           
    start_pos = page.find("<a href=")
    if start_pos == -1:
        return None,0
    start_pos = page.find('"',start_pos)
    end_pos = page.find('"',start_pos + 1)
    url = page[start_pos + 1:end_pos]
    return url,end_pos

#get all links on one page
def get_all_link(page):
    crawl = []
    while True:
        url,end_pos = get_next_link(page)
        if url != None:
            crawl.append(url)
            page = page[end_pos:] #the NEW page starts where the last link is
        else:
            break
    return crawl

#union two lists
def union(a,b):
    for element in b:
        if element not in a:
            a.append(element)

#crawl a seed URL, get every link that directly/indirectly connected
#to the seed page.
def crawl(seed):
    tocrawl = [seed] #tocrawl contains those yet to be crawled pages
    crawled = [] #crawled pages
    last_url = ''
    while len(tocrawl) > 0:
        page = tocrawl.pop()
        if page not in crawled:
            url = str(page)
            if url[0] == '/':
                url = last_url + url #turns a relative path to an absolute one
            if url.find('http') != -1:
                last_url = page
		try:
                	file = urllib.urlopen(url)
                	page = file.read()
		except:
			page = ''
                union(tocrawl,get_all_link(page))
                crawled.append(last_url)
                print url
                file.close()
                

crawl('http://www.google.com')