Simple web crawler
import urllib
#this is just a very simple web crawler, can not actually do
#what a real web crawler do :)
#get the next link on the page,here page is the content of
#the HTML text, also a string
def get_next_link(page):
start_pos = page.find("<a href=")
if start_pos == -1:
return None,0
start_pos = page.find('"',start_pos)
end_pos = page.find('"',start_pos + 1)
url = page[start_pos + 1:end_pos]
return url,end_pos
#get all links on one page
def get_all_link(page):
crawl = []
while True:
url,end_pos = get_next_link(page)
if url != None:
crawl.append(url)
page = page[end_pos:] #the NEW page starts where the last link is
else:
break
return crawl
#union two lists
def union(a,b):
for element in b:
if element not in a:
a.append(element)
#crawl a seed URL, get every link that directly/indirectly connected
#to the seed page.
def crawl(seed):
tocrawl = [seed] #tocrawl contains those yet to be crawled pages
crawled = [] #crawled pages
last_url = ''
while len(tocrawl) > 0:
page = tocrawl.pop()
if page not in crawled:
url = str(page)
if url[0] == '/':
url = last_url + url #turns a relative path to an absolute one
if url.find('http') != -1:
last_url = page
try:
file = urllib.urlopen(url)
page = file.read()
except:
page = ''
union(tocrawl,get_all_link(page))
crawled.append(last_url)
print url
file.close()
crawl('http://www.google.com')