donfanning
8/15/2018 - 12:21 PM

crawler.rb

#!/usr/local/bin/ruby

# crawler.rb
# by: Jason Larsen
# a generic web crawler that allows the user to do whatever they want by passing blocks
# @version 0.7
# 14 Dec 2009
# 0.6 things seem to be working well
# 0.7 modified so that URL's being added to the queue truncate fragments,
# this should save a lot of work

require 'rubygems'
require 'net/http'
require 'nokogiri'
require 'open-uri'
require 'fileutils'

class Crawler
    attr_accessor :verbose, :mode
    attr_reader :visited, :errors, :base_url, :time
    
    def initialize url
        @base_url = URI::parse url
        @base_url.normalize!
        @visited, @errors, @queue, @pages = [], {}, [@base_url], 0
    end
    
    def crawl
        start_time = Time::now
        until @queue.empty? do
			# FIXME remove when fixed weird error
			puts
			puts "Queue Before:"
            puts @queue.size < 4 ? @queue : @queue.size
            puts "Visited size: #{@visited.size}"
			
            url = @queue.shift
            puts "Crawling #{url}" if verbose
			
            begin
                src = Nokogiri::HTML::parse(open url.to_s)
            rescue Exception => e
                puts e if verbose
                if errors[e.to_s] then
                   errors[e.to_s] += 1
                else
                    errors[e.to_s] = 0
                end
				next # try the next url
            end
            
            for link in src.search('a')
                begin
                    href = URI::parse link['href']
                    href.normalize!
                    href = url + href if href.relative?
                rescue Exception
                    next
                end
                
                # skip any non-http links e.g mailto, etc.
                next if not href.scheme == 'http'
                # add it to the queue if we need to
                enqueue href
				
				if scope? href then
					if href.path.end_with? "/" then
						link['href'] += "index.html"
					else
						path = href.path[0.. href.path.rindex('/')]
						path = href.path + "index.html" if href.path.end_with? "/"
						link['href'] = File.basename(href.path).gsub(/[.].*/,'.html')
					end
				end
				
            end
            
            just_visited url
                        
            # write to file
			# figure out where we're writing it
			url.path = url.path + "index.html" if url.path.end_with? '/'
			file_name = File.basename(url.path).gsub(/[.].*/,'') + '.html'
			dirs = File.dirname(url.host + url.path)
			# make the directories
			FileUtils.makedirs(dirs) unless File.directory?(dirs)
			file = File.open(dirs + '/' + file_name, 'w')
			file.puts src.inner_html 
			file.close
			puts "Saved as #{file.path}"
            
            # FIXME remove when fixed weird error
			puts "Queue After:"
            puts @queue.size < 4 ? @queue : @queue.size
            puts "Visited size: #{@visited.size}"
        end
        @time = Time::now - start_time
    end
    
    def report
        error_report = ""
        @errors.each { |error,count| error_report += "\t#{error}: #{count}\n" }
        "Pages cached: #{@pages}\nTime elapsed: #{@time} seconds\nErrors:\n" + error_report
    end
    
    private 
    
    def enqueue url
        url.fragment = nil # by getting rid of the fragments we save a lot of work
        if scope? url and not visited? url and not queued? url then
			if @queue.size < 4 then
				puts "#{url} is in scope" if scope? url
				puts "#{url} is not visited" if not visited? url
				puts "#{url} is not queued" if not queued? url
			end
			puts "...Queuing #{url}..." if verbose
			@queue.push url
        end
    end
    
    def just_visited url
        @pages += 1
        @visited.push url
		puts "visited #{url}" if @queue.size < 4 and visited? url
    end
	
	def scope? url
        return false if not url.scheme == @base_url.scheme and not url.host == @base_url.host
        url.path.start_with? @base_url.path[0..@base_url.path.rindex('/')]
    end
	
	def visited? url
		@visited.include? url
	end
	
	def queued? url
		@queue.include? url
	end
end

crawler = Crawler.new 'http://saas.byu.edu/catalog/2005-2006ucat/'
crawler.verbose = true
crawler.crawl
puts crawler.report