donfanning
8/15/2018 - 12:11 PM

Hacky crawler using Mechanize

Hacky crawler using Mechanize

#!/usr/bin/env ruby

require 'uri'
require 'nokogiri'
require 'mechanize'
require 'logger'

trap('INT') { @crawler.report; exit }

class Crawler
  attr_reader :url, :failures, :pages_crawled

  def initialize(url)
    @url      = url
    @failures = Hash.new([])
  end

  def report
    puts
    puts "Successful hits: #{pages_crawled.length}"
    puts "Errors: #{failures.values.map(&:length).reduce(:+) || 0}"
    puts

    failures.map do |error, urls|
      puts error
      puts urls.map { |url| "  #{url}" }
    end

    puts
  end

  def run
    crawl_page(url)
  end

  def pages_crawled
    @pages_crawled ||= []
  end

  private

  def good(message)
    puts "\e[32m==>\e[0m #{message}"
  end

  def bad(message)
    puts "\e[31m==>\e[0m #{message}"
  end

  def agent
    @agent ||= Mechanize.new do |a|
      a.log = Logger.new('log/crawler.log')
      a.user_agent_alias = 'Mac Safari'
    end
  end

  def crawl_page(url)
    address = simple_address_from_url(url)
    return if pages_crawled.include?(address)
    pages_crawled.push(address)

    begin
      page = agent.get(url)
      good "GET #{url}"

      page.links.map(&:href).each do |href|
        href.sub!(/\/?$/, '')  # Remove trailing slashes
        href.sub!(/#.*?$/, '') # Remove anchors
        crawl_page(href)
      end
    rescue => e
      failures[e.class.to_s] += [url]
      bad "GET #{url}"
    end
  end

  def simple_address_from_url(url)
    uri_parts = URI.split(url)
    "#{uri_parts[0]}://#{uri_parts[1..-4].join}"
  end
end

url = ARGV[0]

@crawler = Crawler.new(url)

@crawler.run
@crawler.report