renews
7/27/2017 - 7:38 PM

Elixir Crawling With Pools For Fun and Profit

Elixir Crawling With Pools For Fun and Profit

PoolCrawler

This is a simple example in response to https://www.reddit.com/r/elixir/comments/4gcnmi/struggling_with_getting_this_worker_pool_and/ as well as a reminder for myself

I used https://github.com/midas/conqueuer to do something similar to the original poster was trying to accomplish.

Some code has been removed to make it simple to follow and just dropping this into a project will not compile. Be sure to look at the comments.

I combined and/or renamed files in this gist to group things together as well.

Also note that I noticed you can get data on the queue sizes from Conqueuer by doing the following (in relation to my code here):

Conqueuer.Queue.size(:CrawlersQueue)
Conqueuer.Queue.size(:ParsersQueue)
defmodule PoolCrawler do
  use Application

  def start(_type, _args) do
    import Supervisor.Spec, warn: false
    
    children = [
      supervisor(PoolCrawler.CrawlerPool.Supervisor, [[], [name: :CrawlerPoolSupervisor]]),
      worker(Conqueuer.Queue, [[], [name: :CrawlersQueue]], id: :crawler_queue),
      worker(Conqueuer.Foreman, [[name: :crawlers], [name: :CrawlersForeman]], id: :crawler_foreman),
      supervisor(PoolCrawler.ParserPool.Supervisor, [[], [name: :ParserPoolSupervisor]]),
      worker(Conqueuer.Queue, [[], [name: :ParsersQueue]], id: :parser_queue),
      worker(Conqueuer.Foreman, [[name: :parsers], [name: :ParsersForeman]], id: :parser_foreman)
    ]
    
    opts = [strategy: :one_for_one, name: PoolCrawler.Supervisor]
    Supervisor.start_link(children, opts)    
  end


  def crawl(url) do
    # likely want to validate the url here
    Conqueuer.work(:crawlers, url)
  end

end
defmodule PoolCrawler.CrawlerPool.Supervisor do
  use Conqueuer.Pool, name: :crawlers,
                      worker: PoolCrawler.CrawlerPool.Worker,
                      size: 4,
                      max_overflow: 1
end

defmodule PoolCrawler.ParserPool.Supervisor do
  use Conqueuer.Pool, name: :parsers,
                      worker: PoolCrawler.ParserPool.Worker,
                      size: 30,
                      max_overflow: 10
end
defmodule PoolCrawler.CrawlerPool.Worker do
  use Conqueuer.Worker
  
  @default_user_agent "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36"
  @default_width 1720
  @default_height 1340

  def perform({url, width, height, user_agent}, _state) do
    IO.puts "CrawlerWorker.peform"
    IO.puts "url: #{url}"
    IO.puts "width: #{width}"
    IO.puts "height: #{height}"
    IO.puts "user_agent: #{user_agent}"

    crawl({url, width, height, user_agent}) |> handle_html(url)
  end

  def perform(url, _state) do
    IO.puts "CrawlerWorker.perform"
    IO.puts "url: #{url}"

    user_agent = @default_user_agent
    width = @default_width
    height = @default_height

    crawl({url, width, height, user_agent}) |> handle_html(url)
  end

  defp crawl({url, width, height, user_agent}) do
    # probably want to keep track of counts or some other metrics here

    # call your module that actually does the crawl. I've used PhantomJS via Elixir/Erlang 'Ports' with success
    # i found some good code to get that working by looking at the Elixir WebDriver implentation. I didn't use that directly, but was able to put
    # together a decent PhantomJS GenServer/Port that does what I need
    #
    # example call: 
    # `{:ok, pid} = SuperDuperPhantomServer.start_link(width, height, user_agent)`
    # `PhantomServer.crawl_page(pid, url)`
  end
  
  # if we get no html back, probably want to keep track of that
  defp handle_html(html, url) when is_nil(html), do: something... # <-- this will not compile, obviously
  defp handle_html(html, _) do
    # send HTML results to parsing queue
    Conqueuer.work(:parsers, html)
  end

  defp already_crawled?(url) do
    # if you want to avoid hitting the same url - store previously crawled
    # links someplace. Maybe a GenServer key/value store? 
  end

end
defmodule PoolCrawler.ParserPool.Worker do
  use Conqueuer.Worker

  def perform(html, _state) do
    # call your parsing module. In my case this was a GenServer that used Floki for parsing logic.
    
    # I then piped the results into process_links
    # like: `MagicParserModule.parse_html(html)[:links] |> process_links
  end

  defp process_links([%{url: nil}]), do: nil
  defp process_links([link = %{url: url}|_]) do
    # validate the link before we bother doing anything else
    case validate_url(link) do
      true -> queue_link(link)
      false -> IO.puts "Url issues: `#{url}`"
    end
  end
  defp process_links([]), do: nil

  defp validate_url(link = %{url: url}) do
    # makes sense to validate the url somehow based on your needs.
    # maybe you should strip anchor tags or anything else unique?
  end
  
  # lets push this url into the crawl pool, starting a new cycle
  defp queue_link(%{no_follow: false, url: url, text: _}) do
    # push to the :crawlers pool
    Conqueuer.work(:crawlers, url)
  end
  
  # when parsing the HTML, I was adhering to 'no follow' attributes - so skip this link
  defp queue_link(%{no_follow: true}) do
    # maybe keep track of how many of these no follow links in a stats module?
  end
end