7/27/2017 - 7:38 PM

Elixir Crawling With Pools For Fun and Profit

Rendered
Source

PoolCrawler

This is a simple example in response to https://www.reddit.com/r/elixir/comments/4gcnmi/struggling_with_getting_this_worker_pool_and/ as well as a reminder for myself

I used https://github.com/midas/conqueuer to do something similar to the original poster was trying to accomplish.

Some code has been removed to make it simple to follow and just dropping this into a project will not compile. Be sure to look at the comments.

I combined and/or renamed files in this gist to group things together as well.

Also note that I noticed you can get data on the queue sizes from Conqueuer by doing the following (in relation to my code here):

Conqueuer.Queue.size(:CrawlersQueue)
Conqueuer.Queue.size(:ParsersQueue)

pool_setup.ex

defmodule PoolCrawler do
  use Application

  def start(_type, _args) do
    import Supervisor.Spec, warn: false
    
    children = [
      supervisor(PoolCrawler.CrawlerPool.Supervisor, [[], [name: :CrawlerPoolSupervisor]]),
      worker(Conqueuer.Queue, [[], [name: :CrawlersQueue]], id: :crawler_queue),
      worker(Conqueuer.Foreman, [[name: :crawlers], [name: :CrawlersForeman]], id: :crawler_foreman),
      supervisor(PoolCrawler.ParserPool.Supervisor, [[], [name: :ParserPoolSupervisor]]),
      worker(Conqueuer.Queue, [[], [name: :ParsersQueue]], id: :parser_queue),
      worker(Conqueuer.Foreman, [[name: :parsers], [name: :ParsersForeman]], id: :parser_foreman)
    ]
    
    opts = [strategy: :one_for_one, name: PoolCrawler.Supervisor]
    Supervisor.start_link(children, opts)    
  end


  def crawl(url) do
    # likely want to validate the url here
    Conqueuer.work(:crawlers, url)
  end

end

pool_supervisors.ex

defmodule PoolCrawler.CrawlerPool.Supervisor do
  use Conqueuer.Pool, name: :crawlers,
                      worker: PoolCrawler.CrawlerPool.Worker,
                      size: 4,
                      max_overflow: 1
end

defmodule PoolCrawler.ParserPool.Supervisor do
  use Conqueuer.Pool, name: :parsers,
                      worker: PoolCrawler.ParserPool.Worker,
                      size: 30,
                      max_overflow: 10
end

worker_crawler.ex

defmodule PoolCrawler.CrawlerPool.Worker do
  use Conqueuer.Worker
  
  @default_user_agent "Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/37.0.2062.120 Safari/537.36"
  @default_width 1720
  @default_height 1340

  def perform({url, width, height, user_agent}, _state) do
    IO.puts "CrawlerWorker.peform"
    IO.puts "url: #{url}"
    IO.puts "width: #{width}"
    IO.puts "height: #{height}"
    IO.puts "user_agent: #{user_agent}"

    crawl({url, width, height, user_agent}) |> handle_html(url)
  end

  def perform(url, _state) do
    IO.puts "CrawlerWorker.perform"
    IO.puts "url: #{url}"

    user_agent = @default_user_agent
    width = @default_width
    height = @default_height

    crawl({url, width, height, user_agent}) |> handle_html(url)
  end

  defp crawl({url, width, height, user_agent}) do
    # probably want to keep track of counts or some other metrics here

    # call your module that actually does the crawl. I've used PhantomJS via Elixir/Erlang 'Ports' with success
    # i found some good code to get that working by looking at the Elixir WebDriver implentation. I didn't use that directly, but was able to put
    # together a decent PhantomJS GenServer/Port that does what I need
    #
    # example call: 
    # `{:ok, pid} = SuperDuperPhantomServer.start_link(width, height, user_agent)`
    # `PhantomServer.crawl_page(pid, url)`
  end
  
  # if we get no html back, probably want to keep track of that
  defp handle_html(html, url) when is_nil(html), do: something... # <-- this will not compile, obviously
  defp handle_html(html, _) do
    # send HTML results to parsing queue
    Conqueuer.work(:parsers, html)
  end

  defp already_crawled?(url) do
    # if you want to avoid hitting the same url - store previously crawled
    # links someplace. Maybe a GenServer key/value store? 
  end

end

worker_parser.ex

defmodule PoolCrawler.ParserPool.Worker do
  use Conqueuer.Worker

  def perform(html, _state) do
    # call your parsing module. In my case this was a GenServer that used Floki for parsing logic.
    
    # I then piped the results into process_links
    # like: `MagicParserModule.parse_html(html)[:links] |> process_links
  end

  defp process_links([%{url: nil}]), do: nil
  defp process_links([link = %{url: url}|_]) do
    # validate the link before we bother doing anything else
    case validate_url(link) do
      true -> queue_link(link)
      false -> IO.puts "Url issues: `#{url}`"
    end
  end
  defp process_links([]), do: nil

  defp validate_url(link = %{url: url}) do
    # makes sense to validate the url somehow based on your needs.
    # maybe you should strip anchor tags or anything else unique?
  end
  
  # lets push this url into the crawl pool, starting a new cycle
  defp queue_link(%{no_follow: false, url: url, text: _}) do
    # push to the :crawlers pool
    Conqueuer.work(:crawlers, url)
  end
  
  # when parsing the HTML, I was adhering to 'no follow' attributes - so skip this link
  defp queue_link(%{no_follow: true}) do
    # maybe keep track of how many of these no follow links in a stats module?
  end
end

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Elixir Crawling With Pools For Fun and Profit

PoolCrawler