donfanning
8/15/2018 - 1:33 PM

psy_crawler.rb

require 'nokogiri'
require 'open-uri'
require 'json'


module PsyCrawler

  DOMAIN = "http://www.psydb.net"

  def self.get_artists

    ['0', *('a'..'z')].each do |i|
      page = Nokogiri::HTML(open("#{DOMAIN}/artists/#{i}/"))

      artists = []
      page.xpath('//*[@id="mainDiv"]/table/tr[2]/td[2]/table[2]//tr//td[3]').each do |td|
        td.to_s.match(/<a href="(\/artists\/[0a-z]\/.+\.php)">(.+)<\/a>/) do |m|
          artists << Hash[:name, m[2], :link, DOMAIN + m[1]]
        end
      end


      dir_name = "artists/#{i}"
      Dir.mkdir(dir_name) unless File.exists?("#{dir_name}")

      file_name = "#{i}_#{artists.size}"
      File.open("#{dir_name}/#{file_name}.json", "w+") do |file|
        puts "writing: #{file_name}"
        file.write(artists.to_json)
      end

    end

  end


  IGNORED_DATA = %w(name active tracks remixes updated)

  def self.get_data(index, artist)

    page = Nokogiri::HTML(open(artist[:link]))

    page.xpath('//*[@id="mainDiv"]/table/tr[2]/td[2]/table[2]/tr[2]/td/table//tr').each do |tr|

      unless tr.at_css("td.TBB").nil?

        key = tr.at_css("td.TBB").content.downcase unless tr.at_css("td.TBB").nil?
        value = tr.at_css("td.TB").content unless tr.at_css("td.TB").nil?

        artist[key.to_sym] = value unless IGNORED_DATA.include? key
      end

    end

    dir_name = "artists/#{index}"
    Dir.mkdir(dir_name) unless File.exists?("#{dir_name}")

    file_name = artist[:name].gsub(/(\s)/, '_').downcase
    File.open("#{dir_name}/#{file_name}.json", "w+") do |file|
      puts "writing: #{file_name}"
      file.write(artist.to_json)
    end
  end

end

PsyCrawler.get_artists