goyoghurt
3/14/2018 - 9:23 AM

iOS App Store Crawler

iOS App Store Crawler

#!/usr/bin/env ruby
# iOS App Store Crawler by Nowa <nowazhu@gmail.com>
# 2010-08-04

require 'rubygems'
require 'hpricot'
require 'open-uri'

USERAGENT = 'Mozilla/5.0 (Macintosh; U; Intel Mac OS X 10_6_4; en-US) AppleWebKit/533.4 (KHTML, like Gecko) Chrome/5.0.375.99 Safari/533.4'
LETTERS = %w{A B C D E F G H I J K L M N O P Q R S T U V W X Y Z *}

cates = {}
doc = open("http://itunes.apple.com/us/genre/mobile-software-applications/id36", 'User-Agent' => USERAGENT) do |f| 
  Hpricot(f)
end

puts doc.search("//title")[0].innerText

# get root categories
doc.search("//a[@class='top-level-genre']").each { |root_cate|
  cates[root_cate.innerText] = root_cate.attributes["href"] unless cates[root_cate.innerText]
}

puts "Got #{cates.size} categories."

cates.each { |cate_title, cate_url|
  puts "Get in <#{cate_title}>"
  
  LETTERS.each { |letter|
    page = 1
    the_url = "#{cate_url}&letter=#{letter}&page=#{page}"
    
    puts "\tFetching Letter: #{letter} of <#{cate_title}>, Page: #{page}, URL: #{the_url}"
    apps = Hpricot(open(the_url, 'User-Agent' => USERAGENT)).search("//div[@id='selectedcontent']//li/a")
    
    while apps.size > 0
      apps.each { |app|
        puts "\t\tGot App <#{app.innerText}>, URL: #{app.attributes['href']}"
        
        app_detail = {
          :title => app.innerText, 
          :detail_url => app.attributes['href'], 
          :developer => {}, 
          :desc => "", 
          :links => {}, 
          :iphone_screenshots => [], 
          :ipad_screenshots => [], 
          :icon_175 => "", 
          :is_free => nil, 
          :price => "", 
          :updated_date => "", 
          :version => "", 
          :version_desc => "", 
          :pkg_size => "", 
          :languages => "", 
          :seller => "", 
          :requirements => {
            :devices => [], 
            :base_ios => ""
          }
        }
        app_doc = Hpricot(open(app.attributes['href'], 'User-Agent' => USERAGENT))
        
        # title & developer
        tmp_ele = app_doc.search("//div[@id='title']")
        if tmp_ele
          # app title
          tmp_ele.search("/h1").each { |h1|
            app_detail[:title] = h1.innerText
          }
          
          # app developer
          tmp_ele.search("/h2").each { |h2|
            app_detail[:developer][:display_name] = h2.innerText
          }
          
          tmp_ele.search("/a").each { |a|
            app_detail[:developer][:nick] = a.attributes["href"].split("/")[-2]
          }
        end
        
        # center-stack
        tmp_ele = app_doc.search("//div[@class='center-stack']")
        if tmp_ele
          # app desc
          tmp_ele.search("/div[@metrics-loc='Titledbox_Description']/p").each { |p|
            app_detail[:desc] = p.innerHTML
          }
          
          # links
          tmp_ele.search("/div[@class='app-links']/a").each { |a|
            symbol = a.innerText.include?("Web Site") ? :website : :support
            app_detail[:links][symbol] = a.attributes["href"]
          }
          
          # iPhone screenshots
          tmp_ele.search("//div[@class='content iphone-screen-shots']//img").each { |img|
            app_detail[:iphone_screenshots].push(img.attributes['src'])
          }
          
          # iPad screenshots
          tmp_ele.search("//div[@class='content ipad-screen-shots']//img").each { |img|
            app_detail[:ipad_screenshots].push(img.attributes['src'])
          }
          
          
        end
        
        # left-stack
        tmp_ele = app_doc.search("//div[@id='left-stack']")
        if tmp_ele
          # icon 175x175
          tmp_ele.search("//img[@class='artwork']").each { |img|
            app_detail[:icon_175] = img.attributes['src']
          }
          
          list = tmp_ele.search("//ul[@class='list']/li")
          
          if list.size > 0
            # is_free & price
            app_detail[:is_free] = list[0].innerText.include?("Free")
            app_detail[:price] = list[0].innerText.gsub('$', '') unless app_detail[:is_free]
          
            app_detail[:updated_date] = list[2].innerText.gsub("\nUpdated:", "").gsub("\nReleased:", "")
            app_detail[:version] = list[3].innerText.gsub("\nVersion:", "").gsub("\nCurrent Version:", "")
            app_detail[:version_desc] = list[4].innerText
            app_detail[:pkg_size] = list[5].innerText
            app_detail[:languages] = list[6].innerText.gsub("\nLanguage:", "").gsub("\nLanguages:", "")
            app_detail[:seller] = list[7].innerText.gsub("\nSeller:", "")
          end
          
          # requirements
          tmp_ele.search("/div[@class='lockup product application']/p").each { |p|
            requirements = p.innerText.gsub("\nRequirements:", "").split('. Requires ')
            app_detail[:requirements][:devices] = requirements[0].gsub('Compatible with ', '').gsub('and ', '')
            app_detail[:requirements][:base_ios] = requirements[1].gsub(' or later.', '').gsub(/(iPhone OS|iOS)/i, '').strip
          }
        end
        
        puts "\t\t\tTitle: #{app_detail[:title]}"
        puts "\t\t\tUpdated: #{app_detail[:updated_date]}"
        puts "\t\t\tVersion: #{app_detail[:version]}"
        puts "\t\t\tSize: #{app_detail[:pkg_size]}"
        puts "\t\t\tLanguages: #{app_detail[:languages]}"
        puts "\t\t\tSeller: #{app_detail[:seller]}"
        puts ""
      }
      
      page += 1
      the_url = "#{cate_url}&letter=#{letter}&page=#{page}"
      
      puts ""
      puts "\tFetching Letter: #{letter} of <#{cate_title}>, Page: #{page}, URL: #{the_url}"
      apps = Hpricot(open(the_url, 'User-Agent' => USERAGENT)).search("//div[@id='selectedcontent']//li/a")
      
      puts ""
    end
    
    puts ""
  }
}