gokhanerdogan
11/27/2016 - 10:49 PM

fb_scrape.rb

# gem install crack rest-client fastercsv
require 'rubygems'
require 'active_support'
require 'active_support/core_ext'
require 'crack'
require 'rest-client'
require 'fastercsv'
require 'json'
require 'csv'
require 'axlsx'
require 'dbi'

DB_PASSWORD ='admin'
DB_USERNAME = 'postgres'
DB_HOST = 'localhost'
DB_DB = 'postgres'
DB_PORT = '5432'

file = File.open('FB_LOG.txt', File::WRONLY | File::APPEND | File::CREAT)
file.sync = true
$logger = Logger.new(file, 'monthly')
$logger.datetime_format = '%Y-%m-%d %H:%M:%S '
$logger.level = Logger::DEBUG


# coding: UTF-8
# To use, get an access token here, by clicking "get access token" 
# and checking user.groups in the dialog box
# https://developers.facebook.com/tools/explorer?method=GET&path=209024949216061%2Ffeed
#
# Run `ruby fb_scrape.rb ACCESS_TOKEN GROUP_ID`
#
# Your CSV should show up as "fb_posts_GROUP_ID.csv" in the same directory. 

class GroupScraper
  def initialize(access_token, group_id)
    @access_token = access_token
    @group_id     = group_id
    @url          =  "https://graph.facebook.com/#{@group_id}/feed?access_token=#{@access_token}"
    @data         = []
	@rawdata = Hash.new
  end

  def start
    scrape(@url)
  end

  def scrape(url)
  #puts RestClient.get(url)
  restponse = RestClient.get(url)
  $logger.info "Url: #{url} Response code: #{restponse.code}"
  resp = JSON.parse(restponse)
    #resp = Crack::JSON.parse(RestClient.get(url))

    if resp['data'] && resp['data'].length > 0

      resp['data'].each do |fb_post|
        post = {
          :fb_id           => fb_post['id'],
          :fb_author       => fb_post["from"]["name"],
          :fb_author_id    => fb_post["from"]["id"],
		  :link			   => fb_post["link"],
          :message         => fb_post["message"],
		  :name			   =>  fb_post["name"],
		  :caption         => fb_post["caption"],
		  :description     => fb_post["description"],
          :fb_created_time => fb_post["created_time"],
          :fb_updated_time => fb_post["updated_time"]
        }
        @data << post
		@rawdata[fb_post['id']] = fb_post.to_s
		$logger.debug "post id: #{fb_post['id']} json: #{fb_post.to_s}"
      end
      
      if resp['paging']['next']
        scrape(resp['paging']['next'])
      end      
    else
      return
    end
  end
  
  def repNil(obj, str ='UNKNOWN')
	if !obj.nil? then
		return obj.to_s
	end
	str
  end
  
	
	def toxlsx
		p = Axlsx::Package.new
		p.workbook.add_worksheet(:name => @group_id) do |sheet|
			sheet.add_row %w[user uid fb_id date message name caption description link url] 
			@data.each do |post|
				#purl = "https://www.facebook.com/groups/#{post[:fb_id].split(/_/)[0]}/permalink/#{post[:fb_id].split(/_/)[1]}"
				sheet.add_row [
				repNil(post[:fb_author]), 
				repNil(post[:fb_author_id]), 
				repNil(post[:fb_id]), 
				repNil(post[:fb_created_time]),
				repNil(post[:message]), 
				repNil(post[:name]), 
				repNil(post[:caption]), 
				repNil(post[:description]), 
				repNil(post[:link])
				]
			end
		end
		p.use_shared_strings = true
		p.serialize('agroup.xlsx')
	end
	
	
	def todb
		dbh = nil
		insert_stmt ='INSERT INTO wb.almedya(fb_id, fb_author, fb_author_id, link, message, name, caption, description, json_body, fb_created_time, fb_updated_time, wb_url, wb_code) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'
	result = 0
	
		begin
			dbh = DBI.connect("dbi:Pg:#{DB_DB}:#{DB_HOST}:#{DB_PORT}", DB_USERNAME, DB_PASSWORD)
			dbh['AutoCommit'] = false
			sth = dbh.prepare(insert_stmt)
	
			@data.each do |post|
				wburl, wbcode = saveToWM(post[:link])
				postId = post[:fb_id]
				sth.execute(	        
								postId,
								post[:fb_author],
								post[:fb_author_id],
								post[:link],
								post[:message],
								post[:name],
								post[:caption],
								post[:description],
								@rawdata[postId],
								post[:fb_created_time] == nil ? nil : DateTime.parse(post[:fb_created_time]),
								post[:fb_updated_time] == nil ? nil : DateTime.parse(post[:fb_updated_time]),
								wburl, 
								wbcode.to_i
							)	
			end
			dbh.commit
			result = sth.rows
			sth.finish
		rescue Exception => e
			$logger.error "An error occurred.Error message: #{e.message} Error message: #{e.backtrace.inspect}"
			dbh.rollback
		ensure
			dbh.disconnect if dbh
		end
		result
	end
	
	def saveToWM(url, baseUrl = 'http://web.archive.org')
		code, wburl = ""
		if url.blank?
			code  = '-1'
			$logger.info 'url is blank!'
			return wburl, code
		end
		
		begin
			uri = URI.parse(baseUrl + '/save/' + url)
			http = Net::HTTP.new(uri.host, uri.port)
			request = Net::HTTP::Get.new(uri.request_uri)
			response = http.request(request)
			
			if response.code.eql? "200"
				wburl = baseUrl + response["content-location"]  
				code = '200'
			else
				code = response.code
			end
		rescue Exception => e  
			$logger.error e.message  
			$logger.error e.backtrace.inspect 
			code = '-1'
		end
		$logger.info "Wayback machine return #{code} url: #{wburl}" 
		return wburl, code
	end
end


$at ='EAACEdEose0cBAIn2wNsuZANyYAjtZAMi5YzPe5dGvlmpvINhcfMZBOpkvI4ZCmCEh1kPbY2ihmndbDiZBG2ZBoZBoNz97MujgBuiIDOclEx4qrzZAhHlWl79tzZC0lqWRmfS1UwZC71ptybpZByVIT1EHR5FRIGXpwY7NBQBWYMqL0CqgMBOBvjnFO7FvzVdaB3B6UZD'
$gId = '1448280725481695'



if __FILE__ == $0
  gs = GroupScraper.new($at , $gId) #   ,
  gs.start
  gs.todb
end

# gem install crack rest-client fastercsv
require 'rubygems'
require 'crack'
require 'rest_client'
require 'fastercsv'

# To use, get an access token here, by clicking "get access token" 
# and checking user.groups in the dialog box
# https://developers.facebook.com/tools/explorer?method=GET&path=209024949216061%2Ffeed
#
# Run `ruby fb_scrape.rb ACCESS_TOKEN GROUP_ID`
#
# Your CSV should show up as "fb_posts_GROUP_ID.csv" in the same directory. 

class GroupScraper
  def initialize(access_token, group_id)
    @access_token = access_token
    @group_id     = group_id
    @url          =  "https://graph.facebook.com/#{@group_id}/feed?access_token=#{@access_token}"
    @data         = []
  end

  def start
    scrape(@url)
  end

  def scrape(url)
    resp = Crack::JSON.parse(RestClient.get(url))

    if resp['data'] && resp['data'].length > 0
      resp['data'].each do |fb_post|
        post = {
          :fb_id           => fb_post['id'],
          :fb_author       => fb_post["from"]["name"],
          :fb_author_id    => fb_post["from"]["id"],
          :message         => fb_post["message"],
          :fb_created_time => fb_post["created_time"],
          :fb_updated_time => fb_post["updated_time"]
        }
        p post
        @data << post
        if fb_post['comments'] && fb_post['comments']['data']
          fb_post['comments']['data'].each do |fb_comment|
            comment = {
              :fb_id           => fb_comment['id'],
              :fb_author       => fb_comment["from"]["name"],
              :fb_author_id    => fb_comment["from"]["id"],
              :message         => fb_comment["message"],
              :fb_created_time => fb_comment["created_time"],
              :fb_likes        => fb_comment['likes']
            }
            p comment
            @data << comment
          end
        end
      end
      
      if resp['paging']['next']
        scrape(resp['paging']['next'])
      end      
    else
      return
    end
  end

  def to_csv
    FasterCSV.open("fb_posts_#{@group_id}.csv", "w") do |csv|
      csv << %w[name fb_id date text url]
      @data.each do |post|
        csv << [post[:fb_author], post[:fb_id], post[:fb_created_time], post[:message], "https://www.facebook.com/groups/#{post[:fb_id].split(/_/)[0]}/permalink/#{post[:fb_id].split(/_/)[1]}"]
      end
    end
  end
end

if __FILE__ == $0
  gs = GroupScraper.new(ARGV[0], ARGV[1])
  gs.start
  gs.to_csv
end