# gem install crack rest-client fastercsv
require 'rubygems'
require 'active_support'
require 'active_support/core_ext'
require 'crack'
require 'rest-client'
require 'fastercsv'
require 'json'
require 'csv'
require 'axlsx'
require 'dbi'
DB_PASSWORD ='admin'
DB_USERNAME = 'postgres'
DB_HOST = 'localhost'
DB_DB = 'postgres'
DB_PORT = '5432'
file = File.open('FB_LOG.txt', File::WRONLY | File::APPEND | File::CREAT)
file.sync = true
$logger = Logger.new(file, 'monthly')
$logger.datetime_format = '%Y-%m-%d %H:%M:%S '
$logger.level = Logger::DEBUG
# coding: UTF-8
# To use, get an access token here, by clicking "get access token"
# and checking user.groups in the dialog box
# https://developers.facebook.com/tools/explorer?method=GET&path=209024949216061%2Ffeed
#
# Run `ruby fb_scrape.rb ACCESS_TOKEN GROUP_ID`
#
# Your CSV should show up as "fb_posts_GROUP_ID.csv" in the same directory.
class GroupScraper
def initialize(access_token, group_id)
@access_token = access_token
@group_id = group_id
@url = "https://graph.facebook.com/#{@group_id}/feed?access_token=#{@access_token}"
@data = []
@rawdata = Hash.new
end
def start
scrape(@url)
end
def scrape(url)
#puts RestClient.get(url)
restponse = RestClient.get(url)
$logger.info "Url: #{url} Response code: #{restponse.code}"
resp = JSON.parse(restponse)
#resp = Crack::JSON.parse(RestClient.get(url))
if resp['data'] && resp['data'].length > 0
resp['data'].each do |fb_post|
post = {
:fb_id => fb_post['id'],
:fb_author => fb_post["from"]["name"],
:fb_author_id => fb_post["from"]["id"],
:link => fb_post["link"],
:message => fb_post["message"],
:name => fb_post["name"],
:caption => fb_post["caption"],
:description => fb_post["description"],
:fb_created_time => fb_post["created_time"],
:fb_updated_time => fb_post["updated_time"]
}
@data << post
@rawdata[fb_post['id']] = fb_post.to_s
$logger.debug "post id: #{fb_post['id']} json: #{fb_post.to_s}"
end
if resp['paging']['next']
scrape(resp['paging']['next'])
end
else
return
end
end
def repNil(obj, str ='UNKNOWN')
if !obj.nil? then
return obj.to_s
end
str
end
def toxlsx
p = Axlsx::Package.new
p.workbook.add_worksheet(:name => @group_id) do |sheet|
sheet.add_row %w[user uid fb_id date message name caption description link url]
@data.each do |post|
#purl = "https://www.facebook.com/groups/#{post[:fb_id].split(/_/)[0]}/permalink/#{post[:fb_id].split(/_/)[1]}"
sheet.add_row [
repNil(post[:fb_author]),
repNil(post[:fb_author_id]),
repNil(post[:fb_id]),
repNil(post[:fb_created_time]),
repNil(post[:message]),
repNil(post[:name]),
repNil(post[:caption]),
repNil(post[:description]),
repNil(post[:link])
]
end
end
p.use_shared_strings = true
p.serialize('agroup.xlsx')
end
def todb
dbh = nil
insert_stmt ='INSERT INTO wb.almedya(fb_id, fb_author, fb_author_id, link, message, name, caption, description, json_body, fb_created_time, fb_updated_time, wb_url, wb_code) VALUES (?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)'
result = 0
begin
dbh = DBI.connect("dbi:Pg:#{DB_DB}:#{DB_HOST}:#{DB_PORT}", DB_USERNAME, DB_PASSWORD)
dbh['AutoCommit'] = false
sth = dbh.prepare(insert_stmt)
@data.each do |post|
wburl, wbcode = saveToWM(post[:link])
postId = post[:fb_id]
sth.execute(
postId,
post[:fb_author],
post[:fb_author_id],
post[:link],
post[:message],
post[:name],
post[:caption],
post[:description],
@rawdata[postId],
post[:fb_created_time] == nil ? nil : DateTime.parse(post[:fb_created_time]),
post[:fb_updated_time] == nil ? nil : DateTime.parse(post[:fb_updated_time]),
wburl,
wbcode.to_i
)
end
dbh.commit
result = sth.rows
sth.finish
rescue Exception => e
$logger.error "An error occurred.Error message: #{e.message} Error message: #{e.backtrace.inspect}"
dbh.rollback
ensure
dbh.disconnect if dbh
end
result
end
def saveToWM(url, baseUrl = 'http://web.archive.org')
code, wburl = ""
if url.blank?
code = '-1'
$logger.info 'url is blank!'
return wburl, code
end
begin
uri = URI.parse(baseUrl + '/save/' + url)
http = Net::HTTP.new(uri.host, uri.port)
request = Net::HTTP::Get.new(uri.request_uri)
response = http.request(request)
if response.code.eql? "200"
wburl = baseUrl + response["content-location"]
code = '200'
else
code = response.code
end
rescue Exception => e
$logger.error e.message
$logger.error e.backtrace.inspect
code = '-1'
end
$logger.info "Wayback machine return #{code} url: #{wburl}"
return wburl, code
end
end
$at ='EAACEdEose0cBAIn2wNsuZANyYAjtZAMi5YzPe5dGvlmpvINhcfMZBOpkvI4ZCmCEh1kPbY2ihmndbDiZBG2ZBoZBoNz97MujgBuiIDOclEx4qrzZAhHlWl79tzZC0lqWRmfS1UwZC71ptybpZByVIT1EHR5FRIGXpwY7NBQBWYMqL0CqgMBOBvjnFO7FvzVdaB3B6UZD'
$gId = '1448280725481695'
if __FILE__ == $0
gs = GroupScraper.new($at , $gId) # ,
gs.start
gs.todb
end