Btibert3
11/4/2012 - 10:49 PM

Collect Twitter and Insert into MongoDB

Collect Twitter and Insert into MongoDB

## load the packages
require(XML)
require(RCurl)
require(rjson)
require(plyr)

## will create a request to the twitter search API - returns JSON
## https://dev.twitter.com/docs/api/1/get/search
## max combo can be 1500 tweets (100*15 pages)
## API limit = 150 requests / hour for requests that are not authenticated
TAG <- "#nfl"
HASHTAG <- URLencode(TAG)
BASE_URL <- "http://search.twitter.com/search.json"
RESULTS <- 10 #max results that can be returned
LANG <- "en"
TYPE <- "recent"
PAGES <- 1 

## should make this a function, but use a for loop for basic example
for (page in 1:PAGES) {
  #page=1
  URL <- paste(BASE_URL, 
               "?q=", HASHTAG, 
               "&rpp=", RESULTS,
               "&lang=", LANG, 
               "&result_type=", TYPE, 
               "&page=", page, 
               sep="")
  out <- try(getURL(URL))
  twitter_data <- fromJSON(out)
  twitter_data <- twitter_data$results
}

## load the r mongo libary -- Assumes that Mongo is running in the terminal
# ## help(mongo)
library(rmongodb)

## great help resource for rmongodb
## http://goo.gl/B6tYz

## follow the basics -- db is the datbase, ns = the collection of documents
db <- "twitter"
ns <- "brock_test"
dbns <- paste(db, ns, sep=".")
# mongo <- mongo.create(db=db)
mongo <- mongo.create()
mongo.is.connected(mongo)

## @ interactive shell for mongo, type "dbs" to show "twitter" was created

## create an entry from the first response in the list from twitter search
## needs to be a bson doc -- need to check that list converts ok
tmp <- twitter_data[[1]]
(tmp.bson <- mongo.bson.from.list(tmp))

## Make sure we start with an empty collection

## ensure there is nothing there - T = was dropped, F may mean didnt exist
mongo.drop(mongo, dbns)
mongo.count(mongo, dbns)

## insert the record -- only 1 twitter response -- T = successful!!
mongo.insert(mongo, dbns, tmp.bson)

## did the record stay? -- should show 1, not 0 from above
mongo.count(mongo, dbns)

## query the record


## disconnect
mongo.disconnect(mongo)

## note the comments below -- I couldnt get RMongo to play nicely
## but appears to have a compact syntax.
## NB: rmongodb was produced by 10gen, the same as MongoDB
##
## another Mongo package -- need to install from github
## mongodb needs to be running
# library(devtools)
# install_github("RMongo", "tc")
# library(RMongo)
# mongo <- mongoDbConnect("test", "localhost", 27017)
# str(mongo)
# 
# ## connect to a database, even if not connected
# mongo <- mongoDbConnect("twitter", "localhost", 27017)
# dbShowCollections(mongo)