Btibert3
11/26/2012 - 2:46 PM

Twitter Commands

Twitter Commands

#==============================================================================
# Name: Streaming twitter using RCURL
# Date: Jan 9th, 2012
#
# saves the data in two formats:
#    - raw text file of the response
#    - adds each raw response as a new element to an R list; 
#         - each element can be processed with apply functions + leverage parallel processing
#         - plays well to hadoop-style processing of data
#
# concerns:  
#    - if incoming responses are faster than the machine can process (cpu, disk speed, etc.)
#    - saving the data in multiple files on hard disk is less than ideal
#         - use couchdb or mongodb?
#
# NOTE:  This code slows down because each response requires opening an 
#        increasingly larger .Rdata file.
#
# MOSTLY PROOF OF CONCEPT - NOT REALLY PRACTICAL FOR DATA COLLECTION.
#==============================================================================
require(RCurl)
# require(rjson)


# set the directory
setwd("C:/Documents and Settings/BTIBERT/Desktop/Twitter Data")

## set the twitter account credentials
USER <- "twitteracctname"
PASSWORD <- "twitterpassword"
UPASS <- paste(USER, PASSWORD, sep=":")

#==============================================================================
## A function that we will use to save the twitter JSON response to disk
## Creates hourly files, each contains a list object, responses are added to list
#==============================================================================
WRITE_TO_FILE <- function(x) {
     
     # inputs: x = the raw response recieved from Twitter
     
     ## build the file name depending on the system hour
     fname_base <- paste("twtstream_", format(Sys.time(), "%m%d%Y%H"), sep="")
     fname_txt <- paste(fname_base, ".txt", sep="")
     fname_r <- paste(fname_base, ".Rdata", sep="")
     
     ## checks to see if the hourly R data file is in the current data directory
     ## loads the R data file if it exists
     if(fname_r %in% list.files()) {
          load(fname_r)
     }
     
     ## check to see if the list exists, if not,create it
     ## be careful, this should be loaded if the .Rdata file exists
     if(!exists("twt", mode="list")) {
     cat("creating twitter list obect\n")
     twt <- list()
     }

     
     ## if the response is not empty, saves two data files
     ## 1) write to text file 2) add to list and save Rdata file
     if (nchar(x) >0 ) {
          
          ## write the response to a text file
          write.table(x, file=fname_txt, append=T, row.names=F, col.names=F)
          
          ## add the response to a new list element
          twt <- c(twt, x) 
          save(twt, file=fname_r)
     }
     
     ## saved response
     cat("saved response-----------------\n")
}


# test <- function(x) {print(x)}

## windows users will need to get this certificate to authenticate
download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")

## write the raw JSON data from the Twitter Firehouse to files specified in write function
## enclose in a while-loop in case there is an error?  hacky way to catch an error?
while(TRUE) {
     getURL("https://stream.twitter.com/1/statuses/sample.json", 
            userpwd=UPASS,
            cainfo = "cacert.pem", 
            write=WRITE_TO_FILE)
}
#! /bin/bash
# remove the lines below here before running------
# Example 7: connect into the twitter firehouse - best approach is to use Command Line - run in Ubuntu 11.10
#      Attempted to do this within R, but different approaches proved computing resources get consumed at incredible rate
# need to make this file executable from a command line in a linux-type environment (run in Ubuntu) by chmod 755
# saves datafile every hour (closes curl, reopens curl) from the twitter sample firehouse, raw json, should be able to process raw data in R and elsewhere
# If you're already in the directory containing the file you could just type: ./filename.sh and press Enter.

while true; do
    curl -s -m 3600 -u twitterusername:twitterpassword https://stream.twitter.com/1/statuses/sample.json -o "twtstream_$(date +%Y%m%d%H).txt"
done