Twitter Commands
#==============================================================================
# Name: Streaming twitter using RCURL
# Date: Jan 9th, 2012
#
# saves the data in two formats:
# - raw text file of the response
# - adds each raw response as a new element to an R list;
# - each element can be processed with apply functions + leverage parallel processing
# - plays well to hadoop-style processing of data
#
# concerns:
# - if incoming responses are faster than the machine can process (cpu, disk speed, etc.)
# - saving the data in multiple files on hard disk is less than ideal
# - use couchdb or mongodb?
#
# NOTE: This code slows down because each response requires opening an
# increasingly larger .Rdata file.
#
# MOSTLY PROOF OF CONCEPT - NOT REALLY PRACTICAL FOR DATA COLLECTION.
#==============================================================================
require(RCurl)
# require(rjson)
# set the directory
setwd("C:/Documents and Settings/BTIBERT/Desktop/Twitter Data")
## set the twitter account credentials
USER <- "twitteracctname"
PASSWORD <- "twitterpassword"
UPASS <- paste(USER, PASSWORD, sep=":")
#==============================================================================
## A function that we will use to save the twitter JSON response to disk
## Creates hourly files, each contains a list object, responses are added to list
#==============================================================================
WRITE_TO_FILE <- function(x) {
# inputs: x = the raw response recieved from Twitter
## build the file name depending on the system hour
fname_base <- paste("twtstream_", format(Sys.time(), "%m%d%Y%H"), sep="")
fname_txt <- paste(fname_base, ".txt", sep="")
fname_r <- paste(fname_base, ".Rdata", sep="")
## checks to see if the hourly R data file is in the current data directory
## loads the R data file if it exists
if(fname_r %in% list.files()) {
load(fname_r)
}
## check to see if the list exists, if not,create it
## be careful, this should be loaded if the .Rdata file exists
if(!exists("twt", mode="list")) {
cat("creating twitter list obect\n")
twt <- list()
}
## if the response is not empty, saves two data files
## 1) write to text file 2) add to list and save Rdata file
if (nchar(x) >0 ) {
## write the response to a text file
write.table(x, file=fname_txt, append=T, row.names=F, col.names=F)
## add the response to a new list element
twt <- c(twt, x)
save(twt, file=fname_r)
}
## saved response
cat("saved response-----------------\n")
}
# test <- function(x) {print(x)}
## windows users will need to get this certificate to authenticate
download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")
## write the raw JSON data from the Twitter Firehouse to files specified in write function
## enclose in a while-loop in case there is an error? hacky way to catch an error?
while(TRUE) {
getURL("https://stream.twitter.com/1/statuses/sample.json",
userpwd=UPASS,
cainfo = "cacert.pem",
write=WRITE_TO_FILE)
}
#! /bin/bash
# remove the lines below here before running------
# Example 7: connect into the twitter firehouse - best approach is to use Command Line - run in Ubuntu 11.10
# Attempted to do this within R, but different approaches proved computing resources get consumed at incredible rate
# need to make this file executable from a command line in a linux-type environment (run in Ubuntu) by chmod 755
# saves datafile every hour (closes curl, reopens curl) from the twitter sample firehouse, raw json, should be able to process raw data in R and elsewhere
# If you're already in the directory containing the file you could just type: ./filename.sh and press Enter.
while true; do
curl -s -m 3600 -u twitterusername:twitterpassword https://stream.twitter.com/1/statuses/sample.json -o "twtstream_$(date +%Y%m%d%H).txt"
done