Btibert3
11/28/2011 - 5:34 PM

Basic Web Mining with R

Basic Web Mining with R

#### load the libraries used for the discussion
install.packages("twitteR")
install.packages("sqldf")
install.packages("XML")
install.packages("RCurl") # cURL interface adapted in R


#==============================================================================
# Read in already existing data from the web
#==============================================================================

##### read in a CSV dataset from yahoo finance
yahoo <- read.csv("http://ichart.finance.yahoo.com/table.csv?s=AAPL&a=08&b=7&c=1984&d=10&e=28&f=2011&g=w&ignore=.csv",
                  header=T, stringsAsFactors=F)

dim(yahoo) # check the rows/columns
head(yahoo, n=5) # look at the first 5 rows
str(yahoo) # have R tell us the details of the 
plot(yahoo[,7], type="l")  # data are reverse order


### fix the data to make a plot
yahoo$Date <- as.Date(yahoo$Date) # tell R that this column is a Date
yahoo.2 <- yahoo[order(yahoo$Date),] # sort the data Ascending order
plot(x=yahoo.2$Date, y=yahoo.2[,7], type="l",
     xlab="Date", ylab="Adjusted Close", 
     main="Weekly Adjusted Closing Price for Apple (AAPL)") # plot the data



#==============================================================================
# Read in HTML tables
# http://stackoverflow.com/questions/4393780/scraping-a-wiki-page-for-the-periodic-table-and-all-the-links
# http://stackoverflow.com/questions/1395528/scraping-html-tables-into-r-data-frames-using-the-xml-package
#==============================================================================

# lets parse NHL data
library(XML)
URL <- paste("http://www.hockey-reference.com/leagues/NHL_", 
  		2011, "_skaters.html", sep="")
tables <- readHTMLTable(URL) # parses all of the tables for us
ds.skaters <- tables$stats # if the table is named (View Source Code), can reference by name
dim(ds.skaters)



#==============================================================================
# Collect data from twitter
#==============================================================================

library(twitteR)
rstats <- searchTwitter("#rstats", n=1500) #searches twitter for the last 1500 tweets
length(rstats) # but only searches back a week or so
mode(rstats) # not a data frame, so we need to convert it to rows/columns
rstats.df <- twListToDF(rstats)
dim(rstats.df)
colnames(rstats.df) # print the column names
length(unique(rstats.df$screenName)) # how many different twitter users
install.packages("sqldf") # install a library to use SQL-like commands
library(sqldf)
temp <- rstats.df
twts <- sqldf("SELECT screenName as name, count(*) as tweets FROM temp
              GROUP BY screenName ORDER BY count(*) desc")
head(twts, n=10) # top 10 tweeters about rstats
hist(twts[,2], xlab="# Rstats Tweets", main="")



#==============================================================================
# Social Network Graph of rstats users
# http://www.drewconway.com/zia/?p=1471
#==============================================================================



#==============================================================================
# Streaming twitter using RCURL
#==============================================================================
library(RCurl)
library(rjson)

# set the directory
setwd("C:\\Documents and Settings\\user\\Desktop\\Web Mining Pres")

#### redirects output to a file, but still has problems with getting data back into R!
WRITE_TO_FILE <- function(x) {
     
     if (nchar(x) >0 ) {
          write.table(x, file="Twitter Stream Capture.txt", append=T, row.names=F, col.names=F)
     }
     
}

### windows users will need to get this certificate to authenticate
download.file(url="http://curl.haxx.se/ca/cacert.pem", destfile="cacert.pem")

### write the raw JSON data from the Twitter Firehouse to a text file
getURL("https://stream.twitter.com/1/statuses/sample.json", 
       userpwd=USER:PASSWORD,
       cainfo = "cacert.pem", 
       write=WRITE_TO_FILE)
# Source the file with the Grab Skaters function
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/Scripts")
source("SkaterStats.R")

## set the working directory
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/30 Goals 100 PIM")

## set the seasons
SEASON <- as.character(c(1960:2004, 2006:2011))


#-----------------------------------------------------------------------
# Use the function to loop over the seasons and piece together
#-----------------------------------------------------------------------

## define the seasons -- 2005 dataset doesnt exist
## if I was a good coder I would trap the error, but this works
SEASON <- as.character(c(1960:2004, 2006:2011))

## create an empy dataset that we will append to
dataset <- data.frame()

## loop over the seasons, use the function to grab the data
## and build the dataset
for (S in SEASON) {
  
  require(plyr)
  
	temp <- GrabSkaters(S)
	dataset <- rbind.fill(dataset, temp)
	print(paste("Completed Season ", S, sep=""))
	
	## pause the script so we don't kill their servers
	Sys.sleep(10)
	
}
head(dataset); tail(dataset);
dataset <- dataset[dataset$tm != 'TOT', ]
save(dataset, file="hr skater db.Rdata")


#-----------------------------------------------------------------------
# Lets get a sense for how many players finish 30/100
#-----------------------------------------------------------------------

# Let's look at a scatterplot of the data
png("Scatter Plot.png")
plot(x=dataset$pim, y=dataset$g, type="p", pch=20, cex=.9,
    xlab="Penalty Mins", ylab="Goals Scored", bty="n", 
		main="Correlate Goals Scored and Penalty Minutes")
abline(h=30, lty=2, col="red"); abline(v=100, lty=2, col="red");
dev.off()

# raw output
dstemp <- transform(dataset, ind30100 = as.numeric(g >=30 & pim >=100))
str(dstemp)
with(dstemp, addmargins(table(season, ind30100)))
with(dstemp, addmargins(table(season, ind30100)))

# % of players that meet criteria per season
library(sqldf)
(pct <- sqldf("SELECT season, AVG(ind30100) as pct FROM dstemp GROUP BY season"))


# basic plot of the data
barplot(pct$pct, names.arg=pct$season)

# trying to learn ggplot
library(ggplot2)
ds <- dataset[dstemp$g >=30 & dstemp$pim >=100, c("season", "player")]
min(as.numeric(ds$season)); as.numeric(max(ds$season));
p <- ggplot(ds, aes(as.numeric(season))) + geom_bar(fill="black", colour="white", binwidth=1)
p <- p + labs(x="Season", y="# Players") 
ggsave(p, file="Distribution.png")


# look at a listing of the data again
View(pct)

# Who are the players in the 2011 season that meet this criteria
dataset[dataset$season == "2011" & dataset$g >= 30 & dataset$pim >=100, c("player", "g", "pim")]
#######################################################################################
# Function to scrape season skater statistics from Hockey-reference.com
#######################################################################################
GrabSkaters <- function(S) {
  
  # The function takes parameter S which is a string and represents the Season
	# Returns: data frame    
  
  require(XML)
	
	## create the URL
	URL <- paste("http://www.hockey-reference.com/leagues/NHL_", 
			S, "_skaters.html", sep="")
	
	## grab the page -- the table is parsed nicely
	tables <- readHTMLTable(URL)
	ds.skaters <- tables$stats
	
	## determine if the HTML table was well formed (column names are the first record)
	## can either read in directly or need to force column names
	## and 
	
	## I don't like dealing with factors if I don't have to
	## and I prefer lower case
	for(i in 1:ncol(ds.skaters)) {
		ds.skaters[,i] <- as.character(ds.skaters[,i])
		names(ds.skaters) <- tolower(colnames(ds.skaters))
	}
	
	## fix a couple of the column names
	colnames(ds.skaters)
	## names(ds.skaters)[10] <- "plusmin"
	names(ds.skaters)[11] <- "plusmin"
	names(ds.skaters)[18] <- "spct"
	
	## finally fix the columns - NAs forced by coercion warnings
	for(i in c(1, 3, 6:18)) {
		ds.skaters[,i] <- as.numeric(ds.skaters[, i])
	}
	
	## convert toi to seconds, and seconds/game
	## ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp
	
	## remove the header and totals row
	ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
	## ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]
	
	## add the year
	ds.skaters$season <- S
	
	## return the dataframe
	return(ds.skaters)
	
}
###############################################################################
# Author: @BrockTibert
#
# Used: R Version 2.12.1, Windows 7 Pro, StatET Plugin for Eclipse
#
#
###############################################################################

#-----------------------------------------------------------------------
# set up script level basics
#-----------------------------------------------------------------------

## I code on the screen, so I set it wider for the console output
options(widht=180)

## libraries
library(XML)
library(plyr)
library(gdata)
library(sqldf)


## directory for the project
DIR <- "C:/Users/Brock/Documents/My Dropbox/Projects/NHL"
setwd(DIR)



#-----------------------------------------------------------------------
# Grab the skaters data from the season page
#-----------------------------------------------------------------------

## set the season
SEASON <- "2011"

## create the URL
URL <- paste("http://www.hockey-reference.com/leagues/NHL_", SEASON, "_skaters.html", sep="")


## grab the page -- the table is parsed nicely
tables <- readHTMLTable(URL)
ds.skaters <- tables$stats

## I don't like dealing with factors if I don't have to
## and I prefer lower case
for(i in 1:ncol(ds.skaters)) {
	ds.skaters[,i] <- as.character(ds.skaters[,i])
	names(ds.skaters) <- tolower(colnames(ds.skaters))
}

## fix a couple of the column names
colnames(ds.skaters)
names(ds.skaters)[10] <- "plusmin"
names(ds.skaters)[17] <- "spct"

## finally fix the columns - NAs forced by coercion warnings
for(i in c(1, 3, 6:18)) {
	ds.skaters[,i] <- as.numeric(ds.skaters[, i])
}

## change the avg time on ice to a time format
ds.skaters$atoi <- strptime(ds.skaters$atoi, "%M:%S")

## remove the header and totals row
ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]

## save the datafile
write.table(ds.skaters, "Skaters.csv", sep=",", row.names=F)

## convert toi to seconds, and seconds/game
ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp

#-----------------------------------------------------------------------
# Quick overview of the 2011 skater data
#-----------------------------------------------------------------------

## quick look of the dataset
summary(ds.skaters)

## Best and worst +/-
head(ds.skaters[ds.skaters$plusmin == max(ds.skaters$plusmin), ])
head(ds.skaters[ds.skaters$plusmin == min(ds.skaters$plusmin), ]) # how is that deal working out for NJ?

## select the columns to analyze
play <- ds.skaters[, c(7:17, 20)]


## fix the NA's to zeros
for(i in 1:ncol(play)) {
#	which(is.na(play[,i])) <- rows that meet the criteria of NA
	play[which(is.na(play[,i])), i] <- 0
}

## look at the correlations
summary(play)
round(cor(play), 2)

#-----------------------------------------------------------------------
# Use PCA and clustering techniques to group the players
#-----------------------------------------------------------------------

## PCA analysis
## http://www.statmethods.net/advstats/factor.html
## the 1st component of princomp(play, cor=F) explained 99% variance
play.pca <- princomp(play, cor=T)


## what did we get for data?
summary(play.pca)
plot(play.pca)
png("PCA biplot.png")
biplot(play.pca)
dev.off()


## lets just use the first 5 = ~= 86% of the variance
play.pca.comp <- play.pca$scores[, 1:5]


## use hierarchical clustering to get a sense of how many clusters 
## to feed to the kmeans procedure - need to feed distance matrix
## pull 40 random cases into the procedure
dist.mat <- dist(play.pca.comp[sample(1:nrow(play.pca.comp), 40), ], method="euclidian")
play.hclust <- hclust(dist.mat, method="ward")
png("Dendrogram.png")
plot(play.hclust)
dev.off()

## I am going to try to 4 clusters
kmean4 <- kmeans(play.pca.comp, centers=4, iter.max=12000)
kmean4$withinss
kmean4$size



#-----------------------------------------------------------------------
# Analyze the clusters
#-----------------------------------------------------------------------

## add the cluster membership to the player data
skaters <- cbind(ds.skaters, kmean4$cluster)
names(skaters)[ncol(skaters)] <- "cluster"

## make them a factor
skaters$cluster2 <- as.factor(skaters$cluster)
skaters$cluster2 <- reorder(skaters$cluster2, new.order=c(4,2,1,3))

## profile clusters by a few variables
table(skaters$cluster2)
ddply(skaters[c("cluster", "pts")], .(cluster), mean, na.rm=T)

## boxplot these clusters
png("4 Cluster by Points Boxplot.png")
boxplot(pts ~ cluster2, data=skaters, main="Boxplot of Clusters by Points \nAs of Feb 7, 2011")
dev.off()


## cluster membership by team
addmargins(table(skaters$tm, skaters$cluster2))
(team.dist <- round(prop.table(table(skaters$cluster2, skaters$tm), 2), 3))


## plot of the table
## Rotate labels = http://stackoverflow.com/questions/1828742/rotating-axis-labels-in-r 
## http://research.stowers-institute.org/efg/R/Color/Chart/
## type colors() to see names for the graph on link above
mycol <- colors()
mycol <- c("red4", "orangered", "burlywood1",  "beige")
png("Team Distribution.png")
barplot(team.dist, horiz=T, cex.names=.75, las=1, 
		col=mycol, main="Distribution of Clusters by Team", xlab="% of Team")
abline(v=.25, lty=2)
abline(v=.5, lty=2)
abline(v=.75, lty=2)
dev.off()