Btibert3
4/1/2011 - 11:06 PM

Player Streakiness in the NHL

Player Streakiness in the NHL

## basics
# R 2.12.2
# windows xp; Yes, I know


## libraries
library(XML)
library(plyr)
library(lubridate)
library(ggplot2)

# Set the working directory
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/Player Streakiness")

# Set the constants
BASE <- "http://www.hockey-reference.com/players/h/hortona01/gamelog/"
SEASON <- c(2004, 2006:2011)


# Loop and grab the data
ds <- data.frame()

for (S in SEASON) {

  URL <- paste(BASE, S, "/", sep="")
  tables <- readHTMLTable(URL)$stats
  head(tables, n=30)
  
  # fix factors and names
  for(i in 1:ncol(tables)) {
    tables[,i] <- as.character(tables[,i])
    names(tables) <- tolower(colnames(tables))
  }
  tables
  
  str(tables)
  names(tables)[6] <- "AwayHome"
  names(tables)[8] <- "WinLoss"
  names(tables)[9]  <- "goals"
  names(tables)
  
  # fix the columns - NAs forced by coercion warnings
  str(tables)
  for(i in c(1:2, 9:19)) {
  	tables[,i] <- as.numeric(tables[, i])
  }
  str(tables)
  
  tables$year <- S
  
  ds <- rbind.fill(ds, tables)
  
  # BE KIND when scraping
  Sys.sleep(10)
  
}


with(ds, table(year))
head(ds, n=30)
dim(ds)
ds<- ds[!is.na(ds$rk), ]
dim(ds)
head(ds, n=30)

save(ds, file="Horton.Rdata")

# Need to change the date to an actual date in R
str(ds)
ds$date <- parse_date(ds$date, c("%Y", "%m", "%d"), seps="-")
str(ds)

# Format to the month year  = do so by setting all with the same arbitrary year
# Set the last months of the season to the year plus 1 so the dates are in "order" when plotted
ds$date <- update(ds$date, year=2010)
ds$date[month(ds$date) < 10] <- update(ds$date[month(ds$date) < 10], year=2011)
head(ds, n=40)


# Help recieved from
# http://stackoverflow.com/questions/5494216/extract-date-in-r

# add cumulative goals by season and make a new dataframe
gamelog <- ddply(ds, .(year), transform, cumegoals = cumsum(goals))

# plot the data
ggplot(aes(y=cumegoals, x=date), data=gamelog) + geom_point() + geom_line() +
  facet_wrap(~year, ncol=1)