Btibert3
4/2/2011 - 6:39 PM

Function to grab NHL Skater Stats for a season

Function to grab NHL Skater Stats for a season

#######################################################################################
# Function to scrape season skater statistics from Hockey-reference.com
#######################################################################################
GrabSkaters <- function(S) {
  
  # The function takes parameter S which is a string and represents the Season
	# Returns: data frame    
  
  require(XML)
	
	## create the URL
	URL <- paste("http://www.hockey-reference.com/leagues/NHL_", 
			S, "_skaters.html", sep="")
	
	## grab the page -- the table is parsed nicely
	tables <- readHTMLTable(URL)
	ds.skaters <- tables$stats
	
	## determine if the HTML table was well formed (column names are the first record)
	## can either read in directly or need to force column names
	## and 
	
	## I don't like dealing with factors if I don't have to
	## and I prefer lower case
	for(i in 1:ncol(ds.skaters)) {
		ds.skaters[,i] <- as.character(ds.skaters[,i])
		names(ds.skaters) <- tolower(colnames(ds.skaters))
	}
	
	## fix a couple of the column names
	colnames(ds.skaters)
	## names(ds.skaters)[10] <- "plusmin"
	names(ds.skaters)[11] <- "plusmin"
	names(ds.skaters)[18] <- "spct"
	
	## finally fix the columns - NAs forced by coercion warnings
	for(i in c(1, 3, 6:18)) {
		ds.skaters[,i] <- as.numeric(ds.skaters[, i])
	}
	
	## convert toi to seconds, and seconds/game
	## ds.skaters$seconds <- (ds.skaters$toi*60)/ds.skaters$gp
	
	## remove the header and totals row
	ds.skaters <- ds.skaters[!is.na(ds.skaters$rk), ]
	## ds.skaters <- ds.skaters[ds.skaters$tm != "TOT", ]
	
	## add the year
	ds.skaters$season <- S
	
	## return the dataframe
	return(ds.skaters)
	
}
# Source the file with the Grab Skaters function
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/Scripts")
source("SkaterStats.R")

## set the working directory
setwd("~/My Dropbox/Eclipse/Projects/R/NHL/Blog Posts/30 Goals 100 PIM")

## set the seasons
SEASON <- as.character(c(1960:2004, 2006:2011))


#-----------------------------------------------------------------------
# Use the function to loop over the seasons and piece together
#-----------------------------------------------------------------------

## define the seasons -- 2005 dataset doesnt exist
## if I was a good coder I would trap the error, but this works
SEASON <- as.character(c(1960:2004, 2006:2011))

## create an empy dataset that we will append to
dataset <- data.frame()

## loop over the seasons, use the function to grab the data
## and build the dataset
for (S in SEASON) {
  
  require(plyr)
  
	temp <- GrabSkaters(S)
	dataset <- rbind.fill(dataset, temp)
	print(paste("Completed Season ", S, sep=""))
	
	## pause the script so we don't kill their servers
	Sys.sleep(10)
	
}
head(dataset); tail(dataset);
dataset <- dataset[dataset$tm != 'TOT', ]
save(dataset, file="hr skater db.Rdata")


#-----------------------------------------------------------------------
# Lets get a sense for how many players finish 30/100
#-----------------------------------------------------------------------

# Let's look at a scatterplot of the data
png("Scatter Plot.png")
plot(x=dataset$pim, y=dataset$g, type="p", pch=20, cex=.9,
    xlab="Penalty Mins", ylab="Goals Scored", bty="n", 
		main="Correlate Goals Scored and Penalty Minutes")
abline(h=30, lty=2, col="red"); abline(v=100, lty=2, col="red");
dev.off()

# raw output
dstemp <- transform(dataset, ind30100 = as.numeric(g >=30 & pim >=100))
str(dstemp)
with(dstemp, addmargins(table(season, ind30100)))
with(dstemp, addmargins(table(season, ind30100)))

# % of players that meet criteria per season
library(sqldf)
(pct <- sqldf("SELECT season, AVG(ind30100) as pct FROM dstemp GROUP BY season"))


# basic plot of the data
barplot(pct$pct, names.arg=pct$season)

# trying to learn ggplot
library(ggplot2)
ds <- dataset[dstemp$g >=30 & dstemp$pim >=100, c("season", "player")]
min(as.numeric(ds$season)); as.numeric(max(ds$season));
p <- ggplot(ds, aes(as.numeric(season))) + geom_bar(fill="black", colour="white", binwidth=1)
p <- p + labs(x="Season", y="# Players") 
ggsave(p, file="Distribution.png")


# look at a listing of the data again
View(pct)

# Who are the players in the 2011 season that meet this criteria
dataset[dataset$season == "2011" & dataset$g >= 30 & dataset$pim >=100, c("player", "g", "pim")]