Btibert3
3/14/2014 - 12:22 AM

Team by Team distribution of goals during 2013/14 Season

Team by Team distribution of goals during 2013/14 Season

## load the packages
library(XML)
library(RCurl)
library(plyr)
library(ggplot2)
library(reshape2)
library(stringr)

## the page
URL = "http://www.hockey-reference.com/leagues/NHL_2014_standings.html"

## read the raw page
team_page = getURL(URL)
team_page = htmlParse(team_page)

## parse out the links
links = xpathSApply(team_page, '//a/@href')
names(links) = NULL
team_links = links[str_detect(links, ".*/2014.html")]
team_links = unique(team_links)

## for each link, grab the page, grab the skater stats, save to master dataset
skaters = data.frame(stringsAsFactors=F)
for (TEAM in team_links) {
  # build the page
  URL = paste0("http://www.hockey-reference.com", TEAM)
  # grab the page
  tm_page = readHTMLTable(URL, stringsAsFactors=F)
  # grab the skater stats
  tmp_skaters = tm_page$skaters
  # fix a couple of the names
  names(tmp_skaters)[10] = "plusminus"
  names(tmp_skaters)[20] = "shotpct"
  # add the team
  team = str_extract(TEAM, "[A-Z]{3}")
  tmp_skaters$team = team
  # merge the data on the skaters tame frame
  skaters = rbind.fill(skaters, tmp_skaters)
  # status
  cat("finished ", TEAM, "\n")
}

## how many skaters does each team have?
ddply(skaters, .(team), summarise, num_players = length(Player))
ggplot(skaters, aes(factor(team))) + 
  geom_bar() + 
  theme_bw() + 
  labs(title="Title", x="Team", y="")


## classify the goal scorer types
skaters$G = as.numeric(skaters$G)
summary(skaters$G)
skaters$skater_type = cut(skaters$G, 
                          breaks=c(0, 5, 10, 20, 30, 40, 50, 60),
                          include.lowest = T,
                          right = F)


## team distributions
tbl_dist = with(skaters, table(team, skater_type))
round(prop.table(tbl_dist, 1), 2) ## row distributions

## metric =  split on 20 + (or a better number)
## scatterplot (x = percentage, y = volume)


BELOW HERE IS TEMP

## grab the data
## http://goo.gl/0ZurK
# tables = readHTMLTable(URL, stringsAsFactors=F)
# length(tables)
# names(tables)
# head(tables$standings)
## XML package FTW!

## bring the data into a dataframe
nhl_14 = tables$stats
colnames(nhl_14) = tolower(colnames(nhl_14))

## change the rank column and remove the row breaks
nhl_14$rk = as.numeric(nhl_14$rk)
nhl_14 = subset(nhl_14, !is.na(rk))
nhl_14 = subset(nhl_14, tm != 'Tm')

STOP = Vanek is rolled up.  Need to crawl team x team and summarize that way

## lets look at the goals column to get a sense of the data
nhl_14$g = as.numeric(nhl_14$g)
summary(nhl_14$g)

## cut the goals variable into groups
nhl_14$break5 = cut(nhl_14$g,
                    breaks = seq(0, 60, 5), 
                    include.lowest=T, 
                    right=F)
nhl_14$break10 = cut(nhl_14$g,
                     breaks = seq(0, 60, 10), 
                     include.lowest=T, 
                     right=F)
nhl_14$core_breaks = cut(nhl_14$g,
                         breaks = c(0, 10, 20, 25, 30, 40, 50, 60), 
                         include.lowest=T, 
                         right=T)


need to isolate players if they have multiple stints
make single record by selecting max stint number but sumamrize for player

## quick distribution of the types
ggplot(nhl_14, aes(core_breaks)) + geom_histogram()


## summarize by team
by_team  ddply(nhl_14, .(tm, core_breaks), )