Team by Team distribution of goals during 2013/14 Season
## load the packages
library(XML)
library(RCurl)
library(plyr)
library(ggplot2)
library(reshape2)
library(stringr)
## the page
URL = "http://www.hockey-reference.com/leagues/NHL_2014_standings.html"
## read the raw page
team_page = getURL(URL)
team_page = htmlParse(team_page)
## parse out the links
links = xpathSApply(team_page, '//a/@href')
names(links) = NULL
team_links = links[str_detect(links, ".*/2014.html")]
team_links = unique(team_links)
## for each link, grab the page, grab the skater stats, save to master dataset
skaters = data.frame(stringsAsFactors=F)
for (TEAM in team_links) {
# build the page
URL = paste0("http://www.hockey-reference.com", TEAM)
# grab the page
tm_page = readHTMLTable(URL, stringsAsFactors=F)
# grab the skater stats
tmp_skaters = tm_page$skaters
# fix a couple of the names
names(tmp_skaters)[10] = "plusminus"
names(tmp_skaters)[20] = "shotpct"
# add the team
team = str_extract(TEAM, "[A-Z]{3}")
tmp_skaters$team = team
# merge the data on the skaters tame frame
skaters = rbind.fill(skaters, tmp_skaters)
# status
cat("finished ", TEAM, "\n")
}
## how many skaters does each team have?
ddply(skaters, .(team), summarise, num_players = length(Player))
ggplot(skaters, aes(factor(team))) +
geom_bar() +
theme_bw() +
labs(title="Title", x="Team", y="")
## classify the goal scorer types
skaters$G = as.numeric(skaters$G)
summary(skaters$G)
skaters$skater_type = cut(skaters$G,
breaks=c(0, 5, 10, 20, 30, 40, 50, 60),
include.lowest = T,
right = F)
## team distributions
tbl_dist = with(skaters, table(team, skater_type))
round(prop.table(tbl_dist, 1), 2) ## row distributions
## metric = split on 20 + (or a better number)
## scatterplot (x = percentage, y = volume)
BELOW HERE IS TEMP
## grab the data
## http://goo.gl/0ZurK
# tables = readHTMLTable(URL, stringsAsFactors=F)
# length(tables)
# names(tables)
# head(tables$standings)
## XML package FTW!
## bring the data into a dataframe
nhl_14 = tables$stats
colnames(nhl_14) = tolower(colnames(nhl_14))
## change the rank column and remove the row breaks
nhl_14$rk = as.numeric(nhl_14$rk)
nhl_14 = subset(nhl_14, !is.na(rk))
nhl_14 = subset(nhl_14, tm != 'Tm')
STOP = Vanek is rolled up. Need to crawl team x team and summarize that way
## lets look at the goals column to get a sense of the data
nhl_14$g = as.numeric(nhl_14$g)
summary(nhl_14$g)
## cut the goals variable into groups
nhl_14$break5 = cut(nhl_14$g,
breaks = seq(0, 60, 5),
include.lowest=T,
right=F)
nhl_14$break10 = cut(nhl_14$g,
breaks = seq(0, 60, 10),
include.lowest=T,
right=F)
nhl_14$core_breaks = cut(nhl_14$g,
breaks = c(0, 10, 20, 25, 30, 40, 50, 60),
include.lowest=T,
right=T)
need to isolate players if they have multiple stints
make single record by selecting max stint number but sumamrize for player
## quick distribution of the types
ggplot(nhl_14, aes(core_breaks)) + geom_histogram()
## summarize by team
by_team ddply(nhl_14, .(tm, core_breaks), )