 04pallav
9/13/2017 - 9:25 PM

## LDA hof

LDA hof

``````####
#### For Analytics.
####

## Extract a few offensive statistics (numerical variables).
num_vars <- c("H", "HR", "RBI", "AVG", "SLG", "OBP")
X <- as.matrix(DTA[, num_vars])
X_st <- scale(X, center = TRUE, scale = TRUE)
DTA_st <- data.frame(DTA\$HOF, X_st)
colnames(DTA_st) <- c("HOF", num_vars)

p <- ncol(X)

## Summary statistics.
x_bar <- colMeans(X)
S <- var(X)
R <- cor(X)

##
## Principal component analysis.
##

pca <- prcomp(X, center = TRUE, scale = TRUE)
pca
summary(pca)

## Eigenvalues and eigenvectors of R.
egn_R <- eigen(R)
egn_R

## Compute PCs.
PC <- scale(X, center = TRUE, scale = TRUE) %*% egn_R\$vectors
apply(PC, 2, var)
egn_R\$values

##
## Linear discriminant analysis.
##

library(MASS)

lda_out <- lda(HOF ~ H + HR + RBI + AVG + SLG + OBP, data = DTA_st)
lda_pred <- predict(lda_out, newdata = DTA_st)
LDs <- lda_pred\$x
our_linear_combo <- X_st %*% rep(1 / 6, 6)

par(mfrow = c(1, 2))
boxplot(LDs[DTA\$HOF == "Y", 1], LDs[DTA\$HOF == "N", 1], ylim = c(-3, 8))
boxplot(our_linear_combo[DTA\$HOF == "Y"], our_linear_combo[DTA\$HOF == "N"],
ylim = c(-3, 8))

cor(LDs[, 1], DTA_st\$H)
cor(LDs[, 1], DTA_st\$HR)
cor(LDs[, 1], DTA_st\$RBI)
cor(LDs[, 1], DTA_st\$AVG)
cor(LDs[, 1], DTA_st\$SLG)
cor(LDs[, 1], DTA_st\$OBP)

## Partial F statistic for H.
n_k <- table(DTA\$HOF)
X_bar_Y <- colMeans(X_st[DTA\$HOF == "Y", ])
X_bar_N <- colMeans(X_st[DTA\$HOF == "N", ])
X_bar_Y_red <- X_bar_Y[-1]
X_bar_N_red <- X_bar_N[-1]
S_Y <- var(X_st[DTA\$HOF == "Y", ])
S_N <- var(X_st[DTA\$HOF == "N", ])
S_Y_red <- var(X_st[DTA\$HOF == "Y", -1])
S_N_red <- var(X_st[DTA\$HOF == "N", -1])
S_po <- ((n_k - 1) * S_Y + (n_k - 1) * S_N) / (n_k + n_k - 2)
S_po_red <- ((n_k - 1) * S_Y_red + (n_k - 1) * S_N_red) / (n_k + n_k - 2)

T2_full <- t(X_bar_Y - X_bar_N) %*% solve(S_po) %*% (X_bar_Y - X_bar_N) /
(1 / n_k + 1 / n_k)
T2_red <- t(X_bar_Y_red - X_bar_N_red) %*% solve(S_po_red) %*%
(X_bar_Y_red - X_bar_N_red) / (1 / n_k + 1 / n_k)

F_stat_H <- (n_k + n_k - 2 - 6 + 1) * ((T2_full - T2_red) / (n_k + n_k - 2 + T2_red))

library(Hotelling)

hot_full <- hotelling.test(H + HR + RBI + AVG + SLG + OBP ~ HOF, data = DTA_st)
hot_red <- hotelling.test(HR + RBI + AVG + SLG + OBP ~ HOF, data = DTA_st)

(n_1 + n_2 - p - 1) / ((n_1 + n_2 - 2) * p) * T2

## Picture of first LD.
with(DTA, boxplot(LDs[HOF == "Y"], LDs[HOF == "N"]))

## Investigate players with high values of LD who are not in HOF.
ii <- (1:nrow(DTA))[DTA\$HOF == "N" & LDs > 2]
DTA_st[ii, ]

## Compute linear discriminant.
ld <- X_st %*% lda_out\$scaling
with(DTA, boxplot(ld[HOF == "Y"], ld[HOF == "N"]))

oo <- order(abs(ld), decreasing = TRUE)
data.frame(DTA_st\$HOF[oo[1:20]], round(ld[oo[1:20]], 2), round(X_st[oo[1:20], ], 2))

## Resubstitution error.
HOF_hat <- predict(lda_out, data = DTA)
with(DTA, table(HOF, HOF_hat\$class))

####
#### For DL.
####

n <- nrow(DTA)

##
## PCA on some of the numerical variables.
##

## These are a few offensive statistics. Big numbers signal greater offensive production.
num_vars <- c("H", "HR", "RBI", "AVG", "SLG", "OBP")
X <- as.matrix(DTA[, num_vars])

p <- ncol(X)

## Standardized variables.
X_st <- scale(X, center = TRUE, scale = TRUE)
DTA_st <- data.frame(DTA\$HOF, X_st)
colnames(DTA_st) <- c("HOF", num_vars)

## Summary statistics.
x_bar <- colMeans(X)
S <- var(X)
R <- cor(X)

## Eigenvalues and eigenvectors of R.
ee <- eigen(R)
lambda <- ee\$values
ee <- ee\$vectors

## The first three PCs explain about 94% of variance. The first PC represents a weighted
## average of all the variables and will distinguish players based on their overall
## stats. The second PC represents a weighted difference, comparing AVG and OBP to HR,
## RBI, and SLG. Some players may not have exceptional "power" statistics (such as home
## runs, runs batted in, and slugging percentage, but they may nevertheless have been
## very valuable offensive players (e.g. Rod Carew, with high batting average and on base
## percentage but relatively few home runs or RBIs). The third PC is harder to interpret.
## We'll talk about it later if we have time.
pca <- prcomp(X_st)
pca
summary(pca)
plot(pca)

## Extract the first three PCs.
Y <- X_st %*% pca\$rotation[, 1:3]

## Variance of PCs equal lambdas.
apply(Y, 2, var)
lambda[1:3]

## Correlations with variables...
cor(Y[, 1], X_st[, 1])
ee[1, 1] * sqrt(lambda)

ee * kronecker(matrix(1, nrow = p, ncol = 1), sqrt(matrix(lambda, nrow = 1, ncol = p)))

##
## Linear discriminant analysis.
##

library(MASS)

lda_out <- lda(HOF ~ H + HR + RBI + AVG + SLG + OBP, data = DTA_st)

## Compute linear discriminants.
ld <- X_st %*% lda_out\$scaling

## Visualize linear discriminants.
with(DTA_st, hist(ld[HOF == "N"], prob = TRUE, xlim = c(-2.5, 6), ylim = c(0, 0.55),
xlab = "linear discriminants", main = "", col = "pink"))
with(DTA_st, hist(ld[HOF == "Y"], prob = TRUE, xlim = c(-2.5, 6), ylim = c(0, 0.55),
col = "lightgreen", add = TRUE))

## Investigate extreme values of linear discriminants.
ld_oo <- order(abs(ld))