nshvai
10/24/2016 - 12:38 PM

## Common feature exploration functions for Data Science Competitions in R

Common feature exploration functions for Data Science Competitions in R

``````
# remove duplicate columns
remDupcols <- function(data){
rem = which(!(names(data) %in% colnames(unique(as.matrix(data), MARGIN=2))))
return(rem)
}

# fast parallel cor
bigcorPar <- function(x, nblocks = 10, verbose = TRUE, ncore="all", ...){
library(ff, quietly = TRUE)
require(doMC)
if(ncore=="all"){
ncore = multicore:::detectCores()
registerDoMC(cores = ncore)
} else{
registerDoMC(cores = ncore)
}

NCOL <- ncol(x)

## test if ncol(x) %% nblocks gives remainder 0
if (NCOL %% nblocks != 0){stop("Choose different 'nblocks' so that ncol(x) %% nblocks = 0!")}

## preallocate square matrix of dimension
## ncol(x) in 'ff' single format
corMAT <- ff(vmode = "single", dim = c(NCOL, NCOL))

## split column numbers into 'nblocks' groups
SPLIT <- split(1:NCOL, rep(1:nblocks, each = NCOL/nblocks))

## create all unique combinations of blocks
COMBS <- expand.grid(1:length(SPLIT), 1:length(SPLIT))
COMBS <- t(apply(COMBS, 1, sort))
COMBS <- unique(COMBS)

## iterate through each block combination, calculate correlation matrix
## between blocks and store them in the preallocated matrix on both
## symmetric sides of the diagonal
results <- foreach(i = 1:nrow(COMBS)) %dopar% {
COMB <- COMBS[i, ]
G1 <- SPLIT[[COMB[1]]]
G2 <- SPLIT[[COMB[2]]]
if (verbose) cat("Block", COMB[1], "with Block", COMB[2], "\n")
flush.console()
COR <- cor(x[, G1], x[, G2], ...)
corMAT[G1, G2] <- COR
corMAT[G2, G1] <- t(COR)
COR <- NULL
}

gc()
return(corMAT)
}

# remove highly correlated features from data
remHighcor <- function(data, cutoff, ...){
data_cor <- cor(sapply(data, as.numeric), ...)
data_cor[is.na(data_cor)] <- 0
rem <- findCorrelation(data_cor, cutoff=cutoff, verbose=T)
return(rem)
}

# remove highly correlated features from data faster
remHighcorPar <- function(data, nblocks, ncore, cutoff, ...){
data_cor = bigcorPar(data, nblocks = nblocks, ncore = ncore, ...)
data_cor = data.matrix(as.data.frame(as.ffdf(data_cor)))
data_cor[is.na(data_cor)] <- 0
rem <- findCorrelation(data_cor, cutoff=cutoff, verbose=T)
return(rem)
}

# remove features from data which are highly correlated to those in main
remHighcor0 <- function(data, main, cutoff, ...){
res = cor(sapply(data, as.numeric), sapply(main, as.numeric), ...) # res is names(data) X names(main) matrix
res[is.na(res)] <- 0
res = res > cutoff
rem = unname(which(rowSums(res) > 0))
return(rem)
}

# one-hot/dummy encode factors
# does not depend on train/test split as long as # of factors is same
categtoOnehot <- function(data, fullrank=T, ...){
data[,names(data)] = lapply(data[,names(data)] , factor) # convert character to factors
if (fullrank){
res = as.data.frame(as.matrix(model.matrix( ~., data, ...)))[,-1]
} else {
res = as.data.frame(as.matrix(model.matrix(~ ., data, contrasts.arg = lapply(data, contrasts, contrasts=FALSE), ...)))[,-1]
}
return(res)
}

# orthogonal polynomial encoding for ordered factors - use only for a few 2-5 levels !
# not all features may actually be important and must be removed
# include dependence on train/test split ?
categtoOrthPoly <- function(data, fullrank=T, ...){
data[,names(data)] = lapply(data[,names(data)] , ordered) # convert character to factors
if (fullrank){
res = as.data.frame(as.matrix(model.matrix( ~., data, ...)))[,-1]
} else {
res = as.data.frame(as.matrix(model.matrix(~ ., data, contrasts.arg = lapply(data, contrasts, contrasts=FALSE), ...)))[,-1]
}
return(res)
}

# Out-of-fold mean/sd/median deviation encoding.
# Useful for high cardinality categorical variables.
# always full rank
categtoDeviationenc <- function(char_data, num_data, traininds=NULL, funcs = funs(mean(.,na.rm=T), sd(.,na.rm=T), 'median' = median(.,na.rm=T))){

if(length(traininds) == 0){
train_char_data = char_data
train_num_data = num_data
} else {
train_char_data = char_data[traininds, ]
train_num_data =num_data[traininds, ]
}

res = list()
for(x in names(train_char_data)){
res[[x]] = train_num_data %>% group_by(.dots=train_char_data[,x]) %>% summarise_each(funcs) # calculate mean/sd/median encodings
res[[x]][,-1] = apply(res[[x]][,-1], 2, scale, scale=FALSE, center=TRUE) # calculate deviances of mean/sd/median encodings
# rename columns
colnames(res[[x]])[1] = x
if (ncol(train_num_data) == 1)
colnames(res[[x]])[-1] = paste0(names(train_num_data),'_',names(res[[x]])[-1])
res[[x]] <- merge(char_data[,x,drop=F], res[[x]], all.x=T, by=x)[,-1] # apply encodings to all data
}
res = data.frame(do.call(cbind, res))
return(res)
}

# function to remove equivalent factors
remEquivfactors <- function(x.data, ref.data = NULL){
if(length(ref.data) == 0L){
all = x.data
} else{
all = data.frame(cbind(ref.data, x.data))
}
all[,names(all)] = lapply(all[,names(all), drop=F], function(l){
as.numeric(reorder(x=l, X=seq(1:nrow(all)), FUN=mean))
})
rem = which(!(names(x.data) %in% colnames(unique(as.matrix(all), MARGIN=2, fromLast = F)))) # removal of cols towards end will be preferred
return(rem)
}

# function to create n-way categorical-categorical interactions
nwayInterac <- function(char_data, n){
nway <- as.data.frame(combn(ncol(char_data), n, function(y) do.call(paste0, char_data[,y])))
names(nway) = combn(ncol(char_data), n, function(y) paste0(names(char_data)[y], collapse='.'))
rem = remEquivfactors(x.data = nway, ref.data=NULL)
if(length(rem) > 0)
nway = nway[,-rem]
return(nway)
}

# create xgbfi xlsx output
xgbfi <- function(xgbfi_path = 'code/xgbfi/bin/XgbFeatureInteractions.exe', # need to clone code from xgbfi repo
model_path,
output_path, # if filename then without xlsx extension
d = 3, # upper bound for extracted feature interactions depth
g = -1, # upper bound for interaction start deepening (zero deepening => interactions starting @root only)
t = 100, #upper bound for trees to be parsed
k = 100, # upper bound for exported feature interactions per depth level
s = 'Gain', # score metric to sort by (Gain, Fscore, wFScore, AvgwFScore, AvgGain, ExpGain)
h = 10 # amounts of split value histograms
){

system(paste0('mono ',xgbfi_path,' -m ',model_path,' -o ',output_path,
' -d ',d,' -g ',g,' -t ',t,' -k ',k,' -s ',s,' -h ',h)) # saves output .xlsx file in given ouput directory

}

# function to extract calendar features
# function to call in holidays open data / other open data from python/othersources ?
# function to calcualte mode?

# time series outliers
# time series outliers
tsoutliers <- function(x,plot=FALSE)
{
x <- as.ts(x)
if(frequency(x)>1)
resid <- stl(x,s.window="periodic",robust=TRUE)\$time.series[,3]
else
{
tt <- 1:length(x)
resid <- residuals(loess(x ~ tt))
}
resid.q <- quantile(resid,prob=c(0.25,0.75))
iqr <- diff(resid.q)
limits <- resid.q + 1.5*iqr*c(-1,1)
score <- abs(pmin((resid-limits[1])/iqr,0) + pmax((resid - limits[2])/iqr,0))
if(plot)
{
plot(x)
x2 <- ts(rep(NA,length(x)))
x2[score>0] <- x[score>0]
tsp(x2) <- tsp(x)
points(x2,pch=19,col="red")
return(invisible(score))
}
else
return(score)
}

# time series lag features``````