cv.learn.func(): learn in crossvalidation with a given learning function. cv.step.info(): needed in cv.learn.func() and to overwrite information depending on some lambda for the model cv.learn.func_functions.R: prewritten machine learning functions
cv.step.info <- function(full.x, full.y, single.cvstep, s="lambda.min"
,prediction_exclude_covariatenames=NA){
info <- single.cvstep # here I append all further information
tmp.coef <- coef(single.cvstep, s=s)
if(is.list(tmp.coef)){ # then (atleast in glmnet) we had a multinomial learning
tmp.coef.mat <- lapply(tmp.coef, as.matrix)
coef.mat <- do.call(cbind, tmp.coef.mat)
colnames(coef.mat) <- names(tmp.coef)
}else{ # then we are in the 1-dimensional case
coef.mat <- as.matrix(tmp.coef)
}
info$coef <- coef.mat
nozero.coefmat <- coef.mat[apply(coef.mat, 1, function(x) sum(abs(x))) != 0, ,drop=FALSE]
if(ncol(nozero.coefmat) == 1){ # then we are in the binomial case
info$coefNozero <- nozero.coefmat[, 1]
names(info$coefNozero) <- rownames(nozero.coefmat)
}else{ # else it is multinomial.
info$coefNozero <- nozero.coefmat
}
# if(all(is.na(featurenames)) || all(is.na(featuresymbols))){
# info$coefNozero_sym <- NA
# }else{
# if(!exists("rownames.IDtoSymbol"))
# stop("Load the function rownames.IDtoSymbol()")
# nozero.coefmat_sym <- rownames.IDtoSymbol(coefmat = nozero.coefmat, featurenames = featurenames, featuresymbols = featuresymbols)
# if(ncol(nozero.coefmat_sym) == 1){ # binomial case
# info$coefNozero_sym <- nozero.coefmat_sym[, 1]
# names(info$coefNozero_sym) <- rownames(nozero.coefmat_sym)
# }else{ # multinomial
# info$coefNozero_sym <- nozero.coefmat_sym
# }
# }
# If I during learning I included some confounding factors (e.g. IPS, Age, etc.)
# I dont want to use this in my prediction score.
# Therefore I set these (given) columns to zero.
if(!all(is.na(prediction_exclude_covariatenames)))
full.x[, prediction_exclude_covariatenames] <- 0
# full.x.intercept <- cbind(1, full.x) # the 1 is for the intercept
# OHE.pred.train <- full.x.intercept[single.cvstep$traintestsample == "train", ] %*% coef.mat
OHE.pred.train <- predict(single.cvstep
, newx = full.x[single.cvstep$traintestsample == "train", ]
, s = s, type="link")
info$pred.train <- OHE.pred.train
info$pred.train.prob <- predict(single.cvstep
, newx = full.x[single.cvstep$traintestsample == "train", ]
, s = s, type="response")
# in 1 dimension, I take the mean+sd of the score
# in k dimensions, the coefficient matrix are just multiple columns
# so i center the resulting n x k matrix per class (=column)
info$pred.train.mean <- apply(OHE.pred.train, 2, mean)
info$pred.train.sd <- apply(OHE.pred.train, 2, sd)
# per class: scaled = (score - mean)/SD
OHE.pred.train.scaled <- sweep(OHE.pred.train, 2, info$pred.train.mean, FUN="-")
OHE.pred.train.scaled <- sweep(OHE.pred.train.scaled, 2, info$pred.train.sd, FUN="/")
info$pred.train.scaled <- OHE.pred.train.scaled
if(any(single.cvstep$traintestsample == "test")){
# OHE.pred.test <- full.x.intercept[single.cvstep$traintestsample == "test", ] %*% coef.mat
OHE.pred.test <- predict(single.cvstep, newx = full.x[single.cvstep$traintestsample == "test", ], s = s, type="link")
info$pred.test <- OHE.pred.test
info$pred.test.prob <- predict(single.cvstep, newx = full.x[single.cvstep$traintestsample == "test", ], s = s, type="response")
OHE.pred.test.scaled <- sweep(OHE.pred.test, 2, info$pred.train.mean, FUN="-")
OHE.pred.test.scaled <- sweep(OHE.pred.test.scaled, 2, info$pred.train.sd, FUN="/")
info$pred.test.scaled <- OHE.pred.test.scaled
}
return(info)
}
#' Title
#'
#' @param x
#' data matrix. rows samples, columns features
#' @param y
#' @param cv.samples
#' @param penalty.factor
#' @param verbose
#' @param plot.verbose
#' @param saveRDS.fileX.eachCVStep
#' @param foldid
#' an optional positive vector of values identifying what fold each observation is in.
#' If you are in fold 2, all samples with 2 are in the test-group!
#' @param FUNC
#' @param ...
#' @param weights
#' @param s
#'
#' @return
#' @export
#'
#' @examples
cv.learn.func <- function(x, y, cv.samples=10
,cv.seed=NA
,penalty.factor = NA
,weights=NA
,verbose=TRUE
,plot.verbose=FALSE
,saveRDS.fileDir=NA
,foldid=NA
,s="lambda.min"
,FUNC
, ...){
# make sure x and y have the same amount of samples.
# Usually: length(y) must be nrow(x)
# But for survival data the second part is sufficient.
if(!(nrow(x) == length(y) || nrow(x) == nrow(y))){
stop("x and y dimensions are wrong!")
}
current.learningFunction <- match.fun(FUNC)
##### check input
## check if in each CV step are "enough" samples (arbitrary)
if(!is.na(cv.samples)){ # Else I want to test allsamples only
if(cv.samples >= nrow(x)){
stop("There must be atleast two cv.steps ( cv.samples is bigger or equal to nrow(x))")
}else if(cv.samples > (nrow(x)-50)){
warning("Less than 50 features are in one run - this can be problematic.")
}
}
## penalty factor: if NA, then same "weights" to each coefficient
if(all(is.na(penalty.factor))){
tmp.penalty.factor <- rep(1, ncol(x))
}else if(length(penalty.factor) == 1){
tmp.penalty.factor <- rep(penalty.factor, ncol(x))
}else{
tmp.penalty.factor <- penalty.factor
}
if(length(tmp.penalty.factor) != ncol(x)){
stop("penalty.factor: Supply either a single value or a vector of lenght ncol(x)")
}
## weights: if NA, then same "weights" to each sample
if(all(is.na(weights))){
tmp.weights <- rep(1, nrow(x))
}else if(length(weights) == 1){
tmp.weights <- rep(weights, nrow(x))
}else{
tmp.weights <- weights
}
if(length(tmp.weights) != nrow(x)){
stop("weights: Supply either a single value or a vector of lenght nrow(x)")
}
###### CV steps: randomize except foldid was given
k <- ceiling(nrow(x) / cv.samples)
if(!all(is.na(foldid))){
if(any(foldid <= 0)){
stop("only positive foldid allowed")
}
index.k <- foldid
k.steps <- c(unique(foldid), -1)
}else if(is.na(cv.samples)){
k.steps <- 1
index.k <- rep(0, nrow(x))
}else{
rand.sampleindex <- sample(nrow(x))
index.k <- rep(1:k, rep(cv.samples, k))
index.k <- index.k[rand.sampleindex] # shuffle the indices
k.steps <- 1:(k+1)
}
###### CV steps seed: if given, either use the same seed for all (one value)
##### or there have to be as many seeds as steps
cv.seeds_all <- k.steps
if(!all(is.na(cv.seed))){
if(length(cv.seed) == 1){
cv.seeds_all[TRUE] <- cv.seed
}else{
if(length(cv.seed) != length(k.steps))
stop(paste0("Supply either a single seed (for all cv.steps the same)"
, ", OR a seed for every CVstep(", length(k.steps), ")"))
cv.seeds_all <- cv.seed
}
names(cv.seeds_all) <- k.steps
}else{
cv.seeds_all <- NA
}
###### now the crossvalidation
cv.result <- list()
counter <- 1
for(i in k.steps){
i <- as.character(i)
if(verbose){
cat("Step ", counter, "/", length(k.steps), " started ----- ")
cat(format(Sys.time()), " ----- ")
}
test.mat <- x[index.k == i, ,drop=FALSE]
test.resp <- y[index.k == i]
train.mat <- x[index.k != i, ,drop=FALSE]
train.resp <- y[index.k != i]
train.weights <- tmp.weights[index.k != i]
# all features which have a 0 at the penalty factor are shifted out of the Lp-Norm.
# and go directly in the model.
if(! all(is.na(cv.seeds_all)))
set.seed(cv.seeds_all[i])
cv.result[[i]] <- current.learningFunction(
x = train.mat
,y = train.resp
,penalty.factor=tmp.penalty.factor
,weights=train.weights
,...)
# training finished. Now add additional information for investigation of the CVstep
cv.result[[i]]$traintestsample <- ifelse(index.k == i, "test", "train")
cv.result[[i]]$feature.penalty <- tmp.penalty.factor
cv.result[[i]]$train.mat <- train.mat
cv.result[[i]]$train.resp <- train.resp
cv.result[[i]]$test.mat <- test.mat
cv.result[[i]]$test.resp <- test.resp
cv.result[[i]]$train.weights <- train.weights
## add further information (train/test scores/scaledScores, trainMean/SD)
cv.result[[i]] <- cv.step.info(full.x = x
,full.y = y
,single.cvstep = cv.result[[i]]
,s = s)
if(plot.verbose){
plot(cv.result[[i]], main=paste0("CV - Step ", counter, "/", length(k.steps)))
}
if(verbose){
cat("finished\n")
}
if(!is.na(saveRDS.fileDir)){
if(!dir.exists(saveRDS.fileDir))
dir.create(saveRDS.fileDir, recursive = TRUE)
stringspecification <- paste0("step_%0", ceiling(log10(length(k.steps))), "d.rds")
tmp.filepath <- file.path(saveRDS.fileDir, sprintf(stringspecification, as.numeric(i)))
saveRDS(cv.result, file=tmp.filepath)
cat(tmp.filepath, " saved\n")
}
counter <- counter+1
}
names(cv.result) <- paste0("CVstep", k.steps)
if(any(names(cv.result)[length(cv.result)] %in% c("CVstep-1", paste0("CVstep", k+1))) ||
is.na(cv.samples))
names(cv.result)[length(cv.result)] <- "allSamples"
return(cv.result)
}
fun.glmnet <- function(x, y, alpha=1, penalty.factor, ...){
inst.load.packages("survival", silent = TRUE)
inst.load.packages("glmnet", silent = TRUE)
model <- cv.glmnet(x = x
,y = y
,alpha=alpha
,penalty.factor=penalty.factor
,...)
return(model)
}
fun.zerosum.binomial <- function(x, y, penalty.factor, ...){
if(!require(zeroSum)){
install_github("rehbergT/zeroSum/zeroSum")
library("zeroSum")
}
model <- zeroSumCVFit(x = x
,y = y
,penalty.factor=penalty.factor
, zeroSumWeights = penalty.factor
, type = "binomialZS"
,...)
return(model)
}
fun.glmnet.forcedShifts <- function(x, y, alpha=1, penalty.factor, n.times.sample
, shiftfun, genewise.shiftfactor=.01, nlambda=150, ...){
# samples are rows
# it seems that this shift and specially multiple same samples make a problem in convergence.
# now I try to make one big shift and several smaller shifts inside each sample.
# maybe this resolves the convergence problem.
new.x <- do.call(rbind, rep(list(x), n.times.sample))
new.y <- rep(y, n.times.sample)
penalty.zero.values <- new.x[, penalty.factor == 0, drop=FALSE]
samplewise.random.shifts <- shiftfun(nrow(new.x))
new.x.shifted <- sweep(new.x, 1, samplewise.random.shifts, FUN="+")
genewise.samplewise.randomShifts <- shiftfun(nrow(new.x) * ncol(new.x))
genewise.samplewise.randomShifts <- genewise.shiftfactor * genewise.samplewise.randomShifts
dim(genewise.samplewise.randomShifts) <- c(nrow(new.x), ncol(new.x))
new.x.shifted <- new.x.shifted + genewise.samplewise.randomShifts
new.x.shifted[, penalty.factor == 0] <- penalty.zero.values
inst.load.packages("survival", silent = TRUE)
inst.load.packages("glmnet", silent = TRUE)
glmnet.control(fdev = 0) # as in the beginning of learning, the fractional changes will be probably big,
# this parameter says: make the lambda always smaller (no "convergence"?)
# additionally I increase the nlambda
model <- cv.glmnet(x = new.x.shifted
,y = new.y
,alpha=alpha
,penalty.factor=penalty.factor
,nlambda=nlambda
,...)
glmnet.control(factory = TRUE)
return(model)
}
## added 5.3.18
fun.zerosum <- function(x, y, penalty.factor=rep(1, ncol(x)), type="binomialZS", ...){
if(!require(zeroSum)){
install_github("rehbergT/zeroSum/zeroSum")
library("zeroSum")
}
model <- zeroSumCVFit(x = x
,y = y
,penalty.factor=penalty.factor
, zeroSumWeights = penalty.factor
, type = type
,...)
return(model)
}