gugl58
4/4/2018 - 8:44 AM

cv.learn.func

cv.learn.func(): learn in crossvalidation with a given learning function. cv.step.info(): needed in cv.learn.func() and to overwrite information depending on some lambda for the model cv.learn.func_functions.R: prewritten machine learning functions

cv.step.info <- function(full.x, full.y, single.cvstep, s="lambda.min"
						 ,prediction_exclude_covariatenames=NA){
	info <- single.cvstep # here I append all further information
	
	tmp.coef <- coef(single.cvstep, s=s)
	if(is.list(tmp.coef)){ # then (atleast in glmnet) we had a multinomial learning
		tmp.coef.mat <- lapply(tmp.coef, as.matrix)
		coef.mat <- do.call(cbind, tmp.coef.mat)
		colnames(coef.mat) <- names(tmp.coef)
	}else{ # then we are in the 1-dimensional case
		coef.mat <- as.matrix(tmp.coef)
	}
	
	info$coef <- coef.mat
	nozero.coefmat <- coef.mat[apply(coef.mat, 1, function(x) sum(abs(x))) != 0, ,drop=FALSE]
	if(ncol(nozero.coefmat) == 1){ # then we are in the binomial case
		info$coefNozero <- nozero.coefmat[, 1]
		names(info$coefNozero) <- rownames(nozero.coefmat)
	}else{ # else it is multinomial.
		info$coefNozero <- nozero.coefmat
	}
	
	# if(all(is.na(featurenames)) || all(is.na(featuresymbols))){
	# 	info$coefNozero_sym <- NA
	# }else{
	# 	if(!exists("rownames.IDtoSymbol"))
	# 		stop("Load the function rownames.IDtoSymbol()")
	# 	nozero.coefmat_sym <- rownames.IDtoSymbol(coefmat = nozero.coefmat, featurenames = featurenames, featuresymbols = featuresymbols)
	# 	if(ncol(nozero.coefmat_sym) == 1){ # binomial case
	# 		info$coefNozero_sym <- nozero.coefmat_sym[, 1]
	# 		names(info$coefNozero_sym) <- rownames(nozero.coefmat_sym)
	# 	}else{ # multinomial
	# 		info$coefNozero_sym <- nozero.coefmat_sym
	# 	}
	# }
	
	# If I during learning I included some confounding factors (e.g. IPS, Age, etc.)
	# I dont want to use this in my prediction score.
	# Therefore I set these (given) columns to zero.
	if(!all(is.na(prediction_exclude_covariatenames))) 
		full.x[, prediction_exclude_covariatenames] <- 0
	
	# full.x.intercept <- cbind(1, full.x) # the 1 is for the intercept
	# OHE.pred.train <- full.x.intercept[single.cvstep$traintestsample == "train", ] %*% coef.mat
	OHE.pred.train <- predict(single.cvstep
							  , newx = full.x[single.cvstep$traintestsample == "train", ]
							  , s = s, type="link")
	info$pred.train <- OHE.pred.train
	info$pred.train.prob <- predict(single.cvstep
									, newx = full.x[single.cvstep$traintestsample == "train", ]
									, s = s, type="response")
	
	# in 1 dimension, I take the mean+sd of the score
	# in k dimensions, the coefficient matrix are just multiple columns
	# so i center the resulting n x k matrix per class (=column)
	info$pred.train.mean <- apply(OHE.pred.train, 2, mean)
	info$pred.train.sd <- apply(OHE.pred.train, 2, sd)
	
	# per class: scaled = (score - mean)/SD
	OHE.pred.train.scaled <- sweep(OHE.pred.train, 2, info$pred.train.mean, FUN="-")
	OHE.pred.train.scaled <- sweep(OHE.pred.train.scaled, 2, info$pred.train.sd, FUN="/")
	info$pred.train.scaled <- OHE.pred.train.scaled
	
	if(any(single.cvstep$traintestsample == "test")){
		# OHE.pred.test <- full.x.intercept[single.cvstep$traintestsample == "test", ] %*% coef.mat
		OHE.pred.test <- predict(single.cvstep, newx = full.x[single.cvstep$traintestsample == "test", ], s = s, type="link")
		info$pred.test <- OHE.pred.test
		info$pred.test.prob <- predict(single.cvstep, newx = full.x[single.cvstep$traintestsample == "test", ], s = s, type="response")
		OHE.pred.test.scaled <- sweep(OHE.pred.test, 2, info$pred.train.mean, FUN="-")
		OHE.pred.test.scaled <- sweep(OHE.pred.test.scaled, 2, info$pred.train.sd, FUN="/")
		info$pred.test.scaled <- OHE.pred.test.scaled
	}
	return(info)
}
#' Title
#'
#' @param x 
#' data matrix. rows samples, columns features
#' @param y 
#' @param cv.samples 
#' @param penalty.factor 
#' @param verbose 
#' @param plot.verbose 
#' @param saveRDS.fileX.eachCVStep 
#' @param foldid 
#' an optional positive vector of values identifying what fold each observation is in.
#' If you are in fold 2, all samples with 2 are in the test-group!
#' @param FUNC 
#' @param ... 
#' @param weights 
#' @param s 
#'
#' @return
#' @export
#'
#' @examples
cv.learn.func <- function(x, y, cv.samples=10
						  ,cv.seed=NA
						  ,penalty.factor = NA
						  ,weights=NA
						  ,verbose=TRUE
						  ,plot.verbose=FALSE
						  ,saveRDS.fileDir=NA
						  ,foldid=NA
						  ,s="lambda.min"
						  ,FUNC
						  , ...){
	# make sure x and y have the same amount of samples.
	# Usually: length(y) must be nrow(x)
	# But for survival data the second part is sufficient.
	if(!(nrow(x) == length(y) || nrow(x) == nrow(y))){
		stop("x and y dimensions are wrong!")
	}
		
		
	current.learningFunction <- match.fun(FUNC)
	##### check input
	## check if in each CV step are "enough" samples (arbitrary)
	if(!is.na(cv.samples)){ # Else I want to test allsamples only
		if(cv.samples >= nrow(x)){
			stop("There must be atleast two cv.steps ( cv.samples is bigger or equal to nrow(x))")
		}else if(cv.samples > (nrow(x)-50)){
			warning("Less than 50 features are in one run - this can be problematic.")
		}
	}
	## penalty factor: if NA, then same "weights" to each coefficient
	if(all(is.na(penalty.factor))){
		tmp.penalty.factor <- rep(1, ncol(x))
	}else if(length(penalty.factor) == 1){
		tmp.penalty.factor <- rep(penalty.factor, ncol(x))
	}else{
		tmp.penalty.factor <- penalty.factor
	}
	if(length(tmp.penalty.factor) != ncol(x)){
		stop("penalty.factor: Supply either a single value or a vector of lenght ncol(x)")
	}
	
	## weights: if NA, then same "weights" to each sample
	if(all(is.na(weights))){
		tmp.weights <- rep(1, nrow(x))
	}else if(length(weights) == 1){
		tmp.weights <- rep(weights, nrow(x))
	}else{
		tmp.weights <- weights
	}
	if(length(tmp.weights) != nrow(x)){
		stop("weights: Supply either a single value or a vector of lenght nrow(x)")
	}
	
	
	###### CV steps: randomize except foldid was given
	k <- ceiling(nrow(x) / cv.samples)
	if(!all(is.na(foldid))){
		if(any(foldid <= 0)){
			stop("only positive foldid allowed")
		}
		index.k <- foldid
		k.steps <- c(unique(foldid), -1)
	}else if(is.na(cv.samples)){
		k.steps <- 1
		index.k <- rep(0, nrow(x))
	}else{
		rand.sampleindex <- sample(nrow(x))
		
		index.k <- rep(1:k, rep(cv.samples, k))
		index.k <- index.k[rand.sampleindex] # shuffle the indices
		k.steps <- 1:(k+1)
	}

	
	###### CV steps seed: if given, either use the same seed for all (one value) 
	#####				  or there have to be as many seeds as steps
	cv.seeds_all <- k.steps
	if(!all(is.na(cv.seed))){
		if(length(cv.seed) == 1){
			cv.seeds_all[TRUE] <- cv.seed
		}else{
			if(length(cv.seed) != length(k.steps))
				stop(paste0("Supply either a single seed (for all cv.steps the same)"
							, ", OR a seed for every CVstep(", length(k.steps), ")"))
			cv.seeds_all <- cv.seed
		}
		names(cv.seeds_all) <- k.steps
	}else{
		cv.seeds_all <- NA
	}
	
	###### now the crossvalidation
	cv.result <- list()
	counter <- 1
	for(i in k.steps){
		i <- as.character(i)
		if(verbose){
			cat("Step ", counter, "/", length(k.steps), " started ----- ")
			cat(format(Sys.time()), " ----- ")
		}
		test.mat <- x[index.k == i, ,drop=FALSE]
		test.resp <- y[index.k == i]
		train.mat <- x[index.k != i, ,drop=FALSE]
		train.resp <- y[index.k != i]	
		train.weights <- tmp.weights[index.k != i]
		
		# all features which have a 0 at the penalty factor are shifted out of the Lp-Norm. 
		# and go directly in the model. 
		if(! all(is.na(cv.seeds_all)))
			set.seed(cv.seeds_all[i])
		
		cv.result[[i]] <- current.learningFunction(
			x = train.mat
			,y = train.resp
			,penalty.factor=tmp.penalty.factor
			,weights=train.weights
			,...)
		# training finished. Now add additional information for investigation of the CVstep
		cv.result[[i]]$traintestsample <- ifelse(index.k == i, "test", "train")
		cv.result[[i]]$feature.penalty <- tmp.penalty.factor
		cv.result[[i]]$train.mat <- train.mat
		cv.result[[i]]$train.resp <- train.resp
		cv.result[[i]]$test.mat <- test.mat
		cv.result[[i]]$test.resp <- test.resp
		cv.result[[i]]$train.weights <- train.weights
		
		## add further information (train/test scores/scaledScores, trainMean/SD)
		cv.result[[i]] <- cv.step.info(full.x = x
									   ,full.y = y
									   ,single.cvstep = cv.result[[i]]
									   ,s = s)
		if(plot.verbose){
			plot(cv.result[[i]], main=paste0("CV - Step ", counter, "/", length(k.steps)))
		}
		if(verbose){
			cat("finished\n")
		}	
		if(!is.na(saveRDS.fileDir)){
			if(!dir.exists(saveRDS.fileDir))
				dir.create(saveRDS.fileDir, recursive = TRUE)
			stringspecification <- paste0("step_%0", ceiling(log10(length(k.steps))), "d.rds")
			tmp.filepath <- file.path(saveRDS.fileDir, sprintf(stringspecification, as.numeric(i)))
			saveRDS(cv.result, file=tmp.filepath)
			cat(tmp.filepath, " saved\n")
		}
		counter <- counter+1
	}
	names(cv.result) <- paste0("CVstep", k.steps)
	if(any(names(cv.result)[length(cv.result)] %in% c("CVstep-1", paste0("CVstep", k+1))) ||
	   is.na(cv.samples))
		names(cv.result)[length(cv.result)] <- "allSamples"
	return(cv.result)
	
}
fun.glmnet <- function(x, y, alpha=1, penalty.factor, ...){
	
	inst.load.packages("survival", silent = TRUE)
	inst.load.packages("glmnet", silent = TRUE)
	model <- cv.glmnet(x = x
					   ,y = y
					   ,alpha=alpha
					   ,penalty.factor=penalty.factor
					   ,...)
	return(model)
}


fun.zerosum.binomial <- function(x, y, penalty.factor, ...){
	if(!require(zeroSum)){
		install_github("rehbergT/zeroSum/zeroSum")
		library("zeroSum")
	}
	model <- zeroSumCVFit(x = x
						  ,y = y
						  ,penalty.factor=penalty.factor
						  , zeroSumWeights = penalty.factor
						  , type = "binomialZS"
						  ,...)
	return(model)
}


fun.glmnet.forcedShifts <- function(x, y, alpha=1, penalty.factor, n.times.sample
									, shiftfun, genewise.shiftfactor=.01, nlambda=150, ...){
	# samples are rows
	# it seems that this shift and specially multiple same samples make a problem in convergence. 
	# now I try to make one big shift and several smaller shifts inside each sample.
	# maybe this resolves the convergence problem. 
	new.x <- do.call(rbind, rep(list(x), n.times.sample))
	new.y <- rep(y, n.times.sample)
	
	penalty.zero.values <- new.x[, penalty.factor == 0, drop=FALSE]
	samplewise.random.shifts <- shiftfun(nrow(new.x))
	new.x.shifted <- sweep(new.x, 1, samplewise.random.shifts, FUN="+")
	
	genewise.samplewise.randomShifts <- shiftfun(nrow(new.x) * ncol(new.x))
	genewise.samplewise.randomShifts <- genewise.shiftfactor * genewise.samplewise.randomShifts
	dim(genewise.samplewise.randomShifts) <- c(nrow(new.x), ncol(new.x))
	new.x.shifted <- new.x.shifted + genewise.samplewise.randomShifts
	
	
	new.x.shifted[, penalty.factor == 0] <- penalty.zero.values
	
	
	inst.load.packages("survival", silent = TRUE)
	inst.load.packages("glmnet", silent = TRUE)
	
	glmnet.control(fdev = 0) # as in the beginning of learning, the fractional changes will be probably big, 
	# this parameter says: make the lambda always smaller (no "convergence"?)
	# additionally I increase the nlambda
	model <- cv.glmnet(x = new.x.shifted
					   ,y = new.y
					   ,alpha=alpha
					   ,penalty.factor=penalty.factor
					   ,nlambda=nlambda
					   ,...)
	glmnet.control(factory = TRUE)
	return(model)
}


## added 5.3.18
fun.zerosum <- function(x, y, penalty.factor=rep(1, ncol(x)), type="binomialZS", ...){
	if(!require(zeroSum)){
		install_github("rehbergT/zeroSum/zeroSum")
		library("zeroSum")
	}
	model <- zeroSumCVFit(x = x
						  ,y = y
						  ,penalty.factor=penalty.factor
						  , zeroSumWeights = penalty.factor
						  , type = type
						  ,...)
	return(model)
}