A new R project

10/22/2017 - 5:55 PM

Principle structure for a new project

main.R initialization.R parallelity.R

# This script initializes a parallel workflow. 
# The idea is to use cv.learn.func() for crossvalidation-methods. 
# If "enableParallelization" is true, the parallel version is used. 

# args are the command line parameters, extracted in "initialization.R"
# cip for cippool, clust for cluster
# You have to give the Rscript the argument "cip*", "clust*"
# AND the path to the pcs where R is prepared. 


# parallelity initialization
if(enableParallelization == TRUE){
	library("doFuture")
	registerDoFuture()
	# args got from initialization.R
	# cip for cippool, clust for cluster
	# You have to give the Rscript the argument "cip*", "clust*"
	# AND the path to the pcs where R is prepared. 
	is.cluster.declared <- grepl("cip", args) | grepl("clust", args)
	if(any(is.cluster.declared)){
		where.is.cluster.declared <- which(is.cluster.declared)
		prepared.pcs <- read.csv(args[where.is.cluster.declared + 1], stringsAsFactors = FALSE)[, 1]
		plan(list(
			tweak(cluster, workers = prepared.pcs),
			tweak(multiprocess, workers = 4L)
		))
	}else{
		plan(multiprocess, workers = 4L) # 4 seems to much for my PC at home.. (but just for the geneRatios)
	}
	options(future.globals.maxSize= 1000 * 1024 ^ 2)
}


source.function("cv.learn.func", A_FUNCD)
if(enableParallelization && cv.learn.func_inParallel){
	source.function("cv.learn.func_parallel", A_FUNCD)
	cv.learn.func <- match.fun("cv.learn.func_parallel")
	rm(cv.learn.func_parallel)
}

main.R

# This script calls all other necessary scripts

cleanrestart <- FALSE
source("Scripts/initialization.R")
enableParallelization <- FALSE
cv.learn.func_inParallel <- FALSE  	# if TRUE, the function cv.learn.func() is overwritten 
									# with a parallel version.
source("Scripts/parallelity.R")

catt(" ----- Initialization done ----- ")

A_PDF.DIR <- file.path(A_PDF.DIRmain, "01_dataloading")
dir.create(A_PDF.DIR, recursive = TRUE)
A_RUNTIME <- source.addtime("Scripts/subscripts/01_dataloading/01_loadPhenodata.R", runtimecobject = A_RUNTIME)

A_PDF.DIR <- file.path(A_PDF.DIRmain, "02_RawDataInspection")
dir.create(A_PDF.DIR, recursive = TRUE)
A_RUNTIME <- source.addtime("Scripts/subscripts/02_RawDataInspection/01_", runtimecobject = A_RUNTIME)

A_PDF.DIR <- file.path(A_PDF.DIRmain, "03_qualityControl")
dir.create(A_PDF.DIR, recursive = TRUE)
A_RUNTIME <- source.addtime("Scripts/subscripts/03_qualityControl/01_", runtimecobject = A_RUNTIME)

A_PDF.DIR <- file.path(A_PDF.DIRmain, "04_learning", "01_method_1")
dir.create(A_PDF.DIR, recursive = TRUE)
A_RUNTIME <- source.addtime("Scripts/subscripts/04_learning/01_method_1", runtimecobject = A_RUNTIME)



####### This comes at the end of the main-script
A_RUNTIME <- summarizeRT(A_RUNTIME)
write(A_RUNTIME, "runtime.txt")

save.image(file=file.path(A_RDA.WORKSPACE, "finished.rda"))
load(file.path(A_RDA.WORKSPACE, "finished.rda"))
print(sessionInfo())

initialization.R

# The following variables can be modified but should be correct if the data was downloaded
# correctly and one has proper rights in the directory
# 	A_RDA.DIR		- directory where all .rda-files are saved	(except .rda's which are distributed)
# 	A_PDF.DIRmain	- main PDF-directory (all pdfs are stored here)
# 	A_DATA.DIR	- all downloaded files are saved here
# 	A_SUBD	- directory which contains all subscripts (should not be modified usually)
# 	A_FUNCD	- directory holding functions used by the subscripts or main.R (should not be modified usually)

# If you want a clean restart, remove all RDA* and PDF* directories
# If you want a completely clean restart, 
# 	1) remove all RDA* and PDF* directories
# 	2) delete all (used) libraries, they will be reinstalled during the script

# global variables are declared as "A_VARIABLENAME"
# "A_" to sort them on top by Rstudio's environment,  and all in upper case.


if(cleanrestart){# not tested with commandline arguments(22.5.18)
	rm(list=ls())
	cleanrestart <- TRUE
}

args <- commandArgs(trailingOnly = TRUE)
if(is.na(args[1]))
	args[1] <- FALSE

##### necessary libraries install commands (tested under ubuntu)
# sudo apt-get install libcurl4-openssl-dev libssl-dev
# sudo apt-get install udunits-bin		# probably (not tested) the necessary for udunits2-R-package

# cleanrestart <- FALSE
if(cleanrestart || args[1] == "cleanrestart"){
	unlink("Results/PDF", recursive = TRUE)
	unlink("Results/RDA", recursive = TRUE)
	unlink("Data/OperationalData/*", recursive = TRUE)
	unlink(c("runtime.txt", "outMAIN.txt"))
}
# load basic scripts - package
# runallscripts <- TRUE
if(!require("devtools")){
	# sudo apt-get install libcurl4-openssl-dev libssl-dev
	install.packages("devtools", repos="http://cran.rstudio.com/")
}
library(devtools)

if(!require("basicscriptsGG")){
	install_github("gugl58/basicscriptsGG")
}
library(basicscriptsGG)





###### setting global variables ######
# directories where
A_RDA.DIR <- "Results/RDA"			# all .rda are saved
A_RDA.WORKSPACE <- file.path(A_RDA.DIR, "workspaces")
A_PDF.DIRmain <- "Results/PDF"				# all pdf are saved
dir.create(A_RDA.DIR, showWarnings = F, recursive = TRUE)
dir.create(A_RDA.WORKSPACE, showWarnings = F, recursive = TRUE)
dir.create(A_PDF.DIRmain, showWarnings = F, recursive = TRUE)


A_SUBD <- "Scripts/subscripts"	# directory where subscripts are located
if(! file.exists(A_SUBD)){
	stop(paste0("Subscripts-Directory \"", A_SUBD, "\" does not exist."))
}
A_FUNCD <- paste0(A_SUBD, "/#functions")	# directory where subscripts are located
if(! file.exists(A_FUNCD)){
	stop(paste0("Functions-Directory \"", A_FUNCD, "\" does not exist."))
}


###### initialising runtimeC-class variable ######
catt <- define.catt()
A_RUNTIME <- RuntimeC()		# initialize runtime-array
A_scriptname <- "\"Skript is not called from source.addtime(), therefore naming is not correct\"" # for testing purposes.
# subscripts print their name usually, but if you source a subscript directly,
# the variable holding its name is not set. Therefore, this variable is set as backup


###### options 
options(warn=1) #warnings printed as they occur



##### Install and load all packages later used. (This is redundant, but now if an error occurs in installing it occurs in the beginning)
#inst.load.packages("GEOquery", inst.bioclite = TRUE)
#inst.load.packages("tibble")
#inst.load.packages("plyr")
#inst.load.packages("affy", inst.bioclite = TRUE)
# install_github("rehbergT/zeroSum/zeroSum")
#library("zeroSum")
#inst.load.packages("hgu133plus2.db", inst.bioclite = TRUE) # Genenames to symbols
#inst.load.packages("reshape2") 
#inst.load.packages("ggplot2") 
#inst.load.packages("tidyr") 
#inst.load.packages("ggfittext")

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

A new R project