R parallel with fork processing of grouped data.table
# use a bash file to watch the following log file:
# tail -f currentStatus.log
# wait for input
# read -n1 -r -p
my.log <- function(text, append=T) {
text <- paste0(Sys.time(), ": ", text, "\n")
cat(text, file="scoringTemp/currentStatus.log", append = append)
}
createCluster <- function() {
os <- Sys.info()[1]
cores <- parallel::detectCores()
if (os=="Linux") {
cl <- parallel::makeForkCluster(nnodes=cores, outfile="")
}
else {
cl <- parallel::makeCluster(cores)
}
cl
}
cl <- createCluster()
my.log("Starting to compare...", append=F)
dsNames <- unique(dataTable$dsName)
set.seed(2017)
splitted <- clusterSplit(cl, sample(dsNames)) # list with 4 elements
dataTableParts <- lapply(splitted, function(dsNamesPart) {
dataTable[dsName %in% dsNamesPart]
})
scoreList <- parLapply(cl, dataTableParts, function(dataTablePart) { # list of 4 score data.tables
# process dataTablePart
my.log("Processing bla...")
})
stopCluster(cl)
scores <- rbindlist(scoreList)