ofrendo
3/22/2017 - 1:22 PM

R parallel with fork processing of grouped data.table

R parallel with fork processing of grouped data.table

# use a bash file to watch the following log file:
# tail -f currentStatus.log
# wait for input
# read -n1 -r -p
my.log <- function(text, append=T) {
  text <- paste0(Sys.time(), ": ", text, "\n")
  cat(text, file="scoringTemp/currentStatus.log", append = append)
}
createCluster <- function() {
  os <- Sys.info()[1]
  
  cores <- parallel::detectCores()
  if (os=="Linux") {
    cl <- parallel::makeForkCluster(nnodes=cores, outfile="")
  }
  else {
    cl <- parallel::makeCluster(cores)    
  }
  cl
}

cl <- createCluster()
my.log("Starting to compare...", append=F)

dsNames <- unique(dataTable$dsName)
set.seed(2017)
splitted <- clusterSplit(cl, sample(dsNames)) # list with 4 elements
dataTableParts <- lapply(splitted, function(dsNamesPart) {
  dataTable[dsName %in% dsNamesPart]
})

scoreList <- parLapply(cl, dataTableParts, function(dataTablePart) { # list of 4 score data.tables
  # process dataTablePart
  my.log("Processing bla...")
})
stopCluster(cl)
scores <- rbindlist(scoreList)