myrddian
9/28/2018 - 7:06 AM

optimiser.R

model.analysis <-function(test_res, predict_rsp) {


  col_names <- c("TruePositive","FalsePositive",
                 "FalseNegative","Accuracy","Precision","Recall","F1")
  ret_val <- data.frame(matrix(ncol = length(col_names), nrow = 0))
  colnames(ret_val) <- col_names
  ret_val[nrow(ret_val) + 1,] = c(tp,fp,fn,accuracy,precision,recall,f1)
  return(ret_val)
}


#Penalty Calculator
model.penalty_calc <- function(thrs, penal_threshold)
{
  #return zero
  if(penal_threshold == FALSE) {
    return(0)
  }
  
  #Otherwise calculate and return
  n_p <- (thrs*0.5) - 0.25
  
  #Clip return values to within -0.25 and 0.25
  if(n_p > 0.25)
  {
    return(0.25)
  }
  if(n_p < -0.25)
  {
    return(-0.25)
  }
  #Return the penalty
  return(n_p)
}

model.acc_score <- function(analysis_res, penalty_cal)
{
  return(as.numeric(analysis_res$Accuracy) +  penalty_cal )
}

model.actuals_score <- function(analysis_res, penalty_cal)
{
  return(as.numeric(analysis_res$TruePositive) - as.numeric(analysis_res$FalsePositive) - as.numeric(analysis_res$FalseNegative) )
}

model.comp_score <- function(analysis_res, penalty_cal)
{
  return(as.numeric(analysis_res$F1) + as.numeric(analysis_res$Accuracy) +  penalty_cal )
}

#Scoring function to determine model fitness
model.score <- function(analysis_res, penalty_cal)
{
  return(as.numeric(analysis_res$F1) +  penalty_cal )
}

#Generic data massaging function in case its needed
model.correct_factor <- function(inputSet)
{
  inputSet$V21  <- factor(inputSet$V21)
  return(inputSet)
}

model.print_status <- function(talk, min_talk, formula, best_form,steps,best_step,score,best_score, i,best_thrs, thr_pn, best_penalty, analysis_res, best_analysis, progress_only)
{
  #Print debug information, don't do this on JUPYTER
  if(talk)
  {
    if(progress_only)
    {
      print(paste("Test No: ", steps))
    }
    else
    {
      print("--------------------------------------")
      print(paste("Test No: ", steps))
      print(paste("Best Found at Step:", best_step))
      print("--------------------------------------")
      if(min_talk == FALSE)
      {
        print(paste("Formula : ", formula))
        print(paste("Best Formula: ", best_form))
        print("--------------------------------------")
        print(paste("Score : ", score))
        print(paste("Best Score: ", best_score))
        print("--------------------------------------")
        print(paste("Threshold: ", i))
        print(paste("Best Thrs:", best_thrs))
        print("--------------------------------------")
        print(paste("Threshold Penalty : ", thr_pn))
        print(paste("Best Thrs penalty :", best_penalty))
        print("--------------------------------------")
      }
      else
      {
        print("=====================================================")
        print("Current Matrix: ")
        print(analysis_res)
        print("Best Analysis Matrix: ")
        print(best_analysis)
        print("=====================================================")
      }
    }
  }
}

model.multi_pick <- function(var_size, var_map, vars, target_var){
  rng_pic_size = sample(1:var_size, 1)
  rng_pic_sel = sample(1:var_size, rng_pic_size)
  
  for(rng_pic in rng_pic_sel)
  {
    if(var_map[rng_pic] == TRUE)
    {
      var_map[rng_pic] = FALSE
    }
    else
    {
      var_map[rng_pic] = TRUE
    }
    #Prevent a null formula
    if(any(var_map) == FALSE)
    {
      var_map[rng_pic] = TRUE
    }
  }
  #Generate new formula
  new_list <- c()
  for(i in 1:var_size)
  {
    if(var_map[i]){
      new_list <- c(new_list,vars[i])
    }
  }
  new_list <- new_list[!is.na(new_list)] #Remove empty elements in formula
  formula <- paste(target_var, paste("~", paste(new_list, collapse="+"))) #Collapse to something glm can use
  return(formula)
}

model.single_pick <- function(var_size, var_map, vars, target_var){
  
  #Pick variable to toggle
  rng_pic = sample(1:var_size, 1)
  #Toggle the selection
  if(var_map[rng_pic] == TRUE)
  {
    var_map[rng_pic] = FALSE
  }
  else
  {
    var_map[rng_pic] = TRUE
  }
  #Prevent a null formula
  if(any(var_map) == FALSE)
  {
    var_map[rng_pic] = TRUE
  }
  
  #Generate new formula
  new_list <- c()
  for(i in 1:var_size)
  {
    if(var_map[i]){
      new_list <- c(new_list,vars[i])
    }
  }
  new_list <- new_list[!is.na(new_list)] #Remove empty elements in formula
  formula <- paste(target_var, paste("~", paste(new_list, collapse="+"))) #Collapse to something glm can use
  return(formula)
}

#The stochastic hill climbing optimiser
model.optimiser <- function(trainSet, #Training Set
                         testSet,     #Test Set
                         testSetRsp = testSet$V21, #Set the test rsp
                         iter=10000,  #No of iterations to optimise
                         target_var="V21",  #target variable
                         talk=TRUE,  #print optimising information as it runs
                         min_talk = FALSE, #Only put the analysis matrix results
                         progress_only = FALSE, #Only put a progress/percent complete
                         penal_threshold = TRUE,   #panalise low threshold values
                         no_threshold = FALSE, # Dont move the threshold value
                         def_threshold = 0.5, #default threshold
                         thrs_low_min = 0, #Lowest possible threshold the optimiser will take
                         penalty_calc = model.penalty_calc, #Penalty Calculator
                         calc_score = model.score, #Score calculator
                         model_analysis,  #functions to analyse model results
                         run_model, #Model to execute for evaluation
                         data_massager = model.correct_factor,
                         monte_carlo = FALSE , #Do a monte-carlo style optimisation instead 
                         multi_pick = FALSE, #Pick multiple variables to mutate)   
                         ret_conf = FALSE )  #Calculate and return the confusion matrix in the end
{
  #Load lib
  library(caret) # for confusion matrix and other stats
  library(e1071) # for naiveBayes
  library(rpart)
  
  #Correct Factor
  testSet  <- data_massager(testSet)
  trainSet <- data_massager(trainSet)
  
  #Initialise optimiser
  var_size = length(names(trainSet)) -1
  vars  = names(trainSet)
  var_map = logical(length = var_size)
  var_map[1] = TRUE
  best_formula <- var_map
  best_thrs <- 0.5
  true_count = 1
  best_tp <- 0 
  best_f1  <- 0 
  best_score <- 0
  best_penalty <- 0
  best_accuracy <- 0
  best_step <- 0
  best_form <- ""
  best_analysis <- data.frame(matrix(ncol = 0, nrow = 0))
  #Start the hillclimb
  for(steps in 1:iter)
  {
    #Copy the best formula on the current one to mutate
    if(monte_carlo == FALSE )
    {
      var_map <- best_formula
    }
    else
    {
      #Start a new
      var_map = logical(length = var_size)
    }
    
    if(multi_pick)
    {
      formula <- model.multi_pick(var_size, var_map, vars, target_var) 
    }
    else
    {
      formula <- model.single_pick(var_size, var_map, vars, target_var) 
    }
    
    #Generate a random threshold value b/w 0 and 1
    if(no_threshold == FALSE)
    {
      i <- rnorm(1,mean=0.5,sd=0.25)
      if( i < thrs_low_min ) {i <- def_threshold} #Prevent less than 0
      if( i > 1 ) {i <- def_threshold} #Prevent a greater than 1 
    }
    else
    {
      i = def_threshold
    }

    #Run the model
    tst_res <- run_model(formula,trainSet,testSet,i) #Analyse its cost
    analysis_res <- model_analysis(tst_res, testSetRsp)  #Generate analysis data

    #Score
    if(is.nan(as.numeric(analysis_res$F1)) == FALSE)  #If the it has NAN in the confusion matrix reject the mutation
    {
      #Prefer results with a higher threshold, the higher the better, 
      #Move the optimiser to a higher F1 score and higher threshold for the logistic function
      thr_pn <- penalty_calc(i, penal_threshold) #Calculate a penalty for using a lower threshold
      score <-  calc_score(analysis_res, thr_pn )   #Calculate score
      
      model.print_status(talk, min_talk, formula, best_form,steps,best_step,score,best_score, i,best_thrs, thr_pn, best_penalty, analysis_res, best_analysis, progress_only)
     
      if(score > best_score)   #If its score is better accept the mutation as the best so far
      {
        best_formula <- var_map
        best_thrs <- i
        best_tp <- as.numeric(analysis_res$TruePositive)
        best_f1 <- as.numeric(analysis_res$F1)
        best_score <- score
        best_form <- formula
        best_step <- steps
        best_accuracy <- as.numeric(analysis_res$Accuracy)
        best_penalty <- thr_pn
        best_analysis <- analysis_res
        best_res <- tst_res
      }
    }
  }
  print("=====================================================")
  print("=====================================================")
  print("Final Result: ")
  print(paste("Best Score:", best_score))
  print(paste("Best Thrs:", best_thrs))
  print(paste("Best TP: ", best_tp))
  print(paste("Best F1 Score: ", best_f1))
  print(paste("Best Formula: ", best_form))
  print(paste("Best Found at Step:", best_step))
  print("Best Analysis Matrix: ")
  print(best_analysis)
  print("=====================================================")
  print("=====================================================")
  
  if(ret_conf){
    tst_rsp <- as.factor(best_res)
    prd_rsp <- as.factor(testSetRsp)
    return(confusionMatrix(data=tst_rsp, reference=prd_rsp))
  }
}