nievergeltlab
7/13/2017 - 8:49 PM

How competing Cox models of risk of relapse can be compared using ROC analysis. This analysis uses an implementation of the permutation meth

How competing Cox models of risk of relapse can be compared using ROC analysis. This analysis uses an implementation of the permutation methods suggested for comparing competing models

################################### 
### Load R libraries

#Contains survival analysis function
 library(survival)
#Functions to build ROC curves from Cox models
 library(risksetROC)


################################### 
### Data loading

#Function to quickly read .csv files
rcsv <- function(...)
	{
		read.csv(..., header=T, stringsAsFactors=FALSE,na.strings=c("NA", "#N/A", "#NULL!"))
	}

#Read the .csv files, of subject data and survival times
 covs <- rcsv('miestimation/pgbd_mi_data_imputed-stata12.csv') #All predictor variables. Variables with imputations have 35 entries each.
 surv <- rcsv('pgbdsurvivalroc.csv')
 
#Merge data based on ID. 
#Note: all.y makes sure that all subjects in the surv dataframe are included here, whether or not they have data in the covs datasheet
#Note: I set it so that if two variables have the same name, the variables from the covs sheet will have the suffix _covs 
 dat <- merge(covs,surv,by="subject_id",all.y=TRUE,suffixes=c("_covs",""))


################################### 
### Data handling (Make it usable for the analysis)

##Per talking with Peter, we just want to take an average of the multiply imputed variables to use as our data

#This is a list of all imputed variables. 
#The scan function takes user input and saves it as a vector
#This is the lazy way of doing it because you can just copy/paste lists from .csv files
imputed_vars_cat <- scan(what="character",sep=",")
childhoodphysabuse_binary,chronicaffdis_bin,comorbidpersonality2,functmorbid_bin,migraines2,mixedepisodes_collapsed,suicidethobehav_collapsed

imputed_vars_cont <- scan(what="character",sep=",")
anxietybaseline_continuous,negative_leq,sheehanimpairment,sheehantotal,suicidethobehav_collapsed


#This loop will go through the list of imputed variables
#and for each subject, take the median value
#iv is an incrementor variable

for (iv in imputed_vars_cont)
{
 #For an imputed variable, find the column number of all imputations
 #This is done by grepping for the variable in the column names of the data
 #note: I prepend a _ before iv, so the original, unimputed data does not come up in the grep result

 var_location <- grep(paste("_",iv,sep=""),names(dat)) 
 ivdat <- dat[,var_location]

 #apply: take the input data (ivdat), and performs a function (in this case median) on every row (as indicated by the 1 value)
 #Will return a vector of length
  imputed_collapsed <- apply(ivdat,1,median) 
 #add the vector of 
  dat <- cbind(dat,imputed_collapsed)
  names(dat)[dim(dat)[2]] <- paste(iv,"i",sep="_")
}

Mode <- function(x) {
  ux <- unique(x)
  ux[which.max(tabulate(match(x, ux)))]
}

for (iv in imputed_vars_cat)
{
 #For an imputed variable, find the column number of all imputations
 #This is done by grepping for the variable in the column names of the data
 #note: I prepend a _ before iv, so the original, unimputed data does not come up in the grep result

 var_location <- grep(paste("_",iv,sep=""),names(dat)) 
 ivdat <- dat[,var_location]

 #apply: take the input data (ivdat), and performs a function (in this case median) on every row (as indicated by the 1 value)
 #Will return a vector of length
  imputed_collapsed <- apply(ivdat,1,Mode) 
 #add the vector of 
  dat <- cbind(dat,imputed_collapsed)
  names(dat)[dim(dat)[2]] <- paste(iv,"i",sep="_")
}



#subset data to relevant covariates only

dat_sub <- subset(dat, select=c(time,event,age,sex,racecat,li_status_collapsed,anxietybaseline_continuous_i,
					chronicaffdis_bin_i,migraines2_i, suicidethobehav_collapsed_i, 
					mixedepisodes_collapsed_i,comorbidpersonality2_i , sheehantotal_i, 
					negative_leq_i))


################################### 
### Permutation and analysis code 

#Note: As of Feb 6, 2017, the model reported in the paper is the 730 day, full model
### turn off resampling the phenotype to get the baseline analysis, which should be  0.75240930 0.67311803 0.07929127 if the seed is set to 17 


do_not_permute = 0 #Set to 1 if you want to turn off permutations and run the default analysis 
full_model = 1 # Set to 1 to include the sheehan and others in the full model (limited set of clinical predictors or a really expanded set of clinical predictors)
null_baseline = 1 # Set to 1 if you want to compare to a model with aboslutely no covariates 

#Set number of permutations
nperm=10000

if (do_not_permute == 1)
{
 nperm= 1
}

#Makes a matrix that contains number of repetitions dimension, ROC for full, reduced, and difference models, for 365 and 730 days
 roc_outputs <- data.frame(matrix(data=NA,nrow=nperm,ncol=6))
 names(roc_outputs) <- c("AUC_f_365","AUC_r_365","AUC_d_365","AUC_f_730","AUC_r_730","AUC_d_730")

#Print a status update for the first or every 1000th permutation
for (permnum in 1:nperm)
{

 if(permnum %% 1000 == 0 | permnum == 1)
 {
  print(paste("permutation run", permnum))
 }

##Permute the phenotype

#Set the seed for random sampling
 set.seed(permnum)

#There are N = dim(dat_sub)[1] rows in the data. Sampling numbers 1:N without 
#replacement N times will give us a randomly ordered list of row numbers
 shuffled <- sample(1:dim(dat_sub)[1],dim(dat_sub)[1],replace=F)

#Make a permuted dataframe by loading the data in the shuffled order for the 
#phenotype column (columns 1 and 2) and in the typical order for all other covariates

 dat_sub_shuffle <- data.frame(dat_sub[shuffled,c(1:2)],dat_sub[,-c(1:2)]) #For use in permutation analysis
 ##Open issue: does it matter or not if I don't reset the row names?


#If you said to not permute, this uses the original data frame instead of the shuffled one
 if (do_not_permute == 1)
 {
  dat_sub_shuffle <- dat_sub 
 }


#Randomly reorder the data, so that training/test sets contain different observations for each permutation

 set.seed(permnum) #Note: set to 17 for the un-permuted data to recreate original results set.seed=17

 if (do_not_permute == 1)
 {
  set.seed(17)
 }
 new_order <- sample(x=1:dim(dat_sub)[1],    size=dim(dat_sub)[1],   replace=F) #Defines the new ordering
 dat_r <- dat_sub_shuffle[new_order,]  #Accomplishes the reordering
 
#Determine which subjects have a survival event, and which do not
#the which function returns the row number of these subjects
 cases     <- which(dat_r$event == 1)
 controls  <- which(dat_r$event == 0)

#Make separate dataframes for cases and controls
#I do this because it's an easy way to insure that 
#the training/test sets contain the correct proportion of cases
 case_set <- dat_r[cases,]
 control_set <- dat_r[controls,]
 
#Repeat the sequence 1...10
 numberseq <- rep(1:10,dim(dat_r)[1]) 

#Cut the sequence down to the dimension of the datasets
#Now every subject will have been assigned a number from 1 to 10
#We will filter on this number to build training/test sets

 control_numbers <- numberseq[1:length(controls)] #Give each control subject a set number
 case_numbers <- numberseq[1:length(cases)] #Give each case subject a set number

###Perform cox regression on each subset of data

#Make a matrix to hold the predicted AUCs
#Make it into a data frame so variables can be named easily

 pred_base_lps <- data.frame(matrix(data=NA,nrow=dim(dat_r)[1],ncol=2))
 names(pred_base_lps) <- c("red","full")
 row.names(pred_base_lps) <- paste("REM",1:dim(pred_base_lps)[1],sep="") # Have to remove the existing row names because we're going to merge on row names, which have been shuffled.

 #Formula for basic model
 if (null_baseline == 0)
 {
  red_model_formula <- formula(Surv(time,event) ~  age + sex + as.factor(racecat) + as.factor(li_status_collapsed))
 }

 if (null_baseline == 1)
 {
  red_model_formula <- formula(Surv(time,event) ~  1)
 }

 #Formula for model with clinical predictors
 #If choosing to include sheehan, negative leq, and chronicity
 if (full_model ==1)
 {
   model_formula <- formula(Surv(time,event) ~ age + sex + as.factor(racecat) + as.factor(li_status_collapsed) + anxietybaseline_continuous_i + 
							 migraines2_i + as.factor(suicidethobehav_collapsed_i) + mixedepisodes_collapsed_i +  
							comorbidpersonality2_i + sheehantotal_i + negative_leq_i + chronicaffdis_bin_i)
 }
 #If choosing to NOT include sheehan, negative leq, and chronicity
 if (full_model ==0)
 {
   model_formula <- formula(Surv(time,event) ~ age + sex + as.factor(racecat) + as.factor(li_status_collapsed) + anxietybaseline_continuous_i + 
							 migraines2_i + as.factor(suicidethobehav_collapsed_i) + mixedepisodes_collapsed_i +  
							comorbidpersonality2_i )
 }

 incstart <- 1 # Need to increment this matrix of AUCs based on the dimension of the test sets, which are not uniformly sized
 
 for (i in 1:10)
 {
    train_set <- rbind(case_set[case_numbers != i,], control_set[control_numbers != i,]) #Get all cases and controls who aren't in the ith set. Combine the set of cases and controls. Regression will be done in these subjects
    test_set  <- rbind(case_set[case_numbers == i,], control_set[control_numbers == i,]) #Get the ith set to make the testing set
    coxmod_base   <- coxph(red_model_formula,data=train_set) #Fit the reduced cox model to the training set
    coxmod_full   <- coxph(model_formula, data= train_set) #Fit the full cox model to the training set 
    pred_base <- predict(coxmod_base,test_set,"lp") #obtain  linear predictors for reduced model on the test set
    pred_full <- predict(coxmod_full,test_set,"lp") #obtain  linear predictors for full model on the test set
    inc <- length(pred_base) 
    pred_base_lps[incstart:(incstart+inc-1),]$red <- pred_base
    pred_base_lps[incstart:(incstart+inc-1),]$full <- pred_full 
    row.names(pred_base_lps)[incstart:(incstart+inc-1)] <- names(pred_full) #Set teh row names to match the existing data row names
    incstart <- incstart + inc  #Increment the counter   
  }

#Based on the row names, merge back with the original data
 dmerge <- merge(pred_base_lps,dat_r,by="row.names")

#Now take the ROC from both models
#Do for each cutoff date 

 t1r365 <- CoxWeights(marker=dmerge$red,Stime = dmerge$time, status=dmerge$event, predict.time=365)
 t1f365 <- CoxWeights(marker=dmerge$full,Stime = dmerge$time, status=dmerge$event, predict.time=365)
 rocd365 <- t1f365$AUC-t1r365$AUC

 t1r730 <- CoxWeights(marker=dmerge$red,Stime = dmerge$time, status=dmerge$event, predict.time=730)
 t1f730 <- CoxWeights(marker=dmerge$full,Stime = dmerge$time, status=dmerge$event, predict.time=730)
 rocd730 <- t1f730$AUC-t1r730$AUC


 roc_outputs[permnum,] <- c(t1f365$AUC, t1r365$AUC, rocd365,t1f730$AUC, t1r730$AUC, rocd730)
 
}

##Handle the output depending on parameters


if (do_not_permute == 1 & full_model == 1 & null_baseline == 1)
{

 t1r730_unpermuted_allpreds <-  t1r730
 t1f730_unpermuted_allpreds <-  t1f730

 t1r365_unpermuted_allpreds <-  t1r365
 t1f365_unpermuted_allpreds <-  t1f365

 roc_outputs_unpermuted_allpreds <- roc_outputs
 save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_unpermuted_allpreds_nullbaseline_v1_feb6_2017.R")

}
table(0.7489 > roc_outputs_unpermuted_allpreds[,6])


if (do_not_permute == 0 & full_model == 1 & null_baseline == 1)
{

 t1r730_unpermuted_allpreds <-  t1r730
 t1f730_unpermuted_allpreds <-  t1f730

 t1r365_unpermuted_allpreds <-  t1r365
 t1f365_unpermuted_allpreds <-  t1f365

 roc_outputs_unpermuted_allpreds <- roc_outputs
 save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_permuted_allpreds_nullbaseline_v1_feb6_2017.R")

}



if (do_not_permute == 1 & full_model == 1 & null_baseline == 0)
{

 t1r730_unpermuted_allpreds <-  t1r730
 t1f730_unpermuted_allpreds <-  t1f730

 t1r365_unpermuted_allpreds <-  t1r365
 t1f365_unpermuted_allpreds <-  t1f365

 roc_outputs_unpermuted_allpreds <- roc_outputs
 save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_unpermuted_allpreds_v1_aug10_2016.R")

}

if (do_not_permute == 1 & full_model == 1 & null_baseline == 1)
{

 t1r730_unpermuted_allpreds <-  t1r730
 t1f730_unpermuted_allpreds <-  t1f730

 t1r365_unpermuted_allpreds <-  t1r365
 t1f365_unpermuted_allpreds <-  t1f365

 roc_outputs_unpermuted_allpreds <- roc_outputs
 save(t1r730_unpermuted_allpreds,t1f730_unpermuted_allpreds,t1r365_unpermuted_allpreds,t1f365_unpermuted_allpreds,roc_outputs_unpermuted_allpreds, file="roc_outputs_unpermuted_allpreds_nullbaseline_v1_feb6_2017.R")

}



if (do_not_permute == 1 & full_model == 0 & null_baseline == 0)
{

 t1r730_unpermuted_lesspreds <-  t1r730
 t1f730_unpermuted_lesspreds <-  t1f730

 t1r365_unpermuted_lesspreds <-  t1r365
 t1f365_unpermuted_lesspreds <-  t1f365

 roc_outputs_unpermuted_lesspreds <- roc_outputs
 save(t1r730_unpermuted_lesspreds,t1f730_unpermuted_lesspreds,t1r365_unpermuted_lesspreds,t1f365_unpermuted_lesspreds,roc_outputs_unpermuted_lesspreds, file="roc_outputs_unpermuted_lesspreds_v1_aug10_2016.R")

}

if (do_not_permute == 0 & full_model == 1 & null_baseline == 0)
{

 t1r730_permed_allpreds <-  t1r730
 t1f730_permed_allpreds <-  t1f730

 t1r365_permed_allpreds <-  t1r365
 t1f365_permed_allpreds <-  t1f365

 roc_outputs_permed_allpreds <- roc_outputs
 save(t1r730_permed_allpreds,t1f730_permed_allpreds,t1r365_permed_allpreds,t1f365_permed_allpreds,roc_outputs_permed_allpreds, file="roc_outputs_permed_allpreds_v1_aug10_2016.R")

}

if (do_not_permute == 0 & full_model == 0 & null_baseline == 0)
{

 t1r730_permed_lesspreds <-  t1r730
 t1f730_permed_lesspreds <-  t1f730

 t1r365_permed_lesspreds <-  t1r365
 t1f365_permed_lesspreds <-  t1f365

 roc_outputs_permed_lesspreds <- roc_outputs
 save(t1r730_permed_lesspreds,t1f730_permed_lesspreds,t1r365_permed_lesspreds,t1f365_permed_lesspreds,roc_outputs_permed_lesspreds, file="roc_outputs_permed_lesspreds_v1_aug10_2016.R")

}

load('roc_outputs_permed_allpreds_v1_aug10_2016.R')
load('roc_outputs_unpermuted_allpreds_v1_aug10_2016.R')
load('roc_outputs_permed_lesspreds_v1_aug10_2016.R')
load('roc_outputs_unpermuted_lesspreds_v1_aug10_2016.R')



pdf("PGBD_ROC_allpredictors_730days_v1_aug10_2016.pdf")
 plot(t1f730_unpermuted_allpreds$FP,t1f730_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
 lines(t1r730_unpermuted_allpreds$FP,t1r730_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
 abline(0,1,col="black",lty=2)
 pv <- table(roc_outputs_unpermuted_allpreds[,6] > roc_outputs_permed_allpreds[,6])[1] / length( roc_outputs_permed_allpreds[,6])

 legend("topleft", legend=c(paste("Full model AUC =", round(t1f730_unpermuted_allpreds$AUC,3)),paste("Reduced model AUC =", round(t1r730_unpermuted_allpreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()

pdf("PGBD_ROC_allpredictors_365days_v1_aug10_2016.pdf")
 plot(t1f365_unpermuted_allpreds$FP,t1f365_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
 lines(t1r365_unpermuted_allpreds$FP,t1r365_unpermuted_allpreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
 abline(0,1,col="black",lty=2)
 pv <- table(roc_outputs_unpermuted_allpreds[,3] > roc_outputs_permed_allpreds[,3])[1] / length( roc_outputs_permed_allpreds[,3])

 legend("topleft", legend=c(paste("Full model AUC =", round(t1f365_unpermuted_allpreds$AUC,3)),paste("Reduced model AUC =", round(t1r365_unpermuted_allpreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()

pdf("PGBD_ROC_lesspredictors_730days_v1_aug10_2016.pdf")
 plot(t1f730_unpermuted_lesspreds$FP,t1f730_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
 lines(t1r730_unpermuted_lesspreds$FP,t1r730_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
 abline(0,1,col="black",lty=2)
 pv <- table(roc_outputs_unpermuted_lesspreds[,6] > roc_outputs_permed_lesspreds[,6])[1] / length( roc_outputs_permed_lesspreds[,6])

 legend("topleft", legend=c(paste("Full model AUC =", round(t1f730_unpermuted_lesspreds$AUC,3)),paste("Reduced model AUC =", round(t1r730_unpermuted_lesspreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()

pdf("PGBD_ROC_lesspredictors_365days_v1_aug10_2016.pdf")
 plot(t1f365_unpermuted_lesspreds$FP,t1f365_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue",xlab="False Positive Rate (1 - Specificity)",ylab="True Positive Rate (Sensitivity)")
 lines(t1r365_unpermuted_lesspreds$FP,t1r365_unpermuted_lesspreds$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
 abline(0,1,col="black",lty=2)
 pv <- table(roc_outputs_unpermuted_lesspreds[,3] > roc_outputs_permed_lesspreds[,3])[1] / length( roc_outputs_permed_lesspreds[,3])

 legend("topleft", legend=c(paste("Full model AUC =", round(t1f365_unpermuted_lesspreds$AUC,3)),paste("Reduced model AUC =", round(t1r365_unpermuted_lesspreds$AUC,3)),paste("P-value =", pv)),col=c("blue","red"),pch=c(18,18,NA))
dev.off()






#No Sheehan  
table(roc_outputs_unpermuted_lesspreds[,1] > roc_outputs_lesspreds[,1])
table(roc_outputs_unpermuted_lesspreds[,2] > roc_outputs_lesspreds[,2])
table(roc_outputs_unpermuted_lesspreds[,3] > roc_outputs_lesspreds[,3])






pdf("roc_unpermuted_lesspreds_730days_v1_aug1_2016.pdf")
plot(t1f730$FP,t1f730$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue")
lines(t1r730$FP,t1r730$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
abline(0,1,col="black",lty=2)
dev.off()


plot(t1f365$FP,t1f365$TP,type="l",lwd=2,cex.axis=1.45,main="",col="blue")
lines(t1r365$FP,t1r365$TP,type="l",lwd=2,cex.axis=1.45,main="",col="red")
abline(0,1,col="black",lty=2)




#Default AUC values without the 
par(mfrow=c(2,2))
hist(roc_outputs[,1],freq=FALSE,xlim=c(0.5,.8),main="Full model AUCs")
abline(v= 0.75240930,col="red",lwd=2)
hist(roc_outputs[,2],freq=FALSE,xlim=c(0.5,.8),main="Reduced model AUCs")
abline(v= 0.67311803,col="red",lwd=2)
hist(roc_outputs[,3],freq=FALSE,xlim=c(-.2,.2),main="Full - Reduced  AUCs")
abline(v= 0.07929127,col="red",lwd=2)
dev.off()

#730 days
#No Sheehan  
table(roc_outputs_unpermuted_lesspreds[,4] > roc_outputs_permed_lesspreds[,4])
table(roc_outputs_unpermuted_lesspreds[,5] > roc_outputs_permed_lesspreds[,5])
table(roc_outputs_unpermuted_lesspreds[,6] > roc_outputs_permed_lesspreds[,6])

#With Sheehan  
table(roc_outputs_unpermuted_allpreds[,4] > roc_outputs_permed_allpreds[,4])
table(roc_outputs_unpermuted_allpreds[,5] > roc_outputs_permed_allpreds[,5])
table(roc_outputs_unpermuted_allpreds[,6] > roc_outputs_permed_allpreds[,6])

#365 days
#No Sheehan  
table(roc_outputs_unpermuted_lesspreds[,1] > roc_outputs_permed_lesspreds[,1])
table(roc_outputs_unpermuted_lesspreds[,2] > roc_outputs_permed_lesspreds[,2])
table(roc_outputs_unpermuted_lesspreds[,3] > roc_outputs_permed_lesspreds[,3])

#With Sheehan  
table(roc_outputs_unpermuted_allpreds[,1] > roc_outputs_permed_allpreds[,1])
table(roc_outputs_unpermuted_allpreds[,2] > roc_outputs_permed_allpreds[,2])
table(roc_outputs_unpermuted_allpreds[,3] > roc_outputs_permed_allpreds[,3])





#Get correlation of measures
library(psych)
corrs <- corr.test(dat_sub[,c("anxietybaseline_continuous_i",
					"chronicaffdis_bin_i","migraines2_i", "suicidethobehav_collapsed_i", 
					"mixedepisodes_collapsed_i","comorbidpersonality2_i" , "sheehantotal_i", 
					"negative_leq_i")],method="spearman")