nievergeltlab
7/17/2019 - 8:51 PM

Moving window Correlation between CpG sites

Going over a user specified moving window, take the correlation between CpG sites within the window, report the ones with abs ( r ) > given cutoff.

library(Hmisc)
library(data.table)
cormethod="pearson"
windowsize=300000 #
windowjump=10000 #Jump over this increment for correlation analysis
minR=0.6 #minimum correlation to report

#GeMes https://www.cell.com/ajhg/pdfExtended/S0002-9297(14)00069-X 
#http://www.sthda.com/english/wiki/correlation-matrix-a-quick-start-guide-to-analyze-format-and-visualize-a-correlation-matrix-using-r-software
flattenCorrMatrix <- function(cormat, pmat) {
  ut <- upper.tri(cormat)
  data.frame(
    row = rownames(cormat)[row(cormat)[ut]],
    column = rownames(cormat)[col(cormat)[ut]],
    cor  =(cormat)[ut],
    p = pmat[ut]
    )
}

#Load annotations
annots <- fread('misc/humanmethylation450_15017482_v1-2.csv',skip=7,data.table=F)

#Order the annotation list by chromosome and position
annots2 <- annots[order(annots$CHR,annots$MAPINFO,decreasing=FALSE),]


##read data

load("cleaned_vPGCb.Rdata") # CpG x Subjects 

#make a dataframe of subject names that goes in the order they appear in the file
dat_order <- as.data.frame(names(dat))

#name the subjects the same as found in the matching column in the batch file
names(dat_order) <- c('methylome_id_fixed')

#establish a note of what the ordering is
dat_order$order <- c(1:(dim(dat)[2]))

#read in covariates
batch <- read.csv("data_analysis/demo2_wholedata_methylation_x3g.csv", header=T,na.strings=c("NA","#N/A","N/A"))

#merge batch data with subject name order data
covariates_ordered0 <- merge(batch,dat_order,by='methylome_id_fixed')

#sort the data by order so the phenotype now lines up
covariates_ordered <- covariates_ordered0[order(covariates_ordered0$order),]

#read in studymax covariate data, to get the studymax ids (if doing studymax)
covars_studymax <- read.csv('data_analysis/demo2_studymaxnov0_methylation_x1.csv',header=T,na.strings=c("NA","#N/A","N/A"),stringsAsFactors=F)

covariates_ordered_subset <- subset(covariates_ordered, visit ==0)

beta.norm <- as.data.frame(as.matrix(dat[, names(dat) %in% covariates_ordered_subset$methylome_id_fixed]))

pheno <- subset(covariates_ordered_subset, select=c(studyid,methylome_id_fixed ))
row.names(pheno) <- pheno$methylome_id_fixed

# Check that the Subjects in the pheno rows matches order of subjects in Beta columns
all(rownames(pheno)==colnames(beta.norm)) 
sum(is.na(match(rownames(pheno), colnames(beta.norm)))) 
#pheno<-pheno[colnames(beta.norm),]
all(rownames(pheno)==colnames(beta.norm)) # this should be true


for (chr in c(1:22,"X","Y")){
 print(chr)
 #Subset annotations to the chromosome
  annots_chr <- subset(annots2, CHR == chr)

 #Find the end of the chromosome
  chrstop=max(annots_chr$MAPINFO)
 
 #Set the starting position 
  startpos=min(annots_chr$MAPINFO)
  
 #Set the stop position
  stoppos=startpos+windowsize

 #Make a variable for tested markers
 testedlist <- NA
 #Loop counter
  reached_limit <- 0
 #Initiate the loop
  keep_looping=TRUE
 #While the window is within the width of the chromosome, perform this command
  while(keep_looping)
  {
   #Identify all CpGs within the window
    annots_loop <- subset(annots_chr,MAPINFO >= startpos & MAPINFO <= stoppos)
   
   #Extract all of these from the data
    cpgs_use <- row.names(dat) %in% annots_loop$IlmnID
   
   #Subset actual data to these CpGs
    dat_loop <- beta.norm[cpgs_use,]
   
   #Only calculate correlations if there is more than 1 CpG!
   if(dim(dat_loop)[1] > 1)
   {
    #Correlate all probes in the data 
     res<- rcorr(t(dat_loop), type=cormethod)
    
    #Flatten correlation matrix (i.e. unwrap it)
     flat <- flattenCorrMatrix(res$r, res$P)
    
    #Name all pairs of CpGs
     flat$pair = paste(flat$row,flat$column,sep="_")

    #Identify pairs that were never tested
     untested <- which(!(flat$pair %in% testedlist) )
    
    #AND also identify untested ones that have mimumum correlation > threshold
     untested_sig <- which(!(flat$pair %in% testedlist) & abs(flat$cor) > minR)
     
    #Put tested markers on the list of tested markers
     testedlist <- c(testedlist,flat[untested,]$pair)
     
    #If there are significant untested markers, append to the untested markers to the output file
    if(length(untested_sig >0))
    {
     write.table(flat[untested_sig,],file="corr_between_cpgs.txt",append=TRUE,quote=F,row.names=F,col.names=F)
    }
   }
   #At the end revise the position by incrementing the window
    startpos=startpos+windowjump
    stoppos=startpos+windowsize

   #Checks to end the loop
    if(stoppos >= chrstop )
    {
     reached_limit <- reached_limit + 1
     #Is this the first time the stop position is longer than the chromosome? If yes, may not have looked at the end of the chromosome yet
     if(reached_limit > 1)
     {
      keep_looping = FALSE
     }
    }
  } 
}