Going over a user specified moving window, take the correlation between CpG sites within the window, report the ones with abs ( r ) > given cutoff.
library(Hmisc)
library(data.table)
cormethod="pearson"
windowsize=300000 #
windowjump=10000 #Jump over this increment for correlation analysis
minR=0.6 #minimum correlation to report
#GeMes https://www.cell.com/ajhg/pdfExtended/S0002-9297(14)00069-X
#http://www.sthda.com/english/wiki/correlation-matrix-a-quick-start-guide-to-analyze-format-and-visualize-a-correlation-matrix-using-r-software
flattenCorrMatrix <- function(cormat, pmat) {
ut <- upper.tri(cormat)
data.frame(
row = rownames(cormat)[row(cormat)[ut]],
column = rownames(cormat)[col(cormat)[ut]],
cor =(cormat)[ut],
p = pmat[ut]
)
}
#Load annotations
annots <- fread('misc/humanmethylation450_15017482_v1-2.csv',skip=7,data.table=F)
#Order the annotation list by chromosome and position
annots2 <- annots[order(annots$CHR,annots$MAPINFO,decreasing=FALSE),]
##read data
load("cleaned_vPGCb.Rdata") # CpG x Subjects
#make a dataframe of subject names that goes in the order they appear in the file
dat_order <- as.data.frame(names(dat))
#name the subjects the same as found in the matching column in the batch file
names(dat_order) <- c('methylome_id_fixed')
#establish a note of what the ordering is
dat_order$order <- c(1:(dim(dat)[2]))
#read in covariates
batch <- read.csv("data_analysis/demo2_wholedata_methylation_x3g.csv", header=T,na.strings=c("NA","#N/A","N/A"))
#merge batch data with subject name order data
covariates_ordered0 <- merge(batch,dat_order,by='methylome_id_fixed')
#sort the data by order so the phenotype now lines up
covariates_ordered <- covariates_ordered0[order(covariates_ordered0$order),]
#read in studymax covariate data, to get the studymax ids (if doing studymax)
covars_studymax <- read.csv('data_analysis/demo2_studymaxnov0_methylation_x1.csv',header=T,na.strings=c("NA","#N/A","N/A"),stringsAsFactors=F)
covariates_ordered_subset <- subset(covariates_ordered, visit ==0)
beta.norm <- as.data.frame(as.matrix(dat[, names(dat) %in% covariates_ordered_subset$methylome_id_fixed]))
pheno <- subset(covariates_ordered_subset, select=c(studyid,methylome_id_fixed ))
row.names(pheno) <- pheno$methylome_id_fixed
# Check that the Subjects in the pheno rows matches order of subjects in Beta columns
all(rownames(pheno)==colnames(beta.norm))
sum(is.na(match(rownames(pheno), colnames(beta.norm))))
#pheno<-pheno[colnames(beta.norm),]
all(rownames(pheno)==colnames(beta.norm)) # this should be true
for (chr in c(1:22,"X","Y")){
print(chr)
#Subset annotations to the chromosome
annots_chr <- subset(annots2, CHR == chr)
#Find the end of the chromosome
chrstop=max(annots_chr$MAPINFO)
#Set the starting position
startpos=min(annots_chr$MAPINFO)
#Set the stop position
stoppos=startpos+windowsize
#Make a variable for tested markers
testedlist <- NA
#Loop counter
reached_limit <- 0
#Initiate the loop
keep_looping=TRUE
#While the window is within the width of the chromosome, perform this command
while(keep_looping)
{
#Identify all CpGs within the window
annots_loop <- subset(annots_chr,MAPINFO >= startpos & MAPINFO <= stoppos)
#Extract all of these from the data
cpgs_use <- row.names(dat) %in% annots_loop$IlmnID
#Subset actual data to these CpGs
dat_loop <- beta.norm[cpgs_use,]
#Only calculate correlations if there is more than 1 CpG!
if(dim(dat_loop)[1] > 1)
{
#Correlate all probes in the data
res<- rcorr(t(dat_loop), type=cormethod)
#Flatten correlation matrix (i.e. unwrap it)
flat <- flattenCorrMatrix(res$r, res$P)
#Name all pairs of CpGs
flat$pair = paste(flat$row,flat$column,sep="_")
#Identify pairs that were never tested
untested <- which(!(flat$pair %in% testedlist) )
#AND also identify untested ones that have mimumum correlation > threshold
untested_sig <- which(!(flat$pair %in% testedlist) & abs(flat$cor) > minR)
#Put tested markers on the list of tested markers
testedlist <- c(testedlist,flat[untested,]$pair)
#If there are significant untested markers, append to the untested markers to the output file
if(length(untested_sig >0))
{
write.table(flat[untested_sig,],file="corr_between_cpgs.txt",append=TRUE,quote=F,row.names=F,col.names=F)
}
}
#At the end revise the position by incrementing the window
startpos=startpos+windowjump
stoppos=startpos+windowsize
#Checks to end the loop
if(stoppos >= chrstop )
{
reached_limit <- reached_limit + 1
#Is this the first time the stop position is longer than the chromosome? If yes, may not have looked at the end of the chromosome yet
if(reached_limit > 1)
{
keep_looping = FALSE
}
}
}
}