alexander-r
6/14/2016 - 2:10 PM

## Metabolmics data normalization functions From: Normalization of metabolomics data with applications to correlation maps (Jauhiainen et.al, B

Metabolmics data normalization functions From: Normalization of metabolomics data with applications to correlation maps (Jauhiainen et.al, Bioinformatics (2014) 30 (15): 2155-2161; doi: 10.1093/bioinformatics/btu175)

``````# This program is free software; you can redistribute it and/or
# modify it under the terms of the GNU General Public License as
#
# Author: Alexandra Jauhiainen, alexandra.jauhiainen@ki.se

updateMean <- function(X,y,Z,Sigma,samples,return.fit=F){
# Internal function to update mean in the iterative normalization procedure.
# Args:
#   X,Z: regressor matrices
#   y: response vector
#   Sigma: current estimate of covariance matrix
#   samples: vector of sample indicator
#   return.fit: If TRUE the fitted mixed model is returned, in addition to the other parameters
# Returns:
#   Updated fitted values and updated residuals, and, optional, the fitted model.

Sigma.inv <- solve(Sigma)

n <- length(levels(samples))
m <- length(y)/n

e <- eigen(Sigma.inv)
V <- e\$vectors
B <- V %*% diag(sqrt(e\$values)) %*% t(V)
Ytmp <- matrix(y,nrow=n,ncol=m)

Y2 <-NULL
for(i in 1:n){
Y2 <- rbind(Y2,t(B%*%(Ytmp[i,])))
}

X2 <- matrix(NA,nrow=nrow(X),ncol=ncol(X))
Z2 <- matrix(NA,nrow=nrow(Z),ncol=ncol(Z))
for(samp in levels(samples)){
pick <- samples==samp
Xtmp <- X[pick,]
Ztmp <- Z[pick,]
X2[pick,]<-B%*%Xtmp
Z2[pick,]<-B%*%Ztmp
}
# new values in Y2, X2, Z2
fit3 <- hglm(X = X2, y = as.numeric(Y2), Z = Z2, family = gaussian(link = identity),conv=1e-6)
fitted.vals <- matrix(fit3\$fv,nrow=n,ncol=m)
resid.vals <- matrix(as.numeric(Y2)-fit3\$fv,nrow=n,ncol=m)

fitted.new <-NULL
for(i in 1:n){
fitted.new <- rbind(fitted.new,t(solve(B)%*%(fitted.vals[i,])))
}

resid.new <-NULL
for(i in 1:n){
resid.new <- rbind(resid.new,t(solve(B)%*%(resid.vals[i,])))
}
if(!return.fit)
list(fitted=fitted.new,resid=resid.new)
else
list(fitted=fitted.new,resid=resid.new,fit=fit3)
}

# Internal function to update the covariance matrix in the iterative normalization procedure.
# Args:
#   fitted.vals: Current fitted values.
#   Y: Response values, as a matrix.
#   resid.vals: Current residuals.
#   use.resid: If TRUE the current residuals are used to estimate covaraince matrix. If FALSE the residuals a re-calculated.
#   eps: small value to avoid numerical instability in estimating the covaraince matrix.
# Returns:
#   An updated estimate of the covariance matrix.

m <- ncol(resid.vals)
n <- nrow(resid.vals)

M <- matrix(0,ncol=m,nrow=m)
if(use.resid){
for(i in 1:n){
M<- M + resid.vals[i,] %*% t(resid.vals[i,])
}
}else{
for(i in 1:n){
M<- M + (Y-fitted.vals)[i,] %*% t((Y-fitted.vals)[i,])
}
}
Sigma.cur <- M/n + eps*diag(m)
Sigma.cur
}

normalizeMixed <- function(Y,cohort,batch,iterative=T,return.cov=F,scale.bv=F,conv.eps=1e-5){
# Performs a mixed model normalization with a simultaneous estimation of
# a covariance matrix by using an iterative procedure.
# Args:
#   Y: Response matrix, metabolites x samples.
#   cohort: A vector indicating cohorts for each sample, can be set to NULL if no cohorts are defined.
#   batch: A vector indicating batches for each sample, can be set to NULL if no batches are defined.
#   iterative: if iteration is to be performed, defaults to true
#   return.cov: If TRUE the estimated covariance matrix is returned together with the normalized values, provided that iterative is set to TRUE
#   scale.bv: If TRUE, a model is fit initially to correct for differing variances for the batches. Requires the nlme package.
#   conv.eps: Small value to indicate convergence.
# Returns:
#   The normalized Y matrix, and, optional, the covariance matrix.
require(hglm)

Y <- t(Y)
m <- ncol(Y)
n <- nrow(Y)

met <- sprintf("%s%2\$0*3\$d", "m", 1:m,nchar(m))
met <- rep(met,each=n)
sample <- sprintf("%s%2\$0*3\$d", "s", 1:n,nchar(n))
sample <- rep(sample,times=m)

if(is.null(batch) & is.null(cohort)){
message("A model with only samples is used.")
dat <- data.frame(y=as.numeric(Y),met=as.factor(met),sample=as.factor(sample))
X<-model.matrix(~1+met,data=dat)
Z <- model.matrix(~0+sample,data=dat)
}
if(is.null(cohort) & !is.null(batch)){
message("A model with batches and samples is used.")
batch <- as.factor(batch)
batch <- rep(batch,times=m)
dat <- data.frame(y=as.numeric(Y),batch=as.factor(batch),met=as.factor(met),sample=as.factor(sample))
X<-model.matrix(~1+met,data=dat)
Z <- model.matrix(~0+batch,data=dat)
Z <- cbind(model.matrix(~0+sample,data=dat),Z)
if(scale.bv){
require(nlme)
fit1 <- lme(y~met, random=~1|batch/sample,weights = varIdent(form=~1|batch),data=dat)
dat\$y <- residuals(fit1,type="pearson") + fitted(fit1)
}
}
if(!is.null(cohort) & !is.null(batch)){
message("A model with cohorts, batches, and samples is used.")
cohort <- as.factor(cohort)
cohort <- rep(cohort, times=m)
dat <- data.frame(y=as.numeric(Y),cohort=as.factor(cohort),batch=as.factor(batch),met=as.factor(met),sample=as.factor(sample))
X<-model.matrix(~1+met,data=dat)
Z <- model.matrix(~0+cohort,data=dat)
Z <- cbind(model.matrix(~0+batch,data=dat),Z)
Z <- cbind(model.matrix(~0+sample,data=dat),Z)
if(scale.bv){
require(nlme)
fit1 <- lme(y~met, random=~1|cohort/batch/sample,weights = varIdent(form=~1|batch),data=dat)
dat\$y <- residuals(fit1,type="pearson") + fitted(fit1)
}
}

y = dat\$y
fit2 <- hglm(X = X, y = y, Z = Z, family = gaussian(link = identity))

if(iterative){
fitted.cur <- fitted.start <- matrix(fit2\$fv,nrow=n,ncol=m)
eps <- 1e-10
Sigma.cur <- Sigma.start<-cov(matrix(dat\$y-fit2\$fv,nrow=n,ncol=m)) + eps*diag(m)

j<-1
converged<-F
while(!converged){

out.mean <- updateMean(X,y,Z,Sigma.cur,dat\$sample)

diff<-abs(Sigma.cur[upper.tri(Sigma.cur)]-Sigma.new[upper.tri(Sigma.new)])
if(all(diff<conv.eps)){
converged<-T
final.fit <- updateMean(X,y,Z,Sigma.new,dat\$sample,return.fit=T)
}
Sigma.cur<-Sigma.new
colnames(Sigma.cur)<-rownames(Sigma.cur)<-colnames(Y)
j<-j+1
}

colnames(Sigma.cur)<-rownames(Sigma.cur)<-colnames(Y)
colnames(Sigma.start)<-rownames(Sigma.start)<-colnames(Y)

YN <- matrix(final.fit\$resid,ncol=m,nrow=n) + matrix(c(0,final.fit\$fit\$fixef[-1])+final.fit\$fit\$fixef[1],nrow=n,ncol=m,byrow=T)
dimnames(YN)<-dimnames(Y)
}else{
Sigma.cur=NULL
YN <- matrix(fit2\$resid,ncol=m,nrow=n) + matrix(c(0,fit2\$fixef[-1])+fit2\$fixef[1],nrow=n,ncol=m,byrow=T)
dimnames(YN)<-dimnames(Y)
}

if(return.cov){
list(Y=t(YN),Sigma=Sigma.cur)
} else{
t(YN)
}
}

heatmap <- function(cor.mat){
# Plots a red and blue heatmap
# Args:
#   cor.mat: a square matrix (e.g. a correlation matrix)
# Returns:
#   A plot is produced. No return arguments.

f <- function(m) t(m)[,nrow(m):1]

m <- nrow(cor.mat)

ColorLevels <- seq(-1,1,length.out=101)
ColorRamp <- rev(redblue(100))

layout(matrix(c(1,1,2,0),2,2),widths=c(4,0.65))

par(mar=c(3,1,1,3))
image(1:m,1:m,f(cor.mat),col=ColorRamp,breaks=ColorLevels,xaxt = "n", yaxt = "n",ylab="",xlab="")
axis(1, 1:m, labels = colnames(cor.mat), las = 2, line = -0.5, tick = 0)
axis(4, 1:m, labels = rev(colnames(cor.mat)), las = 2, line = -0.5, tick = 0)

par(mar = c(1,3,1.5,1))
image(1, ColorLevels,
matrix(data=ColorLevels, ncol=length(ColorLevels),nrow=1),
col=ColorRamp,
xlab="",ylab="",
xaxt="n",cex.axis=1)
}
``````