alexander-r
6/14/2016 - 2:16 PM

Loess data normalization based on time-line, includes plotting normalized vs non-normalized data

Loess data normalization based on time-line, includes plotting normalized vs non-normalized data

##
## mclapply.hack.R
##
## Nathan VanHoudnos
## nathanvan AT northwestern FULL STOP edu
## July 14, 2014
##
## A script to implement a hackish version of 
## parallel:mclapply() on Windows machines.
## On Linux or Mac, the script has no effect
## beyond loading the parallel library. 

require(parallel)    

## Define the hack
mclapply.hack <- function(...) {
    ## Create a cluster
    size.of.list <- length(list(...)[[1]])
    cl <- makeCluster( min(size.of.list, detectCores()) )

    ## Find out the names of the loaded packages 
    loaded.package.names <- c(
        ## Base packages
        sessionInfo()$basePkgs,
        ## Additional packages
        names( sessionInfo()$otherPkgs ))

    tryCatch( {

       ## Copy over all of the objects within scope to
       ## all clusters. 
       this.env <- environment()
       while( identical( this.env, globalenv() ) == FALSE ) {
           clusterExport(cl,
                         ls(all.names=TRUE, env=this.env),
                         envir=this.env)
           this.env <- parent.env(environment())
       }
       clusterExport(cl,
                     ls(all.names=TRUE, env=globalenv()),
                     envir=globalenv())
       
       ## Load the libraries on all the clusters
       ## N.B. length(cl) returns the number of clusters
       parLapply( cl, 1:length(cl), function(xx){
           lapply(loaded.package.names, function(yy) {
               require(yy , character.only=TRUE)})
       })
       
       ## Run the lapply in parallel 
       return( parLapply( cl, ...) )
    }, finally = {        
       ## Stop the cluster
       stopCluster(cl)
    })
}

## Warn the user if they are using Windows
if( Sys.info()[['sysname']] == 'Windows' ){
    message(paste(
      "\n", 
      "   *** Microsoft Windows detected ***\n",
      "   \n",
      "   For technical reasons, the MS Windows version of mclapply()\n",
      "   is implemented as a serial function instead of a parallel\n",
      "   function.",
      "   \n\n",
      "   As a quick hack, we replace this serial version of mclapply()\n",
      "   with a wrapper to parLapply() for this R session. Please see\n\n",
      "     http://www.stat.cmu.edu/~nmv/2014/07/14/implementing-mclapply-on-windows \n\n",
      "   for details.\n\n"))
}

## If the OS is Windows, set mclapply to the
## the hackish version. Otherwise, leave the
## definition alone. 
mclapply <- switch( Sys.info()[['sysname']],
   Windows = {mclapply.hack}, 
   Linux   = {mclapply},
   Darwin  = {mclapply})

## end mclapply.hack.R
setwd("M:/DataAnalysis/_Reports/EX00417 (Bluebird Metabolomics Pilot)/A003 - Untargeted/Documents/Dereplication/2016-03-04/NEG")

# Load libraries and functions
library(ggplot2)
library(reshape2)
library(bisoreg)
library(plyr)

source("M:/DataAnalysis/Automation scripts & soft/R/mclapply.hack.R", local=T)

# Original for span vals: 0.25, 1, by = 0.05
# If number of pooled samples < 5, fold = number of pooled
#
# If pooled don't flank all samples (have to extrapolate):
# theta.fit <- function(x, y, span) loess(y ~ x, span = span, control = loess.control(surface = "direct"))
#
# "folds" should be <= number of pooled samples
#

lwrap <- function (x, y, span.vals = seq(1, 10, by = 0.5), folds = 5){
  mae <- numeric(length(span.vals))
  theta.fit <- function(x, y, span) loess(y ~ x, span = span)
  theta.predict <- function(fit, x0) predict(fit, newdata = x0)
  ii = 0
  for (span in span.vals) {
    ii <- ii + 1
    y.cv <- crossval(x, y, theta.fit, theta.predict, span = span, ngroup = folds)$cv.fit
    fltr <- !is.na(y.cv)
    mae[ii] <- mean(abs(y[fltr] - y.cv[fltr]))
  }
  span <- span.vals[which.min(mae)]
  out <- loess(y ~ x, span = span, control = loess.control(surface = "direct"))
  return(out)
}

adjustArea <- function(x){
  tgts <- subset(x, select = c(TIME, AREA), TYPE=="POOLED")
  lwr <- lwrap(tgts$TIME, tgts$AREA)
  x$fitted <- predict(lwr, x$TIME)
  x$adj = round(x$AREA / x$fitted * median(tgts$AREA))
  return(x)
}

dat <- read.delim("EX00417_NEG_DATA_NO_DUPLICATES.TXT", check.names = FALSE)

tdat <- as.data.frame(t(dat))
colnames(tdat) <- as.character(dat[,1])
dat <- tdat[-1,]
dat$Sample <- row.names(dat)
rm(tdat)
indx <- sapply(dat, is.factor)
indx[1]<-FALSE
dat[indx] <- lapply(dat[indx], function(x) as.numeric(as.character(x)))

dat.m <- melt(dat, id.vars = c("Sample", "TYPE", "TIME"), variable.name = "TARGET", value.name = "AREA", na.rm = T)
dat.s <- split(dat.m, dat.m$TARGET)

dadj <- adjustArea(dat.s[[1]])

dplot <- ggplot(dadj, aes(TIME, y = value, color = variable, shape = factor(TYPE))) +
  geom_point(aes(y = adj, col = "adj"), size=3) +
  geom_line(aes(y = fitted, col = "fitted")) +
  geom_point(aes(y = AREA, col = "AREA"))
  
NAME	20150507-EX00417-A003-IN0016-PO-CS0000009.1-N	20150507-EX00417-A003-IN0016-PO-CS0000009.2-N	20150507-EX00417-A003-IN0016-PO-CS0000009.3-N	20150507-EX00417-A003-IN0016-PO-CS0000009.4-N	20150507-EX00417-A003-IN0016-PO-CS0000009.5-N	20150507-EX00417-A003-IN0016-S00017715-N	20150507-EX00417-A003-IN0016-S00017716-N	20150507-EX00417-A003-IN0016-S00017717-N	20150507-EX00417-A003-IN0016-S00017718-N	20150507-EX00417-A003-IN0016-S00017719-N	20150507-EX00417-A003-IN0016-S00017720-N	20150507-EX00417-A003-IN0016-S00017721-N	20150507-EX00417-A003-IN0016-S00017722-N	20150507-EX00417-A003-IN0016-S00017723-N	20150507-EX00417-A003-IN0016-S00017724-N	20150507-EX00417-A003-IN0016-S00017725-N	20150507-EX00417-A003-IN0016-S00017726-N	20150507-EX00417-A003-IN0016-S00017727-N	20150507-EX00417-A003-IN0016-S00017728-N	20150507-EX00417-A003-IN0016-S00017729-N	20150507-EX00417-A003-IN0016-S00017730-N	20150507-EX00417-A003-IN0016-S00017731-N	20150507-EX00417-A003-IN0016-S00017732-N	20150507-EX00417-A003-IN0016-S00017733-N	20150507-EX00417-A003-IN0016-S00017734-N	20150507-EX00417-A003-IN0016-S00017735-N	20150507-EX00417-A003-IN0016-S00017736-N	20150507-EX00417-A003-IN0016-S00017737-N	20150507-EX00417-A003-IN0016-S00017738-N	20150507-EX00417-A003-IN0016-S00017739-N	20150507-EX00417-A003-IN0016-S00017740-N	20150507-EX00417-A003-IN0016-S00017741-N	20150507-EX00417-A003-IN0016-S00017742-N	20150507-EX00417-A003-IN0016-S00017743-N	20150507-EX00417-A003-IN0016-S00017744-N	20150507-EX00417-A003-IN0016-S00017745-N	20150507-EX00417-A003-IN0016-S00017746-N
TYPE	POOLED	POOLED	POOLED	POOLED	POOLED	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE	SAMPLE
TIME	0	5.65	11.3	16.95	22.6	0.63	1.26	1.88	2.51	3.14	3.77	4.39	5.02	6.28	6.91	7.53	8.16	8.79	9.42	10.04	10.67	11.93	12.55	13.18	13.81	14.44	15.07	15.69	16.32	17.58	18.2	18.83	19.46	20.09	20.71	21.34	21.97
L-HISTIDINE (M-H)-	85511	84182	84460	81713	73492	61040	88556	27650	75078	95983	99869	66847	75195	80357	34602	74307	70824	81666	94960	65535	54473	44840	82196	93111	67506	72186	66416	62415	70721	81199	55746	77000	35269	77939	73921	75588	58132
CITRIC ACID (M-H)-	8495116	6615815	7294822	7329470	6494593	5285724	7429132	2422089	7370853	8033781	7998543	6080085	7445175	7302322	3060311	7190515	8211321	7524816	7341764	6216463	6151818	4784126	7078060	6813009	7212430	6065239	8153841	6578876	4683127	7016849	7137953	9381467	3167750	8881683	7852891	7553011	6312367
SUCCINIC ACID (M-H)-	105507	106669	102665	102646	99346	70086	102363	23709	74260	104657	100744	84135	84588	68097	19068	96911	84150	73252	78657	116426	83675	42668	94628	62724	79655	87166	97551	84735	96981	82080	106453	35261	29145	111736	92888	103276	69304
TRANS-ACONITIC ACID (M-H)-	387520	440274	424221	126330	75133	303391	467775	158946	154158	230590	136774	105146	304805	279782	209932	558651	660411	118712	355268	283335	393161	771528	788553	334986	316278	286394	116297	319577	377277	283404	555786	712513	221401	422831	302480	377889	318259
L-TRYPTOPHAN-15N2 [ISTD] (M-H)-	18906	18081	15152	17959	17378	13177	20116					11774	16048			15316	14043			16505	13870	8060	16526	14937	13506	13561	16819			8462	12875	14173	1999	14535	11631		13498
MANDELIC ACID #2 (M-H)-	101770	104046	99808	100814	105358	796039	82235		29116		882144					790941						4053				14036	45904					2103		138342	14658		
ZEATIN [ISTD] (M+Cl)-	339212	333869	346860	356563	333685	238428	318941	74414	327442	325604	349110	331978	340313	320461	89713	339945	357351	348773	368487	351765	349453	184334	356807	355272	353466	316420	359382	329872	361255	350826	323857	326036	83091	359986	363606	336547	325351
L-[15N]ANTHRANILIC ACID [ISTD] (M-H)-	17565	18356	20912	17861	18632	13141	18955		16744	14052	16640	19051	18730	16649	5410	18670	18994	17036	19985	18064	18600	9919	16695	15475	18751	19334	17400	17231	14839	20450	16034	20551	4324	20712	23862	18955	16717
XANTHINE (M-H)-	57048	58689	51868	55983	54251	53100	39411	17790	36613	35473	51345	49189	47860	40156	11381	118681	95377	40972	44895	95990	75865	38564	76604	50720	43091	43055	35460	38299	55747	26176	31392	38874	11277	52438	56737	40726	41125
THEOPHYLLINE (M-H)-	513117	516259	519962	543743	498036	58438	95124	67807	215293	562421	439414	6832	6457	54161	13992	426364	109640		38096	36421	25458	124558	267763	111210	94747	160404	310702		4299420	6018	3625453	10142	47885	194782	160746	5421	35457
16:0 LYSO PC (M+Cl)-	46350	39134	36170	37790	41527	12740	54038	7229	35525	42177	42309	47847	39361	49558	21002	50780	51119	37636	42092	43920	38268	47550	60673	40755	41594	35699	41793	59793	58280	47288	21415	46023	23607	27930	28524	48580	51681
BENZOIC ACID (M-H)-	13126	14592	12036	12881	12882	9015	9756	5471	18177	9125	9968	8760	7822	13893			11334	11220	12353	12563	7060		8654	15901	6109	9845	6422	8311	8035	11925	11859	8443		15170	13377	8208	8652
STEARIC ACID (M-H)-	164026	187122	198809	235696	291097	130631	301223	53764	216708	202130	188941	197644	111428	186968	51649	235284	225385	202622	223448	263602	180345	161018	248369	162251	233651	276912	282746	333284	354394	272801	267884	390999	92183	490128	965168	310622	321834
GLUTAMINE (M-H)-	175692	187355	159946	173314	126829	135850	153111		131361	178711	173852	181790	195776	182939	122440	139759	157307	156659	182952	157326	121089	90751	128255	195301	166108	161864	148349	283778	264157	146268	130429	183219	134019	149559	155530	152809	131704
ETHYLMALONIC ACID (M-H)-	17656	17571	18647	4811	17633	5140			10186		9772	3436				8428	8972		1884	3803			1830	18971	18391	52487	58855	5747				3713		24646	13613	2001	
DEOXYURIDINE (M+Cl)-	24366	24054	24495	25754	21036	13903	12464	1180	32701	24193	35609	21661	19944	18231		12819	8835	27500	27357		8536	5180		18388	15545	19682	25155	23090		21803	17440	20880		43095	15645	20951	15344
AZELAIC ACID (M-H)-	47102	44828	41977	46604	43483	30350	48072	9759	38964	34822	38841	36667	33319	34686	9423	36141	37012	41063	44079	84615	28154	14753	31202	30747	38383	31529	26395	30993	31814	28452	40489	51558	10354	45173	42790	41247	37309