genomewalker
4/9/2018 - 6:50 PM

osd2014_dada2_inference

library(dada2); packageVersion("dada2")

# File parsing
wdir <- "/scratch/antonio/dada_osd2014"
setwd(wdir)
pathF <-  file.path(wdir, "files") 
pathR <- file.path(wdir, "files")
filtpathF <- file.path(pathF, "filtered") # Filtered forward files go into the pathF/filtered/ subdirectory
filtpathR <- file.path(pathR, "filtered") # ...
fastqFs <- sort(list.files(pathF, pattern="R1.fastq.gz"))
fastqRs <- sort(list.files(pathR, pattern="R2.fastq.gz"))
if(length(fastqFs) != length(fastqRs)) stop("Forward and reverse files do not match.")

sample.names <- sapply(strsplit(fastqFs, "_R1.fastq.gz"), `[`, 1)


filtpathF <- file.path(filtpathF, gsub("R1.fastq.gz","R1.filt.fastq.gz", fastqFs))
filtpathR <- file.path(filtpathR, gsub("R2.fastq.gz","R2.filt.fastq.gz", fastqRs))

fastqFs <- file.path(pathF, fastqFs)
fastqRs <- file.path(pathR, fastqRs)

qpF <- plotQualityProfile(fastqFs[1:5], aggregate = TRUE)
qpR <- plotQualityProfile(fastqRs[1:5], aggregate = TRUE)

# Filter and Trim reads (TruncLen and maxEE based on previous adjustments with reads.out mostly >= 70%)
ft <- filterAndTrim(fwd=fastqFs[1:5], filt=filtpathF[1:5],
                    rev=fastqRs[1:5], filt.rev=filtpathR[1:5],
                    truncLen=c(220,200), maxEE=c(8,8), maxN=0, rm.phix=TRUE,
                    compress=TRUE, verbose=TRUE, multithread=TRUE)




# Learn forward error rates
errF <- learnErrors(filtpathF, nread = 2e6, multithread = 64) 
# Learn reverse error rates
errR <- learnErrors(filtpathR, nread = 2e6, multithread = 64) 

ErrorPlotF <- plotErrors(errF, nominalQ = TRUE)
ErrorPlotR <- plotErrors(errR, nominalQ = TRUE)

#Dereplication
derepFs <- derepFastq(filtpathF, verbose=TRUE)
derepRs <- derepFastq(filtpathR, verbose=TRUE)

#Sample Inference
dadaFs <- dada(derepFs, err=errF, multithread = TRUE)
dadaRs <- dada(derepRs, err=errR, multithread = TRUE)

dadaFs[[1]]

#Merge paired reads
mergers <- mergePairs(dadaFs, derepFs, dadaRs, derepRs, verbose = TRUE)
head(mergers[[1]])

#Construct sequence table
seqtab <- makeSequenceTable(mergers)
dim(seqtab)
table(nchar(getSequences(seqtab)))

#Remove chimeras
seqtab.nochim <- removeBimeraDenovo(seqtab, method="consensus", multithread = TRUE, verbose = TRUE)
dim(seqtab.nochim)
sum(seqtab.nochim)/sum(seqtab)

#Track reads throughout the pipeline
getN <- function(x) sum(getUniques(x))
track <- cbind(ft, sapply(dadaFs, getN), sapply(mergers, getN), rowSums(seqtab), rowSums(seqtab.nochim))
colnames(track) <- c("input", "filtered", "denoised", "merged", "tabled", "nonchim")
rownames(track) <- sample.names
head(track)

as.data.frame(track)
write.table(track, "/scratch/aprondzi/data/paired/DADA_All.txt", sep="\\t")


track_long <- as.data.frame(track) %>%
  rownames_to_column(var = "sample") %>%
  mutate(sample = gsub('/scratch/aprondzi/data/paired', '', sample)) %>%
  gather(variable, value, -sample) %>%
  tbl_df()

track_long$variable <- factor(track_long$variable, levels = colnames(track))

ggplot(track_long, aes(variable, value, fill = variable)) +
  geom_bar(stat = "identity", position = "dodge") + 
  scale_fill_brewer(palette="Paired") +
  facet_wrap(~sample, scales = "free_y") +
  xlab("DADA2 steps") +
  ylab("Number of sequences") 


saveRDS(seqtab, "/bioinf/home/aprondzi/")
"