Toolbox
osd.kos$label <- mapvalues(osd.kos$label,
c('OSD114_2014-06-21_50m_NPL022', 'OSD118_2014-06-24_0.2m_NPL022', 'OSD72_2014-07-21_0.8m_NPL022', 'OSD159_2014-06-21_2m_NPL022'),
c('OSD114_2014-06-21_1m_NPL022', 'OSD118_2014-06-21_0.2m_NPL022', 'OSD72_2014-06-21_0.8m_NPL022', 'OSD159_2014-06-19_2m_NPL022'))
osd.kos<-subset(osd.kos,
label != 'OSD41_2014-06-21_1m_NPL022' & label != 'OSD151_2014-06-21_0m_NPL022' &
label != 'OSD107_2014-06-20_0m_NPL022' & label != 'OSD109_2014-06-20_0m_NPL022' &
label != 'OSD168_2014-06-21_2m_NPL022' & label != 'OSD108_2014-06-20_0m_NPL022' &
label != 'OSD186_2014-06-21_2.4m_NPL022' & label != 'OSD116_2014-06-21_0m_NPL022' &
label != 'OSD110_2014-06-21_0m_NPL022' & label != 'OSD10_2014-06-21_1m_NPL022')
exploreData<-function(f.physeq){
tdt = data.table(TotalCounts = taxa_sums(f.physeq), OTU = taxa_names(f.physeq))
print(ggplot(tdt, aes(TotalCounts)) +
geom_histogram() +
ggtitle("Histogram of Total Counts"))
# taxa cumulative sum
taxcumsum = tdt[, .N, by = TotalCounts]
setkey(taxcumsum, TotalCounts)
taxcumsum[, CumSum := cumsum(N)]
# Define the plot
pCumSum = ggplot(taxcumsum, aes(TotalCounts, CumSum)) +
geom_point() +
xlab("Filtering Threshold, Minimum Total Counts") +
ylab("OTUs Filtered") +
ggtitle("OTUs that would be filtered vs. the minimum count threshold")
print(pCumSum)
# Prevalence
mdt = fast_melt(f.physeq)
prevdt = mdt[, list(Prevalence = sum(count > 0),
TotalCounts = sum(count)),
by = taxaID]
print(ggplot(prevdt, aes(Prevalence)) +
geom_histogram() +
ggtitle("Histogram of Taxa Prevalence"))
cat("0es: ",prevdt[(Prevalence <= 0), .N], "\n")
cat("Singletones: ",prevdt[(Prevalence <= 1), .N], "\n")
cat("Singletones && doubletones:", prevdt[(Prevalence <= 2), .N], "\n")
prevcumsum = prevdt[, .N, by = Prevalence]
setkey(prevcumsum, Prevalence)
prevcumsum[, CumSum := cumsum(N)]
pPrevCumSum = ggplot(prevcumsum, aes(Prevalence, CumSum)) +
geom_point() +
xlab("Filtering Threshold, Prevalence") +
ylab("OTUs Filtered") +
ggtitle("OTUs that would be filtered vs. the minimum count threshold")
print(pPrevCumSum)
return(prevdt)
}
rb.prevdt<-exploreData(rb.physeq)
# rb.prevdt.rar<-exploreData(rb.physeq.rar)
re.prevdt<-exploreData(re.physeq)
# re.prevdt.rar<-exploreData(re.physeq.rar)
ri.prevdt<-exploreData(ri.physeq)
filterPrev<-function(prevdt = prevdt, f.physeq = f.physeq, prev = prev){
keepTaxa = prevdt[(Prevalence >= prev*nsamples(f.physeq) & TotalCounts > summary(prevdt$TotalCounts)[2]), taxaID]
return(keepTaxa)
}
prev <- 0.1
(rb.physeq.filt<-filterPrev(prevdt = rb.prevdt, f.physeq = rb.physeq, prev = prev))
(re.physeq.filt<-filterPrev(re.prevdt, re.physeq, prev = prev))
(ri.physeq.filt<-filterPrev(ri.prevdt, ri.physeq, prev = prev))
fast_melt = function(physeq){
require(data.table)
require(phyloseq)
# supports "naked" otu_table as `physeq` input.
otutab = as(otu_table(physeq), "matrix")
if(!taxa_are_rows(physeq)){otutab <- t(otutab)}
otudt = data.table(otutab, keep.rownames = TRUE)
setnames(otudt, "rn", "taxaID")
# Enforce character taxaID key
otudt[, taxaIDchar := as.character(taxaID)]
otudt[, taxaID := NULL]
setnames(otudt, "taxaIDchar", "taxaID")
# Melt count table
mdt = melt.data.table(otudt,
id.vars = "taxaID",
variable.name = "SampleID",
value.name = "count")
# Remove zeroes, NAs
mdt <- mdt[count > 0][!is.na(count)]
# Calculate relative abundance
mdt[, RelativeAbundance := count / sum(count), by = SampleID]
if(!is.null(tax_table(physeq, errorIfNULL = FALSE))){
# If there is a tax_table, join with it. Otherwise, skip this join.
taxdt = data.table(as(tax_table(physeq, errorIfNULL = TRUE), "matrix"), keep.rownames = TRUE)
setnames(taxdt, "rn", "taxaID")
# Enforce character taxaID key
taxdt[, taxaIDchar := as.character(taxaID)]
taxdt[, taxaID := NULL]
setnames(taxdt, "taxaIDchar", "taxaID")
# Join with tax table
setkey(taxdt, "taxaID")
setkey(mdt, "taxaID")
mdt <- taxdt[mdt]
}
return(mdt)
}
exploreData<-function(f.physeq){
tdt = data.table(TotalCounts = taxa_sums(f.physeq), OTU = taxa_names(f.physeq))
# taxa cumulative sum
taxcumsum = tdt[, .N, by = TotalCounts]
setkey(taxcumsum, TotalCounts)
taxcumsum[, CumSum := cumsum(N)]
# Define the plot
#print(pCumSum)
# Prevalence
mdt = fast_melt(f.physeq)
prevdt = mdt[, list(Prevalence = sum(count > 0),
TotalCounts = sum(count)),
by = taxaID]
cat("0es: ",prevdt[(Prevalence <= 0), .N], "\n")
cat("Singletones: ",prevdt[(Prevalence <= 1), .N], "\n")
cat("Singletones && doubletones:", prevdt[(Prevalence <= 2), .N], "\n")
return(prevdt)
}
filterPrev<-function(prevdt = prevdt, f.physeq = f.physeq, prev = prev){
keepTaxa = prevdt[(Prevalence >= prev*nsamples(f.physeq) & TotalCounts > summary(prevdt$TotalCounts)[2]), taxaID]
return(keepTaxa)
}