genomewalker
2/21/2016 - 8:24 PM

Toolbox

Toolbox

osd.kos$label <- mapvalues(osd.kos$label,
                           c('OSD114_2014-06-21_50m_NPL022', 'OSD118_2014-06-24_0.2m_NPL022', 'OSD72_2014-07-21_0.8m_NPL022', 'OSD159_2014-06-21_2m_NPL022'),
                           c('OSD114_2014-06-21_1m_NPL022', 'OSD118_2014-06-21_0.2m_NPL022', 'OSD72_2014-06-21_0.8m_NPL022', 'OSD159_2014-06-19_2m_NPL022'))

osd.kos<-subset(osd.kos, 
                label != 'OSD41_2014-06-21_1m_NPL022'    & label != 'OSD151_2014-06-21_0m_NPL022' &
                  label != 'OSD107_2014-06-20_0m_NPL022'   & label != 'OSD109_2014-06-20_0m_NPL022' &
                  label != 'OSD168_2014-06-21_2m_NPL022'   & label != 'OSD108_2014-06-20_0m_NPL022' &
                  label != 'OSD186_2014-06-21_2.4m_NPL022' & label != 'OSD116_2014-06-21_0m_NPL022' &
                  label != 'OSD110_2014-06-21_0m_NPL022'   & label != 'OSD10_2014-06-21_1m_NPL022')
exploreData<-function(f.physeq){
tdt = data.table(TotalCounts = taxa_sums(f.physeq), OTU = taxa_names(f.physeq))
print(ggplot(tdt, aes(TotalCounts)) + 
  geom_histogram() + 
  ggtitle("Histogram of Total Counts"))

# taxa cumulative sum
taxcumsum = tdt[, .N, by = TotalCounts]
setkey(taxcumsum, TotalCounts)
taxcumsum[, CumSum := cumsum(N)]
# Define the plot
pCumSum = ggplot(taxcumsum, aes(TotalCounts, CumSum)) + 
  geom_point() +
  xlab("Filtering Threshold, Minimum Total Counts") +
  ylab("OTUs Filtered") +
  ggtitle("OTUs that would be filtered vs. the minimum count threshold") 
print(pCumSum)

# Prevalence

mdt = fast_melt(f.physeq)
prevdt = mdt[, list(Prevalence = sum(count > 0), 
                    TotalCounts = sum(count)),
             by = taxaID] 


print(ggplot(prevdt, aes(Prevalence)) + 
  geom_histogram() + 
  ggtitle("Histogram of Taxa Prevalence"))

cat("0es: ",prevdt[(Prevalence <= 0), .N], "\n")

cat("Singletones: ",prevdt[(Prevalence <= 1), .N], "\n")

cat("Singletones && doubletones:", prevdt[(Prevalence <= 2), .N], "\n")

prevcumsum = prevdt[, .N, by = Prevalence]
setkey(prevcumsum, Prevalence)
prevcumsum[, CumSum := cumsum(N)]
pPrevCumSum = ggplot(prevcumsum, aes(Prevalence, CumSum)) + 
  geom_point() +
  xlab("Filtering Threshold, Prevalence") +
  ylab("OTUs Filtered") +
  ggtitle("OTUs that would be filtered vs. the minimum count threshold")
print(pPrevCumSum)
return(prevdt)
}

rb.prevdt<-exploreData(rb.physeq)
# rb.prevdt.rar<-exploreData(rb.physeq.rar)

re.prevdt<-exploreData(re.physeq)
# re.prevdt.rar<-exploreData(re.physeq.rar)

ri.prevdt<-exploreData(ri.physeq)


filterPrev<-function(prevdt = prevdt, f.physeq = f.physeq, prev = prev){
keepTaxa = prevdt[(Prevalence >= prev*nsamples(f.physeq) & TotalCounts > summary(prevdt$TotalCounts)[2]), taxaID]
return(keepTaxa)
}

prev <- 0.1
(rb.physeq.filt<-filterPrev(prevdt = rb.prevdt, f.physeq = rb.physeq, prev = prev))
(re.physeq.filt<-filterPrev(re.prevdt, re.physeq, prev = prev))
(ri.physeq.filt<-filterPrev(ri.prevdt, ri.physeq, prev = prev))
fast_melt = function(physeq){
  require(data.table)
  require(phyloseq)
  # supports "naked" otu_table as `physeq` input.
  otutab = as(otu_table(physeq), "matrix")
  if(!taxa_are_rows(physeq)){otutab <- t(otutab)}
  otudt = data.table(otutab, keep.rownames = TRUE)
  setnames(otudt, "rn", "taxaID")
  # Enforce character taxaID key
  otudt[, taxaIDchar := as.character(taxaID)]
  otudt[, taxaID := NULL]
  setnames(otudt, "taxaIDchar", "taxaID")
  # Melt count table
  mdt = melt.data.table(otudt, 
                        id.vars = "taxaID",
                        variable.name = "SampleID",
                        value.name = "count")
  # Remove zeroes, NAs
  mdt <- mdt[count > 0][!is.na(count)]
  # Calculate relative abundance
  mdt[, RelativeAbundance := count / sum(count), by = SampleID]
  if(!is.null(tax_table(physeq, errorIfNULL = FALSE))){
    # If there is a tax_table, join with it. Otherwise, skip this join.
    taxdt = data.table(as(tax_table(physeq, errorIfNULL = TRUE), "matrix"), keep.rownames = TRUE)
    setnames(taxdt, "rn", "taxaID")
    # Enforce character taxaID key
    taxdt[, taxaIDchar := as.character(taxaID)]
    taxdt[, taxaID := NULL]
    setnames(taxdt, "taxaIDchar", "taxaID")
    # Join with tax table
    setkey(taxdt, "taxaID")
    setkey(mdt, "taxaID")
    mdt <- taxdt[mdt]
  }
  return(mdt)
}

exploreData<-function(f.physeq){
tdt = data.table(TotalCounts = taxa_sums(f.physeq), OTU = taxa_names(f.physeq))


# taxa cumulative sum
taxcumsum = tdt[, .N, by = TotalCounts]
setkey(taxcumsum, TotalCounts)
taxcumsum[, CumSum := cumsum(N)]
# Define the plot
#print(pCumSum)

# Prevalence

mdt = fast_melt(f.physeq)
prevdt = mdt[, list(Prevalence = sum(count > 0), 
                    TotalCounts = sum(count)),
             by = taxaID] 

cat("0es: ",prevdt[(Prevalence <= 0), .N], "\n")

cat("Singletones: ",prevdt[(Prevalence <= 1), .N], "\n")

cat("Singletones && doubletones:", prevdt[(Prevalence <= 2), .N], "\n")
return(prevdt)
}


filterPrev<-function(prevdt = prevdt, f.physeq = f.physeq, prev = prev){
keepTaxa = prevdt[(Prevalence >= prev*nsamples(f.physeq) & TotalCounts > summary(prevdt$TotalCounts)[2]), taxaID]
return(keepTaxa)
}