genomewalker
12/10/2018 - 8:53 AM

Create files for alluvials plots

#!/usr/bin/env Rscript

args = commandArgs(trailingOnly=TRUE)

# test if there is at least one argument: if not, return an error
if (length(args) == 0) {
  stop("At least one argument must be supplied (input file).n", call = FALSE)
} else if (length(args) == 1) {
  # default output file
  args[2] = "alluvial.tsv"
}


needed = c("tidyverse", "magrittr",  "parallel")
is.installed <- function(pkg){
  is.element(pkg, installed.packages()[,1])
}

if (!is.installed("crayon")){suppressMessages(install.packages("crayon"))}
suppressMessages(library(crayon))

missing_package <- FALSE
cat("\nChecking if all packages are installed...\n\n")
# For loop to run through each of the packages
for (p in 1:length(needed)){
  if(is.installed(needed[p])){
    cat(sprintf("%-10s: %s", needed[p], green("Installed\n")))
  }else{
    cat(sprintf("%-10s: %s", needed[p], red("Not installed\n")))
    missing_package <- TRUE
  }
}

quit_not_installed <- function(){
  cat("\nMissing packages, please install them.\n")
  quit(save = "no", status = 1)
}

if (missing_package) {
  quit_not_installed()
}else{
  cat("\nAll packages installed.\n")}

suppressMessages(library(tidyverse))
suppressMessages(library(magrittr))
suppressMessages(library(parallel))

ncores <- parallel::detectCores()/2

cat(paste0("\nReading file ", args[1], "..."))
suppressMessages(cl_tax_orfs <- read_tsv(args[1], col_names = TRUE) %>%
                   mutate(cl_name = as.character(cl_name)))
cat(green(" done\n\n"))

cat(paste("File", args[1], "has", nrow(cl_tax_orfs), "rows and", ncol(cl_tax_orfs), "columns\n\n"))

majority_vote <- function (x, seed = 12345) {
  set.seed(seed)
  whichMax <- function(x) {
    m <- seq_along(x)[x == max(x, na.rm = TRUE)]
    if (length(m) > 1)
      sample(m, size = 1)
    else m
  }
  x <- as.vector(x)
  tab <- table(x)
  m <- whichMax(tab)
  out <- list(table = tab, ind = m, majority = names(tab)[m])
  return(out)
}

# Analyse annotations -----------------------------------------------------
# cl_tax_orfs %>%
#   group_by(cl_name, category) %>%
#   count() %>%
#   arrange(desc(n)) %>%
#   group_by(category) %>%
#   skimr::skim()

propagate_annotation <-function(X, data = data){
  cls <- data %>%
    dplyr::filter(cl_name == X)

  consensus_superkingdom <- cls %>%
    dplyr::filter(!is.na(superkingdom)) %>%
    summarise(consensus_superkingdom = ifelse(n() < 1, NA,  majority_vote(superkingdom)$majority)) %>% .$consensus_superkingdom

  consensus_phylum <- cls %>%
    dplyr::filter(superkingdom == consensus_superkingdom,
                  !is.na(phylum)) %>%
    summarise(consensus_phylum = ifelse(n() < 1, paste(consensus_superkingdom, "NA", sep = "_"), majority_vote(phylum)$majority)) %>% .$consensus_phylum

  consensus_class <- cls %>%
    dplyr::filter(superkingdom == consensus_superkingdom,
                  phylum == consensus_phylum,
                  !is.na(class)) %>%
    summarise(consensus_class = ifelse(n() < 1, paste(consensus_phylum, "NA", sep = "_"), majority_vote(class)$majority)) %>% .$consensus_class

  consensus_order <- cls %>%
    dplyr::filter(superkingdom == consensus_superkingdom,
                  phylum == consensus_phylum,
                  class == consensus_class,
                  !is.na(order)) %>%
    summarise(consensus_order = ifelse(n() < 1, paste(consensus_class, "NA", sep = "_"), majority_vote(order)$majority)) %>% .$consensus_order

  consensus_family <- cls %>%
    dplyr::filter(superkingdom == consensus_superkingdom,
                  phylum == consensus_phylum,
                  class == consensus_class,
                  order == consensus_order,
                  !is.na(family)) %>%
    summarise(consensus_family = ifelse(n() < 1, paste(consensus_order, "NA", sep = "_"), majority_vote(family)$majority)) %>% .$consensus_family

  consensus_genus <- cls %>%
    dplyr::filter(superkingdom == consensus_superkingdom,
                  phylum == consensus_phylum,
                  class == consensus_class,
                  order == consensus_order,
                  family == consensus_family,
                  !is.na(genus)) %>%
    summarise(consensus_genus = ifelse(n() < 1, paste(consensus_family, "NA", sep = "_"), majority_vote(genus)$majority)) %>% .$consensus_genus

  tibble(cl_name = X, consensus_superkingdom, consensus_phylum, consensus_class,
         consensus_order, consensus_family, consensus_genus)
}


cat(paste("Propagating taxonomic annotations at cluster level using", cyan(ncores), "cores... "))

cl_tax_consensus <- mclapply(cl_tax_orfs$cl_name %>% unique(),
                             propagate_annotation, data = cl_tax_orfs, mc.cores = ncores) %>%
  bind_rows()

tax_ranks <- c("consensus_superkingdom", "consensus_phylum", "consensus_class", "consensus_order", "consensus_family", "consensus_genus")

cat(green("done\n"))

# Quick look to the consensus annotations ---------------------------------


#map(tax_ranks, function(X){cl_tax_consensus %>% group_by_(X) %>% count(sort = TRUE) %>% ungroup()})

#cl_tax_consensus %>% filter(is.na(consensus_phylum))


# Write results -----------------------------------------------------------

# cl_tax_orfs %>%
#   select(supercluster, cl_name) %>%
#   inner_join(cl_tax_consensus %>% select(cl_name, consensus_class, consensus_phylum, consensus_superkingdom)) %>%
#   write_tsv("~/Downloads/pr2alluvial.tsv")


# Annotate at the ORF level -----------------------------------------------
# Uses the data generated above

propagate_annotation_na <-function(X, data = data){
  cls <- data[X,] %>%
    select(genus, family, order, class, phylum, superkingdom, orf, cl_name, supercluster)

  consensus_superkingdom <- cls %>%
    summarise(consensus_superkingdom = ifelse(is.na(superkingdom), NA, superkingdom))%>% .$consensus_superkingdom

  consensus_phylum <- cls %>%
    summarise(consensus_phylum = ifelse(is.na(phylum), paste(consensus_superkingdom, "NA", sep = "_"), phylum)) %>% .$consensus_phylum

  consensus_class <- cls %>%
    summarise(consensus_class = ifelse(is.na(class), paste(consensus_phylum, "NA", sep = "_"), class)) %>% .$consensus_class

  consensus_order <- cls %>%
    summarise(consensus_order = ifelse(is.na(order), paste(consensus_class, "NA", sep = "_"), order)) %>% .$consensus_order

  consensus_family <- cls %>%
    summarise(consensus_family = ifelse(is.na(family), paste(consensus_order, "NA", sep = "_"), family)) %>% .$consensus_family

  consensus_genus <- cls %>%
    dplyr::filter(superkingdom == consensus_superkingdom,
                  phylum == consensus_phylum,
                  class == consensus_class,
                  order == consensus_order,
                  family == consensus_family,
                  !is.na(genus)) %>%
    summarise(consensus_genus = ifelse(n() < 1, paste(consensus_family, "NA", sep = "_"), majority_vote(genus)$majority)) %>% .$consensus_genus

  tibble(supercluster = cls$supercluster, orf = cls$orf, cl_name = cls$cl_name, consensus_superkingdom, consensus_phylum, consensus_class,
         consensus_order, consensus_family, consensus_genus)
}

cat("Collecting consensus annotations... ")
pr_clusters_consensus <- cl_tax_orfs %>%
  select(supercluster, cl_name) %>%
  inner_join(cl_tax_consensus %>% select(cl_name, consensus_superkingdom, consensus_phylum, consensus_class, consensus_order, consensus_family, consensus_genus), by = "cl_name")

cat(green("done"),"\nCollecting ORFs with taxonomic annotations... ")
pr_clusters_no_na <- cl_tax_orfs %>%
  filter(!(is.na(superkingdom) | is.na(phylum)))

cat(green("done"), "\nCollecting ORFs without taxonomic annotations... ")
cl_tax_consensus_na  <- cl_tax_orfs %>%
  filter(is.na(superkingdom) | is.na(phylum)) %>%
  select(supercluster, cl_name, orf) %>%
  unique() %>%
  inner_join(pr_clusters_consensus %>% select(-supercluster), by = "cl_name") %>% unique()

cat(green("done"),"\nPropagating taxonomic annotations at the ORF level using", cyan(ncores), "cores... ")
cl_tax_consensus_no_na <- mclapply(1:nrow(pr_clusters_no_na),
                                   propagate_annotation_na, data = pr_clusters_no_na, mc.cores = ncores) %>%
  bind_rows()
cat(paste0(green("done"), "\nExporting data for alluvial plot drawing to file ", silver(args[2]), "... "))

bind_rows(cl_tax_consensus_no_na,
          cl_tax_consensus_na) %>%
  write_tsv(args[2])
cat(green("done\n"))