genomewalker
3/30/2018 - 4:18 PM

GH phylo analisis

library(phytools)
library(ape)
library(nLTT)
library(tidyverse)

gh117_tree <- midpoint.root(read.tree(file = "~/Downloads/Tao/RAxML_bipartitions.GH117"))
gh50_tree <- midpoint.root(read.tree(file = "~/Downloads/Tao/RAxML_bipartitions.GH50"))
gh16_tree <- midpoint.root(read.tree(file = "~/Downloads/Tao/RAxML_bipartitions.GH16"))
gh86_tree <- midpoint.root(read.tree(file = "~/Downloads/Tao/RAxML_bipartitions.GH86"))
gh2_tree <- midpoint.root(read.tree(file = "~/Downloads/Tao/RAxML_bipartitions.GH2"))

gh117_tree_um <- chronos(gh117_tree, lambda = 0, model = 'relaxed', calibration = makeChronosCalib(gh117_tree, node = 'root', age.min = 1, age.max = 1, interactive = FALSE, soft.bounds = FALSE))
gh50_tree_um <- chronos(gh50_tree, lambda = 0, model = 'relaxed', calibration = makeChronosCalib(gh50_tree, node = 'root', age.min = 1, age.max = 1, interactive = FALSE, soft.bounds = FALSE))
gh16_tree_um <- chronos(gh16_tree, lambda = 0, model = 'relaxed', calibration = makeChronosCalib(gh16_tree, node = 'root', age.min = 1, age.max = 1, interactive = FALSE, soft.bounds = FALSE))
gh86_tree_um <- chronos(gh86_tree, lambda = 0, model = 'relaxed', calibration = makeChronosCalib(gh86_tree, node = 'root', age.min = 1, age.max = 1, interactive = FALSE, soft.bounds = FALSE))
gh2_tree_um <- chronos(gh2_tree, lambda = 0, model = 'relaxed', calibration = makeChronosCalib(gh2_tree, node = 'root', age.min = 1, age.max = 1, interactive = FALSE, soft.bounds = FALSE))

colors <- RColorBrewer::brewer.pal(6, "Set1")

gh16_cd <- gh16_tree$tip.label  %>% 
  tbl_df() %>% 
  separate(value, into = "tax", remove = FALSE, sep = "_N", extra = "drop") %>% 
  mutate(color = case_when(tax == "Actinobacteria" ~ colors[[1]],
                           tax == "Bacteroidetes/Chlorobi_group" ~ colors[[2]],
                           tax == "Firmicutes" ~ colors[[3]],
                           tax == "Gammaproteobacteria" ~ colors[[4]],
                           tax == "Planctomycetes" ~ colors[[5]],
                           tax == "Alphaproteobacteria" ~ colors[[6]])) #%>%
#select(value, color, tax) %>% write_tsv("~/Downloads/Tao/GH16_itol.tsv", col_names = FALSE)
write.tree(gh16_tree, "~/Downloads/Tao/RAxML_bipartitions_rooted.GH16")
write.tree(gh16_tree_um, "~/Downloads/Tao/RAxML_bipartitions_rooted_um.GH16")


gh86_cd <- gh86_tree$tip.label  %>% 
  tbl_df() %>% 
  separate(value, into = "tax", remove = FALSE, sep = "_N", extra = "drop") %>% 
  mutate(color = case_when(tax == "Actinobacteria" ~ colors[[1]],
                           tax == "Bacteroidetes/Chlorobi_group" ~ colors[[2]],
                           tax == "Firmicutes" ~ colors[[3]],
                           tax == "Gammaproteobacteria" ~ colors[[4]],
                           tax == "Planctomycetes" ~ colors[[5]],
                           tax == "Alphaproteobacteria" ~ colors[[6]])) #%>%
#select(value, color, tax) %>% write_tsv("~/Downloads/Tao/GH86_itol.tsv", col_names = FALSE)
write.tree(gh86_tree, "~/Downloads/Tao/RAxML_bipartitions_rooted.GH86")
write.tree(gh86_tree_um, "~/Downloads/Tao/RAxML_bipartitions_rooted_um.GH86")



gh2_cd <- gh2_tree$tip.label  %>% 
  tbl_df() %>% 
  separate(value, into = "tax", remove = FALSE, sep = "_N", extra = "drop") %>% 
  mutate(color = case_when(tax == "Actinobacteria" ~ colors[[1]],
                           tax == "Bacteroidetes/Chlorobi_group" ~ colors[[2]],
                           tax == "Firmicutes" ~ colors[[3]],
                           tax == "Gammaproteobacteria" ~ colors[[4]],
                           tax == "Planctomycetes" ~ colors[[5]],
                           tax == "Alphaproteobacteria" ~ colors[[6]],
                           tax == "pdb_5T98_A_Chain_A_1_Glycoside_Hydrolase" ~ "#000000",
                           tax == "5T9A_A_PDBID_CHAIN_SEQUENCE" ~ "#000000")) #%>%
#select(value, color, tax) %>% write_tsv("~/Downloads/Tao/GH2_itol.tsv", col_names = FALSE)
write.tree(gh2_tree, "~/Downloads/Tao/RAxML_bipartitions_rooted.GH2")
write.tree(gh2_tree_um, "~/Downloads/Tao/RAxML_bipartitions_rooted_um.GH2")


gh50_cd <- gh50_tree$tip.label  %>% 
  tbl_df() %>% 
  separate(value, into = "tax", remove = FALSE, sep = "_N", extra = "drop") %>% 
  mutate(color = case_when(tax == "Actinobacteria" ~ colors[[1]],
                           tax == "Bacteroidetes/Chlorobi_group" ~ colors[[2]],
                           tax == "Firmicutes" ~ colors[[3]],
                           tax == "Gammaproteobacteria" ~ colors[[4]],
                           tax == "Planctomycetes" ~ colors[[5]],
                           tax == "Alphaproteobacteria" ~ colors[[6]])) #%>%
#select(value, color, tax) %>% write_tsv("~/Downloads/Tao/GH50_itol.tsv", col_names = FALSE)
write.tree(gh50_tree, "~/Downloads/Tao/RAxML_bipartitions_rooted.GH50")
write.tree(gh50_tree_um, "~/Downloads/Tao/RAxML_bipartitions_rooted_um.GH50")


gh117_cd <- gh117_tree$tip.label  %>% 
  tbl_df() %>% 
  separate(value, into = "tax", remove = FALSE, sep = "_N", extra = "drop") %>% 
  mutate(color = case_when(tax == "Actinobacteria" ~ colors[[1]],
                           tax == "Bacteroidetes_Chlorobi_group" ~ colors[[2]],
                           tax == "Firmicutes" ~ colors[[3]],
                           tax == "Gammaproteobacteria" ~ colors[[4]],
                           tax == "Planctomycetes" ~ colors[[5]],
                           tax == "Alphaproteobacteria" ~ colors[[6]])) 
#select(value, color, tax) %>% write_tsv("~/Downloads/Tao/GH117_itol.tsv", col_names = FALSE)
write.tree(gh117_tree, "~/Downloads/Tao/RAxML_bipartitions_rooted.GH117", )
write.tree(gh117_tree_um, "~/Downloads/Tao/RAxML_bipartitions_rooted_um.GH117")



pb_parms <- function(X, n){
  h<-max(nodeHeights(X))
  x<-seq(0,h,by=h/100)
  b<-(log(ape::Ntip(X))-log(2))/h
  trees<-pbtree(b=b, n=ape::Ntip(X), t=h, nsim=n, method="direct", quiet=TRUE)
  g<-sapply(trees,function(x) ltt(x,plot=FALSE)$gamma)
  ltt <- ltt(trees, plot=FALSE)
  list(h = h, x = x,b = b, trees = trees, tree = X, ltt = ltt, g = g)
}

library(ggtree)
gh117_pb_p <- pb_parms(gh117_tree_um, 100)
gh16_pb_p <- pb_parms(gh16_tree_um, 100)
gh86_pb_p <- pb_parms(gh86_tree_um, 100)
gh2_pb_p <- pb_parms(gh2_tree_um, 100)
gh50_pb_p <- pb_parms(gh50_tree_um, 100)


class(gh117_tree_um) <- "phylo"
class(gh50_tree_um) <- "phylo"
class(gh16_tree_um) <- "phylo"
class(gh86_tree_um) <- "phylo"
class(gh2_tree_um) <- "phylo"


plot_pb <- function(X, Y, Z){
  X1 <- X
  setNames(exp(1:Ntip(X1)), X1$tip.label)
  X_ltt <- ltt(X, plot = FALSE)
  X_ltt_df <- data_frame(time = X_ltt$times, lineages = X_ltt$ltt)
  
  j <- fortify(X) %>% tbl_df %>%  mutate(y1 = log(y))
  f <- max(j$y1)/Ntip(X)
  j <- j %>% mutate(y = exp(y*f))
  
  
  cls <- Y %>% select(tax, color) %>% unique()
  cls_v <- cls$color
  names(cls_v) <- cls$tax
  
  Y_states <- Y$tax
  names(Y_states) <- Y$value
  fitER <- ace(Y_states, X, model="ER", type="discrete", marginal = TRUE)
  fitER_l <- as_data_frame(fitER$lik.anc)
  fitER_l$node <- 1:X$Nnode+Ntip(X)
  n_col <- dim(fitER_l)[2] - 1
  pies <- nodepie(fitER_l, cols=1:n_col, alpha=0.8, color = cls_v)
  
  plot_label <- sprintf("bold(\"Pybus\" ~ gamma == %0.4f)", round(X_ltt$gamma, 4))
  g1 <- ggtree(j,ladderize=T) %<+% Y + 
    geom_tippoint(aes(x=x+0.005, color = tax, fill = tax), shape = 22) +
    geom_step(data = X_ltt_df, aes(time, lineages), color = "red", size = 1) + 
    theme_bw() +
    coord_cartesian() +
    scale_y_log10() +
    xlab("Relative time") +
    ylab("Lineages (log10)") +
    theme(legend.position = "top",
          legend.title = element_blank()) +
    annotate("text", label = plot_label, x = 0.05 , y = max(X_ltt_df$lineages), parse = TRUE) +
    scale_fill_manual(values = cls_v) +
    scale_color_manual(values = cls_v)
  
  l <- bind_rows(lapply(1:100, function (x, y) {data_frame(time = y$ltt[[x]]$times, lineages = y$ltt[[x]]$ltt, rep = x)}, y = Z))
  g2 <- ggplot(l, aes(time, lineages)) +
    geom_step(aes(group = rep), alpha = 0.7, color = "grey60") +
    geom_step(data = X_ltt_df, aes(time, lineages), size = 1) + 
    geom_line(data = data_frame(x = c(0,Z$h), y = c(2,ape::Ntip(Z$tree))), aes(x, y), size = 0.9, color = "red",linetype = 2) +
    scale_y_log10() +
    theme_bw() +
    xlab("Relative time") +
    ylab("Lineages (log10)") +
    annotate("text", label = plot_label, x = 0.05 , y = max(X_ltt_df$lineages), parse = TRUE)
  
  g3 <-  ggtree(X,ladderize=T) %<+% Y + 
    geom_tippoint(aes(x=x+0.005, color = tax, fill = tax), shape = 22) +
    #geom_step(data = X_ltt_df, aes(time, lineages), color = "red", size = 1) + 
    theme_bw() +
    coord_cartesian() +
    #scale_y_log10() +
    xlab("Relative time") +
    ylab("Lineages") +
    theme(legend.position = "top",
          legend.title = element_blank()) +
     #annotate("text", label = plot_label, x = 0.05 , y = max(X_ltt_df$lineages), parse = TRUE) +
     scale_fill_manual(values = cls_v) +
     scale_color_manual(values = cls_v)
  g3 <- inset(g3, pies)
  
  list(ltt_tree = g1, pb_ltt = g2, ltt_asr = g3, asr = fitER_l)
}

gh117_plots <- plot_pb(gh117_tree_um, gh117_cd, gh117_pb_p)
gh16_plots <- plot_pb(gh16_tree_um, gh16_cd, gh16_pb_p)
gh86_plots <- plot_pb(gh86_tree_um, gh86_cd, gh86_pb_p)
gh50_plots <- plot_pb(gh50_tree_um, gh50_cd, gh50_pb_p)
gh2_plots <- plot_pb(X = gh2_tree_um, Y = gh2_cd, Z = gh2_pb_p)


phytools::ltt(gh117_tree_um, plot = FALSE)$gamma
phytools::ltt(gh50_tree_um, plot = FALSE)$gamma
phytools::ltt(gh16_tree_um, plot = FALSE)$gamma
phytools::ltt(gh86_tree_um, plot = FALSE)$gamma
phytools::ltt(gh2_tree_um, plot = FALSE)$gamma


rand_trees <- function(X, tree){
  n <- sample(seq(0,0.9,0.05),1)
  ntips <- ape::Ntip(tree)
  tipsToDrop <- sample(ntips,round(ntips*n), replace=FALSE)
  prunedTree <- ape::drop.tip(tree,tipsToDrop,trim.internal=TRUE)
  ltt <- ltt(prunedTree, plot=FALSE)
  list(prunedTree = prunedTree, g = ltt$gamma, n = n)
}

gh117_rand_t <- lapply(1:1000, rand_trees, tree = gh117_pb_p$tree)
gh50_rand_t <- lapply(1:1000, rand_trees, tree = gh50_pb_p$tree)
gh16_rand_t <- lapply(1:1000, rand_trees, tree = gh16_pb_p$tree)
gh86_rand_t <- lapply(1:1000, rand_trees, tree = gh86_pb_p$tree)
gh2_rand_t <- lapply(1:1000, rand_trees, tree = gh2_pb_p$tree)


comb <- bind_rows(purrr::map_dbl(gh117_rand_t, "g") %>% 
                    tbl_df() %>% 
                    mutate(class = "GH117"),
                  purrr::map_dbl(gh50_rand_t, "g") %>% 
                    tbl_df() %>% 
                    mutate(class = "GH50"),
                  purrr::map_dbl(gh16_rand_t, "g") %>% 
                    tbl_df() %>% 
                    mutate(class = "GH16"),
                  purrr::map_dbl(gh86_rand_t, "g") %>% 
                    tbl_df() %>% 
                    mutate(class = "GH86"),
                  purrr::map_dbl(gh2_rand_t, "g") %>% 
                    tbl_df() %>% 
                    mutate(class = "GH2"))

classes <- c("GH117", "GH50", "GH16", "GH86", "GH2")

compare_means(value ~ class,  data = comb, p.adjust.method = "BH") %>% View

comparisons <- combn(classes, 2, simplify = FALSE)

comb$class <- factor(comb$class, levels = c("GH2", "GH16", "GH50", "GH86", "GH117"))
plot_label <- sprintf("bold(\"Pybus\" ~ gamma)")
ggplot(comb, aes(class, value)) +
  geom_boxplot(fill = "grey90") +
  #stat_compare_means(comparisons = comparisons, label = "p.signif", hide.ns = TRUE) +
  xlab("") +
  ylab(expression(paste("Pybus ", gamma))) +
  theme_bw() +
  theme(legend.position = "none")
#!/bin/bash

INFILE=${1}
NAM=$(basename ${INFILE} _alignment_filtered.fasta)
LOG=${NAM}.log

CDHIT_BIN=cd-hit
FAMSA_BIN=~/opt/FAMSA/famsa
ZORRO_BIN=~/zorro_linux_x86_64
RAXML_BIN=raxmlHPC-PTHREADS-AVX2
ODSEQ_BIN=~/opt/OD-Seq_mg/OD-seq

# Remove identical sequences
printf "Dereplicating sequences..."
${CDHIT_BIN} -i ${INFILE} -c 1 -d 0 -o ${NAM}_reduced &> ${LOG}

# Align sequence
printf "done\nAligning sequences..."
${FAMSA_BIN} ${NAM}_reduced ${NAM}_aln.fasta >> ${LOG} 2>&1

# Checking for outliers
printf "done\nChecking for outliers..."
${ODSEQ_BIN} -i ${NAM}_aln.fasta -c ${NAM}_core.fasta -o ${NAM}_outlier.fasta >> ${LOG} 2>&1

# REalign core sequences
printf "done\nAligning core sequences..."
${FAMSA_BIN} ${NAM}_core.fasta ${NAM}_core_aln.fasta >> ${LOG} 2>&1


# Mask alignment
printf "done\nMasking alignment..."
${ZORRO_BIN} ${NAM}_core_aln.fasta | awk '{print int($1+0.5)}' > ${NAM}_mask

# Build phylogeny with RAxML and Zorro masking
printf "done\nInferring tree..."
${RAXML_BIN} -T 16 -a ${NAM}_mask -f a -m PROTGAMMAAUTO -s ${NAM}_core_aln.fasta -n ${NAM} -x 1234 -N autoMRE -p 1234 >> ${LOG} 2>&1
printf "done\n"