tsne_bgc.r

3/25/2017 - 1:54 PM
library(tidyverse)
library(vegan)
library(viridis)
library(cowplot)
# Read data
nrps_file <- read_csv("~/Downloads/domain_heatmap/nrps/nrps_threshold_0.9/output_nrps_0.9.csv", col_names = T, trim_ws = T) %>%
  dplyr::rename(domain = X1)
lanc_file <- read_csv("~/Downloads/domain_heatmap/lanc/lanc_threshold_0.9/output_lanc_0.9.csv", col_names = T, trim_ws = T) %>%
  dplyr::rename(domain = X1)
pks_file <- read_csv("~/Downloads/domain_heatmap/pks/pks_threshold_0.9/output_pks_0.9.csv", col_names = T, trim_ws = T) %>%
  dplyr::rename(domain = X1)

# clean it a bit
names(nrps_file) <- gsub("as_genomes//", "", names(nrps_file))
names(lanc_file) <- gsub("as_genomes//", "", names(lanc_file))
names(pks_file) <- gsub("as_genomes//", "", names(pks_file))

# Calculate proportion and transform 0.01 to 0
nrps_file_m_long <- nrps_file %>%
  reshape2::melt() %>%
  dplyr::rename(genome = variable) %>%
  tbl_df %>%
  mutate(value = ifelse(value <= 0.01, 0, value)) %>%
  group_by(genome) %>%
  mutate(prop = value/sum(value), domain = paste("nrps_",domain, sep = "")) 

lanc_file_m_long <- lanc_file %>%
  reshape2::melt() %>%
  dplyr::rename(genome = variable) %>%
  tbl_df %>%
  mutate(value = ifelse(value <= 0.01, 0, value)) %>%
  group_by(genome) %>%
  mutate(prop = value/sum(value), domain = paste("lanc_",domain, sep = "")) 

pks_file_m_long <- pks_file %>%
  reshape2::melt() %>%
  dplyr::rename(genome = variable) %>%
  tbl_df %>%
  mutate(value = ifelse(value <= 0.01, 0, value)) %>%
  group_by(genome) %>%
  mutate(prop = value/sum(value), domain = paste("pks_",domain, sep = "")) 


# Combine data
all_domains <- bind_rows(nrps_file_m_long, lanc_file_m_long, pks_file_m_long)

# Convert to a matrix
all_domains_m <- all_domains %>%
  dplyr::select(-prop) %>%
  spread(domain, value, fill =0) %>%
  as.data.frame()
row.names(all_domains_m) <- all_domains_m$genome
all_domains_m$genome <- NULL
all_domains_m <- as.matrix(all_domains_m)


# Calculate diversity indices
nrps_rich <- specnumber(nrps_file_m)
nrps_shannon <- vegan::diversity(nrps_file_m)
nrps_invsimp <- vegan::diversity(nrps_file_m, "invsimpson")
nrps_pielou <- nrps_shannon/log(nrps_rich)

nrps_diversity <- data.frame(richness = nrps_rich, shannon = nrps_shannon, invsimp = nrps_invsimp, eveness = nrps_pielou)
nrps_diversity$genome <- row.names(nrps_diversity)
nrps_diversity_long <- nrps_diversity %>%
  reshape2::melt()



# Let's check some numbers


all_domains %>%
  group_by(genome) %>%
  summarise(N = sum(value)) %>%
  ggplot(aes(N)) +
  geom_density(alpha = 0.8, fill = "#333333") +
  theme_bw() +
  xlab("Domains per genome")

all_domains %>%
  group_by(domain) %>%
  summarise(N = sum(value)) %>%
  ggplot(aes(N)) +
  geom_density(alpha = 0.8, fill = "#333333") +
  theme_bw() +
  xlab("Genomes per domain")



# Calculate t-sne
library(Rtsne)
library(cluster)


set.seed(2016)

# Need to play with perplexity recommended values from 5 to 50
tsne <- Rtsne(
  as.matrix(log10(all_domains_m + 0.001)),
  check_duplicates = FALSE,
  perplexity = 15,
  theta = 0.5,
  dims = 2,
  verbose = TRUE,
  max_iter = 10000
)

tsne_data <- as.data.frame(tsne$Y)

ggplot(tsne_data, aes(x=V1, y=V2)) +
  geom_point(size=1.25) +
  guides(colour = guide_legend(override.aes = list(size=6))) +
  xlab("") + ylab("") +
  theme_no_axes() +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank())


# Use PAM clustering to identify the clustering in the t-SNE dimension reduction
# We will use the silhoutte width to choose the number of cluster, we can use dynamic tree cutting too
# We use the default euclidean distance metric, we might change it

sil_width <- c(NA)
for(i in 2:100){
  pam_fit <- pam(tsne_data,
                 k = i)
  sil_width[i] <- pam_fit$silinfo$avg.width
}

plot(1:100, sil_width,
     xlab = "Number of clusters",
     ylab = "Silhouette Width")
lines(1:100, sil_width)

# Best number of clusters:
n_cls <- which(sil_width==max(sil_width, na.rm = T))

# We run pam clustering with the max number deined by the silhoutte
pam_fit <- pam(embedding, k = n_cls, keep.diss = T)

# Get medoids
med <- row.names(all_domains_m[pam_fit$id.med, ])

# Extract t-SNE coordinates
tsne_data <- tsne$Y %>%
  data.frame() %>%
  setNames(c("X", "Y")) %>%
  mutate(cluster = factor(pam_fit$clustering),
         name = rownames(all_domains_m))

# Plot t-SNE + PAM + medoids
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
cols1 <- rev(getPalette(n_cls))

ggplot(tsne_data, aes(x=X, y=Y, color = cluster)) +
  geom_point(size=1.25) +
  xlab("") + ylab("") +
  ggrepel::geom_label_repel(data = tsne_data %>% filter(name %in% med), aes(X,Y, label = name), size = 2) +
  scale_color_manual(values = cols1) +
  theme_no_axes() +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none") +
  ggtitle("t-SNE: medoid genomes")

tsne_data %>% 
  group_by(cluster) %>%
  dplyr::count()%>%
  arrange(n) %>%
  ggplot(aes(n)) +
  geom_density(fill = "#333333", alpha = 0.7) +
  theme_bw() +
  xlab("Size of the clusters")

div_list <- vector(mode = "list")

for (i in unique(tsne_data$cluster)){
  tsne_subset <- tsne_data %>%
    filter(cluster == i)
  
  if (dim(tsne_subset)[1] > 1){
    all_domains_m_ss <- all_domains_m[tsne_subset$name,]
    # Calculate diversity indices for each memeber of the cluster:
    all_rich <- specnumber(all_domains_m_ss)
    all_shannon <- vegan::diversity(all_domains_m_ss)
    diversity <- data.frame(richness = all_rich, shannon = all_shannon)
    diversity$name <- row.names(diversity)
    div_list[[i]] <- diversity %>%
      reshape2::melt()
  }else{
    all_rich <- specnumber(all_domains_m_ss)
    all_shannon <- vegan::diversity(all_domains_m_ss)
    diversity <- data.frame(richness = all_rich, shannon = all_shannon)
    diversity$name <- tsne_subset$name
    div_list[[i]] <- diversity %>%
      reshape2::melt()
  }
}

div_list <- bind_rows(div_list)


# Get clusters at least the median of the cluster sizes
cl_median_size <- tsne_data %>% 
  group_by(cluster) %>%
  dplyr::count()%>%
  .$n %>%
  median

cl_selection <- tsne_data %>% 
  group_by(cluster) %>%
  dplyr::count()%>%
  filter(n >= cl_median_size) %>%
  .$cluster %>%
  droplevels()


tsne_data_selection <- tsne_data %>%
  filter(cluster %in% cl_selection) %>%
  mutate(medoid = ifelse(name %in% med, "TRUE", "FALSE"))


max_sh_div <- tsne_data_selection %>%
  left_join(div_list) %>%
  filter(variable == "shannon") %>%
  group_by(cluster) %>%
  filter(value == max(value)) %>%
  .$name

getPalette = colorRampPalette(brewer.pal(9, "Set1"))
cols <- rev(getPalette(length(max_sh_div)))

ggplot(tsne_data, aes(x=X, y=Y)) +
  geom_point(size=1.25, alpha = 0.1, color = "grey10") +
  geom_point(data = tsne_data_selection,  aes(x=X, y=Y, color = cluster),size=1.25) +
  xlab("") + ylab("") +
  ggrepel::geom_label_repel(data = tsne_data_selection %>% filter(name %in% max_sh_div), aes(X,Y, label = name, color = cluster), size = 2) +
  scale_color_manual(values = cols) +
  theme_no_axes() +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none") +
  ggtitle("t-SNE: genomes with largest Shannon domain diversity")

checkm_results <- read_tsv("~/Downloads/InBioPharm_checkm_932_genomes_06022017.txt", col_names = T, trim_ws = T) %>%
  dplyr::rename(genome = `Bin Id`) %>%
  mutate(genome = gsub("_scaffold-sequences", "", genome))


checkm_results_long <- checkm_results %>% melt(id = "genome") %>% tbl_df

for (i in unique(tsne_data_selection$cluster)){
  
  tsne_subset <- tsne_data_selection %>%
    filter(cluster == i)
  
  all_domains_m_ss <- all_domains_m[tsne_subset$name,]
  
  # Calculate diversity indices for each memeber of the cluster:
  
  all_rich <- specnumber(all_domains_m_ss)
  all_shannon <- vegan::diversity(all_domains_m_ss)
  all_invsimp <- vegan::diversity(all_domains_m_ss, "invsimpson")
  all_pielou <- all_shannon/log(all_rich)
  
  diversity <- data.frame(richness = all_rich, shannon = all_shannon, invsimp = all_invsimp, eveness = all_pielou)
  diversity$genome <- row.names(diversity)
  diversity_long <- diversity %>%
    reshape2::melt()
  
  max_sh_div <- diversity_long %>%
    filter(variable == "shannon") %>%
    filter(value == max(value)) %>%
    .$genome
  
  # Plot heatmap
  row_annotations <- sapply(strsplit(colnames(all_domains_m_ss), "_"), "[", 1)
  mat_col <- data.frame(Domain = row_annotations)
  rownames(mat_col) <- colnames(all_domains_m_ss)
  
  
  m_all_domains_m_ss <- all_domains_m_ss
  colnames(m_all_domains_m_ss) <- row_annotations
  
  m_all_domains_m_ss <- melt(m_all_domains_m_ss) %>%
    group_by(Var2) %>%
    summarise(N=sum(value)) %>%
    ungroup() %>%
    mutate(prop = N/sum(N))
  
  mat_colors <- list(Domain = brewer.pal(3, "Set1"))
  names(mat_colors$Domain) <- unique(mat_col$Domain)
  med_gen <- tsne_subset %>% filter(medoid == TRUE) %>% .$name
  main <- paste("Cluster_", i,"; Medoid: ", med_gen,"; Max Shannon div:", max_sh_div, sep = "")
  
  
  pheat <- pheatmap(
    mat               = log10(t(all_domains_m_ss) + 0.001),
    color             = viridis(100),
    border_color      = "white",
    show_colnames     = TRUE,
    show_rownames     = FALSE,
    annotation_row    = mat_col,
    annotation_colors = mat_colors,
    drop_levels       = TRUE,
    fontsize          = 6,
    main              = main,
    clustering_method = "complete",
    clustering_distance_rows = "euclidean",
    clustering_distance_cols = "euclidean")
  
  
  t_plot <- ggplot(tsne_data, aes(x=X, y=Y)) +
    geom_point(size=1.25, alpha = 0.1, color = "grey10") +
    geom_point(data = tsne_subset,  aes(x=X, y=Y, color = cluster),size=1.25) +
    xlab("") + ylab("") +
    ggrepel::geom_label_repel(data = tsne_subset %>% filter(name %in% med), aes(X,Y, label = name, color = cols1[i]), size = 2) +
    theme_no_axes() +
    theme(axis.text = element_blank(),
          axis.ticks = element_blank(),
          legend.position = "none") +
    ggtitle("t-SNE: medoid genomes")
  
  p_s <- ggplot(diversity_long %>% filter(variable == "shannon"), aes(genome, value)) +
    geom_bar(stat = "identity", fill = "#333333", alpha = 0.6) +
    geom_bar(data = diversity_long %>% filter(variable == "shannon", genome == max_sh_div),  aes(genome, value), stat = "identity", fill = "#C02942") +
    theme_bw() +
    theme(axis.text = element_text(size = 6)) +
    xlab("") +
    ylab("Shannon diversity (bits)")
    
   p_d <- ggplot(m_all_domains_m_ss, aes(Var2, prop)) +
      geom_bar(stat = "identity") +
      theme_bw() +
      xlab("Domain") +
      ylab("Relative abundance") +
      scale_y_continuous(labels = scales::percent) +
     theme(axis.text = element_text(size = 6),
           axis.title = element_text(size = 8))
     
   checkm_results_long_filt <- checkm_results_long %>% filter(genome %in% tsne_subset$name) %>%
     filter(variable == "Completeness" | 
            variable == "Contamination" | 
            variable == "Strain heterogeneity" |
            variable == "Genome size (bp)" | 
            variable == "# scaffolds" |
            variable == "GC")
   
   if (dim(checkm_results_long_filt)[1] > 0){
   p_ch <- ggplot(checkm_results_long_filt, aes(genome, as.numeric(value))) +
     geom_bar(stat = "identity") +
     facet_wrap(~variable, scales = "free_x", strip.position = "top") +
     theme_bw() +
     xlab("") +
     ylab("") +
     theme(axis.text = element_text(size = 6),
           axis.title = element_text(size = 8)) +
     scale_y_continuous(labels = human_numbers) +
     coord_flip()
   
   
    fname <- paste("~/Downloads/tsne_cluster", i, ".png", sep = "")
    #print(cowplot::plot_grid(t_plot, pheat$gtable, p_s, p_d , ncol = 2))
    ggdraw() +
      draw_plot(t_plot,        x = 0,   y = 0.6, width = .5, height = .38) +
      draw_plot(pheat$gtable,  x = 0.5, y = 0.6, width = .5, height = .38) +
      draw_plot(p_s,  x = 0, y = 0.31, width = .5, height = .29) +
      draw_plot(p_d,  x = 0.5, y = 0.31, width = .5, height = .29) +
      draw_plot(p_ch,  x = 0, y = 0, width = 1, height = .3) +
      draw_plot_label(c("A", "B", "C", "D", "E"), c(0, 0.5, 0, 0.5, 0), c(1, 1, 0.6, 0.6, 0.3), size = 10)
    ggsave(fname, width = 11.29, height = 11.29, units = "in")
   }
}


human_numbers <- function(x = NULL, smbl ="", signif = 1){
  humanity <- function(y){
    
    if (!is.na(y)){
      tn <- round(abs(y) / 1e12, signif)
      b <- round(abs(y) / 1e9, signif)
      m <- round(abs(y) / 1e6, signif)
      k <- round(abs(y) / 1e3, signif)
      
      if ( y >= 0 ){
        y_is_positive <- ""
      } else {
        y_is_positive <- "-"
      }
      
      if ( k < 1 ) {
        paste0( y_is_positive, smbl, round(abs(y), signif ))
      } else if ( m < 1){
        paste0 (y_is_positive, smbl,  k , "K")
      } else if (b < 1){
        paste0 (y_is_positive, smbl, m ,"M")
      }else if(tn < 1){
        paste0 (y_is_positive, smbl, b ,"bn")
      } else {
        paste0 (y_is_positive, smbl,  comma(tn), "tn")
      }
    } else if (is.na(y) | is.null(y)){
      "-"
    }
  }
  
  sapply(x,humanity)
}
Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

tsne_bgc.r