genomewalker
6/11/2017 - 3:38 PM

t-sne code for basti

t-sne code for basti

library(tidyverse)
library(igraph)
library(Rtsne)
library(Matrix)
library(reshape2)
library(parallel)
library(pbmcapply)
library(cluster)
library(ggforce)
library(RColorBrewer)

setwd("/scratch/smiksch/labrot2")
getwd()

get_rank <- function(x, rank = 'g:'){
  l <- strsplit(x, ';')[[1]]
  l <- l[grepl(rank, l)]
  if (length(l) < 1) {
    l <- NA
  }
  return(l)
}


dist_refseq_full <- read_delim("dist_refseq_v82_21_400_clean_pless-10_dless0.05_onlyunic.txt", col_names = FALSE, progress = T, trim_ws = TRUE, delim = " ")

dist_refseq_full <- rename(.data = dist_refseq_full, node1 = X1, node2 = X2, weight = X3, p_value = X4)

genomtax2 <- read_delim("genomes_simpletax.unique.quoted.without_notfound.txt", col_names = FALSE, progress = T, trim_ws = TRUE, delim = '"')

genometax <- genomtax2 %>%
  rename(tax_id = X1, taxonomy = X2, genome_id = X3) %>%
  rowwise %>%
  mutate(genus = get_rank(taxonomy, '^g:'), specie = get_rank(taxonomy, '^s:'))


number_of_genomes_in_genus <- function(x, data = data){
  g_id <- data %>% filter(genus == x) %>% .$genome_id
  g_dist <-  dist_refseq_full %>% filter(node1 %in% g_id , node2 %in% g_id)
  cat(x, "\n")
  g_net <- graph_from_data_frame(g_dist, directed = FALSE) %>% igraph::simplify(edge.attr.comb = "min")
  return(data.frame(genus = x, elements = vcount(g_net)))
}

list_of_genera <- genometax %>% 
  filter(!is.na(genus)) %>%
  .$genus %>%
  unique

number_genomes <- pbmclapply(list_of_genera, number_of_genomes_in_genus, data = genometax, mc.cores = 64)

df_n_genomes <- bind_rows(number_genomes)%>% tbl_df

list_of_genera_over50 <- df_n_genomes %>% 
  filter(elements > 50) %>% 
  .$genus
list_of_genera_24 <- df_n_genomes %>% 
  filter(elements == 24) %>% 
  .$genus

list_of_genera_under50 <- df_n_genomes %>% 
  filter(elements < 50) %>% 
  .$genus

list_of_under50 <- genometax %>%
  filter(genus %in% list_of_genera_under50)



get_genomes_from_genus <- function(X, data = data){
  #require(cluster)
  #X <- "g:Proteus"
  #data <- genometax
  g_id <- data %>% filter(genus == X) %>% .$genome_id
  g_dist <-  dist_refseq_full %>% filter(node1 %in% g_id , node2 %in% g_id)
  g_net <- graph_from_data_frame(g_dist, directed = FALSE) %>% igraph::simplify(edge.attr.comb = "min")
  g_mat <- as_adjacency_matrix(g_net, sparse = TRUE, attr = 'weight') 
  g_mat <- as.matrix(g_mat)
  mad <- mad(g_dist$weight, constant = 1)
  median <- median(g_dist$weight)
  p <- floor((nrow(g_mat)-1)/3)
  cat("Running t-sne...\n")
  set.seed(123)
  g_tsne <- Rtsne::Rtsne(g_mat,
                         is_distance = TRUE, 
                         perplexity = p,
                         check_duplicates = FALSE,
                         initial_dims = nrow(g_mat),
                         theta = 0.0,
                         dims = 2,
                         verbose = FALSE,
                         max_iter = 2000,
                         
  )
  tsne_data <- as.data.frame(g_tsne$Y)
  
  get_pam_nclust <- function(X){
    pam_fit <- clara(tsne_data, k = X, sample = 100)
    silwidth <- pam_fit$silinfo$avg.width
    return(silwidth)
  }
  nmax <- floor((nrow(g_mat)-1)/2)
  nmax <- 50
  cat("Finding k for PAM...\n")
  sil_width <- mclapply(2:nmax, get_pam_nclust, mc.cores = 50)
  
  sil_width <- unlist(sil_width)
  
  
  sil <- ggplot(data.frame(cls = 2:nmax, width = sil_width), aes(cls, width)) +
    geom_point() +
    geom_line() +
    xlab("Number of clusters") + 
    ylab("Silhouette Width") +
    theme_light()
  
  
  # Best number of clusters:
  n_cls <- which(sil_width==max(sil_width, na.rm = T))
  
  
  # We run pam clustering with the max number deined by the silhoutte
  pam_fit <- clara(tsne_data, k = n_cls, samples = 100)
  
  # Get medoids
  med <- row.names(g_mat[pam_fit$i.med, ])
  
  # Extract t-SNE coordinates
  tsne_data <- g_tsne$Y %>%
    data.frame() %>%
    setNames(c("X", "Y")) %>%
    mutate(cluster = factor(pam_fit$clustering),
           name = rownames(g_mat))
  
  # Plot t-SNE + PAM + medoids
  getPalette = colorRampPalette(brewer.pal(9, "Set1"))
  cols1 <- rev(getPalette(n_cls))
  
  medplot <- ggplot(tsne_data, aes(x=X, y=Y, color = cluster)) +
    geom_point(size=1.25) +
    xlab("") + ylab("") +
    #ggrepel::geom_label_repel(data = tsne_data %>% filter(name %in% med), aes(X,Y, label = name), size = 2) +
    scale_color_manual(values = cols1) +
    theme_no_axes() +
    theme(axis.text = element_blank(),
          axis.ticks = element_blank(),
          legend.position = "none") +
    ggtitle("t-SNE: medoid genomes")
  
  endlist <- list(mad = mad, median = median, mat = g_mat, tsne = g_tsne, silhou = sil, medoids = med, plot = medplot)
  #print(X)
  return(endlist)
  #return(pam_clusters)
}

test1 <- pbmclapply(list_of_genera_over50[1:10], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test1) <- gsub('g:', "", list_of_genera_over50[1:10])
test2 <- pbmclapply(list_of_genera_over50[11:20], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test2) <- gsub('g:', "", list_of_genera_over50[11:20])
test3 <- pbmclapply(list_of_genera_over50[21:30], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test3) <- gsub('g:', "", list_of_genera_over50[21:30])
test4 <- pbmclapply(list_of_genera_over50[31:40], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test4) <- gsub('g:', "", list_of_genera_over50[31:40])
test5 <- pbmclapply(list_of_genera_over50[41:50], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test5) <- gsub('g:', "", list_of_genera_over50[41:50])
test6 <- pbmclapply(list_of_genera_over50[51:60], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test6) <- gsub('g:', "", list_of_genera_over50[51:60])
test7 <- pbmclapply(list_of_genera_over50[61:70], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test7) <- gsub('g:', "", list_of_genera_over50[61:70])
test8 <- pbmclapply(list_of_genera_over50[71:73], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test8) <- gsub('g:', "", list_of_genera_over50[71:73])


test1 <- vector(mode = "list")
for (i in list_of_genera_over50[1:10]){
  cat(i, "\n")
  test1[[i]] <- get_genomes_from_genus(i, data = genometax)
}

test2 <- vector(mode = "list")
for (i in list_of_genera_over50[11:20]){
  cat(i, "\n")
  test2[[i]] <- get_genomes_from_genus(i, data = genometax)
}

test3 <- vector(mode = "list")
for (i in list_of_genera_over50[21:30]){
  cat(i, "\n")
  test3[[i]] <- get_genomes_from_genus(i, data = genometax)
}

test4 <- vector(mode = "list")
for (i in list_of_genera_over50[31:40]){
  cat(i, "\n")
  test4[[i]] <- get_genomes_from_genus(i, data = genometax)
}

test5 <- vector(mode = "list")
for (i in list_of_genera_over50[41:50]){
  cat(i, "\n")
  test5[[i]] <- get_genomes_from_genus(i, data = genometax)
}

test6 <- vector(mode = "list")
for (i in list_of_genera_over50[51:60]){
  cat(i, "\n")
  test6[[i]] <- get_genomes_from_genus(i, data = genometax)
}

test7 <- vector(mode = "list")
for (i in list_of_genera_over50[61:70]){
  cat(i, "\n")
  test7[[i]] <- get_genomes_from_genus(i, data = genometax)
}

test8 <- vector(mode = "list")
for (i in list_of_genera_over50[71:73]){
  cat(i, "\n")
  test8[[i]] <- get_genomes_from_genus(i, data = genometax)
}