t-sne code for basti
library(tidyverse)
library(igraph)
library(Rtsne)
library(Matrix)
library(reshape2)
library(parallel)
library(pbmcapply)
library(cluster)
library(ggforce)
library(RColorBrewer)
setwd("/scratch/smiksch/labrot2")
getwd()
get_rank <- function(x, rank = 'g:'){
l <- strsplit(x, ';')[[1]]
l <- l[grepl(rank, l)]
if (length(l) < 1) {
l <- NA
}
return(l)
}
dist_refseq_full <- read_delim("dist_refseq_v82_21_400_clean_pless-10_dless0.05_onlyunic.txt", col_names = FALSE, progress = T, trim_ws = TRUE, delim = " ")
dist_refseq_full <- rename(.data = dist_refseq_full, node1 = X1, node2 = X2, weight = X3, p_value = X4)
genomtax2 <- read_delim("genomes_simpletax.unique.quoted.without_notfound.txt", col_names = FALSE, progress = T, trim_ws = TRUE, delim = '"')
genometax <- genomtax2 %>%
rename(tax_id = X1, taxonomy = X2, genome_id = X3) %>%
rowwise %>%
mutate(genus = get_rank(taxonomy, '^g:'), specie = get_rank(taxonomy, '^s:'))
number_of_genomes_in_genus <- function(x, data = data){
g_id <- data %>% filter(genus == x) %>% .$genome_id
g_dist <- dist_refseq_full %>% filter(node1 %in% g_id , node2 %in% g_id)
cat(x, "\n")
g_net <- graph_from_data_frame(g_dist, directed = FALSE) %>% igraph::simplify(edge.attr.comb = "min")
return(data.frame(genus = x, elements = vcount(g_net)))
}
list_of_genera <- genometax %>%
filter(!is.na(genus)) %>%
.$genus %>%
unique
number_genomes <- pbmclapply(list_of_genera, number_of_genomes_in_genus, data = genometax, mc.cores = 64)
df_n_genomes <- bind_rows(number_genomes)%>% tbl_df
list_of_genera_over50 <- df_n_genomes %>%
filter(elements > 50) %>%
.$genus
list_of_genera_24 <- df_n_genomes %>%
filter(elements == 24) %>%
.$genus
list_of_genera_under50 <- df_n_genomes %>%
filter(elements < 50) %>%
.$genus
list_of_under50 <- genometax %>%
filter(genus %in% list_of_genera_under50)
get_genomes_from_genus <- function(X, data = data){
#require(cluster)
#X <- "g:Proteus"
#data <- genometax
g_id <- data %>% filter(genus == X) %>% .$genome_id
g_dist <- dist_refseq_full %>% filter(node1 %in% g_id , node2 %in% g_id)
g_net <- graph_from_data_frame(g_dist, directed = FALSE) %>% igraph::simplify(edge.attr.comb = "min")
g_mat <- as_adjacency_matrix(g_net, sparse = TRUE, attr = 'weight')
g_mat <- as.matrix(g_mat)
mad <- mad(g_dist$weight, constant = 1)
median <- median(g_dist$weight)
p <- floor((nrow(g_mat)-1)/3)
cat("Running t-sne...\n")
set.seed(123)
g_tsne <- Rtsne::Rtsne(g_mat,
is_distance = TRUE,
perplexity = p,
check_duplicates = FALSE,
initial_dims = nrow(g_mat),
theta = 0.0,
dims = 2,
verbose = FALSE,
max_iter = 2000,
)
tsne_data <- as.data.frame(g_tsne$Y)
get_pam_nclust <- function(X){
pam_fit <- clara(tsne_data, k = X, sample = 100)
silwidth <- pam_fit$silinfo$avg.width
return(silwidth)
}
nmax <- floor((nrow(g_mat)-1)/2)
nmax <- 50
cat("Finding k for PAM...\n")
sil_width <- mclapply(2:nmax, get_pam_nclust, mc.cores = 50)
sil_width <- unlist(sil_width)
sil <- ggplot(data.frame(cls = 2:nmax, width = sil_width), aes(cls, width)) +
geom_point() +
geom_line() +
xlab("Number of clusters") +
ylab("Silhouette Width") +
theme_light()
# Best number of clusters:
n_cls <- which(sil_width==max(sil_width, na.rm = T))
# We run pam clustering with the max number deined by the silhoutte
pam_fit <- clara(tsne_data, k = n_cls, samples = 100)
# Get medoids
med <- row.names(g_mat[pam_fit$i.med, ])
# Extract t-SNE coordinates
tsne_data <- g_tsne$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering),
name = rownames(g_mat))
# Plot t-SNE + PAM + medoids
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
cols1 <- rev(getPalette(n_cls))
medplot <- ggplot(tsne_data, aes(x=X, y=Y, color = cluster)) +
geom_point(size=1.25) +
xlab("") + ylab("") +
#ggrepel::geom_label_repel(data = tsne_data %>% filter(name %in% med), aes(X,Y, label = name), size = 2) +
scale_color_manual(values = cols1) +
theme_no_axes() +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
ggtitle("t-SNE: medoid genomes")
endlist <- list(mad = mad, median = median, mat = g_mat, tsne = g_tsne, silhou = sil, medoids = med, plot = medplot)
#print(X)
return(endlist)
#return(pam_clusters)
}
test1 <- pbmclapply(list_of_genera_over50[1:10], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test1) <- gsub('g:', "", list_of_genera_over50[1:10])
test2 <- pbmclapply(list_of_genera_over50[11:20], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test2) <- gsub('g:', "", list_of_genera_over50[11:20])
test3 <- pbmclapply(list_of_genera_over50[21:30], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test3) <- gsub('g:', "", list_of_genera_over50[21:30])
test4 <- pbmclapply(list_of_genera_over50[31:40], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test4) <- gsub('g:', "", list_of_genera_over50[31:40])
test5 <- pbmclapply(list_of_genera_over50[41:50], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test5) <- gsub('g:', "", list_of_genera_over50[41:50])
test6 <- pbmclapply(list_of_genera_over50[51:60], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test6) <- gsub('g:', "", list_of_genera_over50[51:60])
test7 <- pbmclapply(list_of_genera_over50[61:70], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test7) <- gsub('g:', "", list_of_genera_over50[61:70])
test8 <- pbmclapply(list_of_genera_over50[71:73], get_genomes_from_genus, data = genometax, mc.cores = 72)
names(test8) <- gsub('g:', "", list_of_genera_over50[71:73])
test1 <- vector(mode = "list")
for (i in list_of_genera_over50[1:10]){
cat(i, "\n")
test1[[i]] <- get_genomes_from_genus(i, data = genometax)
}
test2 <- vector(mode = "list")
for (i in list_of_genera_over50[11:20]){
cat(i, "\n")
test2[[i]] <- get_genomes_from_genus(i, data = genometax)
}
test3 <- vector(mode = "list")
for (i in list_of_genera_over50[21:30]){
cat(i, "\n")
test3[[i]] <- get_genomes_from_genus(i, data = genometax)
}
test4 <- vector(mode = "list")
for (i in list_of_genera_over50[31:40]){
cat(i, "\n")
test4[[i]] <- get_genomes_from_genus(i, data = genometax)
}
test5 <- vector(mode = "list")
for (i in list_of_genera_over50[41:50]){
cat(i, "\n")
test5[[i]] <- get_genomes_from_genus(i, data = genometax)
}
test6 <- vector(mode = "list")
for (i in list_of_genera_over50[51:60]){
cat(i, "\n")
test6[[i]] <- get_genomes_from_genus(i, data = genometax)
}
test7 <- vector(mode = "list")
for (i in list_of_genera_over50[61:70]){
cat(i, "\n")
test7[[i]] <- get_genomes_from_genus(i, data = genometax)
}
test8 <- vector(mode = "list")
for (i in list_of_genera_over50[71:73]){
cat(i, "\n")
test8[[i]] <- get_genomes_from_genus(i, data = genometax)
}