tsne+pam
library(Rtsne)
library(cluster)
library(tidyverse)
# Need to play with perplexity recommended values from 5 to 50
tsne <- Rtsne(
as.matrix(all_domains_m_trans),
check_duplicates = FALSE,
initial_dims = nrow(all_domains_m),
perplexity = 15,
theta = 0.0,
dims = 2,
verbose = FALSE,
max_iter = 2000,
is_distance = TRUE,
)
tsne_data <- as.data.frame(tsne$Y)
# ggplot(tsne_data, aes(x=V1, y=V2)) +
# geom_point(size=1.25) +
# guides(colour = guide_legend(override.aes = list(size=6))) +
# xlab("") + ylab("") +
# theme_no_axes() +
# theme(axis.text = element_blank(),
# axis.ticks = element_blank())
# Use PAM clustering to identify the clustering in the t-SNE dimension reduction
# We will use the silhoutte width to choose the number of cluster, we can use dynamic tree cutting too
# We use the default euclidean distance metric, we might change it
nmax <- 100
get_pam_nclust <- function(X){
pam_fit <- pam(tsne_data, k = X)
silwidth <- pam_fit$silinfo$avg.width
return(silwidth)
}
sil_width <- pbmclapply(2:nmax, get_pam_nclust, mc.cores = 4)
sil_width <- unlist(sil_width)
# plot(2:nmax, sil_width,
# xlab = "Number of clusters",
# ylab = "Silhouette Width")
# lines(2:nmax, sil_width)
# Best number of clusters:
n_cls <- which(sil_width==max(sil_width, na.rm = T))
# We run pam clustering with the max number deined by the silhoutte
pam_fit <- pam(tsne_data, k = n_cls, keep.diss = T)
# Get medoids
med <- row.names(all_domains_m[pam_fit$id.med, ])
# Extract t-SNE coordinates
tsne_data <- tsne$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering),
name = rownames(all_domains_m))
# Plot t-SNE + PAM + medoids
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
cols1 <- rev(getPalette(n_cls))
ggplot(tsne_data, aes(x=X, y=Y, color = cluster)) +
geom_point(size=1.25) +
xlab("") + ylab("") +
#ggrepel::geom_label_repel(data = tsne_data %>% filter(name %in% med), aes(X,Y, label = name), size = 2) +
scale_color_manual(values = cols1) +
theme_no_axes() +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
ggtitle("t-SNE: medoid genomes")