genomewalker
6/6/2017 - 10:56 AM

tsne+pam

tsne+pam

library(Rtsne)
library(cluster)
library(tidyverse)
# Need to play with perplexity recommended values from 5 to 50
tsne <- Rtsne(
  as.matrix(all_domains_m_trans),
  check_duplicates = FALSE,
  initial_dims = nrow(all_domains_m),
  perplexity = 15,
  theta = 0.0,
  dims = 2,
  verbose = FALSE,
  max_iter = 2000,
  is_distance = TRUE,
    
)

tsne_data <- as.data.frame(tsne$Y)

# ggplot(tsne_data, aes(x=V1, y=V2)) +
#   geom_point(size=1.25) +
#   guides(colour = guide_legend(override.aes = list(size=6))) +
#   xlab("") + ylab("") +
#   theme_no_axes() +
#   theme(axis.text = element_blank(),
#         axis.ticks = element_blank())


# Use PAM clustering to identify the clustering in the t-SNE dimension reduction
# We will use the silhoutte width to choose the number of cluster, we can use dynamic tree cutting too
# We use the default euclidean distance metric, we might change it
nmax <- 100

get_pam_nclust <- function(X){
  pam_fit <- pam(tsne_data, k = X)
  silwidth <- pam_fit$silinfo$avg.width
  return(silwidth)
}

sil_width <- pbmclapply(2:nmax, get_pam_nclust, mc.cores = 4)

sil_width <- unlist(sil_width)


# plot(2:nmax, sil_width,
#      xlab = "Number of clusters",
#      ylab = "Silhouette Width")
# lines(2:nmax, sil_width)

# Best number of clusters:
n_cls <- which(sil_width==max(sil_width, na.rm = T))

# We run pam clustering with the max number deined by the silhoutte
pam_fit <- pam(tsne_data, k = n_cls, keep.diss = T)

# Get medoids
med <- row.names(all_domains_m[pam_fit$id.med, ])

# Extract t-SNE coordinates
tsne_data <- tsne$Y %>%
  data.frame() %>%
  setNames(c("X", "Y")) %>%
  mutate(cluster = factor(pam_fit$clustering),
         name = rownames(all_domains_m))

# Plot t-SNE + PAM + medoids
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
cols1 <- rev(getPalette(n_cls))

ggplot(tsne_data, aes(x=X, y=Y, color = cluster)) +
  geom_point(size=1.25) +
  xlab("") + ylab("") +
  #ggrepel::geom_label_repel(data = tsne_data %>% filter(name %in% med), aes(X,Y, label = name), size = 2) +
  scale_color_manual(values = cols1) +
  theme_no_axes() +
  theme(axis.text = element_blank(),
        axis.ticks = element_blank(),
        legend.position = "none") +
  ggtitle("t-SNE: medoid genomes")