library(tidyverse)
library(vegan)
library(viridis)
library(cowplot)
# Read data
nrps_file <- read_csv("~/Downloads/domain_heatmap/nrps/nrps_threshold_0.9/output_nrps_0.9.csv", col_names = T, trim_ws = T) %>%
dplyr::rename(domain = X1)
lanc_file <- read_csv("~/Downloads/domain_heatmap/lanc/lanc_threshold_0.9/output_lanc_0.9.csv", col_names = T, trim_ws = T) %>%
dplyr::rename(domain = X1)
pks_file <- read_csv("~/Downloads/domain_heatmap/pks/pks_threshold_0.9/output_pks_0.9.csv", col_names = T, trim_ws = T) %>%
dplyr::rename(domain = X1)
# clean it a bit
names(nrps_file) <- gsub("as_genomes//", "", names(nrps_file))
names(lanc_file) <- gsub("as_genomes//", "", names(lanc_file))
names(pks_file) <- gsub("as_genomes//", "", names(pks_file))
# Calculate proportion and transform 0.01 to 0
nrps_file_m_long <- nrps_file %>%
reshape2::melt() %>%
dplyr::rename(genome = variable) %>%
tbl_df %>%
mutate(value = ifelse(value <= 0.01, 0, value)) %>%
group_by(genome) %>%
mutate(prop = value/sum(value), domain = paste("nrps_",domain, sep = ""))
lanc_file_m_long <- lanc_file %>%
reshape2::melt() %>%
dplyr::rename(genome = variable) %>%
tbl_df %>%
mutate(value = ifelse(value <= 0.01, 0, value)) %>%
group_by(genome) %>%
mutate(prop = value/sum(value), domain = paste("lanc_",domain, sep = ""))
pks_file_m_long <- pks_file %>%
reshape2::melt() %>%
dplyr::rename(genome = variable) %>%
tbl_df %>%
mutate(value = ifelse(value <= 0.01, 0, value)) %>%
group_by(genome) %>%
mutate(prop = value/sum(value), domain = paste("pks_",domain, sep = ""))
# Combine data
all_domains <- bind_rows(nrps_file_m_long, lanc_file_m_long, pks_file_m_long)
# Convert to a matrix
all_domains_m <- all_domains %>%
dplyr::select(-prop) %>%
spread(domain, value, fill =0) %>%
as.data.frame()
row.names(all_domains_m) <- all_domains_m$genome
all_domains_m$genome <- NULL
all_domains_m <- as.matrix(all_domains_m)
# Calculate diversity indices
nrps_rich <- specnumber(nrps_file_m)
nrps_shannon <- vegan::diversity(nrps_file_m)
nrps_invsimp <- vegan::diversity(nrps_file_m, "invsimpson")
nrps_pielou <- nrps_shannon/log(nrps_rich)
nrps_diversity <- data.frame(richness = nrps_rich, shannon = nrps_shannon, invsimp = nrps_invsimp, eveness = nrps_pielou)
nrps_diversity$genome <- row.names(nrps_diversity)
nrps_diversity_long <- nrps_diversity %>%
reshape2::melt()
# Let's check some numbers
all_domains %>%
group_by(genome) %>%
summarise(N = sum(value)) %>%
ggplot(aes(N)) +
geom_density(alpha = 0.8, fill = "#333333") +
theme_bw() +
xlab("Domains per genome")
all_domains %>%
group_by(domain) %>%
summarise(N = sum(value)) %>%
ggplot(aes(N)) +
geom_density(alpha = 0.8, fill = "#333333") +
theme_bw() +
xlab("Genomes per domain")
# Calculate t-sne
library(Rtsne)
library(cluster)
set.seed(2016)
# Need to play with perplexity recommended values from 5 to 50
tsne <- Rtsne(
as.matrix(log10(all_domains_m + 0.001)),
check_duplicates = FALSE,
perplexity = 15,
theta = 0.5,
dims = 2,
verbose = TRUE,
max_iter = 10000
)
tsne_data <- as.data.frame(tsne$Y)
ggplot(tsne_data, aes(x=V1, y=V2)) +
geom_point(size=1.25) +
guides(colour = guide_legend(override.aes = list(size=6))) +
xlab("") + ylab("") +
theme_no_axes() +
theme(axis.text = element_blank(),
axis.ticks = element_blank())
# Use PAM clustering to identify the clustering in the t-SNE dimension reduction
# We will use the silhoutte width to choose the number of cluster, we can use dynamic tree cutting too
# We use the default euclidean distance metric, we might change it
sil_width <- c(NA)
for(i in 2:100){
pam_fit <- pam(tsne_data,
k = i)
sil_width[i] <- pam_fit$silinfo$avg.width
}
plot(1:100, sil_width,
xlab = "Number of clusters",
ylab = "Silhouette Width")
lines(1:100, sil_width)
# Best number of clusters:
n_cls <- which(sil_width==max(sil_width, na.rm = T))
# We run pam clustering with the max number deined by the silhoutte
pam_fit <- pam(embedding, k = n_cls, keep.diss = T)
# Get medoids
med <- row.names(all_domains_m[pam_fit$id.med, ])
# Extract t-SNE coordinates
tsne_data <- tsne$Y %>%
data.frame() %>%
setNames(c("X", "Y")) %>%
mutate(cluster = factor(pam_fit$clustering),
name = rownames(all_domains_m))
# Plot t-SNE + PAM + medoids
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
cols1 <- rev(getPalette(n_cls))
ggplot(tsne_data, aes(x=X, y=Y, color = cluster)) +
geom_point(size=1.25) +
xlab("") + ylab("") +
ggrepel::geom_label_repel(data = tsne_data %>% filter(name %in% med), aes(X,Y, label = name), size = 2) +
scale_color_manual(values = cols1) +
theme_no_axes() +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
ggtitle("t-SNE: medoid genomes")
tsne_data %>%
group_by(cluster) %>%
dplyr::count()%>%
arrange(n) %>%
ggplot(aes(n)) +
geom_density(fill = "#333333", alpha = 0.7) +
theme_bw() +
xlab("Size of the clusters")
div_list <- vector(mode = "list")
for (i in unique(tsne_data$cluster)){
tsne_subset <- tsne_data %>%
filter(cluster == i)
if (dim(tsne_subset)[1] > 1){
all_domains_m_ss <- all_domains_m[tsne_subset$name,]
# Calculate diversity indices for each memeber of the cluster:
all_rich <- specnumber(all_domains_m_ss)
all_shannon <- vegan::diversity(all_domains_m_ss)
diversity <- data.frame(richness = all_rich, shannon = all_shannon)
diversity$name <- row.names(diversity)
div_list[[i]] <- diversity %>%
reshape2::melt()
}else{
all_rich <- specnumber(all_domains_m_ss)
all_shannon <- vegan::diversity(all_domains_m_ss)
diversity <- data.frame(richness = all_rich, shannon = all_shannon)
diversity$name <- tsne_subset$name
div_list[[i]] <- diversity %>%
reshape2::melt()
}
}
div_list <- bind_rows(div_list)
# Get clusters at least the median of the cluster sizes
cl_median_size <- tsne_data %>%
group_by(cluster) %>%
dplyr::count()%>%
.$n %>%
median
cl_selection <- tsne_data %>%
group_by(cluster) %>%
dplyr::count()%>%
filter(n >= cl_median_size) %>%
.$cluster %>%
droplevels()
tsne_data_selection <- tsne_data %>%
filter(cluster %in% cl_selection) %>%
mutate(medoid = ifelse(name %in% med, "TRUE", "FALSE"))
max_sh_div <- tsne_data_selection %>%
left_join(div_list) %>%
filter(variable == "shannon") %>%
group_by(cluster) %>%
filter(value == max(value)) %>%
.$name
getPalette = colorRampPalette(brewer.pal(9, "Set1"))
cols <- rev(getPalette(length(max_sh_div)))
ggplot(tsne_data, aes(x=X, y=Y)) +
geom_point(size=1.25, alpha = 0.1, color = "grey10") +
geom_point(data = tsne_data_selection, aes(x=X, y=Y, color = cluster),size=1.25) +
xlab("") + ylab("") +
ggrepel::geom_label_repel(data = tsne_data_selection %>% filter(name %in% max_sh_div), aes(X,Y, label = name, color = cluster), size = 2) +
scale_color_manual(values = cols) +
theme_no_axes() +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
ggtitle("t-SNE: genomes with largest Shannon domain diversity")
checkm_results <- read_tsv("~/Downloads/InBioPharm_checkm_932_genomes_06022017.txt", col_names = T, trim_ws = T) %>%
dplyr::rename(genome = `Bin Id`) %>%
mutate(genome = gsub("_scaffold-sequences", "", genome))
checkm_results_long <- checkm_results %>% melt(id = "genome") %>% tbl_df
for (i in unique(tsne_data_selection$cluster)){
tsne_subset <- tsne_data_selection %>%
filter(cluster == i)
all_domains_m_ss <- all_domains_m[tsne_subset$name,]
# Calculate diversity indices for each memeber of the cluster:
all_rich <- specnumber(all_domains_m_ss)
all_shannon <- vegan::diversity(all_domains_m_ss)
all_invsimp <- vegan::diversity(all_domains_m_ss, "invsimpson")
all_pielou <- all_shannon/log(all_rich)
diversity <- data.frame(richness = all_rich, shannon = all_shannon, invsimp = all_invsimp, eveness = all_pielou)
diversity$genome <- row.names(diversity)
diversity_long <- diversity %>%
reshape2::melt()
max_sh_div <- diversity_long %>%
filter(variable == "shannon") %>%
filter(value == max(value)) %>%
.$genome
# Plot heatmap
row_annotations <- sapply(strsplit(colnames(all_domains_m_ss), "_"), "[", 1)
mat_col <- data.frame(Domain = row_annotations)
rownames(mat_col) <- colnames(all_domains_m_ss)
m_all_domains_m_ss <- all_domains_m_ss
colnames(m_all_domains_m_ss) <- row_annotations
m_all_domains_m_ss <- melt(m_all_domains_m_ss) %>%
group_by(Var2) %>%
summarise(N=sum(value)) %>%
ungroup() %>%
mutate(prop = N/sum(N))
mat_colors <- list(Domain = brewer.pal(3, "Set1"))
names(mat_colors$Domain) <- unique(mat_col$Domain)
med_gen <- tsne_subset %>% filter(medoid == TRUE) %>% .$name
main <- paste("Cluster_", i,"; Medoid: ", med_gen,"; Max Shannon div:", max_sh_div, sep = "")
pheat <- pheatmap(
mat = log10(t(all_domains_m_ss) + 0.001),
color = viridis(100),
border_color = "white",
show_colnames = TRUE,
show_rownames = FALSE,
annotation_row = mat_col,
annotation_colors = mat_colors,
drop_levels = TRUE,
fontsize = 6,
main = main,
clustering_method = "complete",
clustering_distance_rows = "euclidean",
clustering_distance_cols = "euclidean")
t_plot <- ggplot(tsne_data, aes(x=X, y=Y)) +
geom_point(size=1.25, alpha = 0.1, color = "grey10") +
geom_point(data = tsne_subset, aes(x=X, y=Y, color = cluster),size=1.25) +
xlab("") + ylab("") +
ggrepel::geom_label_repel(data = tsne_subset %>% filter(name %in% med), aes(X,Y, label = name, color = cols1[i]), size = 2) +
theme_no_axes() +
theme(axis.text = element_blank(),
axis.ticks = element_blank(),
legend.position = "none") +
ggtitle("t-SNE: medoid genomes")
p_s <- ggplot(diversity_long %>% filter(variable == "shannon"), aes(genome, value)) +
geom_bar(stat = "identity", fill = "#333333", alpha = 0.6) +
geom_bar(data = diversity_long %>% filter(variable == "shannon", genome == max_sh_div), aes(genome, value), stat = "identity", fill = "#C02942") +
theme_bw() +
theme(axis.text = element_text(size = 6)) +
xlab("") +
ylab("Shannon diversity (bits)")
p_d <- ggplot(m_all_domains_m_ss, aes(Var2, prop)) +
geom_bar(stat = "identity") +
theme_bw() +
xlab("Domain") +
ylab("Relative abundance") +
scale_y_continuous(labels = scales::percent) +
theme(axis.text = element_text(size = 6),
axis.title = element_text(size = 8))
checkm_results_long_filt <- checkm_results_long %>% filter(genome %in% tsne_subset$name) %>%
filter(variable == "Completeness" |
variable == "Contamination" |
variable == "Strain heterogeneity" |
variable == "Genome size (bp)" |
variable == "# scaffolds" |
variable == "GC")
if (dim(checkm_results_long_filt)[1] > 0){
p_ch <- ggplot(checkm_results_long_filt, aes(genome, as.numeric(value))) +
geom_bar(stat = "identity") +
facet_wrap(~variable, scales = "free_x", strip.position = "top") +
theme_bw() +
xlab("") +
ylab("") +
theme(axis.text = element_text(size = 6),
axis.title = element_text(size = 8)) +
scale_y_continuous(labels = human_numbers) +
coord_flip()
fname <- paste("~/Downloads/tsne_cluster", i, ".png", sep = "")
#print(cowplot::plot_grid(t_plot, pheat$gtable, p_s, p_d , ncol = 2))
ggdraw() +
draw_plot(t_plot, x = 0, y = 0.6, width = .5, height = .38) +
draw_plot(pheat$gtable, x = 0.5, y = 0.6, width = .5, height = .38) +
draw_plot(p_s, x = 0, y = 0.31, width = .5, height = .29) +
draw_plot(p_d, x = 0.5, y = 0.31, width = .5, height = .29) +
draw_plot(p_ch, x = 0, y = 0, width = 1, height = .3) +
draw_plot_label(c("A", "B", "C", "D", "E"), c(0, 0.5, 0, 0.5, 0), c(1, 1, 0.6, 0.6, 0.3), size = 10)
ggsave(fname, width = 11.29, height = 11.29, units = "in")
}
}
human_numbers <- function(x = NULL, smbl ="", signif = 1){
humanity <- function(y){
if (!is.na(y)){
tn <- round(abs(y) / 1e12, signif)
b <- round(abs(y) / 1e9, signif)
m <- round(abs(y) / 1e6, signif)
k <- round(abs(y) / 1e3, signif)
if ( y >= 0 ){
y_is_positive <- ""
} else {
y_is_positive <- "-"
}
if ( k < 1 ) {
paste0( y_is_positive, smbl, round(abs(y), signif ))
} else if ( m < 1){
paste0 (y_is_positive, smbl, k , "K")
} else if (b < 1){
paste0 (y_is_positive, smbl, m ,"M")
}else if(tn < 1){
paste0 (y_is_positive, smbl, b ,"bn")
} else {
paste0 (y_is_positive, smbl, comma(tn), "tn")
}
} else if (is.na(y) | is.null(y)){
"-"
}
}
sapply(x,humanity)
}