genomewalker
8/29/2017 - 10:34 AM

cluster evaluation plots

cluster evaluation plots

library(tidyverse)
library(RSQLite)
library(cowplot)
library(ggrepel)

db_file <- "~/Downloads/cluster_val (2).sqlite3"
con <- dbConnect(SQLite(), db_file)
rejected <- tbl(con, "clval_rejected") %>% collect(n = Inf)
repres <- tbl(con, "clval_repres") %>% collect(n = Inf)
replic <- tbl(con, "cl_replicate") %>% collect(n = Inf)
stats <- tbl(con, "cl_alns_stat") %>% collect(n = Inf)
dbDisconnect(con)


rep_count <- replic %>% group_by(clname, target) %>% count()
rep_count_cl <- replic %>% 
  group_by(clname) %>% 
  count() %>%
  rename(dup = n)


db_file <- "~/Downloads/cluster_val_stat (1).sqlite3"
con <- dbConnect(SQLite(), db_file)
anot <- tbl(con, "clval_annot") %>% collect(n = Inf)
anot_rep <- tbl(con, "clval_rep_annot") %>% collect(n = Inf)
anot_norep <- tbl(con, "clval_norep_annot") %>% collect(n = Inf)
dbDisconnect(con)

# How many clusters do have rejected sequences
rej_desc <- data.frame(class = c("Rejected (9,528)", "Kept (185,984)"), num = c(length(unique(rejected$clname)), dim(repres)[1] - dim(rejected)[1]))


p <- ggplot(rej_desc, aes(class, num)) +
  geom_bar(stat = "identity") +
  scale_y_continuous(labels = scales::comma) +
  ylab("Number of clusters") +
  xlab("") +
  #ggtitle("Clusters with rejected ORFs (Rej = 9,528; n = 200,000)") +
  theme_light() +
  theme(plot.title = element_text(size=8))

prop_rej <- rejected %>% 
  group_by(clname) %>% 
  count() %>% 
  rename(rej = n) %>%
  left_join(repres) %>%
  mutate(prop_rej = rej/norfs) %>%
  arrange(desc(prop_rej)) %>%
  left_join(anot %>% select(-old_repres)) %>%
  left_join(bind_rows(anot_rep, anot_norep)) %>%
  mutate(annot_type = ifelse(is.na(annot_type), "None", annot_type), 
         annot_type = ifelse(annot_type == "mono_annot", "Mono", annot_type),  
         annot_type = ifelse(annot_type == "multi", "Multi", annot_type),
         annot_type = ifelse(annot_type == "singl", "Singl", annot_type),
         jacc_m_sc_raw = jacc_m_sc/prop_annot
         ) %>%
  left_join(rep_count_cl) %>%
  mutate(dup = ifelse(is.na(dup), 0, dup), prop_dup = dup/norfs) %>%
  left_join(stats)
  
 
prop_rej$annot <- factor(prop_rej$annot, levels = c("rep_annot", "norep_annot", "no_annot"))
prop_rej$annot_type <- factor(prop_rej$annot_type, levels = c("Mono", "Multi", "Singl", "None"))

prop_rej %>% filter(prop_rej > 0.1) %>% group_by(annot) %>% count()
prop_rej %>% filter(prop_rej > 0.1) %>% group_by(annot_type) %>% count()


class_names <- c(
  `rep_annot` = "Representative with PFAM (305)",
  `norep_annot` = "Representative without PFAM (76)",
  `no_annot` = "Without annotation (345)",
  `Mono` = "Mono (236)",
  `Multi` = "Multi (96)",
  `Singl` = "Singl (49)",
  `None` = "None (345)"
)


p1 <- ggplot(prop_rej, aes(prop_rej,norfs)) +
  geom_jitter(alpha = 0.5) +
  geom_rug(data = prop_rej, aes(color=annot_type), alpha = 1/2, position = "jitter") +
  theme_light() +
  xlab("Proportion of rejected ORFs per cluster") +
  ylab("Cluster size (# of ORFs)") +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(labels = scales::percent)+
  facet_wrap(~annot_type, scales = "free", labeller = as_labeller(class_names), nrow = 1) +
  ggsci::scale_color_jco(name = "Cluster type", guide=FALSE)

p2 <- plyr::ldply(seq(0,1, 0.1), function(x) {data.frame(threshold = x, clusters = dim(prop_rej %>% filter(prop_rej > x))[1])}) %>%
  ggplot(aes(threshold, clusters)) +
  geom_line() +
  geom_point() +
  ggrepel::geom_label_repel(aes(label = clusters), size = 2, box.padding = unit(0.4, "lines"),point.padding = unit(0.3, "lines")) +
  theme_light() +
  xlab("Proportion of rejected ORFs per cluster") +
  ylab("Number of clusters") +
  scale_y_continuous(labels = scales::comma) +
  scale_x_continuous(breaks = seq(0,1, 0.1), labels = scales::percent)


p4 <- ggplot(prop_rej %>% filter(!is.na(mean)), aes(mean, prop_rej)) +
  geom_jitter(alpha = 0.5) +
  geom_jitter(data = prop_rej %>% filter(prop_rej > 0.1), aes(mean, prop_rej, size = norfs), color = "#C84359", alpha = 0.5) +
  geom_rug(data = prop_rej %>% filter(!is.na(mean)), aes(color=annot), alpha = 1/2, position = "jitter") +
  theme_light() +
  ylab("Proportion of rejected ORFs per cluster") +
  xlab("Average ORF similarity per cluster") +
  scale_x_continuous(labels = scales::percent) +
  scale_y_continuous(labels = scales::percent)+
  facet_wrap(~annot_type, scales = "free", labeller = as_labeller(class_names), nrow = 1) +
  ggsci::scale_color_jco(name = "Cluster type", labels = c("Rep. annot.", "Other annot", "None")) +
  scale_size_continuous(name = "# of ORFs")


p5 <- ggplot(prop_rej %>% filter(prop_rej > 0.1), aes(norfs, fill = "Rejected")) +
  geom_histogram(data = prop_rej %>% filter(prop_rej <= 0.1), aes(norfs, fill = "Kept"), color = "white", size = 0.1) +
  geom_histogram(color = "white", size = 0.1) +
  theme_light() +
  xlab("Cluster size (log10)") +
  ylab("Number of clusters") +
  scale_x_log10() +
  scale_fill_manual(values = c("#4A4A4A", "#F0D999"), name = "") +
  theme(legend.position = c(0.88, 0.85))

library(ggtern)

lines <- data.frame(x = c(0.5, 0, 0.5), 
                    y = c(0.5, 0.5, 0), 
                    z = c(0, 0.5, 0.5), 
                    xend = c(1, 1, 1)/3, 
                    yend = c(1, 1, 1)/3, 
                    zend = c(1, 1, 1)/3)

p4 <- ggtern(data=prop_rej, mapping=aes(x=mean,y=prop_rej,z=prop_dup, size = norfs)) +
  geom_segment(data = lines, 
               aes(x, y, z, 
                   xend = xend, yend = yend, zend = zend), 
               color = 'grey', size = 0.5) +
  geom_point(, alpha = 0.3) + 
  labs( x       = "",
        xarrow  = "Mean ORF similarity per cluster",
        y       = "",
        yarrow  = "Proportion of rejected ORFs per cluster",
        z       = "",
        zarrow  = "Proportion of duplicated ORFs per cluster") +
  #weight_percent() +
  facet_wrap(~annot_type, labeller = as_labeller(class_names), nrow = 1) +
  theme_light() +
  theme_showarrows() + 
  #theme_clockwise() +
  theme_nomask() 

+
  #geom_text(data=prop_rej %>% filter(!is.na(jacc_m_sc)), aes(label = clname)) +
  theme(
    strip.background =
  )

ggdraw() + 
  draw_plot(p, x = 0, y = .5, width = .33, height = .5) +
  draw_plot(p2, x = .33, y = .5, width = .33, height = .5) +
  draw_plot(p5, x = .66, y = .5, width = .33, height = .5) +
  draw_plot(p4, x = 0, y = 0, width = 1, height = .5) +
  draw_plot_label(label = c("a", "b", "c", "d"), size = 14,
                  x = c(0, 0.33, 0.66, 0), y = c(1, 1, 1, 0.53))