cluster evaluation plots
library(tidyverse)
library(RSQLite)
library(cowplot)
library(ggrepel)
db_file <- "~/Downloads/cluster_val (2).sqlite3"
con <- dbConnect(SQLite(), db_file)
rejected <- tbl(con, "clval_rejected") %>% collect(n = Inf)
repres <- tbl(con, "clval_repres") %>% collect(n = Inf)
replic <- tbl(con, "cl_replicate") %>% collect(n = Inf)
stats <- tbl(con, "cl_alns_stat") %>% collect(n = Inf)
dbDisconnect(con)
rep_count <- replic %>% group_by(clname, target) %>% count()
rep_count_cl <- replic %>%
group_by(clname) %>%
count() %>%
rename(dup = n)
db_file <- "~/Downloads/cluster_val_stat (1).sqlite3"
con <- dbConnect(SQLite(), db_file)
anot <- tbl(con, "clval_annot") %>% collect(n = Inf)
anot_rep <- tbl(con, "clval_rep_annot") %>% collect(n = Inf)
anot_norep <- tbl(con, "clval_norep_annot") %>% collect(n = Inf)
dbDisconnect(con)
# How many clusters do have rejected sequences
rej_desc <- data.frame(class = c("Rejected (9,528)", "Kept (185,984)"), num = c(length(unique(rejected$clname)), dim(repres)[1] - dim(rejected)[1]))
p <- ggplot(rej_desc, aes(class, num)) +
geom_bar(stat = "identity") +
scale_y_continuous(labels = scales::comma) +
ylab("Number of clusters") +
xlab("") +
#ggtitle("Clusters with rejected ORFs (Rej = 9,528; n = 200,000)") +
theme_light() +
theme(plot.title = element_text(size=8))
prop_rej <- rejected %>%
group_by(clname) %>%
count() %>%
rename(rej = n) %>%
left_join(repres) %>%
mutate(prop_rej = rej/norfs) %>%
arrange(desc(prop_rej)) %>%
left_join(anot %>% select(-old_repres)) %>%
left_join(bind_rows(anot_rep, anot_norep)) %>%
mutate(annot_type = ifelse(is.na(annot_type), "None", annot_type),
annot_type = ifelse(annot_type == "mono_annot", "Mono", annot_type),
annot_type = ifelse(annot_type == "multi", "Multi", annot_type),
annot_type = ifelse(annot_type == "singl", "Singl", annot_type),
jacc_m_sc_raw = jacc_m_sc/prop_annot
) %>%
left_join(rep_count_cl) %>%
mutate(dup = ifelse(is.na(dup), 0, dup), prop_dup = dup/norfs) %>%
left_join(stats)
prop_rej$annot <- factor(prop_rej$annot, levels = c("rep_annot", "norep_annot", "no_annot"))
prop_rej$annot_type <- factor(prop_rej$annot_type, levels = c("Mono", "Multi", "Singl", "None"))
prop_rej %>% filter(prop_rej > 0.1) %>% group_by(annot) %>% count()
prop_rej %>% filter(prop_rej > 0.1) %>% group_by(annot_type) %>% count()
class_names <- c(
`rep_annot` = "Representative with PFAM (305)",
`norep_annot` = "Representative without PFAM (76)",
`no_annot` = "Without annotation (345)",
`Mono` = "Mono (236)",
`Multi` = "Multi (96)",
`Singl` = "Singl (49)",
`None` = "None (345)"
)
p1 <- ggplot(prop_rej, aes(prop_rej,norfs)) +
geom_jitter(alpha = 0.5) +
geom_rug(data = prop_rej, aes(color=annot_type), alpha = 1/2, position = "jitter") +
theme_light() +
xlab("Proportion of rejected ORFs per cluster") +
ylab("Cluster size (# of ORFs)") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(labels = scales::percent)+
facet_wrap(~annot_type, scales = "free", labeller = as_labeller(class_names), nrow = 1) +
ggsci::scale_color_jco(name = "Cluster type", guide=FALSE)
p2 <- plyr::ldply(seq(0,1, 0.1), function(x) {data.frame(threshold = x, clusters = dim(prop_rej %>% filter(prop_rej > x))[1])}) %>%
ggplot(aes(threshold, clusters)) +
geom_line() +
geom_point() +
ggrepel::geom_label_repel(aes(label = clusters), size = 2, box.padding = unit(0.4, "lines"),point.padding = unit(0.3, "lines")) +
theme_light() +
xlab("Proportion of rejected ORFs per cluster") +
ylab("Number of clusters") +
scale_y_continuous(labels = scales::comma) +
scale_x_continuous(breaks = seq(0,1, 0.1), labels = scales::percent)
p4 <- ggplot(prop_rej %>% filter(!is.na(mean)), aes(mean, prop_rej)) +
geom_jitter(alpha = 0.5) +
geom_jitter(data = prop_rej %>% filter(prop_rej > 0.1), aes(mean, prop_rej, size = norfs), color = "#C84359", alpha = 0.5) +
geom_rug(data = prop_rej %>% filter(!is.na(mean)), aes(color=annot), alpha = 1/2, position = "jitter") +
theme_light() +
ylab("Proportion of rejected ORFs per cluster") +
xlab("Average ORF similarity per cluster") +
scale_x_continuous(labels = scales::percent) +
scale_y_continuous(labels = scales::percent)+
facet_wrap(~annot_type, scales = "free", labeller = as_labeller(class_names), nrow = 1) +
ggsci::scale_color_jco(name = "Cluster type", labels = c("Rep. annot.", "Other annot", "None")) +
scale_size_continuous(name = "# of ORFs")
p5 <- ggplot(prop_rej %>% filter(prop_rej > 0.1), aes(norfs, fill = "Rejected")) +
geom_histogram(data = prop_rej %>% filter(prop_rej <= 0.1), aes(norfs, fill = "Kept"), color = "white", size = 0.1) +
geom_histogram(color = "white", size = 0.1) +
theme_light() +
xlab("Cluster size (log10)") +
ylab("Number of clusters") +
scale_x_log10() +
scale_fill_manual(values = c("#4A4A4A", "#F0D999"), name = "") +
theme(legend.position = c(0.88, 0.85))
library(ggtern)
lines <- data.frame(x = c(0.5, 0, 0.5),
y = c(0.5, 0.5, 0),
z = c(0, 0.5, 0.5),
xend = c(1, 1, 1)/3,
yend = c(1, 1, 1)/3,
zend = c(1, 1, 1)/3)
p4 <- ggtern(data=prop_rej, mapping=aes(x=mean,y=prop_rej,z=prop_dup, size = norfs)) +
geom_segment(data = lines,
aes(x, y, z,
xend = xend, yend = yend, zend = zend),
color = 'grey', size = 0.5) +
geom_point(, alpha = 0.3) +
labs( x = "",
xarrow = "Mean ORF similarity per cluster",
y = "",
yarrow = "Proportion of rejected ORFs per cluster",
z = "",
zarrow = "Proportion of duplicated ORFs per cluster") +
#weight_percent() +
facet_wrap(~annot_type, labeller = as_labeller(class_names), nrow = 1) +
theme_light() +
theme_showarrows() +
#theme_clockwise() +
theme_nomask()
+
#geom_text(data=prop_rej %>% filter(!is.na(jacc_m_sc)), aes(label = clname)) +
theme(
strip.background =
)
ggdraw() +
draw_plot(p, x = 0, y = .5, width = .33, height = .5) +
draw_plot(p2, x = .33, y = .5, width = .33, height = .5) +
draw_plot(p5, x = .66, y = .5, width = .33, height = .5) +
draw_plot(p4, x = 0, y = 0, width = 1, height = .5) +
draw_plot_label(label = c("a", "b", "c", "d"), size = 14,
x = c(0, 0.33, 0.66, 0), y = c(1, 1, 1, 0.53))