library(tidyverse)
mmseqs_m8 <- read_tsv("~/Downloads/rep_aln.sorted.bh.gz", col_names = FALSE, progress = TRUE)
unk_rep_length <- read_delim("~/Downloads/unk_orf_rep_length.tsv.gz", col_names = FALSE, progress = TRUE, delim = " ") %>%
rename(orf = X1, length = X3, cls_size = X2)
mmseqs_m8_cov <- mmseqs_m8 %>%
select(X1, X2, X3, X4, X11, X12) %>%
dplyr::rename(query = X1, subject = X2, id = X3, aln_len = X4, evalue = X11, bscore = X12) %>%
separate(query, c("qcls", "qproj", "query"), extra = "drop", sep = "\\|") %>%
separate(subject, c("scls", "sproj", "subject"), extra = "drop", sep = "\\|") %>%
separate(qcls, c("qclass", "qcls_num"), remove = FALSE) %>%
separate(scls, c("sclass", "scls_num"), remove = FALSE) %>%
left_join(unk_rep_length %>% rename(query = orf)) %>%
rename(lquery = length) %>%
left_join(unk_rep_length %>% rename(subject = orf)) %>%
rename(lsubject = length) %>%
mutate(qcov = aln_len/lquery, scov = aln_len/lsubject)
mmseqs_m8_cov80 <- mmseqs_m8_cov %>%
filter(qclass != "apc" & sclass != "apc") %>%
#filter(qclass != "dpc" & sclass != "dpc") %>%
filter(qcov >= 0.8, scov >= 0.8)
mmseqs_m8_cov60 %>%
filter(qclass == "kpc" & sclass == "kpc") %>%
select(query, subject, id, qcov, scov)
# OLD: TARA_022_SRF_<-0.22_scaffold51013_1_-_2_1156_orf-1
# NEW: malaspina|ICM0001MP1091_1003384_+_4879_5982_orf-8|-
#
# OLD: TARA_046_SRF_<-0.22_scaffold11162_1_-_278_1699_orf-2
# NEW: ICM0062MP0780_1000290_-_1331_3001_orf-2
mmseqs_m8_cov80 <- mmseqs_m8_cov80 %>%
filter(id >= 0.9) %>%
group_by(qclass, sclass) %>%
count() %>%
ungroup() %>%
mutate_each(funs(as.character), qclass, sclass) %>%
mutate( V1 = pmin(qclass, sclass),
V2 = pmax(qclass, sclass)) %>%
group_by(V1, V2) %>%
summarise(n=sum(n)) %>%
arrange(desc(n)) %>%
unite(class, V1, V2, sep = "_vs_")
mmseqs_m8_cov80$class <- factor(mmseqs_m8_cov80$class, levels = mmseqs_m8_cov80 %>% arrange(desc(n)) %>% .$class)
ggplot(mmseqs_m8_cov80, aes(class, n)) +
geom_bar(stat = "identity") +
theme_light() +
scale_y_continuous(labels = scales::comma) +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
xlab("") +
ylab("# clusters")