genomewalker
10/16/2017 - 11:08 AM

Intercluster similarity comparison

library(tidyverse)

mmseqs_m8 <- read_tsv("~/Downloads/rep_aln.sorted.bh.gz", col_names = FALSE, progress = TRUE)
unk_rep_length <- read_delim("~/Downloads/unk_orf_rep_length.tsv.gz", col_names = FALSE, progress = TRUE, delim = " ") %>%
  rename(orf = X1, length = X3, cls_size = X2)

mmseqs_m8_cov <- mmseqs_m8 %>%
  select(X1, X2, X3, X4, X11, X12) %>%
  dplyr::rename(query = X1, subject = X2, id = X3, aln_len = X4, evalue = X11, bscore = X12) %>%
  separate(query, c("qcls", "qproj", "query"), extra = "drop", sep = "\\|") %>%
  separate(subject, c("scls", "sproj", "subject"), extra = "drop", sep = "\\|") %>%
  separate(qcls, c("qclass", "qcls_num"), remove = FALSE) %>%
  separate(scls, c("sclass", "scls_num"), remove = FALSE) %>%
  left_join(unk_rep_length %>% rename(query = orf)) %>%
  rename(lquery = length) %>%
  left_join(unk_rep_length %>% rename(subject = orf)) %>%
  rename(lsubject = length) %>%
  mutate(qcov = aln_len/lquery, scov = aln_len/lsubject)

mmseqs_m8_cov80 <- mmseqs_m8_cov %>%
  filter(qclass != "apc" & sclass != "apc") %>%
  #filter(qclass != "dpc" & sclass != "dpc") %>%
  filter(qcov >= 0.8, scov >= 0.8)

mmseqs_m8_cov60 %>%
  filter(qclass == "kpc" & sclass == "kpc") %>%
  select(query, subject, id, qcov, scov)  

# OLD: TARA_022_SRF_<-0.22_scaffold51013_1_-_2_1156_orf-1 
# NEW: malaspina|ICM0001MP1091_1003384_+_4879_5982_orf-8|-
#   
# OLD: TARA_046_SRF_<-0.22_scaffold11162_1_-_278_1699_orf-2
# NEW: ICM0062MP0780_1000290_-_1331_3001_orf-2


mmseqs_m8_cov80 <- mmseqs_m8_cov80 %>%
  filter(id >= 0.9) %>%
  group_by(qclass, sclass) %>%
  count() %>%
  ungroup() %>%
  mutate_each(funs(as.character), qclass, sclass) %>%
  mutate( V1 = pmin(qclass, sclass), 
          V2 = pmax(qclass, sclass)) %>%
  group_by(V1, V2) %>%
  summarise(n=sum(n)) %>%
  arrange(desc(n)) %>%
  unite(class, V1, V2, sep = "_vs_") 

mmseqs_m8_cov80$class <- factor(mmseqs_m8_cov80$class, levels = mmseqs_m8_cov80 %>% arrange(desc(n)) %>% .$class)

ggplot(mmseqs_m8_cov80, aes(class, n)) +
  geom_bar(stat = "identity") +
  theme_light() +
  scale_y_continuous(labels = scales::comma) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  xlab("") +
  ylab("# clusters")