genomewalker
2/3/2018 - 9:05 PM

Environmental unknowns vs MAGs

Environmnetal unknowns in Tom's MAGs

We will screen Tom's MAGs for environmental unknowns.

Preparing TOM's data

mkdir /scratch/eus_vs_mags
cd eus_vs_mags
wget http://files.metagenomics.eu/TARA_non_redundant_MAGs_AA.fa.gz
wget http://files.metagenomics.eu/TARA_MAGs_v3_metadata.txt
wget http://files.metagenomics.eu/orf2mag.tsv.gz
wget http://files.metagenomics.eu/tara_delmont_taxids.csv

Preparing profiles for MMSEQS2

We need the ffindex files with the HHM profiles, and convert them to MMSEQS2 format. First we copy the HHM profiles to scratch and we convert them:

cd /scratch/eus_vs_mags

cp /bioinf/projects/megx/UNKNOWNS/2017_11/classification/results/ffindex_data/marine_hmp_db_03112017_eu_hhm.ffdata marine_hmp_db_03112017_eu_hhm

cp /bioinf/projects/megx/UNKNOWNS/2017_11/classification/results/ffindex_data/marine_hmp_db_03112017_eu_hhm.ffindex marine_hmp_db_03112017_eu_hhm.index

~/opt/MMseqs2_uv/build/bin/mmseqs convertprofiledb marine_hmp_db_03112017_eu_hhm marine_hmp_db_03112017_eu_hhm_profileDB --threads 16

Searching the MAGs

We will create a DB for MMSEQS2 using the AA sequences from Tom and do a profile search:

mmseqs createdb TARA_non_redundant_MAGs_AA.fa.gz magsDB
mmseqs search magsDB marine_hmp_db_03112017_eu_hhm_profileDB mag_results tmp -c 0.8 --cov-mode 0 -e 1e-5 --threads 64
mmseqs convertalis magsDB marine_hmp_db_03112017_eu_hhm_profileDB mag_results mag_results.m8 --format-mode 2

Parse and plot the results

For plotting use the script here

library(tidyverse)
# Let's read the files
m_vs_u <- read_tsv("mags_eu_results.m8", col_names = FALSE, progress = TRUE) %>%
  select(X1, X2, X3, X11) %>%
  rename(gene_callers_id = X1, unk_id = X2, id = X3, evalue = X11) %>%
  mutate(gene_callers_id = as.character(gene_callers_id),  unk_id = as.character(unk_id)) %>%
  filter(evalue <= 1e-25)
m_genes <- read_tsv("orf2mag.tsv.gz", col_names = TRUE, progress = TRUE) %>%
  select(gene_callers_id, contig) %>%
  mutate(gene_callers_id = as.character(gene_callers_id))
mag_cdata <- read_tsv("TARA_MAGs_v3_metadata.txt", col_names = TRUE)
mag_tax <- read_csv("tara_delmont_taxids.csv", col_names = FALSE) %>%
  separate(X3,into = c("domain","phylum","class", "order", "family", "genus"), sep = ";", fill = "right", extra = "drop") %>%
  rename(MAG=X1)

# Some data massaging
# Combine tables and clean MAG names
m_vs_u_mag <- m_vs_u %>%
  left_join(m_genes) %>%
  tidyr::extract(contig, into = paste("V", 1:4, sep = ""), regex = "([[:alnum:]]+)_([[:alnum:]]+)_([[:alnum:]]+)_([[:alnum:]]+)") %>%
  tidyr::unite(col = MAG, V1,V2,V3,V4, sep = "_", remove = TRUE) 

mag_n_orfs <- m_genes %>%
  tidyr::extract(contig, into = paste("V", 1:4, sep = ""), regex = "([[:alnum:]]+)_([[:alnum:]]+)_([[:alnum:]]+)_([[:alnum:]]+)") %>%
  unite(col = MAG, V1,V2,V3,V4, sep = "_", remove = TRUE) %>%
  group_by(MAG) %>%
  count() %>%
  rename(n_orf = n)

# Count proportion of ORFs in each MAG
m_vs_u_mag_n <- m_vs_u_mag %>%
  select(MAG, unk_id) %>%
  unique() %>%
  group_by(MAG) %>%
  count() %>%
  ungroup() %>%
  #tidyr::complete(uclass, MAG, fill = list(n = 0)) %>%
  left_join(mag_n_orfs) %>%
  mutate(prop = n/n_orf) %>%
  left_join(mag_tax)

# Get the 25 more abundants
m_vs_u_mag_n_25 <- m_vs_u_mag_n %>%
  dplyr::top_n(25) %>%
  .$MAG


m_vs_u_mag_n %>% group_by(phylum) %>% count

# Plot them
ggplot(m_vs_u_mag_n , aes(MAG, n, color = phylum)) +
  geom_bar(stat = "identity") +
  theme_light() +
  xlab("") +
  ylab("# hits") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Calculate histograms
all <- ggplot(m_vs_u_mag_n, aes(prop)) +
  geom_histogram(alpha = 0.5, fill = "#2E3239") +
  theme_light() +
  ylab("Frequency") +
  xlab("Proportion of unknown ORFs") +
  scale_x_continuous(labels = scales::percent)

eupc <- ggplot(m_vs_u_mag_n %>% filter(uclass == "eupc"), aes(prop)) +
  geom_histogram(alpha = 0.5, fill = "#2E3239") +
  theme_light() +
  ylab("Frequency") +
  xlab("Proportion of env. unknowns ORFs") +
  scale_x_continuous(labels = scales::percent)

gupc <- ggplot(m_vs_u_mag_n %>% filter(uclass == "gupc"), aes(prop)) +
  geom_histogram(alpha = 0.5, fill = "#2E3239") +
  theme_light() +
  ylab("Frequency") +
  xlab("Proportion of gen. unknowns ORFs") +
  scale_x_continuous(labels = scales::percent)

# Plot them
ggpubr::ggarrange(all, gupc, eupc, nrow = 1, ncol = 3)

m_vs_u_mag %>%
  group_by(MAG, uclass) %>%
  count %>% View