Tiny script to create aminoacid sequences with different types of mutations

6/28/2017 - 9:53 PM

Tiny script to create aminoacid sequences with different types of mutations

#!/bin/bash
set -e
set -x
#[ -z "$MMDIR" ] && echo "Please set the environment variable \$MMDIR to your MMSEQS installation directory." && exit 1;
[ "$#" -lt 2  ] && echo "Please provide <sequenceDB> <outDir>"  && exit 1;
[ ! -f "$1"   ] && echo "Sequence database $1 not found!"       && exit 1;
[   -d "$2"   ] && echo "Output directory $2 exists already!"   && exit 1;

function abspath() {
    if [ -d "$1" ]; then
        (cd "$1"; pwd)
    elif [ -f "$1" ]; then
        if [[ $1 == */* ]]; then
            echo "$(cd "${1%/*}"; pwd)/${1##*/}"
        else
            echo "$(pwd)/$1"
        fi
    fi
}

export OMP_NUM_THREADS=4
MMSEQSEXE="mmseqs"
RELEASE="${3:-$(gdate "+%Y_%m")}"
SHORTRELEASE="${4:-$(gdate "+%y%m")}"

INPUT=$1
OUTDIR=$2/$RELEASE

TMPDIR=$OUTDIR/tmp
mkdir -p $TMPDIR

OUTDIR=$(abspath $OUTDIR)
TMPDIR=$(abspath $TMPDIR)

PREFILTER_COMMON="$COMMON --diag-score 1 --min-ungapped-score 15 --spaced-kmer-mode 1 --max-seq-len 32768 --split-mode 1"
PREFILTER0_PAR="--max-seqs 20  --comp-bias-corr 0 --k-score 145 ${PREFILTER_COMMON}"
PREFILTER1_PAR="--max-seqs 100 --comp-bias-corr 1 --k-score 135 ${PREFILTER_COMMON}"
PREFILTER2_PAR="--max-seqs 300 --comp-bias-corr 1 --k-score 90 -k 6 ${PREFILTER_COMMON}"
ALIGNMENT_COMMON="$COMMON -e 0.001 --max-seq-len 32768 --max-rejected 2147483647"
ALIGNMENT0_PAR="--max-seqs 20  -c 0.9 --alignment-mode 2 --min-seq-id 0.9 --comp-bias-corr 0  ${ALIGNMENT_COMMON}"
ALIGNMENT1_PAR="--max-seqs 100 -c 0.9 --alignment-mode 2 --min-seq-id 0.9 --comp-bias-corr 1 ${ALIGNMENT_COMMON}"
ALIGNMENT2_PAR="--max-seqs 300 -c 0.8 --alignment-mode 3 --min-seq-id 0.3 --comp-bias-corr 1 ${ALIGNMENT_COMMON}"
CLUSTER0_PAR="--cluster-mode 2"
CLUSTER1_PAR="--cluster-mode 0"
CLUSTER2_PAR="--cluster-mode 0"
SEARCH_PAR="$COMMON --profile --k-score 100"
CSTRANSLATE_PAR="-x 0.3 -c 4 -A $HHLIB/data/cs219.lib -D $HHLIB/data/context_data.lib -I ca3m -f -b"

SEQUENCE_DB="$OUTDIR/aaprot_db"

printf "###############################\n"
printf "#### Creating DB\n"
printf "###############################\n"

#export OMP_PROC_BIND=true
export OMP_NUM_THREADS=4
# we split all sequences that are above 14k in N/14k parts
mmseqs createdb "$INPUT" "${SEQUENCE_DB}" --max-seq-len 32768

gdate --rfc-3339=seconds
printf "###############################\n"
printf "#### Filtering redundancy\n"
printf "###############################\n"

# filter redundancy
INPUT="${SEQUENCE_DB}"
mmseqs clusthash $INPUT "$TMPDIR/aln_redundancy" --min-seq-id 0.9 --threads 4
gdate --rfc-3339=seconds
mmseqs clust $INPUT "$TMPDIR/aln_redundancy" "$TMPDIR/clu_redundancy" ${CLUSTER1_PAR} --threads 4
gdate --rfc-3339=seconds
gawk '{ print $1 }' "$TMPDIR/clu_redundancy.index" > "$TMPDIR/order_redundancy"
mmseqs createsubdb "$TMPDIR/order_redundancy" $INPUT "$TMPDIR/input_step0"

gdate --rfc-3339=seconds

printf "###############################\n"
printf "#### Cluster at 90%% \n"
printf "###############################\n"
#export OMP_NUM_THREADS=16
# go down to 90% and merge fragments (accept fragment if dbcov >= 0.95 && seqId >= 0.9)
STEP=0
INPUT="$TMPDIR/input_step0"
mmseqs prefilter "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" ${PREFILTER0_PAR} --threads 4
gdate --rfc-3339=seconds
mmseqs align "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" "$TMPDIR/aln_step$STEP" ${ALIGNMENT0_PAR} --threads 4
gdate --rfc-3339=seconds
export OMP_NUM_THREADS=4
mmseqs clust $INPUT "$TMPDIR/aln_step$STEP" "$TMPDIR/clu_step$STEP" ${CLUSTER0_PAR} --threads 4
gdate --rfc-3339=seconds
gawk '{ print $1 }' "$TMPDIR/clu_step$STEP.index" > "$TMPDIR/order_step$STEP"
mmseqs createsubdb "$TMPDIR/order_step$STEP" $INPUT "$TMPDIR/input_step1"

gdate --rfc-3339=seconds
#export OMP_NUM_THREADS=16
# go down to 90% (this step is needed to create big clusters)
STEP=1
INPUT="$TMPDIR/input_step1"
mmseqs prefilter "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" ${PREFILTER1_PAR} --threads 4
gdate --rfc-3339=seconds
mmseqs align "$INPUT" "$INPUT" "$TMPDIR/pref_step$STEP" "$TMPDIR/aln_step$STEP" ${ALIGNMENT1_PAR} --threads 4
gdate --rfc-3339=seconds
#export OMP_NUM_THREADS=64
mmseqs clust $INPUT "$TMPDIR/aln_step$STEP" "$TMPDIR/clu_step$STEP" ${CLUSTER1_PAR} --threads 4

gdate --rfc-3339=seconds

printf "###############################\n"
printf "#### Merge redundancy to 90%% clustering\n"
printf "###############################\n"

# create database aaclust 90% (we need to merge redundancy, step_0 and step_1)
mmseqs mergeclusters "${SEQUENCE_DB}" $OUTDIR/aaclust90_$RELEASE \
    "$TMPDIR/clu_redundancy" $TMPDIR/clu_step0 $TMPDIR/clu_step1
gdate --rfc-3339=seconds

gawk '{ print $1 }' "$TMPDIR/clu_step$STEP.index" > "$TMPDIR/order_step$STEP"
mmseqs createsubdb "$TMPDIR/order_step$STEP" $INPUT "$TMPDIR/input_step2"

#export OMP_NUM_THREADS=16
# now we cluster down to 30% sequence id to produce a 30% and 50% clustering
STEP=2
INPUT=$TMPDIR/input_step2
gdate --rfc-3339=seconds
mmseqs prefilter $INPUT $INPUT "$TMPDIR/pref_step$STEP" ${PREFILTER2_PAR} --threads 4
gdate --rfc-3339=seconds
mmseqs align $INPUT $INPUT "$TMPDIR/pref_step$STEP" "$TMPDIR/aln_step$STEP" ${ALIGNMENT2_PAR} --threads 4
gdate --rfc-3339=seconds

printf "###############################\n"
printf "#### Cluster at 50%%\n"
printf "###############################\n"
#export OMP_NUM_THREADS=64
# cluster down to 50%
mmseqs filterdb "$TMPDIR/aln_step$STEP" "$TMPDIR/aln_aaclust50" \
    --filter-column 3 --filter-regex '(0\.[5-9][0-9]{2}|1\.000)' --threads 4
gdate --rfc-3339=seconds
mmseqs clust $INPUT "$TMPDIR/aln_aaclust50" "$TMPDIR/clu_aaclust50" ${CLUSTER2_PAR} --threads 4
gdate --rfc-3339=seconds
mmseqs mergeclusters "${SEQUENCE_DB}" $OUTDIR/aaclust50_$RELEASE \
    "$TMPDIR/clu_redundancy" $TMPDIR/clu_step0 $TMPDIR/clu_step1 $TMPDIR/clu_aaclust50
gdate --rfc-3339=seconds

printf "###############################\n"
printf  "#### Cluster at 30%%\n"
printf "###############################\n"

# cluster down to 30%
mmseqs clust $INPUT "$TMPDIR/aln_step$STEP" "$TMPDIR/clu_aaclust30" ${CLUSTER2_PAR} --threads 4
gdate --rfc-3339=seconds
mmseqs mergeclusters "${SEQUENCE_DB}" $OUTDIR/aaclust30_$RELEASE \
    "$TMPDIR/clu_redundancy" $TMPDIR/clu_step0 $TMPDIR/clu_step1 $TMPDIR/clu_aaclust30
gdate --rfc-3339=seconds

printf "###############################\n"
printf "#### Prepare output\n"
printf "###############################\n"
#export OMP_NUM_THREADS=16
# generate aaclust final output: the _seed, _conensus und .tsv
# also generated the profiles needed for the aaboost
for i in 30 50 90; do
    mmseqs result2profile "${SEQUENCE_DB}" "${SEQUENCE_DB}" "$OUTDIR/aaclust${i}_${RELEASE}" "$TMPDIR/aaclust${i}_${RELEASE}_profile" --threads 16
    ln -sf "${SEQUENCE_DB}_h" "$TMPDIR/aaclust${i}_${RELEASE}_profile_h"
    ln -sf "${SEQUENCE_DB}_h.index" "$TMPDIR/aaclust${i}_${RELEASE}_profile_h.index"
    ln -sf "${SEQUENCE_DB}_h" "$TMPDIR/aaclust${i}_${RELEASE}_profile_consensus_h"
    ln -sf "${SEQUENCE_DB}_h.index" "$TMPDIR/aaclust${i}_${RELEASE}_profile_consensus_h.index"
         #fixme: won't work with updating
        mmseqs mergedbs "$OUTDIR/aaclust${i}_${RELEASE}" "$TMPDIR/aaclust${i}_${RELEASE}_seed" "$OUTDIR/aaprot_db_h" "$OUTDIR/aaprot_db" --prefixes ">"
        rm -f "$TMPDIR/aaclust${i}_${RELEASE}_seed.index"

        gsed -i 's/\x0//g' "$TMPDIR/aaclust${i}_${RELEASE}_seed"

        mmseqs summarizeheaders "${SEQUENCE_DB}_h" "${SEQUENCE_DB}_h" "$OUTDIR/aaclust${i}_${RELEASE}" "$TMPDIR/aaclust${i}_${RELEASE}_summary" --summary-prefix "${i}-${SHORTRELEASE}"
        mmseqs mergedbs "$OUTDIR/aaclust${i}_$RELEASE" "$TMPDIR/aaclust${i}_${RELEASE}_consensus" "$TMPDIR/aaclust${i}_${RELEASE}_summary" "$TMPDIR/aaclust${i}_${RELEASE}_profile_consensus" --prefixes ">
"
        rm -f "$TMPDIR/aaclust${i}_${RELEASE}_consensus.index"
        gsed -i 's/\x0//g' "$TMPDIR/aaclust${i}_${RELEASE}_consensus"

        mmseqs createtsv "${SEQUENCE_DB}" "${SEQUENCE_DB}" "$OUTDIR/aaclust${i}_$RELEASE" "$TMPDIR/aaclust${i}_$RELEASE.tsv"
    mv -f "$TMPDIR/aaclust${i}_${RELEASE}_seed" "$TMPDIR/aaclust${i}_${RELEASE}_seed.fasta"
    mv -f "$TMPDIR/aaclust${i}_${RELEASE}_consensus" "$TMPDIR/aaclust${i}_${RELEASE}_consensus.fasta"

        gtar -cv --use-compress-program=pigz --show-transformed-names --transform "s|${TMPDIR:1}/|aaclust${i}_${RELEASE}/|g" -f "$OUTDIR/aaclust${i}_${RELEASE}.tar.gz" "$TMPDIR/aaclust${i}_$RELEASE.tsv" "$TMPDIR/aaclust${i}_${RELEASE}_consensus.fasta" "$TMPDIR/aaclust${i}_${RELEASE}_seed.fasta"
done

plot_mutants.r

#!/usr/bin/env Rscript
suppressMessages(library("optparse", quietly = TRUE))
suppressMessages(library(tidyverse, quietly = TRUE))
suppressMessages(library(igraph, quietly = TRUE))
suppressMessages(library(pbmcapply, quietly = TRUE))
suppressMessages(library(cowplot, quietly = TRUE))
options(warn=-1)

option_list = list(
  make_option(c("-a", "--aln"), type="character", default=NULL, 
              help="cluster alignment file", metavar="character"),
  make_option(c("-s", "--stats"), type="character", default=NULL, 
              help="cluster statistic file", metavar="character"),
  make_option(c("-n", "--namefile"), type="character", default=NULL, 
              help="Filename to save", metavar="character")
)

opt_parser = OptionParser(option_list=option_list);
opt = parse_args(opt_parser);

if (length(opt) < 3){
  print_help(opt_parser)
  stop("At least one argument must be supplied\n", call.=TRUE)
}

binary_search_filter_1 <- function(g = NULL, low = NULL, high = NULL, w = NULL) {
  x <- g
  if (high < low) {
    return(NULL)
  }else {
    mid <- floor((low + high) / 2)
    x <- delete.edges(g, which(E(g)$weight <= w[mid])) 
    # cat("low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:", count_components(x), " edges:", ecount(x), "\n")
    
    if ((mid - low) == 0){
      x <- delete.edges(g, which(E(g)$weight < w[mid-1])) 
      # cat("Final: low:", low - 1, " mid:", mid - 1, " high:", high - 1, " mid_weight:", weights[mid - 1], " components:",count_components(x), " edges:", ecount(x), "\n")
      return(list(weight=mid - 1, graph=x))
      exit
    }
    
    if (((high - low) == 0)){
      x<-delete.edges(g, which(E(g)$weight < w[mid+1])) 
      # cat("Final: low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:",count_components(x), " edges:", ecount(x), "\n")
      return(list(weight=mid , graph=x))
      exit
    }
    
    
    if (!(is.connected(x))){
      binary_search_filter_1(g, low, mid - 1)
    }else if (is.connected(x)){
      binary_search_filter_1(g, mid + 1, high)
    }
  }
}

get_refs <- function(X = X, aln = aln, data = data){

  if ((data %>% filter(cls_num == X) %>% .$num_mem >= 2) & (data %>% filter(cls_num == X) %>% .$num_ref > 0)) {
    
    ref <- data %>% filter(cls_num == X) %>% .$reference
    
    l <- aln %>% 
      filter(cls == X) %>%
      graph_from_data_frame(directed = FALSE) %>%
      simplify(remove.multiple = TRUE, remove.loops = TRUE, edge.attr.comb = "max")
    weights <- unique(sort(E(l)$weight, decreasing = FALSE))
    if (length(weights) > 0){
      l <- binary_search_filter_1(g = l, low = 1, high = length(weights), w = weights)$graph
      s <- strength(l) %>% sort %>% tail(1) %>% names()
      b <- betweenness(l, weights = 1/E(l)$weight, directed = FALSE, normalized = TRUE) %>% sort %>% tail(1) %>% names()
      p <- page_rank(l, weights = E(l)$weight, directed = FALSE)$vector %>% sort %>% tail(1) %>% names()
      e <- eigen_centrality(l, weights = E(l)$weight, directed = FALSE)$vector %>% sort %>% tail(1) %>% names()
      c <- closeness(l, weights = 1/E(l)$weight, normalized = TRUE) %>% sort %>% tail(1) %>% names()
      d <- data.frame(cls = X, cl_ref = ref, s_ref = s, b_ref = b, pr_ref = p, e_ref = e, c_ref = c, d = graph.density(l))
      return(d)
    }else{
      d <- data.frame(cls = X, cl_ref = ref, s_ref = NA, b_ref = NA, pr_ref = NA, e_ref = NA, c_ref = NA, d = NA)
      return(d)
    }
  }
  # }else{
  #   d <- data.frame(cls = X, cl_ref = ref, s_ref = NA, b_ref = NA, pr_ref = NA, e_ref = NA, d = NA)
  # }
}

suppressMessages(df <- read_tsv(opt$stats, col_names = T))
suppressMessages(alns <- read_tsv(opt$aln, col_names = F))

alns <- alns %>%
  select(X2,X3,X4, X14) %>%
  rename(node1 = X2, node2 = X3, weight = X4, cls = X14) %>%
  filter(!is.na(cls))
  

# Size of clusters and mix

cat("Number of clusters:", df %>% .$cls_num %>% length,
"\nNumber of singletons:", df %>% filter(num_mem == 1) %>% .$cls_num %>% length,
"\nNumber of clusters > 1 members:", df %>% filter(num_mem > 1) %>% .$cls_num %>% length, "\n")

df_sing <- df %>% 
  filter(num_mem == 1) %>% 
  select(starts_with('num')) %>% 
  select(-num_mem, -num_ref, -num_sub, -num_ins, -num_del, -num_all, -num_diff) %>% 
  gather() %>%
  mutate(key = gsub('num_',"", key)) %>% 
  mutate(Identity = paste(gsub('ins|sub|del|all',"", key), "%", sep = ""), type = gsub('90|50|30',"", key), "%", sep = "") %>%
  select(-key) %>%
  group_by(Identity, type) %>%
  summarise(N=sum(value)) %>%
  mutate(class = "Singletons")


df_no_ref <- df %>% 
  filter(num_mem > 1, num_ref < 1) %>% 
  select(starts_with('num')) %>% 
  select(-num_mem, -num_ref, -num_sub, -num_ins, -num_del, -num_all, -num_diff) %>% 
  gather() %>%
  mutate(key = gsub('num_',"", key)) %>% 
  mutate(Identity = paste(gsub('ins|sub|del|all',"", key), "%", sep = ""), type = gsub('90|50|30',"", key)) %>%
  select(-key) %>%
  group_by(Identity, type) %>%
  summarise(N=sum(value)) %>%
  mutate(class = "Without seeds")


df_ref <- df %>% 
  filter(num_mem > 1, num_ref > 0) %>% 
  select(starts_with('num')) %>% 
  select(-num_mem, -num_ref, -num_sub, -num_ins, -num_del, -num_all, -num_diff) %>% 
  gather() %>%
  mutate(key = gsub('num_',"", key)) %>% 
  mutate(Identity = paste(gsub('ins|sub|del|all',"", key), "%", sep = ""), type = gsub('90|50|30',"", key), "%", sep = "") %>%
  select(-key) %>%
  group_by(Identity, type) %>%
  summarise(N=sum(value)) %>%
  mutate(class = "With seeds")

df_mut <- bind_rows(df_sing, df_no_ref, df_ref)

df_mut$Identity <- plyr::mapvalues(df_mut$Identity, from = c("90%", "50%", "30%"), to = c("10%", "50%", "70%"))

df_mut$Identity <- factor(df_mut$Identity, levels = c("10%", "50%", "70%"))
p1 <- ggplot(df_mut %>% filter(class == "Singletons"), aes(type, N, fill = Identity)) +
  geom_bar(stat = "identity") +
  theme_light() +
  ylab("Number of ORFs") +
  xlab("Type of mutation") +
  facet_wrap(~class) +
  scale_fill_manual(values = c("#FF0000", "#00A08A", "#F2AD00")) +
  guides(fill=guide_legend(title="Modified\nresidues"))

p2 <- ggplot(df_mut %>% filter(class == "Without seeds"), aes(type, N, fill = Identity)) +
  geom_bar(stat = "identity") +
  theme_light() +
  ylab("Number of ORFs") +
  xlab("Type of mutation") +
  facet_wrap(~class) +
  scale_fill_manual(values = c("#FF0000", "#00A08A", "#F2AD00"))

p3 <- ggplot(df_mut %>% filter(class == "With seeds"), aes(type, N, fill = Identity)) +
  geom_bar(stat = "identity") +
  theme_light() +
  ylab("Number of ORFs") +
  xlab("Type of mutation") +
  facet_wrap(~class) +
  scale_fill_manual(values = c("#FF0000", "#00A08A", "#F2AD00"))

prow <- plot_grid( p1 + theme(legend.position="none"),
                   p2 + theme(legend.position="none"),
                   p3 + theme(legend.position="none"),
                   align = 'vh',
                   labels = c("A", "B", "C"),
                   hjust = -1,
                   nrow = 1
)

legend <- get_legend(p1)

p <- plot_grid( prow, legend, rel_widths = c(3, .3))
fname <- paste(opt$namefile, "_seed.pdf", sep = "")
fname <- file.path(getwd(), fname)
ggsave(filename = fname, plot = p, width = 11, height = 3, units = "in", device = "pdf")


### 991 267
#432 236

p1 <- bind_rows(df %>% 
  filter(num_mem > 1, num_ref < 1) %>% mutate(class = "Without seeds"), 
  df %>% 
    filter(num_mem > 1, num_ref >= 1) %>% mutate(class = "With seeds")) %>%
  ggplot(aes(num_diff, fill = class)) +
  geom_bar(alpha = 0.8) +
  theme_light() +
  xlab("Different ORFs per cluster") +
  ylab("Number of clusters") +
  theme(legend.title = element_blank()) +
  scale_fill_manual(values = c("#C84359", "#4A4A4A"))

p2 <- bind_rows(df %>% 
                  filter(num_mem > 1, num_ref < 1) %>% mutate(class = "Without seeds"), 
                df %>% 
                  filter(num_mem > 1, num_ref >= 1) %>% mutate(class = "With seeds")) %>%
  ggplot(aes(median_iden, fill = class)) +
  geom_density(alpha = 0.8) +
  theme_light() +
  ylab("Density") +
  xlab("Median similarity") +
  theme(legend.title = element_blank()) +
  scale_fill_manual(values = c("#C84359", "#4A4A4A"))


prow <- plot_grid( p1 + theme(legend.position="none"),
                   p2 + theme(legend.position="none"),
                   align = 'vh',
                   labels = c("A", "B"),
                   hjust = -1,
                   nrow = 1
)


legend <- get_legend(p1)

p <- plot_grid(prow, legend, rel_widths = c(3, .5))


fname <- paste(opt$namefile, "_medsim.pdf", sep = "")
fname <- file.path(getwd(), fname)
ggsave(filename = fname, plot = p, width = 11, height = 3, units = "in", device = "pdf")


k <- pbmclapply(X = df$cls_num, get_refs, aln = alns, data = df, mc.cores = 4, ignore.interactive = TRUE)

k <- bind_rows(k) %>% tbl_df()

d <- data_frame(
  MMSEQS = k %>% filter(grepl("ref", cl_ref)) %>% .$cls %>% length(),
  Strength = k %>% filter(grepl("ref", s_ref)) %>% .$cls %>% length(),
  Betwenness = k %>% filter(grepl("ref", b_ref)) %>% .$cls %>% length(),
  Pagerank = k %>% filter(grepl("ref", pr_ref)) %>% .$cls %>% length(),
  Eingenvector = k %>% filter(grepl("ref", e_ref)) %>% .$cls %>% length(),
  Closeness = k %>% filter(grepl("ref", c_ref)) %>% .$cls %>% length()
) %>% gather %>% mutate(Proportion = value/dim(k)[1])

d$key <- factor(d$key, levels = d %>% arrange(desc(value)) %>% .$key)

#581 384
p <- ggplot(d, aes(key, Proportion)) +
  geom_bar(stat = "identity", fill = "#4A4A4A") +
  theme_light() +
  xlab("Representative method") +
  geom_text(aes(label=value), hjust=0.5, vjust=-.3, size = 3) +
  scale_y_continuous(labels = scales::percent) +
  ggtitle("Seed sequences as cluster representative") +
  theme(plot.title = element_text(size = 10, face = "bold"))
  
fname <- paste(opt$namefile, "_repmet.pdf", sep = "")
fname <- file.path(getwd(), fname)
ggsave(filename = fname, plot = p, device = "pdf", width = 8, height = 4, units = "in")

#k %>% mutate(same_ref = ifelse(cl_ref == e_ref, TRUE, FALSE)) %>% group_by(same_ref) %>% count

# k %>%
#   ggplot(aes(d)) +
#   geom_density(fill = "#4A4A4A") +
#   theme_light() +
#   ylab("Density") +
#   xlab("Graph density")

parse_clusters_ref.py

import argparse
from collections import defaultdict, OrderedDict, Counter
import csv
import re
from Bio import SeqIO
import subprocess
import os
import errno
import glob
import shutil
from statsmodels import robust
import pandas as pd
from tqdm import *


def get_args():
    '''This function parses and return arguments passed in'''
    # Assign description to the help doc
    parser = argparse.ArgumentParser(
        description='Script to parse mmseqs tsv files')
    # Add arguments
    parser.add_argument(
        '-t', '--tsvfile',
        type=str,
        help='MMSEQS tsv file',
        required=True)
    parser.add_argument(
        '-o', '--output',
        type=str,
        help='Output name',
        required=True)
    parser.add_argument(
        '-f', '--fastafile',
        type=str,
        help='Fasta file with ORFs used for clustering',
        required=True)
    # Array for all arguments passed to script
    args = parser.parse_args()
    # Assign args to variables
    infile = args.tsvfile
    outfile = args.output
    fasta_file = args.fastafile
    # Return all variable values
    return infile, outfile, fasta_file


class MyCounter(Counter):
    def __str__(self):
        return ",".join('{}:{}'.format(k, v) for k, v in self.items())


def create_db(f_file, outfile):
    command = ['mmseqs', 'createdb', f_file, outfile + '_seqDB']

    try:
        subprocess.check_output(command)

    except subprocess.CalledProcessError as e:
        print(e.output)
        print(e.returncode)


def search_db(outfile):
    tmp = outfile + '_tmp'
    os.mkdir(tmp)
    command = ['mmseqs',
               'search',
               outfile + '_seqDB',
               outfile + '_seqDB',
               outfile + '_seqDB.aln.db',
               tmp, '-a',
               '--comp-bias-corr', '0',
               '-e', '10',
               '--threads', '4'
               ]
    try:
        subprocess.check_output(command)

    except subprocess.CalledProcessError as e:
        print(e.output)
        print(e.returncode)


def get_alns(outfile):
    command = ['mmseqs',
               'convertalis',
               outfile + '_seqDB',
               outfile + '_seqDB',
               outfile + '_seqDB.aln.db',
               outfile + '_seqDB.aln.m8'
               ]
    try:
        subprocess.check_output(command)

    except subprocess.CalledProcessError as e:
        print(e.output)
        print(e.returncode)


def parse_alns(n_cls, values, outfile):
    df = pd.read_table(outfile + '_seqDB.aln.m8', header=None)
    mean = df.iloc[:, 2].mean()
    median = df.iloc[:, 2].median()
    mad = robust.mad(df.iloc[:, 2], c=1)

    df.insert(12, '', 'cls_' + str(n_cls))

    return df, mean, median, mad


def clean_db(filename):
    try:
        for fl in glob.glob(filename):
            os.remove(fl)
    except OSError as e:  # this would be "except OSError, e:" before Python 2.6
        if e.errno != errno.ENOENT:  # errno.ENOENT = no such file or directory
            raise  # re-raise exception if a different error occurred


def clean_tmp(outfile):
    try:
        shutil.rmtree(outfile + '_tmp')
    except OSError as e:  # this would be "except OSError, e:" before Python 2.6
        if e.errno != errno.ENOENT:  # errno.ENOENT = no such file or directory
            raise  # re-raise exception if a different error occurred


def mmseqs_seq_net(values, seq_dic, n_cls, outfile):
    # write elements to file
    tf_name = outfile + "_cls_tmp.fasta"
    tmp_fasta = open(tf_name, "w")
    for k in values:
        SeqIO.write(seq_dic[k], tmp_fasta, "fasta")
    tmp_fasta.close()

    clean_tmp(outfile)
    clean_db(outfile + "_seqDB*")
    create_db(tf_name, outfile)
    search_db(outfile)
    get_alns(outfile)
    df, mean, median, mad = parse_alns(n_cls, values, outfile)

    return df, mean, median, mad


def create_seq_dic(fasta_file, seq_dic):
    seq_dic = SeqIO.index(fasta_file, "fasta")
    return seq_dic


if __name__ == "__main__":

    infile, outfile, fasta_file = get_args()

    d1 = defaultdict(list)
    dic = defaultdict(list)
    sdic = create_seq_dic(fasta_file, dic)
    n_cls = 0
    pd_all = []

    with open(infile) as f:
        for line in f:
            k = [n for n in line.strip().split('\t')]
            d1[k[0]].append(k[1])

    d1 = OrderedDict(sorted(d1.items(),
                     key=lambda (k, v): len(v),
                     reverse=True))

    with open(outfile + ".tsv", 'wb') as csv_file:
        writer = csv.writer(csv_file, delimiter="\t")
        header = ["cls_num", "num_mem", "reference",
                  "num_diff", "diff_orf",
                  "mean_iden", "median_iden", "mad_iden",
                  "members"]

        writer.writerow(header)
        # t = trange(len(d1), desc='cls_' + n_cls, leave=False)
        for key, value in tqdm(d1.items(), desc='Processing', leave=False):
            n_cls = n_cls + 1

            counts = Counter([re.sub("_\d\d-\w\w\w_..*|_ref", "", m) for m in value])
            if len(value) > 1:
                df, mean, median, mad = mmseqs_seq_net(value, sdic, n_cls, outfile)
                pd_all.append(df)
            else:
                df = mean = median = mad = 'NA'

            writer.writerow(["cls_" + str(n_cls), len(value), key, len(counts),
                             MyCounter(counts), mean, median, mad,
                             ','.join(value)])
        df = pd.concat(pd_all, ignore_index=True)
        df.to_csv(outfile + "_alns.tsv", sep='\t')

parse_clusters.py

import argparse
from collections import defaultdict, OrderedDict, Counter
import csv
import re
from Bio import SeqIO
import subprocess
import os
import errno
import glob
import shutil
from statsmodels import robust
import pandas as pd
from tqdm import *


def get_args():
    '''This function parses and return arguments passed in'''
    # Assign description to the help doc
    parser = argparse.ArgumentParser(
        description='Script to parse mmseqs tsv files')
    # Add arguments
    parser.add_argument(
        '-t', '--tsvfile',
        type=str,
        help='MMSEQS tsv file',
        required=True)
    parser.add_argument(
        '-o', '--output',
        type=str,
        help='Output name',
        required=True)
    parser.add_argument(
        '-f', '--fastafile',
        type=str,
        help='Fasta file with ORFs used for clustering',
        required=True)
    # Array for all arguments passed to script
    args = parser.parse_args()
    # Assign args to variables
    infile = args.tsvfile
    outfile = args.output
    fasta_file = args.fastafile
    # Return all variable values
    return infile, outfile, fasta_file


class MyCounter(Counter):
    def __str__(self):
        return ",".join('{}:{}'.format(k, v) for k, v in self.items())


def create_db(f_file, outfile):
    command = ['mmseqs', 'createdb', f_file, outfile + '_seqDB']

    try:
        subprocess.check_output(command)

    except subprocess.CalledProcessError as e:
        print(e.output)
        print(e.returncode)


def search_db(outfile):
    tmp = outfile + '_tmp'
    os.mkdir(tmp)
    command = ['mmseqs',
               'search',
               outfile + '_seqDB',
               outfile + '_seqDB',
               outfile + '_seqDB.aln.db',
               tmp, '-a',
               '--comp-bias-corr', '0',
               '-e', '10',
               '--threads', '4'
               ]
    try:
        subprocess.check_output(command)

    except subprocess.CalledProcessError as e:
        print(e.output)
        print(e.returncode)


def get_alns(outfile):
    command = ['mmseqs',
               'convertalis',
               outfile + '_seqDB',
               outfile + '_seqDB',
               outfile + '_seqDB.aln.db',
               outfile + '_seqDB.aln.m8'
               ]
    try:
        subprocess.check_output(command)

    except subprocess.CalledProcessError as e:
        print(e.output)
        print(e.returncode)


def parse_alns(n_cls, values, outfile):
    df = pd.read_table(outfile + '_seqDB.aln.m8', header=None)
    mean = df.iloc[:, 2].mean()
    median = df.iloc[:, 2].median()
    mad = robust.mad(df.iloc[:, 2], c=1)

    df.insert(12, '', 'cls_' + str(n_cls))

    return df, mean, median, mad


def clean_db(filename):
    try:
        for fl in glob.glob(filename):
            os.remove(fl)
    except OSError as e:  # this would be "except OSError, e:" before Python 2.6
        if e.errno != errno.ENOENT:  # errno.ENOENT = no such file or directory
            raise  # re-raise exception if a different error occurred


def clean_tmp(outfile):
    try:
        shutil.rmtree(outfile + '_tmp')
    except OSError as e:  # this would be "except OSError, e:" before Python 2.6
        if e.errno != errno.ENOENT:  # errno.ENOENT = no such file or directory
            raise  # re-raise exception if a different error occurred


def mmseqs_seq_net(values, seq_dic, n_cls, outfile):
    # write elements to file
    tf_name = outfile + "_cls_tmp.fasta"
    tmp_fasta = open(tf_name, "w")
    for k in values:
        SeqIO.write(seq_dic[k], tmp_fasta, "fasta")
    tmp_fasta.close()

    clean_tmp(outfile)
    clean_db(outfile + "_seqDB*")
    create_db(tf_name, outfile)
    search_db(outfile)
    get_alns(outfile)
    df, mean, median, mad = parse_alns(n_cls, values, outfile)

    return df, mean, median, mad


def create_seq_dic(fasta_file, seq_dic):
    seq_dic = SeqIO.index(fasta_file, "fasta")
    return seq_dic


if __name__ == "__main__":

    infile, outfile, fasta_file = get_args()

    d1 = defaultdict(list)
    dic = defaultdict(list)
    sdic = create_seq_dic(fasta_file, dic)
    n_cls = 0
    pd_all = []

    with open(infile) as f:
        for line in f:
            k = [n for n in line.strip().split('\t')]
            d1[k[0]].append(k[1])

    d1 = OrderedDict(sorted(d1.items(),
                     key=lambda (k, v): len(v),
                     reverse=True))

    rref = re.compile(r'.*_ref')
    rsub = re.compile(r'.*-sub.*')
    rins = re.compile(r'.*-ins.*')
    rdel = re.compile(r'.*-del.*')
    rall = re.compile(r'.*-all.*')

    rsub90 = re.compile(r'.*90-sub.*')
    rins90 = re.compile(r'.*90-ins.*')
    rdel90 = re.compile(r'.*90-del.*')
    rall90 = re.compile(r'.*90-all.*')

    rsub50 = re.compile(r'.*50-sub.*')
    rins50 = re.compile(r'.*50-ins.*')
    rdel50 = re.compile(r'.*50-del.*')
    rall50 = re.compile(r'.*50-all.*')

    rsub30 = re.compile(r'.*30-sub.*')
    rins30 = re.compile(r'.*30-ins.*')
    rdel30 = re.compile(r'.*30-del.*')
    rall30 = re.compile(r'.*30-all.*')

    with open(outfile + ".tsv", 'wb') as csv_file:
        writer = csv.writer(csv_file, delimiter="\t")
        header = ["cls_num", "num_mem", "is_rep_ref", "reference", "num_ref",
                  "num_sub", "num_ins", "num_del", "num_all", "num_diff", "diff_orf",
                  "mean_iden", "median_iden", "mad_iden",
                  "num_sub90", "num_ins90", "num_del90", "num_all90",
                  "num_sub50", "num_ins50", "num_del50", "num_all50",
                  "num_sub30", "num_ins30", "num_del30", "num_all30",
                  "members"]

        writer.writerow(header)
        # t = trange(len(d1), desc='cls_' + n_cls, leave=False)
        for key, value in tqdm(d1.items(), desc='Processing', leave=False):
            n_cls = n_cls + 1
            nref = len(filter(rref.match, value))

            nsub = len(filter(rsub.match, value))
            nins = len(filter(rins.match, value))
            ndel = len(filter(rdel.match, value))
            nall = len(filter(rall.match, value))

            nsub90 = len(filter(rsub90.match, value))
            nins90 = len(filter(rins90.match, value))
            ndel90 = len(filter(rdel90.match, value))
            nall90 = len(filter(rall90.match, value))

            nsub50 = len(filter(rsub50.match, value))
            nins50 = len(filter(rins50.match, value))
            ndel50 = len(filter(rdel50.match, value))
            nall50 = len(filter(rall50.match, value))

            nsub30 = len(filter(rsub30.match, value))
            nins30 = len(filter(rins30.match, value))
            ndel30 = len(filter(rdel30.match, value))
            nall30 = len(filter(rall30.match, value))

            repres = bool(rref.match(key))

            counts = Counter([re.sub("_\d\d-\w\w\w_..*|_ref", "", m) for m in value])
            if len(value) > 1:
                df, mean, median, mad = mmseqs_seq_net(value, sdic, n_cls, outfile)
                pd_all.append(df)
            else:
                df = mean = median = mad = 'NA'

            writer.writerow(["cls_" + str(n_cls), len(value), repres, key, nref,
                             nsub, nins, ndel, nall, len(counts),
                             MyCounter(counts), mean, median, mad,
                             nsub90, nins90, ndel90, nall90,
                             nsub50, nins50, ndel50, nall50,
                             nsub30, nins30, ndel30, nall30,
                             ','.join(value)])
        df = pd.concat(pd_all, ignore_index=True)
        df.to_csv(outfile + "_alns.tsv", sep='\t')

generate_mutants.py

from Bio import SeqIO
from Bio.SeqRecord import SeqRecord
import argparse
import re
import subprocess
from tqdm import *


def get_args():
    '''This function parses and return arguments passed in'''
    # Assign description to the help doc
    parser = argparse.ArgumentParser(
        description='Script uses EMBOSS msbar '
                    'to generate mutated AA sequences')
    # Add arguments
    parser.add_argument(
        '-g', '--genome',
        type=str,
        help='Genome file with amino acid sequences.',
        required=True)
    parser.add_argument(
        '-n', '--nmut',
        type=str,
        help='Number of mutated sequences per sequence.',
        required=True)
    # Array for all arguments passed to script
    args = parser.parse_args()
    # Assign args to variables
    infile = args.genome
    n_mut = args.nmut
    # Return all variable values
    return infile, n_mut


def run_msbar(iden, mutrate, record, iter, muttype):
    rec = SeqRecord(record.seq,
                    id=record.id,
                    name=record.name,
                    description="")

    seq_len = len(rec.seq)
    mut = int((mutrate * seq_len))
    ident = re.sub("\|", "-", rec.id) + "_" + str(iden) + "_" + str(iter)
    rec.id = ident
    rec.description = ""
    SeqIO.write(rec, "tmp.fasta", "fasta")
    command = ['msbar',
               '-sequence', 'tmp.fasta',
               '-count', str(mut),
               '-point', str(muttype),
               '-auto', '-stdout',
               '-block', "0"
               ]
    try:
        seqmut = subprocess.Popen(command, stdout=subprocess.PIPE)
    except subprocess.CalledProcessError as e:
        print e.output

    r = SeqIO.read(seqmut.stdout, "fasta")
    return r


def run_msbar_sub(file90, file50, file30, i):
    aa90sub = run_msbar(iden="90-sub",
                        mutrate=0.1,
                        record=record,
                        muttype=4,
                        iter=i)
    aa50sub = run_msbar(iden="50-sub",
                        mutrate=0.5,
                        record=record,
                        muttype=4,
                        iter=i)
    aa30sub = run_msbar(iden="30-sub",
                        mutrate=0.7,
                        record=record,
                        muttype=4,
                        iter=i)
    SeqIO.write(aa90sub, file90, "fasta")
    SeqIO.write(aa50sub, file50, "fasta")
    SeqIO.write(aa30sub, file30, "fasta")


def run_msbar_ins(file90, file50, file30, i):
    aa90ins = run_msbar(iden="90-ins",
                        mutrate=0.1,
                        record=record,
                        muttype=2,
                        iter=i)
    aa50ins = run_msbar(iden="50-ins",
                        mutrate=0.5,
                        record=record,
                        muttype=2,
                        iter=i)
    aa30ins = run_msbar(iden="30-ins",
                        mutrate=0.7,
                        record=record,
                        muttype=2,
                        iter=i)
    SeqIO.write(aa90ins, file90, "fasta")
    SeqIO.write(aa50ins, file50, "fasta")
    SeqIO.write(aa30ins, file30, "fasta")


def run_msbar_del(file90, file50, file30, i):
    aa90del = run_msbar(iden="90-del",
                        mutrate=0.1,
                        record=record,
                        muttype=3,
                        iter=i)
    aa50del = run_msbar(iden="50-del",
                        mutrate=0.5,
                        record=record,
                        muttype=3,
                        iter=i)
    aa30del = run_msbar(iden="30-del",
                        mutrate=0.7,
                        record=record,
                        muttype=3,
                        iter=i)
    SeqIO.write(aa90del, file90, "fasta")
    SeqIO.write(aa50del, file50, "fasta")
    SeqIO.write(aa30del, file30, "fasta")


def run_msbar_all(file90, file50, file30, i):
    aa90all = run_msbar(iden="90-all",
                        mutrate=0.1,
                        record=record,
                        muttype="2,3,4",
                        iter=i)
    aa50all = run_msbar(iden="50-all",
                        mutrate=0.5,
                        record=record,
                        muttype="2,3,4",
                        iter=i)
    aa30all = run_msbar(iden="30-all",
                        mutrate=0.7,
                        record=record,
                        muttype="2,3,4",
                        iter=i)
    SeqIO.write(aa90all, file90, "fasta")
    SeqIO.write(aa50all, file50, "fasta")
    SeqIO.write(aa30all, file30, "fasta")


if __name__ == "__main__":

    infile, n_mut = get_args()

    out90sub = "aa90_sub.fasta"
    out50sub = "aa50_sub.fasta"
    out30sub = "aa30_sub.fasta"

    out90ins = "aa90_ins.fasta"
    out50ins = "aa50_ins.fasta"
    out30ins = "aa30_ins.fasta"

    out90del = "aa90_del.fasta"
    out50del = "aa50_del.fasta"
    out30del = "aa30_del.fasta"

    out90all = "aa90_all.fasta"
    out50all = "aa50_all.fasta"
    out30all = "aa30_all.fasta"

    output90sub = open(out90sub, "w")
    output50sub = open(out50sub, "w")
    output30sub = open(out30sub, "w")

    output90ins = open(out90ins, "w")
    output50ins = open(out50ins, "w")
    output30ins = open(out30ins, "w")

    output90del = open(out90del, "w")
    output50del = open(out50del, "w")
    output30del = open(out30del, "w")

    output90all = open(out90all, "w")
    output50all = open(out50all, "w")
    output30all = open(out30all, "w")

    for record in SeqIO.parse(infile, "fasta"):
        t = trange(int(n_mut), desc=record.id, leave=False)
        for i in t:
            run_msbar_sub(file90=output90sub,
                          file50=output50sub,
                          file30=output30sub,
                          i=i)

            run_msbar_ins(file90=output90ins,
                          file50=output50ins,
                          file30=output30ins,
                          i=i)

            run_msbar_del(file90=output90del,
                          file50=output50del,
                          file30=output30del,
                          i=i)

            run_msbar_all(file90=output90all,
                          file50=output50all,
                          file30=output30all,
                          i=i)

    output90sub.close()
    output50sub.close()
    output30sub.close()

    output90ins.close()
    output50ins.close()
    output30ins.close()

    output90del.close()
    output50del.close()
    output30del.close()

    output90all.close()
    output50all.close()
    output30all.close()

example_workflow.sh

# Example for N. maritimus
# First we generate a set of mutants
python generate_mutants.py -g GCF_000018465.1_ASM1846v1_protein.faa -n 3 -o nmarit

# We label our seeds or references
awk '{if ($0 ~ /^>/) {print $1"_ref"}else{print $0}}' ../GCF_000018465.1_ASM1846v1_protein.faa > nmarit_aa_ref.fasta

# Concatenate mutants and references
cat nmar*sub.fasta nmar*ins.fasta nmar*del.fasta nmar*all.fasta nmarit_aa_ref.fasta > nmarit_all_aa.fasta

# Cluster them with the same approach we did for our clusters 
# code adapted to work on a MAC OS X and linuxbrew coreutils
bash unknown_clust.sh nmarit_all_aa.fasta nmarit_aa_all_cls

# Parse clusters
cp nmarit_aa_all_cls/2017_07/tmp/aaclust30_2017_07.tsv nmarit_aaclust30_2017_06.tsv
python parse_clusters.py -t nmarit_aaclust30_2017_06.tsv  -o nmarit_aaclust30_2017_06_all -f nmarit_all_aa.fasta

# Plot results
./plot_mutants.r -a nmarit_aaclust30_2017_06_all_alns.tsv -s nmarit_aaclust30_2017_06_all.tsv -n nmarit

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Tiny script to create aminoacid sequences with different types of mutations