Contig and gene renaming

2/1/2016 - 2:33 PM

#!/bin/bash -l
set -e
#set -x
set -o pipefail

function error_handler() {
    echo "Error occurred in script at line: ${1}."
    echo "Line exited with status: ${2}"

}

trap 'error_handler ${LINENO} $?' ERR

set -o errexit
set -o errtrace
set -o nounset

module load samtools-rocksort
module load coreutils

declare -r CWD=$(pwd)
declare -r DIR=$(readlink -e $1)
declare -r CONTIGDIR=/bioinf/projects/osd/main/2014/06/analysis-results/assembly-evaluation/mapping/bySample/BWA/owncloud
declare -r NAM=$(basename $DIR -mapping)
declare -r NAMD=$(basename $DIR)
declare -r NAMS=$(basename $DIR | cut -f1 -d '.' | sed -e 's/_contigs//')
declare -r OUTDIR=$(pwd)/${NAM}

echo "Subsetting BAM file from ${NAMS}..."

#INFILES
declare -r CONTIGFILE=${CONTIGDIR}/${NAM}/${NAMS}.contigs.fasta.gz


samtools view -bh ${DIR}/${NAM}.sorted.markdup.sorted.bam $(zcat ${CONTIGFILE} | grep '>' | tr -d '>' | tr $"\n" " ") > ${DIR}/${NAM}.cov2.bam

cd ${DIR}

samtools rocksort -@ 4 -m 2G ${DIR}/${NAM}.cov2.bam ${DIR}/${NAM}.cov2.sorted

samtools index ${DIR}/${NAM}.cov2.sorted.bam

samtools flagstat ${DIR}/${NAM}.cov2.bam ${DIR}${NAM}.cov2.sorted.bam > ${DIR}/${NAM}.cov2.sorted.flagstat

cd ${CWD}

rename-orf-coAssembly_OSD.awk

#Execution:
#awk -v ASSEMBLER="idba-ud" -f ~/opt/scripts/rename-orf-coAssembly.awk osd2014-coassembly_idba-ud_contigs.gt500.nt.fasta
#Header format
#>osd{YEAR}-coassembly_{ASSEMBLER}_contig-{CONTIGNUM}_{STRAND}_{START}_{END}_orf-{GENENUM}

{
    if($0 ~ /^>/){
	split($1,a,"_")
	if ($7 == "1"){
	    STRAND = "+"
	}else{
	    STRAND = "-"
	}
	print a[1]"_"a[2]"_"a[3]"_"STRAND"_"$3"_"$5"_orf-"a[4]
    }else{
	O=gsub("\*","",$0)
	print $O
    }
}

rename-orf-bySample-TARA.awk

#Execution:
#awk -v -f ~/opt/scripts/rename_tara_orfs.awk TARA_004_DCM_0.22-1.6_scaffold.nt.fasta
#Header format
#>{TARASAMPLE+SCAFFOLDNUM}_{STRAND}_{START}_{END}_orf-{GENENUM}
#TARA_004_DCM_0.22-1.6_scaffold2_1_2_+_1334_1912_orf_2
{
    if($0 ~ /^>/){
	split($1,a,"_")
	if ($7 == "1"){
	    STRAND = "+"
	}else{
	    STRAND = "-"
	}
	print a[1]"_"a[2]"_"a[3]"_"a[4]"_"a[5]"_"a[6]"_"STRAND"_"$3"_"$5"_orf-"a[7]
    }else{
	O=gsub("\*","",$0)
	print $O
    }
}

rename-orf-bySample-OSD.awk

#Execution:
#awk -v ASSEMBLER="idba-ud" -f ~/opt/scripts/rename-orf-bySample-OSD.awk osd2014-coassembly_idba-ud_contigs.gt500.nt.fasta
#Header format
#>osd{YEAR}-coassembly_{ASSEMBLER}_contig-{CONTIGNUM}_{STRAND}_{START}_{END}_orf-{GENENUM}
#OSD143 2014-06-21 1m NPL022 idba-ud contig-0 3
#1      2          3  4      5       6        7
{
    if($0 ~ /^>/){
	split($1,a,"_")
	if ($7 == "1"){
	    STRAND = "+"
	}else{
	    STRAND = "-"
	}
	print a[1]"_"a[2]"_"a[3]"_"a[4]"_"a[5]"_"a[6]"_"STRAND"_"$3"_"$5"_orf-"a[7]
    }else{
	O=gsub("\*","",$0)
	print $O
    }
}

publish-assemblies-bySample.sh

#!/bin/bash
set -e
#set -x
set -o pipefail

function error_handler() {
    echo "Error occurred in script at line: ${1}." >> error.txt
    echo "FILE:$2" >> error.txt
    echo "Line exited with status: ${3}" >> error.txt

}

trap 'error_handler ${LINENO} ${DIR} $?' ERR

set -o errexit
set -o errtrace
set -o nounset

module load coreutils
declare -r COVERAGE=$1
declare -r DIR=$(readlink -e $2)
declare -r NAM=$(basename $DIR -mapping)
declare -r NAMS=$(basename $DIR _contigs.gt500-mapping)
declare -r OUTDIR=$(pwd)/${NAM}

#INFILES
declare -r CONTIGFILE=${DIR}/${NAM}.fa
declare -r CONTIGCOVERAGE=${DIR}/${NAM}.sorted.markdup.sorted.coverage.percontig
declare -r GENECOVERAGE=${DIR}/${NAM}.bam.bedtools.d.hist


#OUTFILES
declare -r OUTFILT=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.txt
declare -r OUTFILTIDS=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.ids.txt
declare -r OUTFILTFASTA=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.fasta
declare -r OUTFILTLT=${OUTDIR}/${NAMS}.contigs.lt${COVERAGE}.txt
declare -r OUTFILTLTIDS=${OUTDIR}/${NAMS}.contigs.lt${COVERAGE}.ids.txt
declare -r NTFILE=${OUTDIR}/${NAMS}.orfs.nt.fasta
declare -r AAFILE=${OUTDIR}/${NAMS}.orfs.aa.fasta
declare -r NTFILEFILT=${OUTDIR}/${NAMS}.orfs.nt.contigs-gt${COVERAGE}.fasta
declare -r AAFILEFILT=${OUTDIR}/${NAMS}.orfs.aa.contigs-gt${COVERAGE}.fasta
declare -r ORFIDSFILT=${OUTDIR}/${NAMS}.orfs.ids.coverage-lt${COVERAGE}.txt
declare -r OUTGENECONTIG=${OUTDIR}/${NAMS}.orf-coverage.txt
declare -r OUTGENECONTIGGTCOVERAGE=${OUTDIR}/${NAMS}.orf-coverage.gt${COVERAGE}.txt

[[ -d ${OUTDIR} ]] && echo "Deleting existig folder ${OUTDIR}";rm -rf ${OUTDIR}
mkdir -p ${OUTDIR}


#filter contigs by coverage
echo "Processing coverage file: ${CONTIGCOVERAGE}"
echo "Coverage selected: ${COVERAGE}"
echo "Contig file: ${CONTIGFILE}"

echo "Filtering contigs..."
tail -n+2 ${CONTIGCOVERAGE} | \
    sort --parallel=8 -Vk4,4 -k1,1 | \
    mawk -v COVERAGE=${COVERAGE} '$4 >= COVERAGE' | tee ${OUTFILT} | cut -f1 > ${OUTFILTIDS}

tail -n+2 ${CONTIGCOVERAGE} | \
    sort --parallel=8 -Vk4,4 -k1,1 | \
    mawk -v COVERAGE=${COVERAGE} '$4 < COVERAGE' | tee ${OUTFILTLT} | cut -f1 | mawk '{print $0"_"}' > ${OUTFILTLTIDS}

#filter contigs
/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${OUTFILTIDS} in=${CONTIGFILE} out=${OUTFILTFASTA} include=t overwrite=t

#rename ORF files
echo "Renaming ORF files..."
mawk -f ~/opt/scripts/rename-genes-bySample.awk ${DIR}/${NAM}.nt.fasta > ${NTFILE}
mawk -f ~/opt/scripts/rename-genes-bySample.awk ${DIR}/${NAM}.aa.fasta > ${AAFILE}

# get ORFs ids
LC_ALL=C grep -F -f ${OUTFILTLTIDS} <(grep '>' ${NTFILE} | tr -d '>') > ${ORFIDSFILT}

echo "Generating hist files..."
bamToBed -i ${DIR}/${NAM}.sorted.markdup.sorted.bam | sort --parallel=8 -k1,1 -k2,2n > ${DIR}/${NAM}.sorted.markdup.sorted.bed
gff2bed < ${DIR}/${NAM}.gff | sort -k1,1 -k2,2n > ${DIR}/${NAM}.bed
bedtools coverage -hist -sorted -b ${DIR}/${NAM}.sorted.markdup.sorted.bed -a ${DIR}/${NAM}.bed > ${GENECOVERAGE}


#Get gene coverage
echo "Getting ORF coverage..."
LC_ALL=C mawk -f ~/opt/scripts/get_gene_coverage_from_bam-coAssembly.awk ${GENECOVERAGE} | sort -V --parallel=8 > ${OUTGENECONTIG}

LC_ALL=C grep -F -f <(mawk '{print $0"_"}' ${OUTFILTIDS}) ${OUTGENECONTIG} > ${OUTGENECONTIGGTCOVERAGE}



#Filter genes by CONTIG COVERAGE
echo "Filtering genes by contig coverage..."
/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${ORFIDSFILT} in=${NTFILE} out=${NTFILEFILT} include=f overwrite=t

/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${ORFIDSFILT} in=${AAFILE} out=${AAFILEFILT} include=f overwrite=t


# What we need
#1.Contigs with selected COVERAGE
#2.NT and AA sequences for the genes on selcted contigs
#3.File with coverage for each gene

declare -r OUTCONTIGS=${OUTDIR}/${NAMS}.contigs.fasta
declare -r OUTORFSNT=${OUTDIR}/${NAMS}.orfs.nt.fasta
declare -r OUTORFSAA=${OUTDIR}/${NAMS}.orfs.aa.fasta
declare -r OUTORFCOV=${OUTDIR}/${NAMS}.orfs.coverage.txt

mv ${OUTFILTFASTA} ${OUTCONTIGS}
mv ${NTFILEFILT} ${OUTORFSNT}
mv ${AAFILEFILT} ${OUTORFSAA}
mv ${OUTGENECONTIGGTCOVERAGE} ${OUTORFCOV}
# Delete non useful files
rm ${OUTDIR}/*lt${COVERAGE}*
rm ${OUTDIR}/*gt${COVERAGE}*
rm ${OUTGENECONTIG}

get_orf_coverage_from_bam.awk

#Read a hist file from beddtools coverage -hist and create a coverage per gene file.
#Coverage per each gene is calculated as: coverage_geneA = sum(depth_of_coverage * fraction_covered)
{
    if ($0 !~ /^all/){
	split($4,a,"_");
	b=$1"_"$6"_"$2+1"_"$3"_orf-"a[2]"\t"$13;
	c[b]=c[b] + ($11*$14)
    }
}END{
    for (var in c){
	print var"\t"c[var]
    }
 }

get_orf_coverage_from_bam-coAssembly.awk

#Read a hist file from beddtools coverage -hist and create a coverage per gene file.
#Coverage per each gene is calculated as: coverage_geneA = sum(depth_of_coverage * fraction_covered)
{
    if ($0 i ~ /^all/){
	next;
    }else{
	split($4,a,"_");
	b=$1"_"$6"_"$2+1"_"$3"_orf-"a[2]"\t"$13;
	c[b]=c[b] + ($11*$14)
    }
}END{
    for (var in c){
	print var"\t"c[var]
    }
}

get_orf_coverage_from_bam-bySample.awk

#Read a hist file from beddtools coverage -hist and create a coverage per gene file.
#Coverage per each gene is calculated as: coverage_geneA = sum(depth_of_coverage * fraction_covered)
{
    if ($0 ~ /^all/){
	next;
    }else{
	split($9,a,";");
	split(a[1],d,"_")
	b=$1"_"$7"_"($4)"_"$5"_orf-"d[2]"\t"$12;
	c[b]=c[b] + ($10*$13)
    }
}END{
    for (var in c){
	print var"\t"c[var]
    }
 }

rename-orf-bySample-GOS.awk

cat ~/opt/scripts/rename_GOS.awk
# Reads file with samples
# Rename sequences:
# >Sample_name-Read_num

{
    if(NR == FNR){
	## If this is the first file, save the values of $1
	## in the array n.
	n[$1] = $2

    }
    ## If we have moved on to the 2nd file
    else{
	if ($0 ~ /^>/){
	    gsub(">","",$1)
	    for (i=1;i<=NF;i++){
		if ($i ~ /sample_id/){
		    O=i
		    gsub("/sample_id=","",$O)
		    if ($O in n) {
			V=n[$O]
		    }
		}
	    }
	    print ">"V"-"$1
	}else{
	    print $0
	}
    }
}

# if($3 in n){

# 	}

publish-assemblies-coAssembly.sh

#!/bin/bash
set -e
#set -x
set -o pipefail

function error_handler() {
    echo "Error occurred in script at line: ${1}."
    echo "Line exited with status: ${2}"

}

trap 'error_handler ${LINENO} $?' ERR

set -o errexit
set -o errtrace
set -o nounset

#module load coreutils
declare -r COVERAGE=$1
declare -r DIR=$(readlink -e $2)
declare -r NAM=$(basename $DIR -mapping)
declare -r NAMS=$(basename $DIR | cut -f1 -d '.' | sed -e 's/_contigs//')
declare -r OUTDIR=$(pwd)/${NAM}

#INFILES
declare -r CONTIGFILE=${DIR}/${NAM}.fa
declare -r CONTIGCOVERAGE=${DIR}/${NAM}.sorted.markdup.sorted.coverage.percontig
declare -r GENECOVERAGE=${DIR}/${NAM}.bam.bedtools.d.hist

#OUTFILES
declare -r OUTFILT=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.txt
declare -r OUTFILTIDS=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.ids.txt
declare -r OUTFILTFASTA=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.fasta
declare -r OUTFILTLT=${OUTDIR}/${NAMS}.contigs.lt${COVERAGE}.txt
declare -r OUTFILTLTIDS=${OUTDIR}/${NAMS}.contigs.lt${COVERAGE}.ids.txt
declare -r NTFILE=${OUTDIR}/${NAMS}.orfs.nt.fasta
declare -r AAFILE=${OUTDIR}/${NAMS}.orfs.aa.fasta
declare -r NTFILEFILT=${OUTDIR}/${NAMS}.orfs.nt.contigs-gt${COVERAGE}.fasta
declare -r AAFILEFILT=${OUTDIR}/${NAMS}.orfs.aa.contigs-gt${COVERAGE}.fasta
declare -r ORFIDSFILT=${OUTDIR}/${NAMS}.orfs.ids.coverage-lt${COVERAGE}.txt
declare -r OUTGENECONTIG=${OUTDIR}/${NAMS}.orf-coverage.txt
declare -r OUTGENECONTIGGTCOVERAGE=${OUTDIR}/${NAMS}.orf-coverage.gt${COVERAGE}.txt

[[ -d ${OUTDIR} ]] && echo "Deleting existig folder ${OUTDIR}";rm -rf ${OUTDIR}
mkdir -p ${OUTDIR}


#filter contigs by coverage
echo "Processing coverage file: ${CONTIGCOVERAGE}"
echo "Coverage selected: ${COVERAGE}"
echo "Contig file: ${CONTIGFILE}"

echo "Filtering contigs..."
tail -n+2 ${CONTIGCOVERAGE} | \
    sort --parallel=8 -Vk4,4 -k1,1 | \
    mawk -v COVERAGE=${COVERAGE} '$4 >= COVERAGE' | tee ${OUTFILT} | cut -f1 > ${OUTFILTIDS}

tail -n+2 ${CONTIGCOVERAGE} | \
    sort --parallel=8 -Vk4,4 -k1,1 | \
    mawk -v COVERAGE=${COVERAGE} '$4 < COVERAGE' | tee ${OUTFILTLT} | cut -f1 | mawk '{print $0"_"}' > ${OUTFILTLTIDS}

#filter contigs
/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${OUTFILTIDS} in=${CONTIGFILE} out=${OUTFILTFASTA} include=t overwrite=t

#rename ORF files
echo "Renaming ORF files..."
mawk -f ~/opt/scripts/rename-genes-coAssembly.awk ${DIR}/${NAM}.nt.fasta > ${NTFILE}
mawk -f ~/opt/scripts/rename-genes-coAssembly.awk ${DIR}/${NAM}.aa.fasta > ${AAFILE}

# get ORFs ids
LC_ALL=C grep -F -f ${OUTFILTLTIDS} <(grep '>' ${NTFILE} | tr -d '>') > ${ORFIDSFILT}


#Get gene coverage
echo "Getting ORF coverage..."
LC_ALL=C mawk -f ~/opt/scripts/get_gene_coverage_from_bam-coAssembly.awk ${GENECOVERAGE} | sort -V --parallel=8 > ${OUTGENECONTIG}

LC_ALL=C grep -F -f <(mawk '{print $0"_"}' ${OUTFILTIDS}) ${OUTGENECONTIG} > ${OUTGENECONTIGGTCOVERAGE}



#Filter genes by CONTIG COVERAGE
echo "Filtering genes by contig coverage..."
/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${ORFIDSFILT} in=${NTFILE} out=${NTFILEFILT} include=f overwrite=t

/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${ORFIDSFILT} in=${AAFILE} out=${AAFILEFILT} include=f overwrite=t


# What we need
#1.Contigs with selected COVERAGE
#2.NT and AA sequences for the genes on selcted contigs
#3.File with coverage for each gene

declare -r OUTCONTIGS=${OUTDIR}/${NAMS}.contigs.fasta
declare -r OUTORFSNT=${OUTDIR}/${NAMS}.orfs.nt.fasta
declare -r OUTORFSAA=${OUTDIR}/${NAMS}.orfs.aa.fasta
declare -r OUTORFCOV=${OUTDIR}/${NAMS}.orfs.abundance.txt

mv ${OUTFILTFASTA} ${OUTCONTIGS}
mv ${NTFILEFILT} ${OUTORFSNT}
mv ${AAFILEFILT} ${OUTORFSAA}
mv ${OUTGENECONTIGGTCOVERAGE} ${OUTORFCOV}
# Delete non useful files
rm ${OUTDIR}/*lt${COVERAGE}*
rm ${OUTDIR}/*gt${COVERAGE}*
rm ${OUTGENECONTIG}

gen_contig_cov_per_bam_table.py

#!/usr/bin/env python
"""
@author: inodb

This is the same script as gen_input_table.py from CONCOCT. It is used in
metassemble to determine contig coverage from a bam file and a fasta file.
"""
import sys
import os
import argparse
import subprocess
from signal import signal, SIGPIPE, SIG_DFL

from Bio import SeqIO, SeqUtils


def get_gc_and_len_dict(fastafile):
    """Creates a dictionary with the fasta id as key and GC and length as keys
    for the inner dictionary."""
    out_dict = {}

    for rec in SeqIO.parse(fastafile, "fasta"):
        out_dict[rec.id] = {}
        out_dict[rec.id]["length"] = len(rec.seq)
        out_dict[rec.id]["GC"] = SeqUtils.GC(rec.seq)

    return out_dict


def get_bedcov_dict(bedcoverage):
    """Uses the BEDTools genomeCoverageBed histogram output to determine mean
    coverage and percentage covered for each contig.

    Returns dict with fasta id as key and percentage covered and cov_mean as
    keys for the inner dictionary."""
    out_dict = {}

    # Check if given argument is a file, otherwise use the content of the
    # variable
    if os.path.isfile(bedcoverage):
        fh = open(bedcoverage)
    else:
        fh = bedcoverage.split('\n')[:-1]

    for line in fh:
        cols = line.split()

        try:
            d = out_dict[cols[0]]
        except KeyError:
            d = {}
            out_dict[cols[0]] = d

        if int(cols[1]) == 0:
            d["percentage_covered"] = 100 - float(cols[4]) * 100.0
        else:
            d["cov_mean"] = d.get("cov_mean", 0) + int(cols[1]) * float(cols[4])

    return out_dict


def print_sample_columns(t, header="cov_mean_sample_"):
    sys.stdout.write((("\t" + header + "%s") * len(t)) % t)


def print_input_table(fastadict, bedcovdicts, samplenames=None):
    """Writes the input table for Probin to stdout. See hackathon google
    docs."""

    # Header
    sys.stdout.write("contig\tlength\tGC")
    if samplenames is None:
        # Use index if no sample names given in header
        print_sample_columns(tuple(range(len(bedcovdicts))), "cov_mean_sample_")
        print_sample_columns(tuple(range(len(bedcovdicts))), "percentage_covered_sample_")
    else:
        # Use given sample names in header
        assert(len(samplenames) == len(bedcovdicts))
        print_sample_columns(tuple(samplenames), "cov_mean_sample_")
        print_sample_columns(tuple(samplenames), "percentage_covered_sample_")
    sys.stdout.write("\n")

    # Content
    assert(len(fastadict) > 0)
    for acc in fastadict:
        # fasta stats
        sys.stdout.write("%s\t%d\t%f" %
            (
                acc,
                fastadict[acc]['length'],
                fastadict[acc]['GC']
            )
        )

        # Print mean
        for bcd in bedcovdicts:
            try:
                # Print cov mean
                sys.stdout.write("\t%f" % (bcd[acc]["cov_mean"]))
            except KeyError:
                # No reads mapped to this contig
                sys.stdout.write("\t0")

        # Print percentage covered
        for bcd in bedcovdicts:
            try:
                # Print percentage covered
                sys.stdout.write("\t%f" % (bcd[acc]["percentage_covered"]))
            except KeyError:
                if acc in bcd and "cov_mean" in bcd[acc]:
                    # all reads were covered
                    sys.stdout.write("\t100")
                else:
                    # No reads mapped to this contig
                    sys.stdout.write("\t0")

        sys.stdout.write("\n")


def gen_contig_cov_per_bam_table(fastafile, bamfiles, samplenames=None, isbedfiles=False):
    """Reads input files into dictionaries then prints everything in the table
    format required for running ProBin."""
    bedcovdicts = []

    # Determine coverage information from bam file using BEDTools
    for i, bf in enumerate(bamfiles):
        if isbedfiles is False:
            p = subprocess.Popen(["genomeCoverageBed", "-ibam", bf], stdout=subprocess.PIPE)
            out, err = p.communicate()
            if p.returncode != 0:
                sys.stderr.write(out)
                raise Exception('Error with genomeCoverageBed')
            else:
                bedcovdicts.append(get_bedcov_dict(out))
        else:
            bedcovdicts.append(get_bedcov_dict(bf))

    print_input_table(get_gc_and_len_dict(fastafile), bedcovdicts, samplenames=samplenames)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("fastafile", help="Contigs fasta file")
    parser.add_argument("bamfiles", nargs='+', help="BAM files with mappings to contigs")
    parser.add_argument("--samplenames", default=None, help="File with sample names, one line each. Should be same nr as bamfiles.")
    parser.add_argument("--isbedfiles", action='store_true',
        help="The bamfiles argument are outputs of genomeCoverageBed, not the actual bam file. Skips running genomeCoverageBed from within this script.")
    args = parser.parse_args()

    # Get sample names
    if args.samplenames is not None:
        samplenames = [s[:-1] for s in open(args.samplenames).readlines()]
        if len(samplenames) != len(args.bamfiles):
            raise Exception("Nr of names in samplenames should be equal to nr of given bamfiles")
    else:
        samplenames = None

    # ignore broken pipe error when piping output
    # http://newbebweb.blogspot.pt/2012/02/python-head-ioerror-errno-32-broken.html
    signal(SIGPIPE, SIG_DFL)

    gen_contig_cov_per_bam_table(args.fastafile, args.bamfiles,
        samplenames=samplenames, isbedfiles=args.isbedfiles)

filter_contigs_by_coverage.sh

#!/bin/bash
set -e
declare -r COVERAGE=$1
declare -r FILE=$2
declare -r CONTIGS=$3

NAM=$(echo $FILE | cut -f 1,2 -d '.')

echo "Processing coverage file: $FILE"
echo "Coverage selected: $1"
echo "Contig file: $CONTIGS"

declare OUTFILT=${NAM}.contigs.gt${COVERAGE}.txt
declare OUTFILTIDS=${NAM}.contigs.gt${COVERAGE}.ids.txt
declare OUTFILTFASTA=${NAM}.contigs.gt${COVERAGE}.fasta

echo "Filtering contigs..."
tail -n+2 ${FILE} | \
    sort --parallel=8 -Vk4,4 -k1,1  | \
    awk -v COVERAGE=${COVERAGE} '$4 >= COVERAGE' | tee ${OUTFILT} | cut -f1 > ${OUTFILTIDS}

#filter contigs
/bioinf/software/bbmap/bbmap-34.00/filterbyname.sh names=$OUTFILTIDS in=${CONTIGS} out=${OUTFILTFASTA} include=t overwrite=t

#filter genes from contigs

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

Contig and gene renaming