genomewalker
6/28/2017 - 10:06 AM

bwa_map_tara.sh

#Read a hist file from beddtools coverage -hist and create a coverage per gene file.
#Coverage per each gene is calculated as: coverage_geneA = sum(depth_of_coverage * fraction_covered)
{
    if ($0 i ~ /^all/){
	next;
    }else{
	split($4,a,"_");
	b=$1"_"$6"_"$2+1"_"$3"_orf-"a[2]"\t"$13;
	c[b]=c[b] + ($11*$14)
    }
}END{
    for (var in c){
	print var"\t"c[var]
    }
}
#!/usr/bin/env python
"""
@author: inodb

This is the same script as gen_input_table.py from CONCOCT. It is used in
metassemble to determine contig coverage from a bam file and a fasta file.
"""
import sys
import os
import argparse
import subprocess
from signal import signal, SIGPIPE, SIG_DFL

from Bio import SeqIO, SeqUtils


def get_gc_and_len_dict(fastafile):
    """Creates a dictionary with the fasta id as key and GC and length as keys
    for the inner dictionary."""
    out_dict = {}

    for rec in SeqIO.parse(fastafile, "fasta"):
        out_dict[rec.id] = {}
        out_dict[rec.id]["length"] = len(rec.seq)
        out_dict[rec.id]["GC"] = SeqUtils.GC(rec.seq)

    return out_dict


def get_bedcov_dict(bedcoverage):
    """Uses the BEDTools genomeCoverageBed histogram output to determine mean
    coverage and percentage covered for each contig.

    Returns dict with fasta id as key and percentage covered and cov_mean as
    keys for the inner dictionary."""
    out_dict = {}

    # Check if given argument is a file, otherwise use the content of the
    # variable
    if os.path.isfile(bedcoverage):
        fh = open(bedcoverage)
    else:
        fh = bedcoverage.split('\n')[:-1]

    for line in fh:
        cols = line.split()

        try:
            d = out_dict[cols[0]]
        except KeyError:
            d = {}
            out_dict[cols[0]] = d

        if int(cols[1]) == 0:
            d["percentage_covered"] = 100 - float(cols[4]) * 100.0
        else:
            d["cov_mean"] = d.get("cov_mean", 0) + int(cols[1]) * float(cols[4])

    return out_dict


def print_sample_columns(t, header="cov_mean_sample_"):
    sys.stdout.write((("\t" + header + "%s") * len(t)) % t)


def print_input_table(fastadict, bedcovdicts, samplenames=None):
    """Writes the input table for Probin to stdout. See hackathon google
    docs."""

    # Header
    sys.stdout.write("contig\tlength\tGC")
    if samplenames is None:
        # Use index if no sample names given in header
        print_sample_columns(tuple(range(len(bedcovdicts))), "cov_mean_sample_")
        print_sample_columns(tuple(range(len(bedcovdicts))), "percentage_covered_sample_")
    else:
        # Use given sample names in header
        assert(len(samplenames) == len(bedcovdicts))
        print_sample_columns(tuple(samplenames), "cov_mean_sample_")
        print_sample_columns(tuple(samplenames), "percentage_covered_sample_")
    sys.stdout.write("\n")

    # Content
    assert(len(fastadict) > 0)
    for acc in fastadict:
        # fasta stats
        sys.stdout.write("%s\t%d\t%f" %
            (
                acc,
                fastadict[acc]['length'],
                fastadict[acc]['GC']
            )
        )

        # Print mean
        for bcd in bedcovdicts:
            try:
                # Print cov mean
                sys.stdout.write("\t%f" % (bcd[acc]["cov_mean"]))
            except KeyError:
                # No reads mapped to this contig
                sys.stdout.write("\t0")

        # Print percentage covered
        for bcd in bedcovdicts:
            try:
                # Print percentage covered
                sys.stdout.write("\t%f" % (bcd[acc]["percentage_covered"]))
            except KeyError:
                if acc in bcd and "cov_mean" in bcd[acc]:
                    # all reads were covered
                    sys.stdout.write("\t100")
                else:
                    # No reads mapped to this contig
                    sys.stdout.write("\t0")

        sys.stdout.write("\n")


def gen_contig_cov_per_bam_table(fastafile, bamfiles, samplenames=None, isbedfiles=False):
    """Reads input files into dictionaries then prints everything in the table
    format required for running ProBin."""
    bedcovdicts = []

    # Determine coverage information from bam file using BEDTools
    for i, bf in enumerate(bamfiles):
        if isbedfiles is False:
            p = subprocess.Popen(["genomeCoverageBed", "-ibam", bf], stdout=subprocess.PIPE)
            out, err = p.communicate()
            if p.returncode != 0:
                sys.stderr.write(out)
                raise Exception('Error with genomeCoverageBed')
            else:
                bedcovdicts.append(get_bedcov_dict(out))
        else:
            bedcovdicts.append(get_bedcov_dict(bf))

    print_input_table(get_gc_and_len_dict(fastafile), bedcovdicts, samplenames=samplenames)


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("fastafile", help="Contigs fasta file")
    parser.add_argument("bamfiles", nargs='+', help="BAM files with mappings to contigs")
    parser.add_argument("--samplenames", default=None, help="File with sample names, one line each. Should be same nr as bamfiles.")
    parser.add_argument("--isbedfiles", action='store_true',
        help="The bamfiles argument are outputs of genomeCoverageBed, not the actual bam file. Skips running genomeCoverageBed from within this script.")
    args = parser.parse_args()

    # Get sample names
    if args.samplenames is not None:
        samplenames = [s[:-1] for s in open(args.samplenames).readlines()]
        if len(samplenames) != len(args.bamfiles):
            raise Exception("Nr of names in samplenames should be equal to nr of given bamfiles")
    else:
        samplenames = None

    # ignore broken pipe error when piping output
    # http://newbebweb.blogspot.pt/2012/02/python-head-ioerror-errno-32-broken.html
    signal(SIGPIPE, SIG_DFL)

    gen_contig_cov_per_bam_table(args.fastafile, args.bamfiles,
        samplenames=samplenames, isbedfiles=args.isbedfiles)
#!/bin/bash
set -x
set -e

module purge
module load python/2.7__qiime
module load samtools/dnanexus
module load java/1.8.0

export JAVA_HOME=/system/software/linux-x86_64/java/jdk-1.8.0
export PATH=~/opt/bedtools2/bin:~/opt/bedops/bin/:$PATH

declare -r NAM='NOM'
declare -r CON=${NAM}.fasta
declare -r GFF=${NAM}.gff
declare -r NSLOTS=${SLURM_JOB_CPUS_PER_NODE}
declare -r MEM=3G

declare -r MDUP=~/opt/picard-tools-1.133/picard.jar
declare -r BWA=~/opt/bwa/bwa
declare -r TRIM=~/opt/Trimmomatic-0.32/trimmomatic-0.32.jar

#Prepare data

declare -r RES=/data/oerc-microb3/afernandezguerra/TARA/results/${NAM}-mapping
declare -r TMPDIR=${RES}
#create folder
[[ -d ${RES} ]] && rm -rf ${RES}
mkdir -p ${RES}

cd ${RES}

# Get read files and QC them with trimmomatic
declare -r AFILE=/data/oerc-microb3/afernandezguerra/TARA/input/assem-file-name2url.txt

#get read files
RFILES=$(grep ${NAM} ${AFILE})

#How many files do we have
NRFILE=$(echo "${RFILES}" | wc -l)

#we need to download them, we use gnu parallel to get them all at the same time
#echo "${RFILES}" | cut -f2 -d ' ' | ~/opt/gnu-parallel/bin/parallel -j ${NRFILE} "curl -s -O {}"

echo "${RFILES}" | \
    cut -f 2 -d ' ' | \
    while read LINE
    do
        N=$(basename "${LINE}")
	while ! rsync --progress --append -h --numeric-ids -e "ssh -T -c arcfour -o Compression=no -x" UV2000.mpi-bremen.de:/bioinf/projects/megx/TARA/assemblies/FASTQ/${N} . ;do sleep 5;done
    done

PE1=${RES}/${NAM}.R1.fastq
PE2=${RES}/${NAM}.R2.fastq
SE=${RES}/${NAM}.SR.fastq

#we need to combine them
cat *_1.fastq.gz > R1.fastq.gz &
cat *_2.fastq.gz > R2.fastq.gz &
wait

rm *_1.fastq.gz &
rm *_2.fastq.gz &
wait


~/opt/bbmap/bbduk.sh in=R1.fastq.gz in2=R2.fastq.gz out1="${PE1}" out2="${PE2}" outs="${SE}" qin=33 minlen=45 qtrim=rl trimq=20 ktrim=r k=25 mink=11 ref=~/opt/bbmap/resources/tara.fa.gz hdist=1 tbo tpe t=${NSLOTS}

#remove downloaded files
#rm ${RES}/*.fastq.gz

#quality control with trimmomatic and tara adapters (min len 45 and Q20)
#java -jar ${TRIM} PE -threads ${NSLOTS} -phred33 R1.fastq.gz R2.fastq.gz s1_pe s1_se s2_pe s2_se ILLUMINACLIP:/home/oerc-microb3/afernandezguerra/opt/Trimmomatic-0.32/adapters/tara.fa:2:30:10:8:true LEADING:3 SLIDINGWINDOW:4:20 MINLEN:45

#~/opt/bbmap/bbduk.sh in=R1.fastq.gz in2=R2.fastq.gz out1=${PE1} out2=${PE2} outs=${SE} qin=33 minlen=45 qtrim=rl trimq=20 ktrim=r k=25 mink=11 ref=~/opt/bbmap/resources/tara.fa.gz hdist=1 tbo tpe

#rename files
#mv s1_pe ${PE1}
#mv s2_pe ${PE2}

#cat s1_se s2_se > ${SE}
#rm s1_se s2_se

rm R1.fastq.gz &
rm R2.fastq.gz &
wait

#link contig file
ln -s /data/oerc-microb3/afernandezguerra/TARA/input/assemblies/${NAM}.fasta ${RES}/${CON}
ln -s /data/oerc-microb3/afernandezguerra/TARA/input/ORFS/${NAM}_scaffold.gff ${RES}/${GFF}

#${BBMA} in1=${IN1},${IN3} in2=${IN2},null ref=${RES}/${CON} out=${RES}/${NAM}.sam outm=${RES}/${NAM}.mapped.fastq nodisk threads=4 ambiguous=best usejni=t append

cd ${RES}

${BWA} index ${RES}/${CON}
${BWA} mem -M -t ${NSLOTS} ${RES}/${CON} ${SE} > ${RES}/${NAM}.se.sam
${BWA} mem -M -t ${NSLOTS} ${RES}/${CON} ${PE1} ${PE2} > ${RES}/${NAM}.pe.sam

# ${BWA} index ${RES}/${CON}
# ${BWA} mem -M -t ${NSLOTS} ${RES}/${CON} ${SE} > ${RES}/${NAM}.se.sam
# ${BWA} mem -M -t ${NSLOTS} ${RES}/${CON} ${PE1} ${PE2} > ${RES}/${NAM}.pe.sam

rm ${PE1} ${PE2} ${SE}

# gzip ${PE1} &
# gzip ${PE2} &
# gzip ${SE} &
# wait

samtools faidx ${RES}/${CON}

# We filter out unmapped reads and one with mapq < 10
samtools view -@ ${NSLOTS} -q 10 -F 4 -u -bt ${RES}/${CON}.fai ${RES}/${NAM}.se.sam | samtools rocksort -@ ${NSLOTS} -m ${MEM} - ${RES}/${NAM}.se
samtools view -@ ${NSLOTS} -q 10 -F 4 -u -bt ${RES}/${CON}.fai ${RES}/${NAM}.pe.sam | samtools rocksort -@ ${NSLOTS} -m ${MEM} - ${RES}/${NAM}.pe

/system/software/linux-x86_64/samtools/0.1.19/samtools merge -@ ${NSLOTS} ${RES}/${NAM}.bam ${RES}/${NAM}.pe.bam ${RES}/${NAM}.se.bam

#samtools merge ${RES}/${NAM}.sam ${RES}/${NAM}.pe.sam ${RES}/${NAM}.se.sam

rm ${RES}/${NAM}.pe.sam ${RES}/${NAM}.se.sam
rm ${RES}/${NAM}.pe.bam ${RES}/${NAM}.se.bam

#samtools faidx ${RES}/${CON}

#samtools view -@ ${NSLOTS} -bt ${RES}/${CON}.fai ${RES}/${NAM}.sam > ${RES}/${NAM}.bam

samtools rocksort -@ ${NSLOTS} -m ${MEM} ${RES}/${NAM}.bam ${RES}/${NAM}.sorted

samtools index ${RES}/${NAM}.sorted.bam

JAVA_OPT="-Xms2g -Xmx32g -XX:ParallelGCThreads=4 -XX:MaxPermSize=2g -XX:+CMSClassUnloadingEnabled"

# Mark duplicates and sort
java $JAVA_OPT \
     -jar ${MDUP} MarkDuplicates \
     INPUT=${RES}/${NAM}.sorted.bam \
     OUTPUT=${RES}/${NAM}.sorted.markdup.bam \
     METRICS_FILE=${RES}/${NAM}.sorted.markdup.metrics \
     AS=TRUE \
     VALIDATION_STRINGENCY=LENIENT \
     MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \
     REMOVE_DUPLICATES=TRUE

samtools rocksort -@ ${NSLOTS} -m ${MEM} ${RES}/${NAM}.sorted.markdup.bam ${RES}/${NAM}.sorted.markdup.sorted
samtools index ${RES}/${NAM}.sorted.markdup.sorted.bam
samtools flagstat ${RES}/${NAM}.sorted.markdup.sorted.bam > ${RES}/${NAM}.sorted.markdup.sorted.flagstat

rm ${RES}/${NAM}.sorted.markdup.bam
rm ${RES}/*.fasta.*

# Determine Genome Coverage and mean coverage per contig
~/opt/bedtools2/bin/genomeCoverageBed -ibam ${RES}/${NAM}.sorted.markdup.sorted.bam > ${RES}/${NAM}.sorted.markdup.sorted.coverage
    # generate table with length and coverage stats per contig (From http://github.com/BinPro/CONCOCT)
python ~/opt/scripts/gen_contig_cov_per_bam_table.py --isbedfiles ${RES}/${CON} ${RES}/${NAM}.sorted.markdup.sorted.coverage > ${RES}/${NAM}.sorted.markdup.sorted.coverage.percontig

#ORF PREDICTION
#~/opt/Prodigal/prodigal -i ${RES}/${CON} -a ${RES}/${NAM}.aa.fasta -d ${RES}/${NAM}.nt.fasta -p meta -o ${RES}/${NAM}.gff -f gff

#COUNT FEATURES
~/opt/bedtools2/bin/bedtools multicov -q 10 -bams ${RES}/${NAM}.sorted.markdup.sorted.bam -bed ${RES}/${NAM}.gff > ${RES}/${NAM}.bam.bedtools.d.cnt

#~/opt/bedtools2/bin/bedtools coverage -hist -b ${RES}/${NAM}.sorted.markdup.sorted.bam -a ${RES}/${GFF} > ${RES}/${NAM}.bam.bedtools.d.hist

~/opt/bedtools2/bin/bamToBed -i ${RES}/${NAM}.sorted.markdup.sorted.bam |LC_ALL=C ~/opt/coreutils/bin/sort --parallel=${NSLOTS} -k1,1 -k2,2n > ${RES}/${NAM}.sorted.markdup.sorted.bed

~/opt/bedops/bin/gff2bed < ${RES}/${GFF} | LC_ALL=C ~/opt/coreutils/bin/sort --parallel=${NSLOTS} -k1,1 -k2,2n > ${RES}/${NAM}.bed


~/opt/bedtools2/bin/bedtools coverage -hist -sorted -b ${RES}/${NAM}.sorted.markdup.sorted.bed -a ${RES}/${NAM}.bed > ${RES}/${NAM}.bam.bedtools.d.hist

LC_ALL=C ~/opt/mawk/bin/mawk -f ~/opt/scripts/get_gene_coverage_from_bam-coAssembly.awk ${RES}/${NAM}.bam.bedtools.d.hist | LC_ALL=C ~/opt/coreutils/bin/sort -V --parallel=${NSLOTS} > ${RES}/${NAM}.orf-coverage.txt

rm ${RES}/${NAM}.bam ${RES}/${NAM}.sorted.bam

find ${RES} -name "*.bed" -print -exec gzip {} \;
find ${RES} -name "*.hist" -print -exec gzip {} \;
find ${RES} -name "*.coverage" -print -exec gzip {} \;

echo "DONE"