#Read a hist file from beddtools coverage -hist and create a coverage per gene file.
#Coverage per each gene is calculated as: coverage_geneA = sum(depth_of_coverage * fraction_covered)
{
if ($0 i ~ /^all/){
next;
}else{
split($4,a,"_");
b=$1"_"$6"_"$2+1"_"$3"_orf-"a[2]"\t"$13;
c[b]=c[b] + ($11*$14)
}
}END{
for (var in c){
print var"\t"c[var]
}
}
#!/usr/bin/env python
"""
@author: inodb
This is the same script as gen_input_table.py from CONCOCT. It is used in
metassemble to determine contig coverage from a bam file and a fasta file.
"""
import sys
import os
import argparse
import subprocess
from signal import signal, SIGPIPE, SIG_DFL
from Bio import SeqIO, SeqUtils
def get_gc_and_len_dict(fastafile):
"""Creates a dictionary with the fasta id as key and GC and length as keys
for the inner dictionary."""
out_dict = {}
for rec in SeqIO.parse(fastafile, "fasta"):
out_dict[rec.id] = {}
out_dict[rec.id]["length"] = len(rec.seq)
out_dict[rec.id]["GC"] = SeqUtils.GC(rec.seq)
return out_dict
def get_bedcov_dict(bedcoverage):
"""Uses the BEDTools genomeCoverageBed histogram output to determine mean
coverage and percentage covered for each contig.
Returns dict with fasta id as key and percentage covered and cov_mean as
keys for the inner dictionary."""
out_dict = {}
# Check if given argument is a file, otherwise use the content of the
# variable
if os.path.isfile(bedcoverage):
fh = open(bedcoverage)
else:
fh = bedcoverage.split('\n')[:-1]
for line in fh:
cols = line.split()
try:
d = out_dict[cols[0]]
except KeyError:
d = {}
out_dict[cols[0]] = d
if int(cols[1]) == 0:
d["percentage_covered"] = 100 - float(cols[4]) * 100.0
else:
d["cov_mean"] = d.get("cov_mean", 0) + int(cols[1]) * float(cols[4])
return out_dict
def print_sample_columns(t, header="cov_mean_sample_"):
sys.stdout.write((("\t" + header + "%s") * len(t)) % t)
def print_input_table(fastadict, bedcovdicts, samplenames=None):
"""Writes the input table for Probin to stdout. See hackathon google
docs."""
# Header
sys.stdout.write("contig\tlength\tGC")
if samplenames is None:
# Use index if no sample names given in header
print_sample_columns(tuple(range(len(bedcovdicts))), "cov_mean_sample_")
print_sample_columns(tuple(range(len(bedcovdicts))), "percentage_covered_sample_")
else:
# Use given sample names in header
assert(len(samplenames) == len(bedcovdicts))
print_sample_columns(tuple(samplenames), "cov_mean_sample_")
print_sample_columns(tuple(samplenames), "percentage_covered_sample_")
sys.stdout.write("\n")
# Content
assert(len(fastadict) > 0)
for acc in fastadict:
# fasta stats
sys.stdout.write("%s\t%d\t%f" %
(
acc,
fastadict[acc]['length'],
fastadict[acc]['GC']
)
)
# Print mean
for bcd in bedcovdicts:
try:
# Print cov mean
sys.stdout.write("\t%f" % (bcd[acc]["cov_mean"]))
except KeyError:
# No reads mapped to this contig
sys.stdout.write("\t0")
# Print percentage covered
for bcd in bedcovdicts:
try:
# Print percentage covered
sys.stdout.write("\t%f" % (bcd[acc]["percentage_covered"]))
except KeyError:
if acc in bcd and "cov_mean" in bcd[acc]:
# all reads were covered
sys.stdout.write("\t100")
else:
# No reads mapped to this contig
sys.stdout.write("\t0")
sys.stdout.write("\n")
def gen_contig_cov_per_bam_table(fastafile, bamfiles, samplenames=None, isbedfiles=False):
"""Reads input files into dictionaries then prints everything in the table
format required for running ProBin."""
bedcovdicts = []
# Determine coverage information from bam file using BEDTools
for i, bf in enumerate(bamfiles):
if isbedfiles is False:
p = subprocess.Popen(["genomeCoverageBed", "-ibam", bf], stdout=subprocess.PIPE)
out, err = p.communicate()
if p.returncode != 0:
sys.stderr.write(out)
raise Exception('Error with genomeCoverageBed')
else:
bedcovdicts.append(get_bedcov_dict(out))
else:
bedcovdicts.append(get_bedcov_dict(bf))
print_input_table(get_gc_and_len_dict(fastafile), bedcovdicts, samplenames=samplenames)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("fastafile", help="Contigs fasta file")
parser.add_argument("bamfiles", nargs='+', help="BAM files with mappings to contigs")
parser.add_argument("--samplenames", default=None, help="File with sample names, one line each. Should be same nr as bamfiles.")
parser.add_argument("--isbedfiles", action='store_true',
help="The bamfiles argument are outputs of genomeCoverageBed, not the actual bam file. Skips running genomeCoverageBed from within this script.")
args = parser.parse_args()
# Get sample names
if args.samplenames is not None:
samplenames = [s[:-1] for s in open(args.samplenames).readlines()]
if len(samplenames) != len(args.bamfiles):
raise Exception("Nr of names in samplenames should be equal to nr of given bamfiles")
else:
samplenames = None
# ignore broken pipe error when piping output
# http://newbebweb.blogspot.pt/2012/02/python-head-ioerror-errno-32-broken.html
signal(SIGPIPE, SIG_DFL)
gen_contig_cov_per_bam_table(args.fastafile, args.bamfiles,
samplenames=samplenames, isbedfiles=args.isbedfiles)
#!/bin/bash
set -x
set -e
module purge
module load python/2.7__qiime
module load samtools/dnanexus
module load java/1.8.0
export JAVA_HOME=/system/software/linux-x86_64/java/jdk-1.8.0
export PATH=~/opt/bedtools2/bin:~/opt/bedops/bin/:$PATH
declare -r NAM='NOM'
declare -r CON=${NAM}.fasta
declare -r GFF=${NAM}.gff
declare -r NSLOTS=${SLURM_JOB_CPUS_PER_NODE}
declare -r MEM=3G
declare -r MDUP=~/opt/picard-tools-1.133/picard.jar
declare -r BWA=~/opt/bwa/bwa
declare -r TRIM=~/opt/Trimmomatic-0.32/trimmomatic-0.32.jar
#Prepare data
declare -r RES=/data/oerc-microb3/afernandezguerra/TARA/results/${NAM}-mapping
declare -r TMPDIR=${RES}
#create folder
[[ -d ${RES} ]] && rm -rf ${RES}
mkdir -p ${RES}
cd ${RES}
# Get read files and QC them with trimmomatic
declare -r AFILE=/data/oerc-microb3/afernandezguerra/TARA/input/assem-file-name2url.txt
#get read files
RFILES=$(grep ${NAM} ${AFILE})
#How many files do we have
NRFILE=$(echo "${RFILES}" | wc -l)
#we need to download them, we use gnu parallel to get them all at the same time
#echo "${RFILES}" | cut -f2 -d ' ' | ~/opt/gnu-parallel/bin/parallel -j ${NRFILE} "curl -s -O {}"
echo "${RFILES}" | \
cut -f 2 -d ' ' | \
while read LINE
do
N=$(basename "${LINE}")
while ! rsync --progress --append -h --numeric-ids -e "ssh -T -c arcfour -o Compression=no -x" UV2000.mpi-bremen.de:/bioinf/projects/megx/TARA/assemblies/FASTQ/${N} . ;do sleep 5;done
done
PE1=${RES}/${NAM}.R1.fastq
PE2=${RES}/${NAM}.R2.fastq
SE=${RES}/${NAM}.SR.fastq
#we need to combine them
cat *_1.fastq.gz > R1.fastq.gz &
cat *_2.fastq.gz > R2.fastq.gz &
wait
rm *_1.fastq.gz &
rm *_2.fastq.gz &
wait
~/opt/bbmap/bbduk.sh in=R1.fastq.gz in2=R2.fastq.gz out1="${PE1}" out2="${PE2}" outs="${SE}" qin=33 minlen=45 qtrim=rl trimq=20 ktrim=r k=25 mink=11 ref=~/opt/bbmap/resources/tara.fa.gz hdist=1 tbo tpe t=${NSLOTS}
#remove downloaded files
#rm ${RES}/*.fastq.gz
#quality control with trimmomatic and tara adapters (min len 45 and Q20)
#java -jar ${TRIM} PE -threads ${NSLOTS} -phred33 R1.fastq.gz R2.fastq.gz s1_pe s1_se s2_pe s2_se ILLUMINACLIP:/home/oerc-microb3/afernandezguerra/opt/Trimmomatic-0.32/adapters/tara.fa:2:30:10:8:true LEADING:3 SLIDINGWINDOW:4:20 MINLEN:45
#~/opt/bbmap/bbduk.sh in=R1.fastq.gz in2=R2.fastq.gz out1=${PE1} out2=${PE2} outs=${SE} qin=33 minlen=45 qtrim=rl trimq=20 ktrim=r k=25 mink=11 ref=~/opt/bbmap/resources/tara.fa.gz hdist=1 tbo tpe
#rename files
#mv s1_pe ${PE1}
#mv s2_pe ${PE2}
#cat s1_se s2_se > ${SE}
#rm s1_se s2_se
rm R1.fastq.gz &
rm R2.fastq.gz &
wait
#link contig file
ln -s /data/oerc-microb3/afernandezguerra/TARA/input/assemblies/${NAM}.fasta ${RES}/${CON}
ln -s /data/oerc-microb3/afernandezguerra/TARA/input/ORFS/${NAM}_scaffold.gff ${RES}/${GFF}
#${BBMA} in1=${IN1},${IN3} in2=${IN2},null ref=${RES}/${CON} out=${RES}/${NAM}.sam outm=${RES}/${NAM}.mapped.fastq nodisk threads=4 ambiguous=best usejni=t append
cd ${RES}
${BWA} index ${RES}/${CON}
${BWA} mem -M -t ${NSLOTS} ${RES}/${CON} ${SE} > ${RES}/${NAM}.se.sam
${BWA} mem -M -t ${NSLOTS} ${RES}/${CON} ${PE1} ${PE2} > ${RES}/${NAM}.pe.sam
# ${BWA} index ${RES}/${CON}
# ${BWA} mem -M -t ${NSLOTS} ${RES}/${CON} ${SE} > ${RES}/${NAM}.se.sam
# ${BWA} mem -M -t ${NSLOTS} ${RES}/${CON} ${PE1} ${PE2} > ${RES}/${NAM}.pe.sam
rm ${PE1} ${PE2} ${SE}
# gzip ${PE1} &
# gzip ${PE2} &
# gzip ${SE} &
# wait
samtools faidx ${RES}/${CON}
# We filter out unmapped reads and one with mapq < 10
samtools view -@ ${NSLOTS} -q 10 -F 4 -u -bt ${RES}/${CON}.fai ${RES}/${NAM}.se.sam | samtools rocksort -@ ${NSLOTS} -m ${MEM} - ${RES}/${NAM}.se
samtools view -@ ${NSLOTS} -q 10 -F 4 -u -bt ${RES}/${CON}.fai ${RES}/${NAM}.pe.sam | samtools rocksort -@ ${NSLOTS} -m ${MEM} - ${RES}/${NAM}.pe
/system/software/linux-x86_64/samtools/0.1.19/samtools merge -@ ${NSLOTS} ${RES}/${NAM}.bam ${RES}/${NAM}.pe.bam ${RES}/${NAM}.se.bam
#samtools merge ${RES}/${NAM}.sam ${RES}/${NAM}.pe.sam ${RES}/${NAM}.se.sam
rm ${RES}/${NAM}.pe.sam ${RES}/${NAM}.se.sam
rm ${RES}/${NAM}.pe.bam ${RES}/${NAM}.se.bam
#samtools faidx ${RES}/${CON}
#samtools view -@ ${NSLOTS} -bt ${RES}/${CON}.fai ${RES}/${NAM}.sam > ${RES}/${NAM}.bam
samtools rocksort -@ ${NSLOTS} -m ${MEM} ${RES}/${NAM}.bam ${RES}/${NAM}.sorted
samtools index ${RES}/${NAM}.sorted.bam
JAVA_OPT="-Xms2g -Xmx32g -XX:ParallelGCThreads=4 -XX:MaxPermSize=2g -XX:+CMSClassUnloadingEnabled"
# Mark duplicates and sort
java $JAVA_OPT \
-jar ${MDUP} MarkDuplicates \
INPUT=${RES}/${NAM}.sorted.bam \
OUTPUT=${RES}/${NAM}.sorted.markdup.bam \
METRICS_FILE=${RES}/${NAM}.sorted.markdup.metrics \
AS=TRUE \
VALIDATION_STRINGENCY=LENIENT \
MAX_FILE_HANDLES_FOR_READ_ENDS_MAP=1000 \
REMOVE_DUPLICATES=TRUE
samtools rocksort -@ ${NSLOTS} -m ${MEM} ${RES}/${NAM}.sorted.markdup.bam ${RES}/${NAM}.sorted.markdup.sorted
samtools index ${RES}/${NAM}.sorted.markdup.sorted.bam
samtools flagstat ${RES}/${NAM}.sorted.markdup.sorted.bam > ${RES}/${NAM}.sorted.markdup.sorted.flagstat
rm ${RES}/${NAM}.sorted.markdup.bam
rm ${RES}/*.fasta.*
# Determine Genome Coverage and mean coverage per contig
~/opt/bedtools2/bin/genomeCoverageBed -ibam ${RES}/${NAM}.sorted.markdup.sorted.bam > ${RES}/${NAM}.sorted.markdup.sorted.coverage
# generate table with length and coverage stats per contig (From http://github.com/BinPro/CONCOCT)
python ~/opt/scripts/gen_contig_cov_per_bam_table.py --isbedfiles ${RES}/${CON} ${RES}/${NAM}.sorted.markdup.sorted.coverage > ${RES}/${NAM}.sorted.markdup.sorted.coverage.percontig
#ORF PREDICTION
#~/opt/Prodigal/prodigal -i ${RES}/${CON} -a ${RES}/${NAM}.aa.fasta -d ${RES}/${NAM}.nt.fasta -p meta -o ${RES}/${NAM}.gff -f gff
#COUNT FEATURES
~/opt/bedtools2/bin/bedtools multicov -q 10 -bams ${RES}/${NAM}.sorted.markdup.sorted.bam -bed ${RES}/${NAM}.gff > ${RES}/${NAM}.bam.bedtools.d.cnt
#~/opt/bedtools2/bin/bedtools coverage -hist -b ${RES}/${NAM}.sorted.markdup.sorted.bam -a ${RES}/${GFF} > ${RES}/${NAM}.bam.bedtools.d.hist
~/opt/bedtools2/bin/bamToBed -i ${RES}/${NAM}.sorted.markdup.sorted.bam |LC_ALL=C ~/opt/coreutils/bin/sort --parallel=${NSLOTS} -k1,1 -k2,2n > ${RES}/${NAM}.sorted.markdup.sorted.bed
~/opt/bedops/bin/gff2bed < ${RES}/${GFF} | LC_ALL=C ~/opt/coreutils/bin/sort --parallel=${NSLOTS} -k1,1 -k2,2n > ${RES}/${NAM}.bed
~/opt/bedtools2/bin/bedtools coverage -hist -sorted -b ${RES}/${NAM}.sorted.markdup.sorted.bed -a ${RES}/${NAM}.bed > ${RES}/${NAM}.bam.bedtools.d.hist
LC_ALL=C ~/opt/mawk/bin/mawk -f ~/opt/scripts/get_gene_coverage_from_bam-coAssembly.awk ${RES}/${NAM}.bam.bedtools.d.hist | LC_ALL=C ~/opt/coreutils/bin/sort -V --parallel=${NSLOTS} > ${RES}/${NAM}.orf-coverage.txt
rm ${RES}/${NAM}.bam ${RES}/${NAM}.sorted.bam
find ${RES} -name "*.bed" -print -exec gzip {} \;
find ${RES} -name "*.hist" -print -exec gzip {} \;
find ${RES} -name "*.coverage" -print -exec gzip {} \;
echo "DONE"