Contig and gene renaming
#!/bin/bash -l
set -e
#set -x
set -o pipefail
function error_handler() {
echo "Error occurred in script at line: ${1}."
echo "Line exited with status: ${2}"
}
trap 'error_handler ${LINENO} $?' ERR
set -o errexit
set -o errtrace
set -o nounset
module load samtools-rocksort
module load coreutils
declare -r CWD=$(pwd)
declare -r DIR=$(readlink -e $1)
declare -r CONTIGDIR=/bioinf/projects/osd/main/2014/06/analysis-results/assembly-evaluation/mapping/bySample/BWA/owncloud
declare -r NAM=$(basename $DIR -mapping)
declare -r NAMD=$(basename $DIR)
declare -r NAMS=$(basename $DIR | cut -f1 -d '.' | sed -e 's/_contigs//')
declare -r OUTDIR=$(pwd)/${NAM}
echo "Subsetting BAM file from ${NAMS}..."
#INFILES
declare -r CONTIGFILE=${CONTIGDIR}/${NAM}/${NAMS}.contigs.fasta.gz
samtools view -bh ${DIR}/${NAM}.sorted.markdup.sorted.bam $(zcat ${CONTIGFILE} | grep '>' | tr -d '>' | tr $"\n" " ") > ${DIR}/${NAM}.cov2.bam
cd ${DIR}
samtools rocksort -@ 4 -m 2G ${DIR}/${NAM}.cov2.bam ${DIR}/${NAM}.cov2.sorted
samtools index ${DIR}/${NAM}.cov2.sorted.bam
samtools flagstat ${DIR}/${NAM}.cov2.bam ${DIR}${NAM}.cov2.sorted.bam > ${DIR}/${NAM}.cov2.sorted.flagstat
cd ${CWD}
#Execution:
#awk -v ASSEMBLER="idba-ud" -f ~/opt/scripts/rename-orf-coAssembly.awk osd2014-coassembly_idba-ud_contigs.gt500.nt.fasta
#Header format
#>osd{YEAR}-coassembly_{ASSEMBLER}_contig-{CONTIGNUM}_{STRAND}_{START}_{END}_orf-{GENENUM}
{
if($0 ~ /^>/){
split($1,a,"_")
if ($7 == "1"){
STRAND = "+"
}else{
STRAND = "-"
}
print a[1]"_"a[2]"_"a[3]"_"STRAND"_"$3"_"$5"_orf-"a[4]
}else{
O=gsub("\*","",$0)
print $O
}
}
#Execution:
#awk -v -f ~/opt/scripts/rename_tara_orfs.awk TARA_004_DCM_0.22-1.6_scaffold.nt.fasta
#Header format
#>{TARASAMPLE+SCAFFOLDNUM}_{STRAND}_{START}_{END}_orf-{GENENUM}
#TARA_004_DCM_0.22-1.6_scaffold2_1_2_+_1334_1912_orf_2
{
if($0 ~ /^>/){
split($1,a,"_")
if ($7 == "1"){
STRAND = "+"
}else{
STRAND = "-"
}
print a[1]"_"a[2]"_"a[3]"_"a[4]"_"a[5]"_"a[6]"_"STRAND"_"$3"_"$5"_orf-"a[7]
}else{
O=gsub("\*","",$0)
print $O
}
}
#Execution:
#awk -v ASSEMBLER="idba-ud" -f ~/opt/scripts/rename-orf-bySample-OSD.awk osd2014-coassembly_idba-ud_contigs.gt500.nt.fasta
#Header format
#>osd{YEAR}-coassembly_{ASSEMBLER}_contig-{CONTIGNUM}_{STRAND}_{START}_{END}_orf-{GENENUM}
#OSD143 2014-06-21 1m NPL022 idba-ud contig-0 3
#1 2 3 4 5 6 7
{
if($0 ~ /^>/){
split($1,a,"_")
if ($7 == "1"){
STRAND = "+"
}else{
STRAND = "-"
}
print a[1]"_"a[2]"_"a[3]"_"a[4]"_"a[5]"_"a[6]"_"STRAND"_"$3"_"$5"_orf-"a[7]
}else{
O=gsub("\*","",$0)
print $O
}
}
#!/bin/bash
set -e
#set -x
set -o pipefail
function error_handler() {
echo "Error occurred in script at line: ${1}." >> error.txt
echo "FILE:$2" >> error.txt
echo "Line exited with status: ${3}" >> error.txt
}
trap 'error_handler ${LINENO} ${DIR} $?' ERR
set -o errexit
set -o errtrace
set -o nounset
module load coreutils
declare -r COVERAGE=$1
declare -r DIR=$(readlink -e $2)
declare -r NAM=$(basename $DIR -mapping)
declare -r NAMS=$(basename $DIR _contigs.gt500-mapping)
declare -r OUTDIR=$(pwd)/${NAM}
#INFILES
declare -r CONTIGFILE=${DIR}/${NAM}.fa
declare -r CONTIGCOVERAGE=${DIR}/${NAM}.sorted.markdup.sorted.coverage.percontig
declare -r GENECOVERAGE=${DIR}/${NAM}.bam.bedtools.d.hist
#OUTFILES
declare -r OUTFILT=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.txt
declare -r OUTFILTIDS=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.ids.txt
declare -r OUTFILTFASTA=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.fasta
declare -r OUTFILTLT=${OUTDIR}/${NAMS}.contigs.lt${COVERAGE}.txt
declare -r OUTFILTLTIDS=${OUTDIR}/${NAMS}.contigs.lt${COVERAGE}.ids.txt
declare -r NTFILE=${OUTDIR}/${NAMS}.orfs.nt.fasta
declare -r AAFILE=${OUTDIR}/${NAMS}.orfs.aa.fasta
declare -r NTFILEFILT=${OUTDIR}/${NAMS}.orfs.nt.contigs-gt${COVERAGE}.fasta
declare -r AAFILEFILT=${OUTDIR}/${NAMS}.orfs.aa.contigs-gt${COVERAGE}.fasta
declare -r ORFIDSFILT=${OUTDIR}/${NAMS}.orfs.ids.coverage-lt${COVERAGE}.txt
declare -r OUTGENECONTIG=${OUTDIR}/${NAMS}.orf-coverage.txt
declare -r OUTGENECONTIGGTCOVERAGE=${OUTDIR}/${NAMS}.orf-coverage.gt${COVERAGE}.txt
[[ -d ${OUTDIR} ]] && echo "Deleting existig folder ${OUTDIR}";rm -rf ${OUTDIR}
mkdir -p ${OUTDIR}
#filter contigs by coverage
echo "Processing coverage file: ${CONTIGCOVERAGE}"
echo "Coverage selected: ${COVERAGE}"
echo "Contig file: ${CONTIGFILE}"
echo "Filtering contigs..."
tail -n+2 ${CONTIGCOVERAGE} | \
sort --parallel=8 -Vk4,4 -k1,1 | \
mawk -v COVERAGE=${COVERAGE} '$4 >= COVERAGE' | tee ${OUTFILT} | cut -f1 > ${OUTFILTIDS}
tail -n+2 ${CONTIGCOVERAGE} | \
sort --parallel=8 -Vk4,4 -k1,1 | \
mawk -v COVERAGE=${COVERAGE} '$4 < COVERAGE' | tee ${OUTFILTLT} | cut -f1 | mawk '{print $0"_"}' > ${OUTFILTLTIDS}
#filter contigs
/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${OUTFILTIDS} in=${CONTIGFILE} out=${OUTFILTFASTA} include=t overwrite=t
#rename ORF files
echo "Renaming ORF files..."
mawk -f ~/opt/scripts/rename-genes-bySample.awk ${DIR}/${NAM}.nt.fasta > ${NTFILE}
mawk -f ~/opt/scripts/rename-genes-bySample.awk ${DIR}/${NAM}.aa.fasta > ${AAFILE}
# get ORFs ids
LC_ALL=C grep -F -f ${OUTFILTLTIDS} <(grep '>' ${NTFILE} | tr -d '>') > ${ORFIDSFILT}
echo "Generating hist files..."
bamToBed -i ${DIR}/${NAM}.sorted.markdup.sorted.bam | sort --parallel=8 -k1,1 -k2,2n > ${DIR}/${NAM}.sorted.markdup.sorted.bed
gff2bed < ${DIR}/${NAM}.gff | sort -k1,1 -k2,2n > ${DIR}/${NAM}.bed
bedtools coverage -hist -sorted -b ${DIR}/${NAM}.sorted.markdup.sorted.bed -a ${DIR}/${NAM}.bed > ${GENECOVERAGE}
#Get gene coverage
echo "Getting ORF coverage..."
LC_ALL=C mawk -f ~/opt/scripts/get_gene_coverage_from_bam-coAssembly.awk ${GENECOVERAGE} | sort -V --parallel=8 > ${OUTGENECONTIG}
LC_ALL=C grep -F -f <(mawk '{print $0"_"}' ${OUTFILTIDS}) ${OUTGENECONTIG} > ${OUTGENECONTIGGTCOVERAGE}
#Filter genes by CONTIG COVERAGE
echo "Filtering genes by contig coverage..."
/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${ORFIDSFILT} in=${NTFILE} out=${NTFILEFILT} include=f overwrite=t
/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${ORFIDSFILT} in=${AAFILE} out=${AAFILEFILT} include=f overwrite=t
# What we need
#1.Contigs with selected COVERAGE
#2.NT and AA sequences for the genes on selcted contigs
#3.File with coverage for each gene
declare -r OUTCONTIGS=${OUTDIR}/${NAMS}.contigs.fasta
declare -r OUTORFSNT=${OUTDIR}/${NAMS}.orfs.nt.fasta
declare -r OUTORFSAA=${OUTDIR}/${NAMS}.orfs.aa.fasta
declare -r OUTORFCOV=${OUTDIR}/${NAMS}.orfs.coverage.txt
mv ${OUTFILTFASTA} ${OUTCONTIGS}
mv ${NTFILEFILT} ${OUTORFSNT}
mv ${AAFILEFILT} ${OUTORFSAA}
mv ${OUTGENECONTIGGTCOVERAGE} ${OUTORFCOV}
# Delete non useful files
rm ${OUTDIR}/*lt${COVERAGE}*
rm ${OUTDIR}/*gt${COVERAGE}*
rm ${OUTGENECONTIG}
#Read a hist file from beddtools coverage -hist and create a coverage per gene file.
#Coverage per each gene is calculated as: coverage_geneA = sum(depth_of_coverage * fraction_covered)
{
if ($0 !~ /^all/){
split($4,a,"_");
b=$1"_"$6"_"$2+1"_"$3"_orf-"a[2]"\t"$13;
c[b]=c[b] + ($11*$14)
}
}END{
for (var in c){
print var"\t"c[var]
}
}
#Read a hist file from beddtools coverage -hist and create a coverage per gene file.
#Coverage per each gene is calculated as: coverage_geneA = sum(depth_of_coverage * fraction_covered)
{
if ($0 i ~ /^all/){
next;
}else{
split($4,a,"_");
b=$1"_"$6"_"$2+1"_"$3"_orf-"a[2]"\t"$13;
c[b]=c[b] + ($11*$14)
}
}END{
for (var in c){
print var"\t"c[var]
}
}
#Read a hist file from beddtools coverage -hist and create a coverage per gene file.
#Coverage per each gene is calculated as: coverage_geneA = sum(depth_of_coverage * fraction_covered)
{
if ($0 ~ /^all/){
next;
}else{
split($9,a,";");
split(a[1],d,"_")
b=$1"_"$7"_"($4)"_"$5"_orf-"d[2]"\t"$12;
c[b]=c[b] + ($10*$13)
}
}END{
for (var in c){
print var"\t"c[var]
}
}
cat ~/opt/scripts/rename_GOS.awk
# Reads file with samples
# Rename sequences:
# >Sample_name-Read_num
{
if(NR == FNR){
## If this is the first file, save the values of $1
## in the array n.
n[$1] = $2
}
## If we have moved on to the 2nd file
else{
if ($0 ~ /^>/){
gsub(">","",$1)
for (i=1;i<=NF;i++){
if ($i ~ /sample_id/){
O=i
gsub("/sample_id=","",$O)
if ($O in n) {
V=n[$O]
}
}
}
print ">"V"-"$1
}else{
print $0
}
}
}
# if($3 in n){
# }
#!/bin/bash
set -e
#set -x
set -o pipefail
function error_handler() {
echo "Error occurred in script at line: ${1}."
echo "Line exited with status: ${2}"
}
trap 'error_handler ${LINENO} $?' ERR
set -o errexit
set -o errtrace
set -o nounset
#module load coreutils
declare -r COVERAGE=$1
declare -r DIR=$(readlink -e $2)
declare -r NAM=$(basename $DIR -mapping)
declare -r NAMS=$(basename $DIR | cut -f1 -d '.' | sed -e 's/_contigs//')
declare -r OUTDIR=$(pwd)/${NAM}
#INFILES
declare -r CONTIGFILE=${DIR}/${NAM}.fa
declare -r CONTIGCOVERAGE=${DIR}/${NAM}.sorted.markdup.sorted.coverage.percontig
declare -r GENECOVERAGE=${DIR}/${NAM}.bam.bedtools.d.hist
#OUTFILES
declare -r OUTFILT=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.txt
declare -r OUTFILTIDS=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.ids.txt
declare -r OUTFILTFASTA=${OUTDIR}/${NAMS}.contigs.gt${COVERAGE}.fasta
declare -r OUTFILTLT=${OUTDIR}/${NAMS}.contigs.lt${COVERAGE}.txt
declare -r OUTFILTLTIDS=${OUTDIR}/${NAMS}.contigs.lt${COVERAGE}.ids.txt
declare -r NTFILE=${OUTDIR}/${NAMS}.orfs.nt.fasta
declare -r AAFILE=${OUTDIR}/${NAMS}.orfs.aa.fasta
declare -r NTFILEFILT=${OUTDIR}/${NAMS}.orfs.nt.contigs-gt${COVERAGE}.fasta
declare -r AAFILEFILT=${OUTDIR}/${NAMS}.orfs.aa.contigs-gt${COVERAGE}.fasta
declare -r ORFIDSFILT=${OUTDIR}/${NAMS}.orfs.ids.coverage-lt${COVERAGE}.txt
declare -r OUTGENECONTIG=${OUTDIR}/${NAMS}.orf-coverage.txt
declare -r OUTGENECONTIGGTCOVERAGE=${OUTDIR}/${NAMS}.orf-coverage.gt${COVERAGE}.txt
[[ -d ${OUTDIR} ]] && echo "Deleting existig folder ${OUTDIR}";rm -rf ${OUTDIR}
mkdir -p ${OUTDIR}
#filter contigs by coverage
echo "Processing coverage file: ${CONTIGCOVERAGE}"
echo "Coverage selected: ${COVERAGE}"
echo "Contig file: ${CONTIGFILE}"
echo "Filtering contigs..."
tail -n+2 ${CONTIGCOVERAGE} | \
sort --parallel=8 -Vk4,4 -k1,1 | \
mawk -v COVERAGE=${COVERAGE} '$4 >= COVERAGE' | tee ${OUTFILT} | cut -f1 > ${OUTFILTIDS}
tail -n+2 ${CONTIGCOVERAGE} | \
sort --parallel=8 -Vk4,4 -k1,1 | \
mawk -v COVERAGE=${COVERAGE} '$4 < COVERAGE' | tee ${OUTFILTLT} | cut -f1 | mawk '{print $0"_"}' > ${OUTFILTLTIDS}
#filter contigs
/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${OUTFILTIDS} in=${CONTIGFILE} out=${OUTFILTFASTA} include=t overwrite=t
#rename ORF files
echo "Renaming ORF files..."
mawk -f ~/opt/scripts/rename-genes-coAssembly.awk ${DIR}/${NAM}.nt.fasta > ${NTFILE}
mawk -f ~/opt/scripts/rename-genes-coAssembly.awk ${DIR}/${NAM}.aa.fasta > ${AAFILE}
# get ORFs ids
LC_ALL=C grep -F -f ${OUTFILTLTIDS} <(grep '>' ${NTFILE} | tr -d '>') > ${ORFIDSFILT}
#Get gene coverage
echo "Getting ORF coverage..."
LC_ALL=C mawk -f ~/opt/scripts/get_gene_coverage_from_bam-coAssembly.awk ${GENECOVERAGE} | sort -V --parallel=8 > ${OUTGENECONTIG}
LC_ALL=C grep -F -f <(mawk '{print $0"_"}' ${OUTFILTIDS}) ${OUTGENECONTIG} > ${OUTGENECONTIGGTCOVERAGE}
#Filter genes by CONTIG COVERAGE
echo "Filtering genes by contig coverage..."
/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${ORFIDSFILT} in=${NTFILE} out=${NTFILEFILT} include=f overwrite=t
/bioinf/software/bbmap/bbmap-35.14/filterbyname.sh names=${ORFIDSFILT} in=${AAFILE} out=${AAFILEFILT} include=f overwrite=t
# What we need
#1.Contigs with selected COVERAGE
#2.NT and AA sequences for the genes on selcted contigs
#3.File with coverage for each gene
declare -r OUTCONTIGS=${OUTDIR}/${NAMS}.contigs.fasta
declare -r OUTORFSNT=${OUTDIR}/${NAMS}.orfs.nt.fasta
declare -r OUTORFSAA=${OUTDIR}/${NAMS}.orfs.aa.fasta
declare -r OUTORFCOV=${OUTDIR}/${NAMS}.orfs.abundance.txt
mv ${OUTFILTFASTA} ${OUTCONTIGS}
mv ${NTFILEFILT} ${OUTORFSNT}
mv ${AAFILEFILT} ${OUTORFSAA}
mv ${OUTGENECONTIGGTCOVERAGE} ${OUTORFCOV}
# Delete non useful files
rm ${OUTDIR}/*lt${COVERAGE}*
rm ${OUTDIR}/*gt${COVERAGE}*
rm ${OUTGENECONTIG}
#!/usr/bin/env python
"""
@author: inodb
This is the same script as gen_input_table.py from CONCOCT. It is used in
metassemble to determine contig coverage from a bam file and a fasta file.
"""
import sys
import os
import argparse
import subprocess
from signal import signal, SIGPIPE, SIG_DFL
from Bio import SeqIO, SeqUtils
def get_gc_and_len_dict(fastafile):
"""Creates a dictionary with the fasta id as key and GC and length as keys
for the inner dictionary."""
out_dict = {}
for rec in SeqIO.parse(fastafile, "fasta"):
out_dict[rec.id] = {}
out_dict[rec.id]["length"] = len(rec.seq)
out_dict[rec.id]["GC"] = SeqUtils.GC(rec.seq)
return out_dict
def get_bedcov_dict(bedcoverage):
"""Uses the BEDTools genomeCoverageBed histogram output to determine mean
coverage and percentage covered for each contig.
Returns dict with fasta id as key and percentage covered and cov_mean as
keys for the inner dictionary."""
out_dict = {}
# Check if given argument is a file, otherwise use the content of the
# variable
if os.path.isfile(bedcoverage):
fh = open(bedcoverage)
else:
fh = bedcoverage.split('\n')[:-1]
for line in fh:
cols = line.split()
try:
d = out_dict[cols[0]]
except KeyError:
d = {}
out_dict[cols[0]] = d
if int(cols[1]) == 0:
d["percentage_covered"] = 100 - float(cols[4]) * 100.0
else:
d["cov_mean"] = d.get("cov_mean", 0) + int(cols[1]) * float(cols[4])
return out_dict
def print_sample_columns(t, header="cov_mean_sample_"):
sys.stdout.write((("\t" + header + "%s") * len(t)) % t)
def print_input_table(fastadict, bedcovdicts, samplenames=None):
"""Writes the input table for Probin to stdout. See hackathon google
docs."""
# Header
sys.stdout.write("contig\tlength\tGC")
if samplenames is None:
# Use index if no sample names given in header
print_sample_columns(tuple(range(len(bedcovdicts))), "cov_mean_sample_")
print_sample_columns(tuple(range(len(bedcovdicts))), "percentage_covered_sample_")
else:
# Use given sample names in header
assert(len(samplenames) == len(bedcovdicts))
print_sample_columns(tuple(samplenames), "cov_mean_sample_")
print_sample_columns(tuple(samplenames), "percentage_covered_sample_")
sys.stdout.write("\n")
# Content
assert(len(fastadict) > 0)
for acc in fastadict:
# fasta stats
sys.stdout.write("%s\t%d\t%f" %
(
acc,
fastadict[acc]['length'],
fastadict[acc]['GC']
)
)
# Print mean
for bcd in bedcovdicts:
try:
# Print cov mean
sys.stdout.write("\t%f" % (bcd[acc]["cov_mean"]))
except KeyError:
# No reads mapped to this contig
sys.stdout.write("\t0")
# Print percentage covered
for bcd in bedcovdicts:
try:
# Print percentage covered
sys.stdout.write("\t%f" % (bcd[acc]["percentage_covered"]))
except KeyError:
if acc in bcd and "cov_mean" in bcd[acc]:
# all reads were covered
sys.stdout.write("\t100")
else:
# No reads mapped to this contig
sys.stdout.write("\t0")
sys.stdout.write("\n")
def gen_contig_cov_per_bam_table(fastafile, bamfiles, samplenames=None, isbedfiles=False):
"""Reads input files into dictionaries then prints everything in the table
format required for running ProBin."""
bedcovdicts = []
# Determine coverage information from bam file using BEDTools
for i, bf in enumerate(bamfiles):
if isbedfiles is False:
p = subprocess.Popen(["genomeCoverageBed", "-ibam", bf], stdout=subprocess.PIPE)
out, err = p.communicate()
if p.returncode != 0:
sys.stderr.write(out)
raise Exception('Error with genomeCoverageBed')
else:
bedcovdicts.append(get_bedcov_dict(out))
else:
bedcovdicts.append(get_bedcov_dict(bf))
print_input_table(get_gc_and_len_dict(fastafile), bedcovdicts, samplenames=samplenames)
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("fastafile", help="Contigs fasta file")
parser.add_argument("bamfiles", nargs='+', help="BAM files with mappings to contigs")
parser.add_argument("--samplenames", default=None, help="File with sample names, one line each. Should be same nr as bamfiles.")
parser.add_argument("--isbedfiles", action='store_true',
help="The bamfiles argument are outputs of genomeCoverageBed, not the actual bam file. Skips running genomeCoverageBed from within this script.")
args = parser.parse_args()
# Get sample names
if args.samplenames is not None:
samplenames = [s[:-1] for s in open(args.samplenames).readlines()]
if len(samplenames) != len(args.bamfiles):
raise Exception("Nr of names in samplenames should be equal to nr of given bamfiles")
else:
samplenames = None
# ignore broken pipe error when piping output
# http://newbebweb.blogspot.pt/2012/02/python-head-ioerror-errno-32-broken.html
signal(SIGPIPE, SIG_DFL)
gen_contig_cov_per_bam_table(args.fastafile, args.bamfiles,
samplenames=samplenames, isbedfiles=args.isbedfiles)
#!/bin/bash
set -e
declare -r COVERAGE=$1
declare -r FILE=$2
declare -r CONTIGS=$3
NAM=$(echo $FILE | cut -f 1,2 -d '.')
echo "Processing coverage file: $FILE"
echo "Coverage selected: $1"
echo "Contig file: $CONTIGS"
declare OUTFILT=${NAM}.contigs.gt${COVERAGE}.txt
declare OUTFILTIDS=${NAM}.contigs.gt${COVERAGE}.ids.txt
declare OUTFILTFASTA=${NAM}.contigs.gt${COVERAGE}.fasta
echo "Filtering contigs..."
tail -n+2 ${FILE} | \
sort --parallel=8 -Vk4,4 -k1,1 | \
awk -v COVERAGE=${COVERAGE} '$4 >= COVERAGE' | tee ${OUTFILT} | cut -f1 > ${OUTFILTIDS}
#filter contigs
/bioinf/software/bbmap/bbmap-34.00/filterbyname.sh names=$OUTFILTIDS in=${CONTIGS} out=${OUTFILTFASTA} include=t overwrite=t
#filter genes from contigs