OSD_TARA_OM_NOGS
#!/bin/bash
set -x
set -e
set -o pipefail
set -o errexit
set -o errtrace
set -o nounset
# Run uproc and extract contigs
declare -r NAM='NOM'
declare -r HMM3=/bioinf/software/hmmer/hmmer-3.1b2/bin/hmmsearch
declare -r NOGDB=/local/biodb/hmmer3/NOG_hmm
declare RES=/bioinf/projects/osd/main/2014/06/analysis-results/NOGs/results/${NAM}
declare -r NOGOUT=${RES}/${NAM}.NOG4.tsv
declare -r NUMNOGS=190648
declare -r NOGLOG=${RES}/hmm.log
declare -r NSLOTS=${NSLOTS}
if [ ! -e ${NOGLOG} ]; then
# create result folder
[[ -d ${RES} ]] && rm -rf ${RES}
mkdir -p ${RES}
else
echo "Resuming NOG search..."
fi
cd ${RES}
# Get read files and QC them with trimmomatic
#declare -r AFILE=/data/oerc-microb3/afernandezguerra/BGC/input/assem-file-name2url.txt
#get read files
#RFILES=$(grep ${NAM} ${AFILE})
#How many files do we have
#NRFILE=$(echo "${RFILES}" | wc -l)
#echo "${RFILES}" | \
# cut -f 2 -d ' ' | \
# while read LINE
# do
# N=$(basename "${LINE}")
# while ! rsync --progress --append -h --numeric-ids -e "ssh -T -c arcfour -o Compression=no -x" UV2000.mpi-bremen.de:/bioinf/projects/megx/TARA/assemblies/FASTQ/${N} . ;do sleep 5;done
# done
#ME=~/BGC/input/${NAM}_ME_shotgun_workable_merged.fastq.gz
#SE=~/BGC/input/${NAM}_SE_shotgun_workable_merged.fastq.gz
#SR=${RES}/${NAM}.SR.fasta
#~/opt/bbmap/reformat.sh in=${ME} out=${RES}/tmp.me.fasta maxns=0
#~/opt/bbmap/reformat.sh in=${SE} out=${RES}/tmp.se.fasta maxns=0
#cat ${RES}/tmp.me.fasta ${RES}/tmp.se.fasta > ${SR}
#rm ${RES}/tmp.{me,se}.fasta
#ORFS=${RES}/${NAM}.orfs
#${FGS} -genome="${SR}" -out=${ORFS} -complete=0 -train=illumina_10 -thread=${NSLOTS}
#${FGS} -s "${SR}" -m 24000 -o ${ORFS} -e 0 -d 0 -w 0 -r /home/mpi45770/opt/FragGeneScanPlus/train -t illumina_10 -p ${NSLOTS}
ORFS=/bioinf/projects/osd/main/2014/06/analysis-results/NOGs/input/assemblies/${NAM}_spades.orfs.aa.fasta
# BGCs
#${UPROC} -z ${RES}/${NAM}.bgc.gz -p -t ${NSLOTS} ${BGCDB} ${UPROCM} ${ORFS}
# PFAM
#${UPROC} -z ${RES}/${NAM}.pfam27.gz -p -t ${NSLOTS} ${PFAMDB} ${UPROCM} ${ORFS}
# KEGG
#${UPROC} -z ${RES}/${NAM}.kegg_20140317.gz -p -t ${NSLOTS} ${KEGGDB} ${UPROCM} ${ORFS}
# RESFAMS
#NOGNUM=1
if [ -e ${NOGLOG} ]; then
if [ ! -s ${NOGLOG} ]; then
NOGNUM=1
else
NOGNUM=$(cat ${NOGLOG})
fi
else
touch ${NOGLOG}
NOGNUM=1
fi
for ((i=${NOGNUM}; i<=${NUMNOGS}; i++)); do
NOG=$(awk -v L=${i} 'NR==L' /bioinf/projects/osd/main/2014/06/analysis-results/NOGs/input/NOG_profiles.txt)
${HMM3} --cut_ga --cpu ${NSLOTS} --domtblout ${RES}/tmp.nog ${NOGDB}/${NOG} ${ORFS} > ${NOGOUT}.log 2>&1 # 1> /dev/null 2>&1;
cat ${RES}/tmp.nog >> ${NOGOUT}
rm ${RES}/tmp.nog
echo ${i} > ${NOGLOG}
done
# compress genes
gzip ${NOGOUT} &
wait
echo "DONE"
#exit
# run uproc using the models
#zcat ${RES}/${NAM}.bgc.gz | cut -f 2 -d ',' | sort -u > ${RES}/${NAM}.bgc.contigs.id.txt
#${BBMAP} in=${ASS} names=${RES}/${NAM}.bgc.contigs.id.txt out=${RES}/${NAM}.bgc.contigs.fasta.gz include=t
# Combine all runs and filter non-overlapping hits with evalue <= 1e-5 and coverage >= 0.4
# 233992194 hits
# Parsed & filtered using parse_hmmsearch.sh
# OSD2014.NOG4.good.1e-5.c04.txt.gz: 25840271
zcat ../../results/OSD*/*NOG4.tsv.gz | mawk '$0 !~ /#/{print}' > OSD2014.NOG4.txt
bash parse_hmmsearch.sh OSD2014.NOG4.txt 1e-5 0.4 | gzip > OSD2014.NOG4.good.1e-5.c04.txt.gz
## OLD STUFF
# Filter hits with e-value > 1e-5
# Filtered hits: 82629431
mawk '$7 <= 1e-5{print}' OSD2014.NOG4.txt > OSD2014.NOG4.1e-5.txt
# Sort hits by evalue and gene name
~/opt/coreutils/bin/sort -S50% --parallel=8 -k1,1V -k7,7g OSD2014.NOG4.1e-5.txt > OSD2014.NOG4.sorted.1e-5.txt
# Filter the hits that are not at 90% evalue of the best hits and we keep one if there are multiple hits of the same eggNOG, we also add the sample label in the first column
# 5945666 multiple hits
# 5668895 unique combination hits-gene_id
# 4958765 unique gene_ids
mawk -f 1rlca_filter_best_0.9_evalue.awk OSD2014.NOG4.sorted.1e-5.txt | mawk '!a[$1$4]++' | mawk '{split($1,a,"_spades"); print a[1]"\t"$0}' > OSD2014.NOG4.sorted.1e-5.bh.txt
# Get OSD NOGs abundances
LC_ALL=C zgrep -F -f <(awk '{print $2'} <(zcat OSD2014.NOG4.sorted.1e-5.bh.short.txt) | sort --parallel=8 -S20G -u) $OSDANAL/RESISTOME/OSD_orfs_abundances.txt.gz > OSD2014.NOG4.abundance.1e-5.bh.txt
#!/usr/bin/env sh
# Parse hmmsearch output removing overlaping hits and filters by evalue and coverage
# $1 file name
# $2 evalue
# $3 coverage
LC_ALL=C mawk -vE="${2}" -vT="${3}" 'BEGIN{OFS="\t"}{if ($0 !~ "#"){C=($17-$16+1)/$6; if ($13 <= E && C >= T){print $1,$3,$4,$6,$12,$16,$17,$18,$19,C}}}' $1 | \
LC_ALL=C sort --parallel 16 -S20% -k 1,1 -k 8n -k 9n | \
perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[2]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[4]<$c[4]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_."\n";}}'
# Run HMMER3 on mpi mode
mpirun $MPI_HOSTS ~/opt/hmmer-3.1b2-mpi/bin/hmmsearch --mpi --noali --cut_ga --domtblout /data/oerc-microb3/afernandezguerra/NOGs/results/TARA.NOG4.new.txt /data/oerc-microb3/afernandezguerra/NOGs/input/NOG.hmm.new /data/oerc-microb3/afernandezguerra/NOGs/input/OM-RGC_seq.release.faa > /data/oerc-microb3/afernandezguerra/NOGs/results/all.nog.new.log 2>&1
# Analyse results get the ones from the output
~/opt/mawk/bin/mawk '{if($0 !~ /#/){print $4}}' ../results/TARA.NOG4.txt | ~/opt/coreutils/bin/sort --parallel=16 -u -T . > NOG_processed_out.txt
# Analyse the ones from the logs
LC_ALL=C grep -F 'Query:' ../results/all.nog.log > NOG_processed.log.txt
# Get the ones that failed when reached the queue limit
LC_ALL=C grep -F -f <(sort NOG_processed* | uniq -u) NOG_processed_out.txt
# the ones that had to be repeated and removed from the first run
# NOG.ENOG4111HRN.meta_raw
# NOG.ENOG4111HRP.meta_raw
# NOG.ENOG4111HRQ.meta_raw
# NOG.ENOG4111HRR.meta_raw
# NOG.ENOG4111HRS.meta_raw
# NOG.ENOG4111HRT.meta_raw
# NOG.ENOG4111HRU.meta_raw
# Let's remove them from the first run: all.nog.out.txt
LC_ALL=C grep -v -F -f hmm_to_rm.txt ../results/TARA.NOG4.txt > TARA.NOG4.out.good.txt
# Combine both runs and filter non-overlapping hits with evalue <= 1e-5 and coverage >= 0.4
# 1178615749 all.nog.out.good.txt
# 384015652 all.nog.out.new.txt
# Combined runs: 1562631401 hits
# Parsed & filtered using parse_hmmsearch.sh
# TARA.NOG4.good.1e-5.c04.txt.gz: 100756023
# TARA.NOG4.new.1e-5.c04.txt.gz: 45642297
# Combined: 146398320
bash parse_hmmsearch.sh <(zcat TARA.NOG4.good.txt.gz) 1e-5 0.4 | gzip > TARA.NOG4.good.1e-5.c04.txt.gz
bash parse_hmmsearch.sh <(zcat TARA.NOG4.new.txt.gz) 1e-5 0.4 | gzip > TARA.NOG4.new.1e-5.c04.txt.gz
### OLD stuff
# Filtered hits: 407820829
cat TARA.NOG4.good.txt TARA.NOG4.out.new.txt | ~/opt/mawk/bin/mawk '{if($0 !~ /#/ && $7 <= 1e-5){ print}}' > TARA.NOG4.1e-5.txt
# We sort the results by evalue and gene name
~/opt/coreutils/bin/sort -S90G --parallel=8 -k1,1V -k7,7g TARA.NOG4.1e-5.txt > TARA.NOG4.sorted.1e-5.txt
# Filter the hits that are not at 90% evalue of the best hits and we keep one if there are multiple hits of the same eggNOG
# 30549490 multiple hits
# 28727237 unique combination hit-gene_id
# 25384430 unique gene_ids
mawk -f 1rlca_filter_best_0.9_evalue.awk TARA.NOG4.sorted.1e-5.txt | mawk '!a[$1$4]++' TARA.NOG4.sorted.1e-5.bh.txt > TARA.NOG4.sorted.1e-5.bh.txt
{
if ($1 in array){
F=array[$1]
O=(log($7)/log(10));
if (O <= F){
print $0;
}
}else{
O=0.9*(log($7)/log(10));
array[$1]=O;
print $0;
}
}