genomewalker
2/4/2016 - 1:43 PM

OSD_TARA_OM_NOGS

OSD_TARA_OM_NOGS

#!/bin/bash
set -x
set -e
set -o pipefail
set -o errexit
set -o errtrace
set -o nounset

# Run uproc and extract contigs

declare -r NAM='NOM'
declare -r HMM3=/bioinf/software/hmmer/hmmer-3.1b2/bin/hmmsearch

declare -r NOGDB=/local/biodb/hmmer3/NOG_hmm

declare RES=/bioinf/projects/osd/main/2014/06/analysis-results/NOGs/results/${NAM}

declare -r NOGOUT=${RES}/${NAM}.NOG4.tsv
declare -r NUMNOGS=190648
declare -r NOGLOG=${RES}/hmm.log

declare -r NSLOTS=${NSLOTS}

if [ ! -e ${NOGLOG} ]; then
    # create result folder
    [[ -d ${RES} ]] && rm -rf ${RES}
    mkdir -p ${RES}
else
    echo "Resuming NOG search..."
fi

cd ${RES}

# Get read files and QC them with trimmomatic
#declare -r AFILE=/data/oerc-microb3/afernandezguerra/BGC/input/assem-file-name2url.txt

#get read files
#RFILES=$(grep ${NAM} ${AFILE})

#How many files do we have
#NRFILE=$(echo "${RFILES}" | wc -l)


#echo "${RFILES}" | \
# cut -f 2 -d ' ' | \
# while read LINE
# do
#       N=$(basename "${LINE}")
#       while ! rsync --progress --append -h --numeric-ids -e "ssh -T -c arcfour -o Compression=no -x" UV2000.mpi-bremen.de:/bioinf/projects/megx/TARA/assemblies/FASTQ/${N} . ;do sleep 5;done
# done


#ME=~/BGC/input/${NAM}_ME_shotgun_workable_merged.fastq.gz
#SE=~/BGC/input/${NAM}_SE_shotgun_workable_merged.fastq.gz

#SR=${RES}/${NAM}.SR.fasta

#~/opt/bbmap/reformat.sh in=${ME} out=${RES}/tmp.me.fasta maxns=0
#~/opt/bbmap/reformat.sh in=${SE} out=${RES}/tmp.se.fasta maxns=0

#cat ${RES}/tmp.me.fasta ${RES}/tmp.se.fasta > ${SR}

#rm ${RES}/tmp.{me,se}.fasta
#ORFS=${RES}/${NAM}.orfs

#${FGS} -genome="${SR}" -out=${ORFS} -complete=0 -train=illumina_10 -thread=${NSLOTS}
#${FGS} -s "${SR}" -m 24000 -o ${ORFS} -e 0 -d 0 -w 0 -r /home/mpi45770/opt/FragGeneScanPlus/train -t illumina_10 -p ${NSLOTS}

ORFS=/bioinf/projects/osd/main/2014/06/analysis-results/NOGs/input/assemblies/${NAM}_spades.orfs.aa.fasta

# BGCs
#${UPROC} -z ${RES}/${NAM}.bgc.gz -p -t ${NSLOTS} ${BGCDB} ${UPROCM} ${ORFS}

# PFAM
#${UPROC} -z ${RES}/${NAM}.pfam27.gz -p -t ${NSLOTS} ${PFAMDB} ${UPROCM} ${ORFS}

# KEGG
#${UPROC} -z ${RES}/${NAM}.kegg_20140317.gz -p -t ${NSLOTS} ${KEGGDB} ${UPROCM} ${ORFS}

# RESFAMS
#NOGNUM=1

if [ -e ${NOGLOG} ]; then
    if [ ! -s ${NOGLOG} ]; then
        NOGNUM=1
    else
        NOGNUM=$(cat ${NOGLOG})
    fi
else
    touch ${NOGLOG}
    NOGNUM=1
fi

for ((i=${NOGNUM}; i<=${NUMNOGS}; i++)); do
    NOG=$(awk -v L=${i} 'NR==L' /bioinf/projects/osd/main/2014/06/analysis-results/NOGs/input/NOG_profiles.txt)
    ${HMM3} --cut_ga --cpu ${NSLOTS} --domtblout ${RES}/tmp.nog ${NOGDB}/${NOG} ${ORFS} > ${NOGOUT}.log 2>&1 # 1> /dev/null 2>&1;
    cat ${RES}/tmp.nog >> ${NOGOUT}
    rm ${RES}/tmp.nog
    echo ${i} > ${NOGLOG}
done

# compress genes
gzip ${NOGOUT} &

wait

echo "DONE"
#exit

# run uproc using the models




#zcat ${RES}/${NAM}.bgc.gz | cut -f 2 -d ',' | sort -u > ${RES}/${NAM}.bgc.contigs.id.txt

#${BBMAP} in=${ASS} names=${RES}/${NAM}.bgc.contigs.id.txt out=${RES}/${NAM}.bgc.contigs.fasta.gz include=t
# Combine all runs and filter non-overlapping hits with evalue <= 1e-5 and coverage >= 0.4
# 233992194 hits
# Parsed & filtered using parse_hmmsearch.sh
# OSD2014.NOG4.good.1e-5.c04.txt.gz: 25840271
zcat ../../results/OSD*/*NOG4.tsv.gz  | mawk '$0 !~ /#/{print}' > OSD2014.NOG4.txt
bash parse_hmmsearch.sh OSD2014.NOG4.txt 1e-5 0.4 | gzip > OSD2014.NOG4.good.1e-5.c04.txt.gz

## OLD STUFF
# Filter hits with e-value > 1e-5
# Filtered hits: 82629431
mawk '$7 <= 1e-5{print}' OSD2014.NOG4.txt > OSD2014.NOG4.1e-5.txt
# Sort hits by evalue and gene name 
~/opt/coreutils/bin/sort -S50% --parallel=8 -k1,1V -k7,7g OSD2014.NOG4.1e-5.txt > OSD2014.NOG4.sorted.1e-5.txt

# Filter the hits that are not at 90% evalue of the best hits and we keep one if there are multiple hits of the same eggNOG, we also add the sample label in the first column
# 5945666 multiple hits
# 5668895 unique combination hits-gene_id
# 4958765 unique gene_ids
mawk -f 1rlca_filter_best_0.9_evalue.awk OSD2014.NOG4.sorted.1e-5.txt | mawk '!a[$1$4]++' | mawk '{split($1,a,"_spades"); print a[1]"\t"$0}' > OSD2014.NOG4.sorted.1e-5.bh.txt

# Get OSD NOGs abundances
LC_ALL=C zgrep -F -f <(awk '{print $2'} <(zcat OSD2014.NOG4.sorted.1e-5.bh.short.txt) | sort --parallel=8 -S20G -u) $OSDANAL/RESISTOME/OSD_orfs_abundances.txt.gz > OSD2014.NOG4.abundance.1e-5.bh.txt
#!/usr/bin/env sh
# Parse hmmsearch output removing overlaping hits and filters by evalue and coverage
# $1 file name
# $2 evalue
# $3 coverage
LC_ALL=C mawk -vE="${2}" -vT="${3}" 'BEGIN{OFS="\t"}{if ($0 !~ "#"){C=($17-$16+1)/$6; if ($13 <= E && C >= T){print $1,$3,$4,$6,$12,$16,$17,$18,$19,C}}}' $1 | \
        LC_ALL=C sort --parallel 16 -S20% -k 1,1 -k 8n -k 9n | \
        perl -e 'while(<>){chomp;@a=split;next if $a[-1]==$a[-2];push(@{$b{$a[2]}},$_);}foreach(sort keys %b){@a=@{$b{$_}};for($i=0;$i<$#a;$i++){@b=split(/\t/,$a[$i]);@c=split(/\t/,$a[$i+1]);$len1=$b[-1]-$b[-2];$len2=$c[-1]-$c[-2];$len3=$b[-1]-$c[-2];if($len3>0 and ($len3/$len1>0.5 or $len3/$len2>0.5)){if($b[4]<$c[4]){splice(@a,$i+1,1);}else{splice(@a,$i,1);}$i=$i-1;}}foreach(@a){print $_."\n";}}'
# Run HMMER3 on mpi mode

mpirun $MPI_HOSTS ~/opt/hmmer-3.1b2-mpi/bin/hmmsearch --mpi --noali --cut_ga --domtblout /data/oerc-microb3/afernandezguerra/NOGs/results/TARA.NOG4.new.txt /data/oerc-microb3/afernandezguerra/NOGs/input/NOG.hmm.new /data/oerc-microb3/afernandezguerra/NOGs/input/OM-RGC_seq.release.faa > /data/oerc-microb3/afernandezguerra/NOGs/results/all.nog.new.log 2>&1

# Analyse results get the ones from the output
~/opt/mawk/bin/mawk '{if($0 !~ /#/){print $4}}' ../results/TARA.NOG4.txt | ~/opt/coreutils/bin/sort --parallel=16 -u -T . > NOG_processed_out.txt

# Analyse the ones from the logs
LC_ALL=C grep -F 'Query:' ../results/all.nog.log > NOG_processed.log.txt

# Get the ones that failed when reached the queue limit
LC_ALL=C grep -F -f <(sort NOG_processed* | uniq -u) NOG_processed_out.txt

# the ones that had to be repeated and removed from the first run
# NOG.ENOG4111HRN.meta_raw
# NOG.ENOG4111HRP.meta_raw
# NOG.ENOG4111HRQ.meta_raw
# NOG.ENOG4111HRR.meta_raw
# NOG.ENOG4111HRS.meta_raw
# NOG.ENOG4111HRT.meta_raw
# NOG.ENOG4111HRU.meta_raw

# Let's remove them from the first run: all.nog.out.txt
LC_ALL=C grep -v -F -f hmm_to_rm.txt ../results/TARA.NOG4.txt > TARA.NOG4.out.good.txt

# Combine both runs and filter non-overlapping hits with evalue <= 1e-5 and coverage >= 0.4
# 1178615749 all.nog.out.good.txt
# 384015652 all.nog.out.new.txt
# Combined runs: 1562631401 hits
# Parsed & filtered using parse_hmmsearch.sh
# TARA.NOG4.good.1e-5.c04.txt.gz: 100756023
# TARA.NOG4.new.1e-5.c04.txt.gz: 45642297
# Combined: 146398320
bash parse_hmmsearch.sh <(zcat TARA.NOG4.good.txt.gz) 1e-5 0.4 | gzip > TARA.NOG4.good.1e-5.c04.txt.gz
bash parse_hmmsearch.sh <(zcat TARA.NOG4.new.txt.gz) 1e-5 0.4 | gzip > TARA.NOG4.new.1e-5.c04.txt.gz

### OLD stuff
# Filtered hits: 407820829
cat TARA.NOG4.good.txt TARA.NOG4.out.new.txt | ~/opt/mawk/bin/mawk '{if($0 !~ /#/ && $7 <= 1e-5){ print}}' > TARA.NOG4.1e-5.txt

# We sort the results by evalue and gene name
~/opt/coreutils/bin/sort -S90G --parallel=8 -k1,1V -k7,7g TARA.NOG4.1e-5.txt > TARA.NOG4.sorted.1e-5.txt

# Filter the hits that are not at 90% evalue of the best hits and we keep one if there are multiple hits of the same eggNOG
# 30549490 multiple hits
# 28727237 unique combination hit-gene_id
# 25384430 unique gene_ids
mawk -f 1rlca_filter_best_0.9_evalue.awk TARA.NOG4.sorted.1e-5.txt | mawk '!a[$1$4]++' TARA.NOG4.sorted.1e-5.bh.txt > TARA.NOG4.sorted.1e-5.bh.txt
{
  if ($1 in array){
         F=array[$1]
         O=(log($7)/log(10));
          if (O <= F){
            print $0;
          }
  }else{
          O=0.9*(log($7)/log(10));
          array[$1]=O;
          print $0;
  }
}