genomewalker
1/25/2017 - 2:02 PM

mmseqs_cog_nog.sh

find ../ -name '*.faa' -print | while read line; do NAM=$(basename $line .faa); TMP=$(mktemp -d -p /scratch/antonio/tara_orfs/input_concat/1/NOG); mmseqs createdb $line ${NAM}_DB; mmseqs search ${NAM}_DB bactArNOGDB ${NAM}_eggnog $TMP --max-seqs 50 --threads 64 -a -e 1e-5; done
find . -name '*_eggnog' -print | parallel --progress -j24  ~/opt/scripts/eggnog_par.sh {} \;
# search
find ../ncbi_cog/ -name '*fasta' -print | while read line; do NAM=$(basename $line _scaffold.aa.fasta); TMP=$(mktemp -d -p /scratch/antonio/tara/); mmseqs createdb $line ${NAM}_DB; mmseqs search ${NAM}_DB bactArNOGDB ${NAM}_eggnog $TMP --max-seqs 50 --threads 64 -a -e 1e-5; done
# extract BH
mmseqs filterdb osd_ass_nog osd_ass_nog_bh --extract-lines 1
# convert to BLAST m8 format
mmseqs convertalis osdAssemDB bactArNOGDB osd_ass_nog_bh osd_ass_nog_bh.m8
# combine and get coverage
cut -f 7,10 osd_ass_nog_bh | paste osd_ass_nog_bh.m8 - | sed '/^$/d' | awk 'NF<14{next}; $11 <= 1e-5{print $0"\t"$4/$13}' > osd_ass_nog_bh_1e-5_c0.tsv

# get NOG members 
zcat NOG.members.tsv.gz | cut -f 2,5,6 | awk '{split($3, array,",");for (i in array){print $1"\t"$2"\t"array[i]}}' | sort -k3,3 --parallel 32 -S25% > NOG.members_long.tsv

# combine results and NOGs
join -1 2 -2 3 <(sort -k2,2 --parallel 32 -S25% osd_ass_nog_bh.m8) NOG.members_long.tsv > osd_ass_nog_bh_combined.m8

# combine results and cogs
join -t $'\t' -1 2 -2 3 <(sort --parallel 16 -S25% -k2,2 osd_all_assem_1e-5_c0.tsv) <(sort -k3,3 cog2003-2014_combined.tsv) > osd_all_assem_1e-5_c0_cog.tsv