OSD2014 exclusive ORFs to OM-RGC
BEGIN {
RS="//\n";
ORS=""
}
{
if ($0~/CDS/ && $0~/isolation_source/){
split($0,a,"\n");
L=length(a);
for (i = 1; i <= L; i++){
if (a[i] ~ /^VERSION/) {
split(a[i], b, "GI:")
print b[2]
} else if (a[i] ~ "/isolation_source"){
o=i
gsub(" ", "", a[o])
print "\t"a[o]
while (a[o] !~ /"$/){
o++
gsub(" ", "", a[o])
print " "a[o]
}
} else if (a[i] ~ "protein_id"){
gsub(" /", "", a[i])
print "\t"a[i];
} else if (a[i] ~ "db_xref"){
gsub(" /", "", a[i])
print "\t"a[i];
}
}
print "\n";
}
}
# In: /bioinf/projects/osd/main/2014/06/analysis-results/OSD-GC
# Number of ORFs in the OM-RGC
# 40154822
LC_ALL=C grep -F -c '>' OM-RGC_seq.release.fasta
# Number of ORFs in the OSD
# 7015383 orfs
LC_ALL=C grep -F -c '>' OSD-GC.orfs.nt.fasta
# We need to remove OSD10
# 66467 orfs removed
# 6948916 total
grep OSD10_ OSD-GC.orfs.fasta | tr -d '>' > OSD2014toremove.txt
filterbyname.sh in=OSD2014-GC.orfs.fasta out=OSD2014-GC.orfs.marine.fasta names=OSD2014toremove.ids.txt include=f
# To create a RGC we cluster at 95% and filter orfs < 100nt
# 4626720 orfs
cd-hit-para.pl -i OSD2014-GC.orfs.marine.fasta -o OSD2014-GC.orfs.marine.95.fasta --P cd-hit-est --B hosts --S 16 --Q 32 --R OSD2014-GC.orfs.marine.95.restart -T SGE "-c 0.95 -T 4 -M 0 -G 0 -aS 0.9 -g 1 -r 1 -d 0"
# 4468990 orfs
reformat.sh in=OSD2014-GC.orfs.marine.95.fasta out=OSD2014-RGC.fasta.gz minlen=100
# Overlap of OSD and the OM-RGC
# We use cd-hit-2d to incrementally cluster OSD to OM-RGC
./cd-hit-2d-para.pl -i OM-RGC_seq.release.fasta -i2 OSD2014-GC.orfs.marine.fasta -o OSD2014-GC.exclusive --P cd-hit-est --B hosts --S 64 --S2 32 --Q 32 --R OSD2014-GC.exclusive.restart -T SGE "-c 0.95 -T 4 -M 0 -G 0 -aS 0.9 -g 1 -r 1 -d 0"
# 4710976 from 6948916 (67.8%) orfs not included in the OM-RGC
# We cluster at 95% and remove < 100nt
# 3736006 orfs
cd-hit-para.pl -i OSD2014-GC.exclusive.fasta -o OSD2014-GC.exclusive.95.fasta --P cd-hit-est --B hosts --S 16 --Q 32 --R OSD2014-GC.exclusive.95.restart -T SGE "-c 0.95 -T 4 -M 0 -G 0 -aS 0.9 -g 1 -r 1 -d 0"
# 3607637 orfs
reformat.sh in=OSD2014-GC.exclusive.95.fasta out=OSD2014-GC.to.OM-RGC.fasta.gz minlen=100
# combine all results in one file.
# 18529116 results
# Blastp fields -outfmt "6 qseqid sseqid evalue bitscore length sskingdoms"
cat ../../results/OSD-BLAST.aa.*.txt > OSD2014-RGC-BLASTp.aa.nr.exclusive.out
# sort by OSD_gene_id and evalue
LC_ALL=C ~/opt/coreutils/bin/sort --parallel=12 -k1,1V -k3,3g OSD2014-RGC-BLASTp.aa.nr.exclusive.out > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.out
# Set a log-evalue threshold is set, which by default is equal to 90% of the log-E-value of the best hit for that protein
# From https://www.microb3.eu/sites/default/files/deliverables/MB3_D5_8_PU.pdf
# 10880436 results
~/opt/mawk/bin/mawk -f 1rlca_filter_best_0.9_evalue.awk OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.out > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.out
# Prepare data for LCA, we will use https://github.com/emepyc/Blast2lca
awk '{print $1"\t"$2"\t0\t0\t0\t0\t0\t0\t0\t0\t"$3"\t"$4}' OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.out > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.forlca.out
# Run blast2lca to get the LCA from our blastp results
gitaxid2bin ../../results/tax_db/
blast2lca --nprocs 8 -savemem -dict gi_taxid.bin -nodes tax_db/nodes.dmp -names tax_db/names.dmp -levels=superkingdom:phylum:class:family:genus OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.forlca.out > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.out
# There are some weird sequences (8) that hasn't been parsed properly
# When:
sort <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.out) <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.forlca.out| sort -u ) | uniq -u | less
# 07_orf-2
# .1|
# 2014-06-24_3m_NPL022_spades_contig-4224_-_3_500_orf-1
# 7_+_1_531_orf-1
# O
# OSD155_2014-06-21_1m_NPL022_spades
# OSD65_2014-06-20_0.1m_NPL022_spad
# OSD155_2014-06-21_1m_NPL022_spades_contig-8671_+_1_975_orf-1 (Not processed)
# There are 3628796 orfs shared between the unique input orfs and blast2lca output
# We remove the bad ones
# Get the ones from Bacteria domain
# 2987771
~/opt/mawk/bin/mawk 'BEGIN{FS="\t"}$4 ~ /Bacteria;/{print $1"\tBacteria"}' OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.out > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.bacteria.out
# Get the ones from Archaea domain
# 124492
~/opt/mawk/bin/mawk 'BEGIN{FS="\t"}$4 ~ /Archaea;/{print $1"\tArchaea"}' OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.out > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.archaea.out
# Get the ones from Eukaryota domain
# 131418
~/opt/mawk/bin/mawk 'BEGIN{FS="\t"}$4 ~ /Eukaryota;/{print $1"\tEukaryota"}' OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.out > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.eukaryota.out
# Get the ones from Viruses domain
# 263131
~/opt/mawk/bin/mawk 'BEGIN{FS="\t"}$4 ~ /Viruses;/{print $1"\tViruses"}' OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.out > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.viruses.out
# Get the ones not classified
# 121984
LC_ALL=C grep -F -f <(sort --parallel=12 <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.{archaea,bacteria,eukaryota,viruses}.out) <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.out) | uniq -u | cut -f1) OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.out > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.unclassified.out
# We will get the best hit for the ones not classified and get the Kingdom from the NR db
export BLASTDB=/data/oerc-microb3/afernandezguerra/BLASTDB:$BLASTDB
~/opt/ncbi-blast-2.2.31+/bin/blastdbcmd -db NR -entry_batch <(LC_ALL=C grep -F -f <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.out) <(cat OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.out) | ~/opt/coreutils/bin/sort -k1,1V -k3,3g --parallel=12 | awk '!a[$1]++' | cut -f2 -d '|') -outfmt "%g|%K" -target_only | tr "|" $'\t' > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.gi2kingdom.out
# We add the results to the existing files
# For Bacteria
# 3013700
G=Bacteria; LC_ALL=C grep -w -F -f <(grep ${G} OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.gi2kingdom.out | cut -f 1) <(LC_ALL=C grep -F -f <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.out) <(cat OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.out) | ~/opt/coreutils/bin/sort -k1,1V -k3,3g --parallel=12 | awk '!a[$1]++') | awk -v O=${G} '{print $1"\t"O}' >> OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.bacteria.out
# For Archaea
# 128856
G=Archaea; LC_ALL=C grep -w -F -f <(grep ${G} OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.gi2kingdom.out | cut -f 1) <(LC_ALL=C grep -F -f <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.out) <(cat OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.out) | ~/opt/coreutils/bin/sort -k1,1V -k3,3g --parallel=12 | awk '!a[$1]++') | awk -v O=${G} '{print $1"\t"O}' >> OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.archaea.out
# For Eukaryota
# 199359
G=Eukaryota; LC_ALL=C grep -w -F -f <(grep ${G} OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.gi2kingdom.out | cut -f 1) <(LC_ALL=C grep -F -f <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.out) <(cat OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.out) | ~/opt/coreutils/bin/sort -k1,1V -k3,3g --parallel=12 | awk '!a[$1]++') | awk -v O=${G} '{print $1"\t"O}' >> OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.eukaryota.out
# For Viruses
# 273340
G=Viruses; LC_ALL=C grep -w -F -f <(grep ${G} OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.gi2kingdom.out | cut -f 1) <(LC_ALL=C grep -F -f <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.out) <(cat OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.out) | ~/opt/coreutils/bin/sort -k1,1V -k3,3g --parallel=12 | awk '!a[$1]++') | awk -v O=${G} '{print $1"\t"O}' >> OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.viruses.out
# Non classified
# 13541
LC_ALL=C grep -w -F -f <(grep "N/A\|-" OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.gi2kingdom.out | cut -f 1) <(LC_ALL=C grep -F -f <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.root.out) <(cat OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.out) | ~/opt/coreutils/bin/sort -k1,1V -k3,3g --parallel=12 | awk '!a[$1]++') | awk -v O=${G} '{print $1"\tunclassified"}' > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.unclassified.out
# How many sequences have a hit vs NR database
# Results at /data/oerc-microb3/afernandezguerra/EXCLUSIVE-BLAST
# Number of exclusive genes
# 4710976
grep -c '>' ../../input/OSD-GC.exclusive.aa.fasta
# Split the input file
~/opt/mawk/bin/mawk -vO=10000 'BEGIN {n_seq=0;partid=1;} /^>/ {if(n_seq%O==0){file=sprintf("OSD.exclusive.aa.%d",partid);partid++;} print >> file; n_seq++; next;} { print >> file; }' < OSD-GC.exclusive.aa.fasta
# And BLASTp
~/opt/ncbi-blast-2.2.31+/bin/blastp -query /data/oerc-microb3/afernandezguerra/EXCLUSIVE-BLAST/input/OSD.exclusive.aa.${SLURM_ARRAY_TASK_ID} -out /data/oerc-microb3/afernandezguerra/EXCLUSIVE-BLAST/results/OSD-BLAST.aa.${SLURM_ARRAY_TASK_ID}.txt -db NR -max_target_seqs 5 -num_threads 8 -outfmt "6 qseqid sseqid evalue bitscore length sskingdoms" -evalue 1e-5
# Number of hits
# 3628797
cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.out | sort -u | wc -l
# 77% with a hit
# Get only the archaeal,bacterial and viral fraction
# Total 3415896
# Unique 974821
LC_ALL=C grep -F -f <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.{archaea,bacteria,viruses}.out) <(cat OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.out) | grep -v 'Eukaryota' | ~/opt/coreutils/bin/sort -k1,1V -k3,3g --parallel=12 | awk '!a[$1]++' | cut -f2 -d '|' | ~/opt/coreutils/bin/sort --parallel=12 -u > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.prok.gi.uniq
# Create entrez urls for curl
awk 'BEGIN{ORS=","}NR%500==0 {print "\n"} {print}' OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.prok.gi.uniq | sed -e 's/,$//g' -e 's/^,//' | while read LINE; do echo "\"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=${LINE}&retmode=text&rettype=gp\""; done | awk '{print $1 " -o gi" NR "_prok.gp"}' > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.prok.gi.url
# Use xargs to get in a parallel fashion
# Don't abuse
# Retrieved 973858
xargs -P 16 curl -s < OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.prok.gi.url
# Do the same for the eukaryotic
# Total 199359
# Unique 95786
# Retrieved 95063
LC_ALL=C grep -F -f <(cut -f1 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.eukaryota.out) <(cat OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.out) | grep -v 'Viruses\|Bacteria\|Archaea' | ~/opt/coreutils/bin/sort -k1,1V -k3,3g --parallel=12 | awk '!a[$1]++' | cut -f2 -d '|' | ~/opt/coreutils/bin/sort --parallel=12 -u > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.euk.gi.uniq
awk 'BEGIN{ORS=","}NR%500==0 {print "\n"} {print}' OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.euk.gi.uniq | sed -e 's/,$//g' -e 's/^,//' | while read LINE; do echo "\"http://eutils.ncbi.nlm.nih.gov/entrez/eutils/efetch.fcgi?db=protein&id=${LINE}&retmode=text&rettype=gp\""; done | awk '{print $1 " -o gi" NR "euk.gp"}' > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.euk.gi.url
xargs -P 16 curl -s < OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.euk.gi.url
# Create folder for the donwloaded files and concatenate
mkdir GBfiles
mv *gp GBfiles/
cat GBfiles/*_prok.gp > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.prok.gp
cat GBfiles/*_euk.gp > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.euk.gp
# Linearize and select entries with isolation_source field
# Prok with isolation_source
# 134372 gis with isolation_source
~/opt/mawk/bin/mawk -f filteIS.awk OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.prok.gp | awk 'BEGIN{FS="\t"}{ gsub("/isolation_source=","",$2); gsub(/"/,"",$2); print $1"\t"$2}' > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.prok.gi2is.txt
# Get unique isolation_sources
# 2728 unique isolation_sources
cut -f2 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.prok.gi2is.txt | sort -u > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.prok.is.uniq
# Euk with isolation_source
# 6833 gis with isolation_source
~/opt/mawk/bin/mawk -f filteIS.awk OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.euk.gp | awk 'BEGIN{FS="\t"}{ gsub("/isolation_source=","",$2); gsub(/"/,"",$2); print $1"\t"$2}' > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.euk.gi2is.txt
# Get unique isolation_sources
# 203 unique isolation_sources
cut -f2 OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.euk.gi2is.txt | sort -u > OSD2014-RGC-BLASTp.aa.nr.exclusive.sorted.evalue.filtered.1rlca.lca.euk.is.uniq
{
if ($1 in array){
F=array[$1]
O=(log($3)/log(10));
if (O <= F){
print $0;
}
}else{
O=0.9*(log($3)/log(10));
array[$1]=O;
print $0;
}
}