shell script to download all proteins from taxids listed in intersection_ids.txt
#! /bin/sh
rm -rf log.txt # file where the taxid of skipped taxa will written to
max_treshold=20000
mkdir -p out
while read line; do
pyla_name=$(echo $line | awk -F';' '{print $1}' | tr -s ' ' | tr ' ' '_')
txid=$(echo $line | awk -F';' '{print $2}')
echo "processing $pyla_name"
num_found=$(esearch -db protein -query "txid$txid[Organism:exp]"\
< /dev/null | grep 'Count' | grep -o '[0-9]\+')
echo "checking"
echo $num_found
if [ "$num_found" -lt "$max_treshold" ];then # skip this phyla if there are too many proteins
echo "downloading..."
esearch -db protein -query "txid$txid[Organism:exp]" < /dev/null \
| efetch -format fasta > out/$pyla_name.fasta
else
echo "too many proteins found ($num_found), skip"
echo "$pyla_name;$txid" >> log.txt
fi
done <intersection_ids.txt