philippmuench
3/6/2018 - 10:58 AM

shell script to download all proteins from taxids listed in intersection_ids.txt

shell script to download all proteins from taxids listed in intersection_ids.txt

#! /bin/sh

rm -rf log.txt # file where the taxid of skipped taxa will written to
max_treshold=20000
mkdir -p out
while read line; do
  pyla_name=$(echo $line | awk -F';' '{print $1}' | tr -s ' ' | tr ' ' '_')
  txid=$(echo $line | awk -F';' '{print $2}')
  echo "processing $pyla_name"
  num_found=$(esearch -db protein -query "txid$txid[Organism:exp]"\
   < /dev/null | grep 'Count' | grep -o '[0-9]\+')
  echo "checking"
  echo $num_found
  if [ "$num_found" -lt "$max_treshold" ];then # skip this phyla if there are too many proteins
  	echo "downloading..."
    esearch -db protein -query "txid$txid[Organism:exp]" < /dev/null \
    | efetch -format fasta > out/$pyla_name.fasta
  else
    echo "too many proteins found ($num_found), skip"
    echo "$pyla_name;$txid" >> log.txt
  fi
done <intersection_ids.txt