genomewalker
9/7/2016 - 9:16 AM

Get COGs from NCBI FTP

Get COGs from NCBI FTP

# First we will download the necessary files
wget ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data/cog2003-2014.csv
wget ftp://ftp.ncbi.nih.gov/pub/COG/COG2014/data/prot2003-2014.fa.gz

# Then we will extract Patrick's COG sequences using filterbyname.sh from the BBMAP package
for COG in COG0085 COG0468 COG0187 COG0226 COG0248 COG0855 COG2326 COG1702; do LC_ALL=C grep -F -f <(grep ${COG} cog2003-2014.csv | cut -f1 -d ',') -w prot2003-2014.ids | tr -d '>' > ${COG}.ids; ./bbmap/filterbyname.sh -in=prot2003-2014.fa.gz out=${COG}.fasta.gz include=true names=${COG}.ids; done

# Create LAMBDA indexes
for i in COG*; do ~/opt/lambda-v0.9.3/bin/lambda_indexer -p blastx -d ${i} ; done

# Run for each OSD sample on SGE
qsub OSD1_2014-06-21_0m_NPL022_lambda_cogs.sh

# Concatenate all results
for N in COG0085 COG0468 COG0187 COG0226 COG0248 COG0855 COG2326 COG1702; do find results/ -name "*${N}*" | while read LINE; do NAM=$(basename ${LINE} _${N}.blastx.m8.gz); zcat ${LINE} | awk -vL=$N"\t"${NAM} '{print L"\t"$0}' ; done; done |gzip -c > COG_patrick_results.txt.gz

# Get best hit
sort -S50% --parallel=8 -k3,3V -k13,13g <(zcat COG_patrick_results.txt.gz) | awk '!a[$3]++' > COG_patrick_results.bh.txt
#!/bin/bash
set -x
set -e
set -o pipefail
set -o errexit
set -o errtrace
set -o nounset

# Run uproc and extract contigs

declare -r NAM="OSD1_2014-06-21_0m_NPL022"
declare -r NSLOTS="${NSLOTS}"
declare -r PATHB="/home/mpi45770/opt/lambda-v0.9.3/bin/lambda"
declare INPUT="/home/mpi45770/COG_patrick/input"
declare RESULTS="/home/mpi45770/COG_patrick/results"

declare ME="${INPUT}"/"${NAM}"_ME_shotgun_workable_merged.fastq.gz
declare SE="${INPUT}"/"${NAM}"_SE_shotgun_workable_merged.fastq.gz

declare -a COGS=(COG0085 COG0468 COG0187 COG0226 COG0248 COG0855 COG2326 COG1702)

for COG in "${COGS[@]}"; do
    declare DB=/home/mpi45770/biodb/lambda/"${COG}".fasta.gz

    MERES="${RESULTS}"/"${NAM}"_"${COG}"_ME.blastx.m8
    SERES="${RESULTS}"/"${NAM}"_"${COG}"_SE.blastx.m8

    "${PATHB}" -e 1e-5 -so 5 -p blastx -nm 10  -q "${ME}" -d "${DB}" -o "${MERES}" -t "${NSLOTS}"
    "${PATHB}" -e 1e-5 -so 5 -p blastx -nm 10  -q "${SE}" -d "${DB}" -o "${SERES}" -t "${NSLOTS}"

    cat "${MERES}" "${SERES}" | gzip -1c > "${RESULTS}"/"${NAM}"_"${COG}".blastx.m8.gz
    rm "${MERES}" "${SERES}"
done