genomewalker
9/16/2016 - 2:18 PM

HUMAnN analysis

HUMAnN analysis

# Update HUMAnN files
# Get module descriptions
curl -S http://rest.kegg.jp/list/module > kegg_modules_description_01092016.txt
# Get pathway descriptions
curl -S http://rest.kegg.jp/list/pathway > kegg_pathway_description_01092016.txt

mkdir KEGG_MODULE
cd KEGG_MODULE
cut -f1 ../kegg_modules_description_01092016.txt | cut -f2 -d ':' | ~/opt/parallel/bin/parallel -j 128 --progress ../getModule.sh
cd ..
for i in KEGG_MODULE/M*.txt; do NAM=$(basename $i .txt); awk -vL=$NAM '{if ($0 ~ /DEFINITION/){sub("DEFINITION  ", "");print L"\t"$0}}' $i ; done  |sed -e 's/-- //g' -e 's/ --//g'> modulep
sed -e 's/[^a-zA-Z0-9]/ /g' -e 's/ \+/\t/g' modulep > modulec
cp ../kegg_modules_description_01092016.txt map_kegg.txt

mkdir KEGG_PATH/
cd KEGG_PATH
cut -f1 ../kegg_pathway_description_01092016.txt | cut -f2 -d ':' | ~/opt/parallel/bin/parallel -j 128 --progress ../getPath.sh
cut -f1 ../kegg_pathway_description_01092016.txt | cut -f2 -d ':' |sed -e 's/map/ko/g' | ~/opt/parallel/bin/parallel -j 128 --progress ../getPath.sh
for i in ko0*txt; do NAM=$(basename $i .txt); sed -e 's/^ \+//g' $i | awk 'BEGIN{ORS="\t"}{if ($0 ~ /^K/){print $1}}' | sed -e 's/\t$//g' | awk -vL=$NAM '{if (NF > 1){print L"\t"$0}}'; done > ../path_kos.txt
mv ../path_kos.txt keggc
cp keggc pathwayc

# Modify keggXsample table to get names that fit HUMAnN expectations
awk 'BEGIN{FS="\t";OFS=FS}{if (NR == 1){for (i = 1; i <= NF; i++){if (i == 1){$i = "#OSD ID"}else{$i = i - 1; $i = "OSD"$i}}print $0}else{print $0}}' ../osd2014_koXsample_filt_raw_counts.tsv > osd2014_koXsample_filt_raw_counts_renamed.tsv
awk 'BEGIN{FS="\t";OFS="\t"}{if (NR == 1){for (i =1; i <= NF; i++){o=$i; $i = i-1; $i = o"\tOSD"$i"\n"}print $0}}' osd2014_koXsample_filt_raw_counts.tsv | tail -n+2 | awk '{print $1"\t"$2}' > conversion_table.txt

# And run HUMAnN
scons -j
#!/bin/bash
MODULE=${1}
curl -s http://rest.kegg.jp/get/${MODULE} > ${MODULE}.txt
#set -x
DATE="${3:-$(date "+%Y_%m_%d")}"
RESULTS="kegg_"${DATE}
DATA=${RESULTS}/data
GENOMES=${RESULTS}/genomes
GENES=${RESULTS}/genes

GENOME_FILE=${DATA}/kegg_genome_list_${DATE}.txt

function ProgressBar {
# Process data
    let _progress=(${1}*100/${2}*100)/100
    let _done=(${_progress}*4)/10
    let _left=40-$_done
# Build progressbar string lengths
    _fill=$(printf "%${_done}s")
    _empty=$(printf "%${_left}s")

# Build progressbar strings and print the ProgressBar line
printf "\r[${_fill// /#}${_empty// /-}] ${_progress}%%"
}

function join_by {
    local IFS="$1"
    shift
    echo "$*"
}

[[ -d ${RESULTS} ]] && printf "Folder ${RESULTS} found. Removing data..."; find ${RESULTS} -delete; printf "done\n"
mkdir -p ${GENOMES} ${GENES} ${DATA}

# Get all genomes list
printf "Retrieving genome list..."
curl -sS http://rest.kegg.jp/list/genome > ${GENOME_FILE}
printf "done.\n"
# Get all genes for each genomes

# read genomes to an array
mapfile -t genomes < <(cut -f1 ${GENOME_FILE})

_start=1
_end=$(echo ${#genomes[@]})
# Proof of concept
#printf "Retrieving all genomes in KEGGdb\n"
for genome in $(seq ${_start} ${_end})
do
    curl -sS http://rest.kegg.jp/list/${genomes[genome]/genome:} > ${GENOMES}/${genomes[genome]/genome:}.txt
    #ProgressBar ${genome} ${_end}
#done
#printf "\n"


#for genome in $(seq ${_start} ${_end})
#do
    printf "Retrieving all genes in ${genomes[genome]/genome:}\n"
    mapfile -t genes < <(cut -f1 ${GENOMES}/${genomes[genome]/genome:}.txt)
    _start=1
    _end=$(echo ${#genes[@]})
    # for i in $(seq 0 ${#genes[@]}); do
    #   curl -sS http://rest.kegg.jp/get/${genes[$i]} >> ${GENES}/${genomes[genome]/genome:}_genes.txt
    #   ProgressBar ${i} ${_end}
    # done

    BLOCK=10
    NGENES=$(echo ${#genes[@]})
    DIV=$(expr ${NGENES} / ${BLOCK})
    MOD=$(expr ${NGENES} % ${BLOCK})

    if [[ ${DIV} > 0 ]]; then
        for d in $(seq 1 ${DIV}); do
            start=$(expr ${d} \* ${BLOCK} \- ${BLOCK})
            SLICE=$(join_by "+" "${genes[@]:${start}:${BLOCK}}")
            curl -sS http://rest.kegg.jp/get/${SLICE} >> ${GENES}/${genomes[genome]/genome:}_gene_${d}.txt &
            ProgressBar ${d} ${DIV}
        done
        wait
        start=$(expr ${d} \* ${BLOCK})
        end=$(expr ${MOD})
        SLICE=$(join_by + "${genes[@]:${start}:${end}}")
        curl -sS http://rest.kegg.jp/get/${SLICE} >> ${GENES}/${genomes[genome]/genome:}_gene_${d}.txt
    else
        d=0
        SLICE=$(join_by + "${genes[@]:0:${NGENES}}")
                curl -S http://rest.kegg.jp/get/${SLICE} >> ${GENES}/${genomes[genome]/genome:}_gene_${d}.txt
    fi
    cat ${GENES}/${genomes[genome]/genome:}_gene_*.txt > ${GENES}/${genomes[genome]/genome:}_genes.txt
    rm ${GENES}/${genomes[genome]/genome:}_gene_*
    printf "\n"
done
#!/bin/bash
PWY=${1}
curl -s http://rest.kegg.jp/get/${PWY} > ${PWY}.txt