HUMAnN analysis
# Update HUMAnN files
# Get module descriptions
curl -S http://rest.kegg.jp/list/module > kegg_modules_description_01092016.txt
# Get pathway descriptions
curl -S http://rest.kegg.jp/list/pathway > kegg_pathway_description_01092016.txt
mkdir KEGG_MODULE
cd KEGG_MODULE
cut -f1 ../kegg_modules_description_01092016.txt | cut -f2 -d ':' | ~/opt/parallel/bin/parallel -j 128 --progress ../getModule.sh
cd ..
for i in KEGG_MODULE/M*.txt; do NAM=$(basename $i .txt); awk -vL=$NAM '{if ($0 ~ /DEFINITION/){sub("DEFINITION ", "");print L"\t"$0}}' $i ; done |sed -e 's/-- //g' -e 's/ --//g'> modulep
sed -e 's/[^a-zA-Z0-9]/ /g' -e 's/ \+/\t/g' modulep > modulec
cp ../kegg_modules_description_01092016.txt map_kegg.txt
mkdir KEGG_PATH/
cd KEGG_PATH
cut -f1 ../kegg_pathway_description_01092016.txt | cut -f2 -d ':' | ~/opt/parallel/bin/parallel -j 128 --progress ../getPath.sh
cut -f1 ../kegg_pathway_description_01092016.txt | cut -f2 -d ':' |sed -e 's/map/ko/g' | ~/opt/parallel/bin/parallel -j 128 --progress ../getPath.sh
for i in ko0*txt; do NAM=$(basename $i .txt); sed -e 's/^ \+//g' $i | awk 'BEGIN{ORS="\t"}{if ($0 ~ /^K/){print $1}}' | sed -e 's/\t$//g' | awk -vL=$NAM '{if (NF > 1){print L"\t"$0}}'; done > ../path_kos.txt
mv ../path_kos.txt keggc
cp keggc pathwayc
# Modify keggXsample table to get names that fit HUMAnN expectations
awk 'BEGIN{FS="\t";OFS=FS}{if (NR == 1){for (i = 1; i <= NF; i++){if (i == 1){$i = "#OSD ID"}else{$i = i - 1; $i = "OSD"$i}}print $0}else{print $0}}' ../osd2014_koXsample_filt_raw_counts.tsv > osd2014_koXsample_filt_raw_counts_renamed.tsv
awk 'BEGIN{FS="\t";OFS="\t"}{if (NR == 1){for (i =1; i <= NF; i++){o=$i; $i = i-1; $i = o"\tOSD"$i"\n"}print $0}}' osd2014_koXsample_filt_raw_counts.tsv | tail -n+2 | awk '{print $1"\t"$2}' > conversion_table.txt
# And run HUMAnN
scons -j
#!/bin/bash
MODULE=${1}
curl -s http://rest.kegg.jp/get/${MODULE} > ${MODULE}.txt
#set -x
DATE="${3:-$(date "+%Y_%m_%d")}"
RESULTS="kegg_"${DATE}
DATA=${RESULTS}/data
GENOMES=${RESULTS}/genomes
GENES=${RESULTS}/genes
GENOME_FILE=${DATA}/kegg_genome_list_${DATE}.txt
function ProgressBar {
# Process data
let _progress=(${1}*100/${2}*100)/100
let _done=(${_progress}*4)/10
let _left=40-$_done
# Build progressbar string lengths
_fill=$(printf "%${_done}s")
_empty=$(printf "%${_left}s")
# Build progressbar strings and print the ProgressBar line
printf "\r[${_fill// /#}${_empty// /-}] ${_progress}%%"
}
function join_by {
local IFS="$1"
shift
echo "$*"
}
[[ -d ${RESULTS} ]] && printf "Folder ${RESULTS} found. Removing data..."; find ${RESULTS} -delete; printf "done\n"
mkdir -p ${GENOMES} ${GENES} ${DATA}
# Get all genomes list
printf "Retrieving genome list..."
curl -sS http://rest.kegg.jp/list/genome > ${GENOME_FILE}
printf "done.\n"
# Get all genes for each genomes
# read genomes to an array
mapfile -t genomes < <(cut -f1 ${GENOME_FILE})
_start=1
_end=$(echo ${#genomes[@]})
# Proof of concept
#printf "Retrieving all genomes in KEGGdb\n"
for genome in $(seq ${_start} ${_end})
do
curl -sS http://rest.kegg.jp/list/${genomes[genome]/genome:} > ${GENOMES}/${genomes[genome]/genome:}.txt
#ProgressBar ${genome} ${_end}
#done
#printf "\n"
#for genome in $(seq ${_start} ${_end})
#do
printf "Retrieving all genes in ${genomes[genome]/genome:}\n"
mapfile -t genes < <(cut -f1 ${GENOMES}/${genomes[genome]/genome:}.txt)
_start=1
_end=$(echo ${#genes[@]})
# for i in $(seq 0 ${#genes[@]}); do
# curl -sS http://rest.kegg.jp/get/${genes[$i]} >> ${GENES}/${genomes[genome]/genome:}_genes.txt
# ProgressBar ${i} ${_end}
# done
BLOCK=10
NGENES=$(echo ${#genes[@]})
DIV=$(expr ${NGENES} / ${BLOCK})
MOD=$(expr ${NGENES} % ${BLOCK})
if [[ ${DIV} > 0 ]]; then
for d in $(seq 1 ${DIV}); do
start=$(expr ${d} \* ${BLOCK} \- ${BLOCK})
SLICE=$(join_by "+" "${genes[@]:${start}:${BLOCK}}")
curl -sS http://rest.kegg.jp/get/${SLICE} >> ${GENES}/${genomes[genome]/genome:}_gene_${d}.txt &
ProgressBar ${d} ${DIV}
done
wait
start=$(expr ${d} \* ${BLOCK})
end=$(expr ${MOD})
SLICE=$(join_by + "${genes[@]:${start}:${end}}")
curl -sS http://rest.kegg.jp/get/${SLICE} >> ${GENES}/${genomes[genome]/genome:}_gene_${d}.txt
else
d=0
SLICE=$(join_by + "${genes[@]:0:${NGENES}}")
curl -S http://rest.kegg.jp/get/${SLICE} >> ${GENES}/${genomes[genome]/genome:}_gene_${d}.txt
fi
cat ${GENES}/${genomes[genome]/genome:}_gene_*.txt > ${GENES}/${genomes[genome]/genome:}_genes.txt
rm ${GENES}/${genomes[genome]/genome:}_gene_*
printf "\n"
done
#!/bin/bash
PWY=${1}
curl -s http://rest.kegg.jp/get/${PWY} > ${PWY}.txt