genomewalker
2/2/2016 - 9:46 AM

my-mg-traits-2.0

my-mg-traits-2.0

#!/bin/bash -l
set -x
set -e
set -o pipefail
set -o errexit
set -o errtrace
set -o nounset

module load java
module load bbmap
module load pear
module load sortmerna
module load coreutils

# Run uproc and extract contigs

# Add any TARA sample name
# i.e. TARA_076_SRF_<-0.22 
declare -r NAM='NOM'

declare -r WD=/bioinf/projects/megx/TARA/rDNAs/compute
declare RES="${WD}/results/${NAM}"

declare -r MINOV='10'

# create result folder
[[ -d "${RES}" ]] && rm -rf "${RES}"
mkdir -p "${RES}"

cd "${RES}"

# Get read files and QC them with trimmomatic
declare -r AFILE=${WD}/input/assem-file-name2url.txt

#get read files
RFILES=$(grep "${NAM}" ${AFILE})

#How many files do we have
NRFILE=$(echo "${RFILES}" | wc -l)


echo "${RFILES}" | \
    cut -f 2 -d ' ' | \
    while read LINE
    do
        N=$(basename "${LINE}")
#       CMD="wget -c --retry-connrefused "${LINE}""
#       CMD="~/opt/aria2/bin/aria2c --file-allocation=none -c -x 16 "${LINE}""
#       CMD="rsync --bwlimit=50000 --progress --append -h --numeric-ids -e "ssh -T -c arcfour -o Compression=no -x" arcturus.mpi-bremen.de:/export/bioinf/projects/megx/TARA/assemblies/FASTQ/${N} ."
        #       while ! wget -c --retry-connrefused "${LINE}";do sleep 5;done
        ln -s "/bioinf/projects/megx/TARA/assemblies/FASTQ/${N}" .
    done


PE1="${RES}/${NAM}.R1.fastq"
PE2="${RES}/${NAM}.R2.fastq"
SE="${RES}/${NAM}.SR.fastq"


#we need to combine them
R1=R1.fastq.gz
R2=R2.fastq.gz
cat *_1.fastq.gz > "${R1}" &
cat *_2.fastq.gz > "${R2}" &
wait

rm *_1.fastq.gz &
rm *_2.fastq.gz &
wait

TARAADAP=/bioinf/software/bbmap/bbmap-35.14/resources/tara.fa.gz
#Remove adapters
bbduk.sh in="${R1}" in2="${R2}" out1="${PE1}" out2="${PE2}" outs="${SE}" qin=33 minlen=45 ktrim=r k=25 mink=11 ref=${TARAADAP} hdist=1 tbo tpe maxns=0 threads=${NSLOTS}

rm R1.fastq.gz &
rm R2.fastq.gz &
wait

# We use one third of the memory available
declare MEM=$(free -g | grep Mem | awk '{printf "%dG",$2/3}')
echo ${MEM}

# Run PEAR
/bioinf/software/pear/pear-0.9.8/bin/pear -j ${NSLOTS} -y ${MEM} -v ${MINOV} -f "${PE1}" -r "${PE2}" -o "${NAM}.pear"

rm ${RES}/*discarded.fastq

cat ${RES}/*assembled.fastq >> "${SE}"

SR="${RES}/${NAM}.SR.qc.fasta"

rm "${PE1}" "${PE2}"

PE1="${RES}/${NAM}.R1.fasta"
PE2="${RES}/${NAM}.R2.fasta"

# Quality trim non merged
bbduk.sh in="${RES}/${NAM}.pear.unassembled.forward.fastq" in2="${RES}/${NAM}.pear.unassembled.reverse.fastq" out1="${PE1}" out2="${PE2}" outs="${RES}/tmp.SR.fasta" qin=33 minlen=45 qtrim=rl trimq=20 ktrim=r k=25 mink=11 ref=${TARAADAP} hdist=1 tbo tpe t=${NSLOTS} maxns=0 threads=${NSLOTS}

# Quality trim merged
bbduk.sh in="${SE}" out1="${SR}" qin=33 minlen=45 qtrim=rl trimq=20 ktrim=r k=25 mink=11 ref=${TARAADAP} hdist=1 tbo tpe t=${NSLOTS} maxns=0 threads=${NSLOTS}

cat "${PE1}" "${PE2}" "${RES}/tmp.SR.fasta" >> "${SR}"

rm "${RES}/tmp.SR.fasta" "${SE}" "${PE1}" "${PE2}"
rm ${RES}/*fastq

rRNA="${RES}/${NAM}.rRNA"


#SORT=/home/oerc-microb3/afernandezguerra/opt/sortmerna-2.0/bin/sortmerna
DB=/bioinf/software/sortmerna/sortmerna-2.0
#MEM=$(free -m | grep Mem | awk '{printf "%d",$2/3}')
MEM=4000

${DB}/bin/sortmerna --reads "${SR}" -a ${NSLOTS} --ref ${DB}/rRNA_databases/silva-bac-16s-id90.fasta,${DB}/index/silva-bac-16s-db:${DB}/rRNA_databases/silva-arc-16s-id95.fasta,${DB}/index/silva-arc-16s-db:${DB}/rRNA_databases/silva-euk-18s-id95.fasta,${DB}/index/silva-euk-18s-db --blast 1 --fastx --aligned "${RES}/${NAM}.sortmerna.rDNA" -v --log -m ${MEM} --best 1

bbduk.sh in="${SR}" outm="${RES}/${NAM}.bbduk.rDNA.fasta" k=31 ref=~/opt/ribokmers.fa.gz threads=${NSLOTS}

rm "${SR}"

echo "DONE"
#exit

# run uproc using the models

# Low complexity masking, remove reads with Ns and remove duplicates
#BBMASK
${BBTOOLS}/bbmask.sh in=${WD}/quality-trimming/${NAME}.comb.qc.fastq out=${WD}/quality-trimming/${NAME}.comb.qc.masked.fastq window=80 entropy=0.75 ke=5 > ${OSDRESMG}/log/${OSD}.bbmask.log 2>&1 
${BBTOOLS}/bbduk.sh in=${WD}/quality-trimming/${NAME}.comb.qc.masked.fastq out=${OSDRESMG}/fastq/${NAME}.comb.qc.masked.fasta maxns=1 > ${OSDRESMG}/log/${OSD}.bbduk.log 2>&1

${CDHIT}/cd-hit-dup -d 1 -i ${OSDRESMG}/fastq/${NAME}.comb.qc.masked.fasta -o ${OSDRESMG}/fastq/${NAME}.comb.qc.masked.dedup.fasta > ${OSDRESMG}/log/${OSD}.dedup.log 2>&1

[[ -f ${TRAITSDIR}/${NAME}.comb.qc.masked.dedup.fasta ]] && rm ${TRAITSDIR}/${NAME}.comb.qc.masked.dedup.fasta
cp ${OSDRESMG}/fastq/${NAME}.comb.qc.masked.dedup.fasta ${TRAITSDIR}

# Split fasta file, consider using mawk (will install the module)
awk -vO=$NSEQ 'BEGIN {n_seq=0;partid=1;} /^>/ {if(n_seq%O==0){file=sprintf("05-part-%d.fasta",partid);partid++;} print >> file; n_seq++; next;} { print >> file; }' < $RAW_FASTA


# Here goes fragenescan in an array job
# 2 threads
# 3 threads
# 4 threads
# 6 threads
qsub -t 1-500 -tc 50 -pe threaded 6  
#zcat ${RES}/${NAM}.bgc.gz | cut -f 2 -d ',' | sort -u > ${RES}/${NAM}.bgc.contigs.id.txt

#${BBMAP} in=${ASS} names=${RES}/${NAM}.bgc.contigs.id.txt out=${RES}/${NAM}.bgc.contigs.fasta.gz include=t
TARA_052_DCM_0.22-1.6 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR599/ERR599002/ERR599002_1.fastq.gz
TARA_052_DCM_0.22-1.6 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR599/ERR599002/ERR599002_2.fastq.gz
TARA_052_DCM_0.22-1.6 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR599/ERR599016/ERR599016_1.fastq.gz
TARA_052_DCM_0.22-1.6 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR599/ERR599016/ERR599016_2.fastq.gz
TARA_122_DCM_0.45-0.8 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR594/ERR594301/ERR594301_1.fastq.gz
TARA_122_DCM_0.45-0.8 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR594/ERR594301/ERR594301_2.fastq.gz
...
#!/bin/bash -l
#$ -N TARA-rDNA
#$ -P megx.p
#$ -pe threaded 8
#$ -t 11,18 #,21,25,26
#$ -tc 25
#$ -V
#$ -j y
#$ -cwd

TASKS=/bioinf/projects/megx/TARA/rDNAs/compute/runtime/tasks/TARA*

EXEC=$(ls ${TASKS} | tail -n+${SGE_TASK_ID} | head -n1)
echo ${EXEC}

exec  "${EXEC}"
library(Biostrings)
library(Biobase)
library(plyr)
library(stringr)

# helper constants

RPKOs = c("K02945", "K02967", "K02982", "K02986", "K02988", "K02990", "K02992", "K02994", "K02996", 
          "K02946", "K02948" ,"K02950" ,"K02952" ,"K02954" ,"K02956" ,"K02959" ,"K02961" ,"K02963" , 
          "K02965" , "K02968" , "K02970" , "K02981" , "K02985" , "K02984" , "K02987" , "K02989" , "K02991",
          "K02993" , "K02995" , "K02997" , "K02947" , "K02949" , "K02951" , "K02953" , "K02955" , "K02958" ,
          "K02957" , "K02960" , "K02962" , "K02964" , "K02966" , "K02969" , "K02971" , "K02973" , "K02974" , "K02975" ,
          "K02976" , "K02978" , "K02977" , "K02979" , "K02980" , "K02983" , "K02998" , "K02863" , "K02886" , 
          "K02906" , "K02926" , "K02931" , "K02933" , "K02935" , "K02939" , "K02864" , "K02867" , "K02869" , 
          "K02871" , "K02874" , "K02876" , "K02878" , "K02879" , "K02881" , "K02884" , "K02887" , "K02888" , 
          "K02890" , "K02892" , "K02895" , "K02897" , "K02899" , "K02902" , "K02904" , "K02907" , "K02909" , 
          "K02911" , "K02913" , "K02914" , "K02916" , "K02919" , "K07590" , "K02925" , "K02930" , "K02932" , 
          "K02934" , "K02937" , "K02936" , "K02938" , "K02940" , "K02866" , "K02865" , "K02868" , "K02870" , 
          "K02873" , "K02872" , "K02875" , "K02877" , "K02880" , "K02883" , "K02882" , "K02885" , "K02889" , 
          "K02891" , "K02894" , "K02893" , "K02896" , "K02898" , "K02901" , "K02900" , "K02903" , "K02905" , 
          "K02908" , "K02910" , "K02912" , "K02915" , "K02918" , "K02917" , "K02920" , "K02922" , "K02921" , 
          "K02923" , "K02924" , "K02927" , "K02928" , "K02929" , "K02941" , "K02942" , "K02943" , "K02944" , 
          "K01977" , "K01980" , "K01985" , "K01979" , "K01982" , "K01981"
)

# constants

dummy<-"AAAAACAAGAATACAACCACGACTAGAAGCAGGAGTATAATCATGATTCAACACCAGCATCCACCCCCGCCTCGACGCCGGCGTCTACTCCTGCTTGAAGACGAGGATGCAGCCGCGGCTGGAGGCGGGGGTGTAGTCGTGGTTTAATACTAGTATTCATCCTCGTCTTGATGCTGGTGTTTATTCTTGTTT"

codons<-strsplit(strbreak(dummy, width=3, exdent=0, collapse="|"), split="\\|")[[1]]
stops<-c("TAA", "TAG", "TGA")
nostops<-codons[!(codons %in% stops)]

ctab<-data.frame(
  codon=nostops,
  aa=as.factor(as.character(translate(DNAStringSet(nostops)))),
  codonstr=as.character(nostops), stringsAsFactors=F
)

acnt <- tapply(ctab$codon, ctab$aa, length)
ordaa <- ctab[order(ctab$aa),"aa"]

# functions

cnorm<-function(x) x/sum(x)
cnormm <- function(x) if(is.matrix(x)) x/rowSums(x) else x/x

rcm <- function(x) if(is.matrix(x)) as.integer(!is.nan(rowSums(x))) else as.integer(!is.nan(x))
byaa<-function(x) unlist(tapply(x,ctab$aa,cnorm))

byaas <- function(x) do.call(cbind, sapply(names(acnt), function(y) cnormm(x[,ctab$aa %in% y])))
byrcs <- function(x) sapply(names(acnt), function(y) rcm(x[,ordaa %in% y]))


# optimized MILC calculator
# the only required parameter is the codon usage table set, 
# and MILC will be calculated against the average CU of that set.
# one optional parameter:
# subsets - (named) list of 
#         - logical vectors of the same length as nrow(s), or
#         - additional CU table set(s) that will be processed
# the function returns a data frame of the same row length as nrow(s)
# and ncol equal to the number of subsets plus one (self)

calcMilc<-function(s, subsets=list()) {
  
  # test parameters
  
  if(! inherits(s, "codonTable")) stop("First argument must be a codon usage table!")
  nseq <- nrow(s)
  
  if(!is.list(subsets)) stop(paste("subsets must be a (named) list of logical vectors, each of length", nseq,
                                   "or codonTable objects (of any length)"))
  if(length(subsets) != 0) {
    ok <- sapply(subsets, function(x) {
      all(is.vector(x, mode = "logical"), length(x) == nseq) |
        all(inherits(x, "codonTable"), nrow(x) > 0)
    })
    stopifnot(ok)
    nam <- names(subsets)
    nsubs <- length(subsets)
    if(is.null(nam)) {
      nam <- paste("subset", 1:nsubs, sep = ".")
    } else {
      nam[nam == ""] <- paste("subset", (1:nsubs)[nam == ""], sep = ".")
    }
    names(subsets) <- make.names(nam, unique = TRUE)
  }

  # add a dummy self selection to subsets
  self_set <- rep(TRUE, nseq)
  subsets <- c(list(self = c(self_set)), subsets)

  # strip all unneeded info and convert to matrix
  o<-as.matrix(s[,nostops])
  
  # preprocess all subsets
  gc_list <- sapply(subsets, function(y) {
    if(is.vector(y, mode = "logical")) {
      sel <- o[y,] # this should never give error, beecause we tested for equal length
    } else {
      sel <- as.matrix(y[,nostops])
    }
    sel_sum <- colSums(sel) # add counts per codon
    gc <- byaa(sel_sum) # and normalize synonymous codons to sum = 1
    gc
  }, simplify = FALSE)

  # make fc values from original table
  fc<-byaas(o)
  
  # and correction factor
  corr1 <- t(t(byrcs(fc)) * as.vector(acnt-1))
  l <- s$len
  cf <- rowSums(corr1)/l - 0.5
  
  # now loop through all the gc's and calculate distance for each gene in the original set
  milcs <- sapply(gc_list, function(gc) {

    ma <- 2 * o[,order(ctab$aa)] * log(t(t(fc)/gc))
    MILC <- rowSums(ma, na.rm = TRUE)/l - cf
    MILC
    
  })
  
  return(cbind(s, milcs))
}



readSet <- function(folder = ".", KOs = c(), zipped = FALSE) {
  
  if(length(KOs) == 0) 
    pattern <- "*.fasta"
  else
    pattern <- paste(KOs, sep="|")
  
  if(zipped) {
    where <- tempdir()
    unzip(folder, exdir = where)
    folder <- where
  }
  files <- dir(folder, pattern=pattern)
  
  combined <- llply(files, function(x) {
      aset <- readDNAStringSet(paste(folder, x, sep="/"))
      as.character(aset)    
    }, .progress = "text")
  names(combined) <- files

  if(zipped) unlink(where, recursive = TRUE)
  
  aset <- DNAStringSet(unlist(combined))

  ctable <- oligonucleotideFrequency(aset, width = 3, step = 3)
  
#  KO <- str_replace_all(names(aset), ".*(K\\d{5}).*", "\\1")
#  COG <- str_replace_all(names(aset), ".*(([KCN]|TW)OG\\d{5}).*", "\\1")

  ID <- names(aset)
  KO <- str_extract(ID, "K\\d{5}")
  COG <- str_extract(ID, "([KCN]|TW)OG\\d{5}")
  len.stop <- rowSums(ctable[,codons])
  len <- rowSums(ctable[,nostops])
  problem <- rowSums(ctable) != len.stop

  ccc = data.frame(
    ID = ID,
    ctable,
    KO = KO,
    COG = COG,
    len.stop = len.stop,
    len = len,
    problem = problem
  )
  
  class(ccc) <- c(class(ccc), "codonTable")
  ccc
}



BEGIN;

set search_path to mg_traits;


-- post on /jobs
-- example started(ing) job
-- customer as by authorization or anonymous
INSERT INTO mg_traits_jobs (
    customer, mg_url, sample_label, sample_environment
  ) 
  VALUES (
    'mg-traits-tester', 
    'https://dev-dav.megx.net/test-data/mg-traits-test2.fasta',
    'running job sample', 'marine'
) RETURNING id;

-- get job details on /jobs/mg{id}:{sample_name}
select id, sample_label, time_submitted, time_finished, return_code, error_message from mg_traits.mg_traits_jobs;

-- // get on /mg{id}:{sample_name}/simple-traits
select * from mg_traits_results where id = ?;

-- // get on /mg{id}:{sample_name}/function-table
select sample_label, pfam , id  from mg_traits_pfam where id = ?;

-- variant with additional column for array as string
select sample_label, pfam as pfam_array, array_to_string(pfam, ',') as  pfam_string, id  from mg_traits_pfam where id = ?;

-- // get on /mg{id}:{sample_name}/amino-acid-content
select * from mg_traits.aa where id = ?;

-- // get on /mg{id}:{sample_name}/di-nucleotide-odds-ratio
select * from mg_traits.dinuc where id = ?;



rollback;
cat ../input/assem-file-name2url.txt| while read i; do NOM=$(echo $i | cut -f1 -d ' '); sed -e "s/NOM/${NOM}/g" ../uprocBGCreads.sh > tasks/${NOM}.sortmerna.sh; done && chmod +x tasks/*