my-mg-traits-2.0
#!/bin/bash -l
set -x
set -e
set -o pipefail
set -o errexit
set -o errtrace
set -o nounset
module load java
module load bbmap
module load pear
module load sortmerna
module load coreutils
# Run uproc and extract contigs
# Add any TARA sample name
# i.e. TARA_076_SRF_<-0.22
declare -r NAM='NOM'
declare -r WD=/bioinf/projects/megx/TARA/rDNAs/compute
declare RES="${WD}/results/${NAM}"
declare -r MINOV='10'
# create result folder
[[ -d "${RES}" ]] && rm -rf "${RES}"
mkdir -p "${RES}"
cd "${RES}"
# Get read files and QC them with trimmomatic
declare -r AFILE=${WD}/input/assem-file-name2url.txt
#get read files
RFILES=$(grep "${NAM}" ${AFILE})
#How many files do we have
NRFILE=$(echo "${RFILES}" | wc -l)
echo "${RFILES}" | \
cut -f 2 -d ' ' | \
while read LINE
do
N=$(basename "${LINE}")
# CMD="wget -c --retry-connrefused "${LINE}""
# CMD="~/opt/aria2/bin/aria2c --file-allocation=none -c -x 16 "${LINE}""
# CMD="rsync --bwlimit=50000 --progress --append -h --numeric-ids -e "ssh -T -c arcfour -o Compression=no -x" arcturus.mpi-bremen.de:/export/bioinf/projects/megx/TARA/assemblies/FASTQ/${N} ."
# while ! wget -c --retry-connrefused "${LINE}";do sleep 5;done
ln -s "/bioinf/projects/megx/TARA/assemblies/FASTQ/${N}" .
done
PE1="${RES}/${NAM}.R1.fastq"
PE2="${RES}/${NAM}.R2.fastq"
SE="${RES}/${NAM}.SR.fastq"
#we need to combine them
R1=R1.fastq.gz
R2=R2.fastq.gz
cat *_1.fastq.gz > "${R1}" &
cat *_2.fastq.gz > "${R2}" &
wait
rm *_1.fastq.gz &
rm *_2.fastq.gz &
wait
TARAADAP=/bioinf/software/bbmap/bbmap-35.14/resources/tara.fa.gz
#Remove adapters
bbduk.sh in="${R1}" in2="${R2}" out1="${PE1}" out2="${PE2}" outs="${SE}" qin=33 minlen=45 ktrim=r k=25 mink=11 ref=${TARAADAP} hdist=1 tbo tpe maxns=0 threads=${NSLOTS}
rm R1.fastq.gz &
rm R2.fastq.gz &
wait
# We use one third of the memory available
declare MEM=$(free -g | grep Mem | awk '{printf "%dG",$2/3}')
echo ${MEM}
# Run PEAR
/bioinf/software/pear/pear-0.9.8/bin/pear -j ${NSLOTS} -y ${MEM} -v ${MINOV} -f "${PE1}" -r "${PE2}" -o "${NAM}.pear"
rm ${RES}/*discarded.fastq
cat ${RES}/*assembled.fastq >> "${SE}"
SR="${RES}/${NAM}.SR.qc.fasta"
rm "${PE1}" "${PE2}"
PE1="${RES}/${NAM}.R1.fasta"
PE2="${RES}/${NAM}.R2.fasta"
# Quality trim non merged
bbduk.sh in="${RES}/${NAM}.pear.unassembled.forward.fastq" in2="${RES}/${NAM}.pear.unassembled.reverse.fastq" out1="${PE1}" out2="${PE2}" outs="${RES}/tmp.SR.fasta" qin=33 minlen=45 qtrim=rl trimq=20 ktrim=r k=25 mink=11 ref=${TARAADAP} hdist=1 tbo tpe t=${NSLOTS} maxns=0 threads=${NSLOTS}
# Quality trim merged
bbduk.sh in="${SE}" out1="${SR}" qin=33 minlen=45 qtrim=rl trimq=20 ktrim=r k=25 mink=11 ref=${TARAADAP} hdist=1 tbo tpe t=${NSLOTS} maxns=0 threads=${NSLOTS}
cat "${PE1}" "${PE2}" "${RES}/tmp.SR.fasta" >> "${SR}"
rm "${RES}/tmp.SR.fasta" "${SE}" "${PE1}" "${PE2}"
rm ${RES}/*fastq
rRNA="${RES}/${NAM}.rRNA"
#SORT=/home/oerc-microb3/afernandezguerra/opt/sortmerna-2.0/bin/sortmerna
DB=/bioinf/software/sortmerna/sortmerna-2.0
#MEM=$(free -m | grep Mem | awk '{printf "%d",$2/3}')
MEM=4000
${DB}/bin/sortmerna --reads "${SR}" -a ${NSLOTS} --ref ${DB}/rRNA_databases/silva-bac-16s-id90.fasta,${DB}/index/silva-bac-16s-db:${DB}/rRNA_databases/silva-arc-16s-id95.fasta,${DB}/index/silva-arc-16s-db:${DB}/rRNA_databases/silva-euk-18s-id95.fasta,${DB}/index/silva-euk-18s-db --blast 1 --fastx --aligned "${RES}/${NAM}.sortmerna.rDNA" -v --log -m ${MEM} --best 1
bbduk.sh in="${SR}" outm="${RES}/${NAM}.bbduk.rDNA.fasta" k=31 ref=~/opt/ribokmers.fa.gz threads=${NSLOTS}
rm "${SR}"
echo "DONE"
#exit
# run uproc using the models
# Low complexity masking, remove reads with Ns and remove duplicates
#BBMASK
${BBTOOLS}/bbmask.sh in=${WD}/quality-trimming/${NAME}.comb.qc.fastq out=${WD}/quality-trimming/${NAME}.comb.qc.masked.fastq window=80 entropy=0.75 ke=5 > ${OSDRESMG}/log/${OSD}.bbmask.log 2>&1
${BBTOOLS}/bbduk.sh in=${WD}/quality-trimming/${NAME}.comb.qc.masked.fastq out=${OSDRESMG}/fastq/${NAME}.comb.qc.masked.fasta maxns=1 > ${OSDRESMG}/log/${OSD}.bbduk.log 2>&1
${CDHIT}/cd-hit-dup -d 1 -i ${OSDRESMG}/fastq/${NAME}.comb.qc.masked.fasta -o ${OSDRESMG}/fastq/${NAME}.comb.qc.masked.dedup.fasta > ${OSDRESMG}/log/${OSD}.dedup.log 2>&1
[[ -f ${TRAITSDIR}/${NAME}.comb.qc.masked.dedup.fasta ]] && rm ${TRAITSDIR}/${NAME}.comb.qc.masked.dedup.fasta
cp ${OSDRESMG}/fastq/${NAME}.comb.qc.masked.dedup.fasta ${TRAITSDIR}
# Split fasta file, consider using mawk (will install the module)
awk -vO=$NSEQ 'BEGIN {n_seq=0;partid=1;} /^>/ {if(n_seq%O==0){file=sprintf("05-part-%d.fasta",partid);partid++;} print >> file; n_seq++; next;} { print >> file; }' < $RAW_FASTA
# Here goes fragenescan in an array job
# 2 threads
# 3 threads
# 4 threads
# 6 threads
qsub -t 1-500 -tc 50 -pe threaded 6
#zcat ${RES}/${NAM}.bgc.gz | cut -f 2 -d ',' | sort -u > ${RES}/${NAM}.bgc.contigs.id.txt
#${BBMAP} in=${ASS} names=${RES}/${NAM}.bgc.contigs.id.txt out=${RES}/${NAM}.bgc.contigs.fasta.gz include=t
TARA_052_DCM_0.22-1.6 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR599/ERR599002/ERR599002_1.fastq.gz
TARA_052_DCM_0.22-1.6 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR599/ERR599002/ERR599002_2.fastq.gz
TARA_052_DCM_0.22-1.6 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR599/ERR599016/ERR599016_1.fastq.gz
TARA_052_DCM_0.22-1.6 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR599/ERR599016/ERR599016_2.fastq.gz
TARA_122_DCM_0.45-0.8 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR594/ERR594301/ERR594301_1.fastq.gz
TARA_122_DCM_0.45-0.8 ftp://ftp.sra.ebi.ac.uk/vol1/fastq/ERR594/ERR594301/ERR594301_2.fastq.gz
...
#!/bin/bash -l
#$ -N TARA-rDNA
#$ -P megx.p
#$ -pe threaded 8
#$ -t 11,18 #,21,25,26
#$ -tc 25
#$ -V
#$ -j y
#$ -cwd
TASKS=/bioinf/projects/megx/TARA/rDNAs/compute/runtime/tasks/TARA*
EXEC=$(ls ${TASKS} | tail -n+${SGE_TASK_ID} | head -n1)
echo ${EXEC}
exec "${EXEC}"
library(Biostrings)
library(Biobase)
library(plyr)
library(stringr)
# helper constants
RPKOs = c("K02945", "K02967", "K02982", "K02986", "K02988", "K02990", "K02992", "K02994", "K02996",
"K02946", "K02948" ,"K02950" ,"K02952" ,"K02954" ,"K02956" ,"K02959" ,"K02961" ,"K02963" ,
"K02965" , "K02968" , "K02970" , "K02981" , "K02985" , "K02984" , "K02987" , "K02989" , "K02991",
"K02993" , "K02995" , "K02997" , "K02947" , "K02949" , "K02951" , "K02953" , "K02955" , "K02958" ,
"K02957" , "K02960" , "K02962" , "K02964" , "K02966" , "K02969" , "K02971" , "K02973" , "K02974" , "K02975" ,
"K02976" , "K02978" , "K02977" , "K02979" , "K02980" , "K02983" , "K02998" , "K02863" , "K02886" ,
"K02906" , "K02926" , "K02931" , "K02933" , "K02935" , "K02939" , "K02864" , "K02867" , "K02869" ,
"K02871" , "K02874" , "K02876" , "K02878" , "K02879" , "K02881" , "K02884" , "K02887" , "K02888" ,
"K02890" , "K02892" , "K02895" , "K02897" , "K02899" , "K02902" , "K02904" , "K02907" , "K02909" ,
"K02911" , "K02913" , "K02914" , "K02916" , "K02919" , "K07590" , "K02925" , "K02930" , "K02932" ,
"K02934" , "K02937" , "K02936" , "K02938" , "K02940" , "K02866" , "K02865" , "K02868" , "K02870" ,
"K02873" , "K02872" , "K02875" , "K02877" , "K02880" , "K02883" , "K02882" , "K02885" , "K02889" ,
"K02891" , "K02894" , "K02893" , "K02896" , "K02898" , "K02901" , "K02900" , "K02903" , "K02905" ,
"K02908" , "K02910" , "K02912" , "K02915" , "K02918" , "K02917" , "K02920" , "K02922" , "K02921" ,
"K02923" , "K02924" , "K02927" , "K02928" , "K02929" , "K02941" , "K02942" , "K02943" , "K02944" ,
"K01977" , "K01980" , "K01985" , "K01979" , "K01982" , "K01981"
)
# constants
dummy<-"AAAAACAAGAATACAACCACGACTAGAAGCAGGAGTATAATCATGATTCAACACCAGCATCCACCCCCGCCTCGACGCCGGCGTCTACTCCTGCTTGAAGACGAGGATGCAGCCGCGGCTGGAGGCGGGGGTGTAGTCGTGGTTTAATACTAGTATTCATCCTCGTCTTGATGCTGGTGTTTATTCTTGTTT"
codons<-strsplit(strbreak(dummy, width=3, exdent=0, collapse="|"), split="\\|")[[1]]
stops<-c("TAA", "TAG", "TGA")
nostops<-codons[!(codons %in% stops)]
ctab<-data.frame(
codon=nostops,
aa=as.factor(as.character(translate(DNAStringSet(nostops)))),
codonstr=as.character(nostops), stringsAsFactors=F
)
acnt <- tapply(ctab$codon, ctab$aa, length)
ordaa <- ctab[order(ctab$aa),"aa"]
# functions
cnorm<-function(x) x/sum(x)
cnormm <- function(x) if(is.matrix(x)) x/rowSums(x) else x/x
rcm <- function(x) if(is.matrix(x)) as.integer(!is.nan(rowSums(x))) else as.integer(!is.nan(x))
byaa<-function(x) unlist(tapply(x,ctab$aa,cnorm))
byaas <- function(x) do.call(cbind, sapply(names(acnt), function(y) cnormm(x[,ctab$aa %in% y])))
byrcs <- function(x) sapply(names(acnt), function(y) rcm(x[,ordaa %in% y]))
# optimized MILC calculator
# the only required parameter is the codon usage table set,
# and MILC will be calculated against the average CU of that set.
# one optional parameter:
# subsets - (named) list of
# - logical vectors of the same length as nrow(s), or
# - additional CU table set(s) that will be processed
# the function returns a data frame of the same row length as nrow(s)
# and ncol equal to the number of subsets plus one (self)
calcMilc<-function(s, subsets=list()) {
# test parameters
if(! inherits(s, "codonTable")) stop("First argument must be a codon usage table!")
nseq <- nrow(s)
if(!is.list(subsets)) stop(paste("subsets must be a (named) list of logical vectors, each of length", nseq,
"or codonTable objects (of any length)"))
if(length(subsets) != 0) {
ok <- sapply(subsets, function(x) {
all(is.vector(x, mode = "logical"), length(x) == nseq) |
all(inherits(x, "codonTable"), nrow(x) > 0)
})
stopifnot(ok)
nam <- names(subsets)
nsubs <- length(subsets)
if(is.null(nam)) {
nam <- paste("subset", 1:nsubs, sep = ".")
} else {
nam[nam == ""] <- paste("subset", (1:nsubs)[nam == ""], sep = ".")
}
names(subsets) <- make.names(nam, unique = TRUE)
}
# add a dummy self selection to subsets
self_set <- rep(TRUE, nseq)
subsets <- c(list(self = c(self_set)), subsets)
# strip all unneeded info and convert to matrix
o<-as.matrix(s[,nostops])
# preprocess all subsets
gc_list <- sapply(subsets, function(y) {
if(is.vector(y, mode = "logical")) {
sel <- o[y,] # this should never give error, beecause we tested for equal length
} else {
sel <- as.matrix(y[,nostops])
}
sel_sum <- colSums(sel) # add counts per codon
gc <- byaa(sel_sum) # and normalize synonymous codons to sum = 1
gc
}, simplify = FALSE)
# make fc values from original table
fc<-byaas(o)
# and correction factor
corr1 <- t(t(byrcs(fc)) * as.vector(acnt-1))
l <- s$len
cf <- rowSums(corr1)/l - 0.5
# now loop through all the gc's and calculate distance for each gene in the original set
milcs <- sapply(gc_list, function(gc) {
ma <- 2 * o[,order(ctab$aa)] * log(t(t(fc)/gc))
MILC <- rowSums(ma, na.rm = TRUE)/l - cf
MILC
})
return(cbind(s, milcs))
}
readSet <- function(folder = ".", KOs = c(), zipped = FALSE) {
if(length(KOs) == 0)
pattern <- "*.fasta"
else
pattern <- paste(KOs, sep="|")
if(zipped) {
where <- tempdir()
unzip(folder, exdir = where)
folder <- where
}
files <- dir(folder, pattern=pattern)
combined <- llply(files, function(x) {
aset <- readDNAStringSet(paste(folder, x, sep="/"))
as.character(aset)
}, .progress = "text")
names(combined) <- files
if(zipped) unlink(where, recursive = TRUE)
aset <- DNAStringSet(unlist(combined))
ctable <- oligonucleotideFrequency(aset, width = 3, step = 3)
# KO <- str_replace_all(names(aset), ".*(K\\d{5}).*", "\\1")
# COG <- str_replace_all(names(aset), ".*(([KCN]|TW)OG\\d{5}).*", "\\1")
ID <- names(aset)
KO <- str_extract(ID, "K\\d{5}")
COG <- str_extract(ID, "([KCN]|TW)OG\\d{5}")
len.stop <- rowSums(ctable[,codons])
len <- rowSums(ctable[,nostops])
problem <- rowSums(ctable) != len.stop
ccc = data.frame(
ID = ID,
ctable,
KO = KO,
COG = COG,
len.stop = len.stop,
len = len,
problem = problem
)
class(ccc) <- c(class(ccc), "codonTable")
ccc
}
BEGIN;
set search_path to mg_traits;
-- post on /jobs
-- example started(ing) job
-- customer as by authorization or anonymous
INSERT INTO mg_traits_jobs (
customer, mg_url, sample_label, sample_environment
)
VALUES (
'mg-traits-tester',
'https://dev-dav.megx.net/test-data/mg-traits-test2.fasta',
'running job sample', 'marine'
) RETURNING id;
-- get job details on /jobs/mg{id}:{sample_name}
select id, sample_label, time_submitted, time_finished, return_code, error_message from mg_traits.mg_traits_jobs;
-- // get on /mg{id}:{sample_name}/simple-traits
select * from mg_traits_results where id = ?;
-- // get on /mg{id}:{sample_name}/function-table
select sample_label, pfam , id from mg_traits_pfam where id = ?;
-- variant with additional column for array as string
select sample_label, pfam as pfam_array, array_to_string(pfam, ',') as pfam_string, id from mg_traits_pfam where id = ?;
-- // get on /mg{id}:{sample_name}/amino-acid-content
select * from mg_traits.aa where id = ?;
-- // get on /mg{id}:{sample_name}/di-nucleotide-odds-ratio
select * from mg_traits.dinuc where id = ?;
rollback;
cat ../input/assem-file-name2url.txt| while read i; do NOM=$(echo $i | cut -f1 -d ' '); sed -e "s/NOM/${NOM}/g" ../uprocBGCreads.sh > tasks/${NOM}.sortmerna.sh; done && chmod +x tasks/*