pipeline OligoMM-Antibiotics
'''
oligomm_antibiotics.snakefile
Philipp Muench
Maps NGS data to OligoMM reference genomes and performs variant calling
-----------------------------------------------------------------------
Requirements:
samtools
http://www.htslib.org/download/
fastp
https://github.com/OpenGene/fastp
kneaddata
https://bitbucket.org/biobakery/kneaddata
Trimmomatic
http://www.usadellab.org/cms/?page=trimmomatic
bowtie2
http://bowtie-bio.sourceforge.net/bowtie2/index.shtml
Rscript
https://cran.r-project.org/mirrors.html
sankeyD3 (R package)
https://github.com/fbreitwieser/sankeyD3
networkD3 (R package)
https://cran.r-project.org/web/packages/networkD3/index.html
bowtie2
http://bowtie-bio.sourceforge.net/bowtie2/
Usage:
snakemake --jobs 10
'''
# Globals ---------------------------------------------------------------------
# list of genomes to be processed (contig names of reference sequence)
GENOMES = ["CP022722.1", "CP021420.1", "CP021421.1", "CP021422.1", "CP022712.1", "CP022713.1"]
# name of sample to be processed
SAMPLES = ["1683d00_S47"]
# process many sample at once
FILES, = glob_wildcards("../ftp_reads/genome.gbf.de/HZI_BIFO/bifouser/19-0125/{files}_R1_001.fastq.gz")
# joned reference to map against
REF="/net/metagenomics/projects/pmuench_oligomm_ab/reference_genomes/joined/joined_reference.fasta"
# dependencies
BOWTIE2_DIR="/net/metagenomics/projects/pmuench_oligomm_ab/tools/bowtie2-2.3.5.1-linux-x86_64"
PICARD_PATH="/net/metagenomics/projects/pmuench_oligomm_ab/tools/picard.jar"
TRIMMOMATIC_DIR="/net/metagenomics/projects/pmuench_oligomm_ab/tools/Trimmomatic-0.39"
# DBs
KNEADDATA_DB="/net/metagenomics/projects/pmuench_oligomm_ab/kneadData_ref_genomes"
KRAKEN_DB="/net/metagenomics/projects/pmuench_oligomm_ab/tools/kraken2/kraken2-2.0.7-beta/db/minikraken2_v2_8GB_201904_UPDATE"
# Rules -----------------------------------------------------------------------
rule all:
input:
# expand("snpEff/{sample}/{genome}.vcf", genome=GENOMES, sample=SAMPLES),
# expand("igv_report/{sample}/{genome}.html", genome=GENOMES, sample=SAMPLES)
expand("kraken/{sample}/{sample}.report", sample=FILES)
rule copy_files:
input:
from1="../ftp_reads/genome.gbf.de/HZI_BIFO/bifouser/19-0125/{sample}_R1_001.fastq.gz",
from2="../ftp_reads/genome.gbf.de/HZI_BIFO/bifouser/19-0125/{sample}_R2_001.fastq.gz"
output:
to1="reads/pe/{sample}_R1_001.fastq.gz",
to2="reads/pe/{sample}_R2_001.fastq.gz"
shell: """
cp {input.from1} {output.to1}
cp {input.from2} {output.to2}
"""
# QC using fastp (https://github.com/OpenGene/fastp), pipeline will use kneaddata
rule fastp_pe:
input:
sample=["reads/pe/{sample}_R1_001.fastq.gz", "reads/pe/{sample}_R2_001.fastq.gz"]
output:
trimmed=["trimmed/pe/{sample}.1.fastq.gz", "trimmed/pe/{sample}.2.fastq.gz"],
html="report/pe/{sample}.html",
json="report/pe/{sample}.json"
log:
"logs/fastp/pe/{sample}.log"
params:
extra=""
threads: 25
wrapper:
"0.36.0/bio/fastp"
# QC using kneaddata (https://bitbucket.org/biobakery/kneaddata)
rule kneaddata:
input:
fq1=["reads/pe/{sample}_R1_001.fastq.gz"],
fq2=["reads/pe/{sample}_R2_001.fastq.gz"]
output:
un1=["kneaddata/{sample}/{sample}_R1_001_kneaddata_unmatched_1.fastq"],
un2=["kneaddata/{sample}/{sample}_R1_001_kneaddata_unmatched_2.fastq"],
trimmed1=["kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_1.fastq"],
trimmed2=["kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_2.fastq"]
threads: 10
params:
folder=["kneaddata/{sample}"],
trim=TRIMMOMATIC_DIR,
db=KNEADDATA_DB,
bowtie2=BOWTIE2_DIR
log:
"logs/kneaddata/{sample}.log"
shell: """
kneaddata --input {input.fq1} --remove-intermediate-output --input {input.fq2} -db {params.db} --output {params.folder} --trimmomatic {params.trim} --bowtie2 {params.bowtie2} -t {threads} > {log} 2> {log}
"""
# align PE reads to joined reference
rule bwa_mem:
input:
# reads=["kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_1.fastq", "kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_2.fastq"]
reads=["trimmed/pe/{sample}.1.fastq.gz", "trimmed/pe/{sample}.2.fastq.gz"]
output:
"mapped/{sample}.bam"
log:
"logs/bwa_mem/{sample}.log"
params:
index=REF,
extra=r"-R '@RG\tID:{sample}\tSM:{sample}'",
sort="none", # Can be 'none', 'samtools' or 'picard'.
sort_order="queryname", # Can be 'queryname' or 'coordinate'.
sort_extra="" # Extra args for samtools/picard.
threads: 10
wrapper:
"0.36.0/bio/bwa/mem"
# extract unmapped reads
rule extract_unmapped_reads:
input:
bam=["mapped/{sample}.bam"]
output:
unmapped=["unmapped/{sample}.bam"],
sorted=["unmapped/{sample}_sorted.bam"]
shell: """
samtools view -b -f 4 {input.bam} > {output.unmapped}
samtools sort {output.unmapped} -o {output.sorted}
"""
# bam to fast to get unmapped PE reads
rule bam_to_fastq:
input:
bam=["unmapped/{sample}_sorted.bam"]
output:
fq1=["unmapped/fastq/{sample}_1.fastq"],
fq2=["unmapped/fastq/{sample}_2.fastq"]
shell: """
bedtools bamtofastq -i {input.bam} -fq {output.fq1} -fq2 {output.fq2}
"""
# assign taxon to non-mapping reads based on kneaddata output
rule kraken2:
input:
reads1=["unmapped/fastq/{sample}_1.fastq"],
reads2=["unmapped/fastq/{sample}_2.fastq"]
output:
krak=["kraken/{sample}/{sample}.krak"],
report=["kraken/{sample}/{sample}.report"]
params:
db=KRAKEN_DB
threads: 8
shell: """
kraken2 --paired --use-names --db {params.db} --threads 20 --output {output.krak} --report {output.report} {input.reads1} {input.reads2}
"""
# generate html kraken report
rule kraken2Sankey:
input:
report=["kraken/{sample}/{sample}.report"]
output:
html=["kraken/{sample}/{sample}.html"]
shell: """
Rscript scripts/krakenSankey.R {input.report}
"""
rule samtools_flagstat:
input:
"mapped/{sample}.bam"
output:
"mapped/{sample}.bam.flagstat"
wrapper:
"0.36.0/bio/samtools/flagstat"
rule samtools_sort:
input:
"mapped/{sample}.bam"
output:
"mapped/{sample}.sorted.bam"
threads: # Samtools takes additional threads through its option -@
25 # This value - 1 will be sent to -@.
wrapper:
"0.36.0/bio/samtools/sort"
rule mark_duplicates:
input:
bam="mapped/{sample}.sorted.bam"
output:
bam="dedup/{sample}.bam"
params:
jar=PICARD_PATH
log:
"logs/picard/dedup/{sample}.log"
shell: "java -Xmx10G -jar {params.jar} MarkDuplicates INPUT={input.bam} OUTPUT={output.bam} CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT QUIET=true AS=true METRICS_FILE=metrics VERBOSITY=WARNING TMP_DIR=/scratch/pmuench"
rule samtools_index:
input:
"dedup/{sample}.bam"
output:
"dedup/{sample}.bam.bai"
params:
"" # optional params string
wrapper:
"0.36.0/bio/samtools/index"
rule samtools_index_raw:
input:
"mapped/{sample}.sorted.bam"
output:
"mapped/{sample}.sorted.bam.bai"
params:
"" # optional params string
wrapper:
"0.36.0/bio/samtools/index"
# calculate coverage on deduplicated bam files
rule samtools_depth_dedup:
input:
bam="dedup/{sample}.bam"
output:
stat="coverage/{sample}/{sample}_dedup.coverage.txt.gz"
shell: "samtools depth {input.bam} | bgzip > {output.stat}"
# calculate coverage on non-deduplicated bam files
rule samtools_depth:
input:
bam="dedup/{sample}.bam"
output:
stat="coverage/{sample}/{sample}.coverage.txt.gz"
shell: "samtools depth {input.bam} | bgzip > {output.stat}"
# plot depth on dedup coverage information
#rule depth_plot:
# input:
# stat="dedup/{sample}.coverage.txt.gz"
# output:
# pdf="coverage_statistics/{sample}.coverage.txt.gz.pdf"
# script: "scripts/coverage.R"
rule lofreq:
input:
bam="mapped/{sample}.sorted.bam",
bai="mapped/{sample}.sorted.bam.bai",
ref=REF
output:
vcf="lofreq_call/{sample}.vcf"
log:
"logs/lofreq_call/{sample}.log"
threads: 20
shell: "lofreq call-parallel --pp-threads {threads} -f {input.ref} {input.bam} -o {output.vcf} > {log} 2> {log}"
rule bgzip:
input:
vcf="lofreq_call/{sample}.vcf"
output:
vcfgz="lofreq_call/{sample}.vcf.gz"
log:
"logs/bgzip/{sample}.log"
shell:
"bgzip -c {input.vcf} > {output.vcfgz} 2> {log}"
rule index_vcf:
input:
vcfgz="lofreq_call/{sample}.vcf.gz"
output:
vcfsort="lofreq_call/{sample}.vcf.gz.tbi"
log:
"logs/sort_vcf/{sample}.log"
shell:
"tabix -f -p vcf {input.vcfgz}"
rule get_chr_list:
input:
vcf="lofreq_call/{sample}.vcf"
output:
chr="lofreq_call/{sample}.txt"
shell: """cat {input.vcf} | mawk '$1 ~ /^#/ {{next}} {{print $1 | "sort -k1,1 -u"}}' > {output.chr}"""
"""
sorts and gz a vcf file and indexes
"""
rule sort_vcf:
input:
vcf="lofreq_call/{sample}.vcf"
output:
sorted="lofreq_call/{sample}_sorted.vcf.gz"
shell: """
cat {input.vcf} | mawk '$1 ~ /^#/ {{print $0;next}} {{print $0 | "sort -k1,1 -k2,2n"}}' | bgzip -c > {output.sorted}
tabix -p vcf {output.sorted}
"""
"""
extracts a genomes from a sorted gz vcfDD
"""
rule split_vcf:
input:
vcf="lofreq_call/{sample}_sorted.vcf.gz"
output:
single="lofreq_splitted/{sample}/{genome}.vcf",
sorted="lofreq_splitted/{sample}/{genome}_sorted.vcf.gz",
index="lofreq_splitted/{sample}/{genome}.vcf.tbi"
shell: """
tabix -h {input.vcf} {wildcards.genome} > {output.single}
cat {output.single} | mawk '$1 ~ /^#/ {{print $0;next}} {{print $0 | "sort -k1,1 -k2,2n"}}' | bgzip -c > {output.sorted}
tabix -f -p vcf {output.sorted}
cp {output.sorted}.tbi {output.index}
"""
rule split_bam:
input:
bam="dedup/{sample}.bam"
output:
single="dedup_splitted/{sample}/{genome}.bam"
shell: """
samtools view -bh {input.bam} > {output.single}
samtools index {output.single}
"""
rule annotate_vcf:
input:
vcf="lofreq_splitted/{sample}/{genome}_sorted.vcf.gz",
bed="reference_genomes/{genome}.bed.gz"
output:
vcf2="lofreq_annotated/{sample}/{genome}.vcf"
shell: """
bcftools annotate -a {input.bed} -c CHROM,FROM,TO,GENE -h <(echo '##INFO=<ID=GENE,Number=1,Type=String,Description="Gene name">') {input.vcf} > {output.vcf2}
"""
rule snpEff:
input:
vcf="lofreq_splitted/{sample}/{genome}.vcf"
params:
ref="{genome}"
output:
vcf="snpEff/{sample}/{genome}.vcf",
bed="snpEff/{sample}/{genome}.bed",
csv="snpEff/{sample}/{genome}.csv",
html="snpEff/{sample}/{genome}.html",
log: "logs/smpEff/{sample}/{genome}.log"
shell: """
java -Xmx4g -jar ../tools/snpEff/snpEff.jar -no-downstream -no-upstream {params.ref} {input.vcf} -o bed > {output.bed}
java -Xmx4g -jar ../tools/snpEff/snpEff.jar -no-downstream -no-upstream {params.ref} {input.vcf} -csvStats {output.csv} > {output.vcf} 2> {log}
java -Xmx4g -jar ../tools/snpEff/snpEff.jar -no-downstream -no-upstream {params.ref} {input.vcf} -stats {output.html} > {output.vcf} 2> {log}
"""
rule igv_report:
input:
fasta="reference_genomes/{genome}.fasta",
vcf="snpEff/{sample}/{genome}.vcf",
bam="dedup_splitted/{sample}/{genome}.bam",
bed="reference_genomes/{genome}.bed.gz"
output:
path="igv_report/{sample}/{genome}.html"
shell: "create_report {input.vcf} {input.fasta} --info-columns ANN --tracks {input.bam} {input.bed} --output {output.path}"