philippmuench
8/8/2019 - 2:33 PM

pipeline OligoMM-Antibiotics

pipeline OligoMM-Antibiotics

'''
oligomm_antibiotics.snakefile
Philipp Muench

Maps NGS data to OligoMM reference genomes and performs variant calling
-----------------------------------------------------------------------

Requirements:

samtools
	http://www.htslib.org/download/

fastp
	https://github.com/OpenGene/fastp

kneaddata
	https://bitbucket.org/biobakery/kneaddata

Trimmomatic
	http://www.usadellab.org/cms/?page=trimmomatic

bowtie2
	http://bowtie-bio.sourceforge.net/bowtie2/index.shtml	

Rscript
	https://cran.r-project.org/mirrors.html

sankeyD3 (R package)
	https://github.com/fbreitwieser/sankeyD3

networkD3 (R package)
	https://cran.r-project.org/web/packages/networkD3/index.html

bowtie2
	http://bowtie-bio.sourceforge.net/bowtie2/	

Usage:

	snakemake --jobs 10
'''

# Globals ---------------------------------------------------------------------

# list of genomes to be processed (contig names of reference sequence)
GENOMES = ["CP022722.1", "CP021420.1", "CP021421.1", "CP021422.1", "CP022712.1", "CP022713.1"]

# name of sample to be processed
SAMPLES = ["1683d00_S47"]

# process many sample at once
FILES, = glob_wildcards("../ftp_reads/genome.gbf.de/HZI_BIFO/bifouser/19-0125/{files}_R1_001.fastq.gz")

# joned reference to map against
REF="/net/metagenomics/projects/pmuench_oligomm_ab/reference_genomes/joined/joined_reference.fasta"

# dependencies
BOWTIE2_DIR="/net/metagenomics/projects/pmuench_oligomm_ab/tools/bowtie2-2.3.5.1-linux-x86_64"
PICARD_PATH="/net/metagenomics/projects/pmuench_oligomm_ab/tools/picard.jar"
TRIMMOMATIC_DIR="/net/metagenomics/projects/pmuench_oligomm_ab/tools/Trimmomatic-0.39"

# DBs
KNEADDATA_DB="/net/metagenomics/projects/pmuench_oligomm_ab/kneadData_ref_genomes"
KRAKEN_DB="/net/metagenomics/projects/pmuench_oligomm_ab/tools/kraken2/kraken2-2.0.7-beta/db/minikraken2_v2_8GB_201904_UPDATE"

# Rules -----------------------------------------------------------------------

rule all:
	input: 
#		expand("snpEff/{sample}/{genome}.vcf", genome=GENOMES, sample=SAMPLES), 
#		expand("igv_report/{sample}/{genome}.html", genome=GENOMES, sample=SAMPLES)
		expand("kraken/{sample}/{sample}.report", sample=FILES)

rule copy_files:
	input: 
		from1="../ftp_reads/genome.gbf.de/HZI_BIFO/bifouser/19-0125/{sample}_R1_001.fastq.gz",
		from2="../ftp_reads/genome.gbf.de/HZI_BIFO/bifouser/19-0125/{sample}_R2_001.fastq.gz"
	output:
		to1="reads/pe/{sample}_R1_001.fastq.gz",
		to2="reads/pe/{sample}_R2_001.fastq.gz"
	shell: """
					cp {input.from1} {output.to1}
					cp {input.from2} {output.to2}
					"""

# QC using fastp (https://github.com/OpenGene/fastp), pipeline will use kneaddata
rule fastp_pe:
		input:
				sample=["reads/pe/{sample}_R1_001.fastq.gz", "reads/pe/{sample}_R2_001.fastq.gz"]
		output:
				trimmed=["trimmed/pe/{sample}.1.fastq.gz", "trimmed/pe/{sample}.2.fastq.gz"],
				html="report/pe/{sample}.html",
				json="report/pe/{sample}.json"
		log:
				"logs/fastp/pe/{sample}.log"
		params:
				extra=""
		threads: 25
		wrapper:
				"0.36.0/bio/fastp"

# QC using kneaddata (https://bitbucket.org/biobakery/kneaddata)
rule kneaddata:
	input:
		fq1=["reads/pe/{sample}_R1_001.fastq.gz"],
		fq2=["reads/pe/{sample}_R2_001.fastq.gz"]
	output:
		un1=["kneaddata/{sample}/{sample}_R1_001_kneaddata_unmatched_1.fastq"],
		un2=["kneaddata/{sample}/{sample}_R1_001_kneaddata_unmatched_2.fastq"],
		trimmed1=["kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_1.fastq"],
		trimmed2=["kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_2.fastq"] 
	threads: 10
	params:
		folder=["kneaddata/{sample}"], 
		trim=TRIMMOMATIC_DIR,
		db=KNEADDATA_DB,
		bowtie2=BOWTIE2_DIR		
	log:
		"logs/kneaddata/{sample}.log"
	shell: """
				kneaddata --input {input.fq1} --remove-intermediate-output --input {input.fq2} -db {params.db} --output {params.folder} --trimmomatic {params.trim} --bowtie2 {params.bowtie2} -t {threads} > {log} 2> {log}
				"""

# align PE reads to joined reference
rule bwa_mem:
		input:
		#		reads=["kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_1.fastq", "kneaddata/{sample}/{sample}_R1_001_kneaddata_paired_2.fastq"]
				reads=["trimmed/pe/{sample}.1.fastq.gz", "trimmed/pe/{sample}.2.fastq.gz"]
		output:
				"mapped/{sample}.bam"
		log:
				"logs/bwa_mem/{sample}.log"
		params:
				index=REF,
				extra=r"-R '@RG\tID:{sample}\tSM:{sample}'",
				sort="none",						 # Can be 'none', 'samtools' or 'picard'.
				sort_order="queryname",  # Can be 'queryname' or 'coordinate'.
				sort_extra=""						 # Extra args for samtools/picard.
		threads: 10
		wrapper:
				"0.36.0/bio/bwa/mem"

# extract unmapped reads
rule extract_unmapped_reads:
	input:
		bam=["mapped/{sample}.bam"]
	output:
		unmapped=["unmapped/{sample}.bam"],
		sorted=["unmapped/{sample}_sorted.bam"]
	shell: """
					samtools view -b -f 4 {input.bam} > {output.unmapped} 
					samtools sort {output.unmapped} -o {output.sorted}
					"""

# bam to fast to get unmapped PE reads
rule bam_to_fastq:
	input:
		bam=["unmapped/{sample}_sorted.bam"]
	output:
		fq1=["unmapped/fastq/{sample}_1.fastq"],
		fq2=["unmapped/fastq/{sample}_2.fastq"]
	shell: """
					bedtools bamtofastq -i {input.bam} -fq {output.fq1} -fq2 {output.fq2}
					"""

# assign taxon to non-mapping reads based on kneaddata output
rule kraken2:
	input:
		reads1=["unmapped/fastq/{sample}_1.fastq"],
		reads2=["unmapped/fastq/{sample}_2.fastq"]
	output:
		krak=["kraken/{sample}/{sample}.krak"],
		report=["kraken/{sample}/{sample}.report"]
	params:
		db=KRAKEN_DB
	threads: 8
	shell: """
				kraken2 --paired --use-names --db {params.db} --threads 20 --output {output.krak} --report {output.report} {input.reads1} {input.reads2}
					"""

# generate html kraken report
rule kraken2Sankey:
		input:
				report=["kraken/{sample}/{sample}.report"]
		output:
				html=["kraken/{sample}/{sample}.html"]
		shell: """
					 Rscript scripts/krakenSankey.R {input.report}
					 """

rule samtools_flagstat:
	input:
		"mapped/{sample}.bam"
	output:
		"mapped/{sample}.bam.flagstat"
	wrapper:
		"0.36.0/bio/samtools/flagstat"

rule samtools_sort:
		input:
				"mapped/{sample}.bam"
		output:
				"mapped/{sample}.sorted.bam"
		threads:	# Samtools takes additional threads through its option -@
				25		 # This value - 1 will be sent to -@.
		wrapper:
				"0.36.0/bio/samtools/sort"

rule mark_duplicates:
		input:
				bam="mapped/{sample}.sorted.bam"
		output:
				bam="dedup/{sample}.bam"
		params:
				jar=PICARD_PATH
		log:
				"logs/picard/dedup/{sample}.log"
		shell: "java -Xmx10G -jar {params.jar} MarkDuplicates INPUT={input.bam} OUTPUT={output.bam} CREATE_INDEX=true VALIDATION_STRINGENCY=LENIENT QUIET=true AS=true METRICS_FILE=metrics VERBOSITY=WARNING TMP_DIR=/scratch/pmuench"

rule samtools_index:
		input:
				"dedup/{sample}.bam"
		output:
				"dedup/{sample}.bam.bai"
		params:
				"" # optional params string
		wrapper:
				"0.36.0/bio/samtools/index"

rule samtools_index_raw:
		input:
				"mapped/{sample}.sorted.bam"
		output:
				"mapped/{sample}.sorted.bam.bai"
		params:
				"" # optional params string
		wrapper:
				"0.36.0/bio/samtools/index"

# calculate coverage on deduplicated bam files
rule samtools_depth_dedup:
		input: 
				 bam="dedup/{sample}.bam"
		output:
				 stat="coverage/{sample}/{sample}_dedup.coverage.txt.gz"
		shell: "samtools depth {input.bam} | bgzip > {output.stat}"

# calculate coverage on non-deduplicated bam files
rule samtools_depth:
		input: 
				 bam="dedup/{sample}.bam"
		output:
				 stat="coverage/{sample}/{sample}.coverage.txt.gz"
		shell: "samtools depth {input.bam} | bgzip > {output.stat}"

# plot depth on dedup coverage information
#rule depth_plot:
#		input:
#				 stat="dedup/{sample}.coverage.txt.gz"
#		output:
#				 pdf="coverage_statistics/{sample}.coverage.txt.gz.pdf"
#		script:  "scripts/coverage.R"

rule lofreq:
		input:
				bam="mapped/{sample}.sorted.bam",
				bai="mapped/{sample}.sorted.bam.bai",
				ref=REF
		output:
				vcf="lofreq_call/{sample}.vcf"
		log:
				"logs/lofreq_call/{sample}.log"
		threads: 20
		shell: "lofreq call-parallel --pp-threads {threads} -f {input.ref} {input.bam} -o {output.vcf} > {log} 2> {log}"

rule bgzip:
		input:
				vcf="lofreq_call/{sample}.vcf"
		output:
				vcfgz="lofreq_call/{sample}.vcf.gz"
		log:
				"logs/bgzip/{sample}.log"
		shell:
				"bgzip -c {input.vcf} > {output.vcfgz} 2> {log}"

rule index_vcf:
		input:
				 vcfgz="lofreq_call/{sample}.vcf.gz"
		output:
				 vcfsort="lofreq_call/{sample}.vcf.gz.tbi"
		log:
				 "logs/sort_vcf/{sample}.log"
		shell:
				 "tabix -f -p vcf {input.vcfgz}"

rule get_chr_list:
		input:
				 vcf="lofreq_call/{sample}.vcf"
		output:
				 chr="lofreq_call/{sample}.txt"
		shell: """cat {input.vcf} | mawk '$1 ~ /^#/ {{next}} {{print $1 | "sort -k1,1 -u"}}' > {output.chr}"""

"""
sorts and gz a vcf file and indexes
"""
rule sort_vcf:
		 input:
				 vcf="lofreq_call/{sample}.vcf"
		 output:
				 sorted="lofreq_call/{sample}_sorted.vcf.gz"
		 shell: """
						cat {input.vcf} | mawk '$1 ~ /^#/ {{print $0;next}} {{print $0 | "sort -k1,1 -k2,2n"}}' | bgzip -c > {output.sorted}
						tabix -p vcf {output.sorted}
						"""

"""
extracts a genomes from a sorted gz vcfDD
"""
rule split_vcf:
		 input:
				 vcf="lofreq_call/{sample}_sorted.vcf.gz"
		 output:
				 single="lofreq_splitted/{sample}/{genome}.vcf",
				 sorted="lofreq_splitted/{sample}/{genome}_sorted.vcf.gz",
				 index="lofreq_splitted/{sample}/{genome}.vcf.tbi"
		 shell: """
						tabix -h {input.vcf} {wildcards.genome} > {output.single}
						cat {output.single} | mawk '$1 ~ /^#/ {{print $0;next}} {{print $0 | "sort -k1,1 -k2,2n"}}' | bgzip -c > {output.sorted}	 
						tabix -f -p vcf {output.sorted}
						cp {output.sorted}.tbi {output.index}  
						"""

rule split_bam:
			input:
					bam="dedup/{sample}.bam"
			output:
					single="dedup_splitted/{sample}/{genome}.bam"
			shell: """
						 samtools view -bh {input.bam} > {output.single}
						 samtools index {output.single}
						 """

rule annotate_vcf:
			input:
					 vcf="lofreq_splitted/{sample}/{genome}_sorted.vcf.gz",
					 bed="reference_genomes/{genome}.bed.gz"
			output:
					 vcf2="lofreq_annotated/{sample}/{genome}.vcf"
			shell: """
						 bcftools annotate -a {input.bed} -c CHROM,FROM,TO,GENE -h <(echo '##INFO=<ID=GENE,Number=1,Type=String,Description="Gene name">') {input.vcf} > {output.vcf2}
						 """

rule snpEff:
			input:
					 vcf="lofreq_splitted/{sample}/{genome}.vcf"
			params:
					 ref="{genome}"
			output:
					 vcf="snpEff/{sample}/{genome}.vcf",
					 bed="snpEff/{sample}/{genome}.bed",
					 csv="snpEff/{sample}/{genome}.csv",
					 html="snpEff/{sample}/{genome}.html",
			log: "logs/smpEff/{sample}/{genome}.log"
			shell: """				
						 java -Xmx4g -jar ../tools/snpEff/snpEff.jar -no-downstream -no-upstream {params.ref} {input.vcf} -o bed > {output.bed}
						 java -Xmx4g -jar ../tools/snpEff/snpEff.jar -no-downstream -no-upstream {params.ref} {input.vcf} -csvStats {output.csv} > {output.vcf} 2> {log}
						java -Xmx4g -jar ../tools/snpEff/snpEff.jar -no-downstream -no-upstream {params.ref} {input.vcf} -stats {output.html} > {output.vcf} 2> {log}
						 """ 

rule igv_report:
		input:
				fasta="reference_genomes/{genome}.fasta",
				vcf="snpEff/{sample}/{genome}.vcf",
				bam="dedup_splitted/{sample}/{genome}.bam",
				bed="reference_genomes/{genome}.bed.gz"
		output:
				path="igv_report/{sample}/{genome}.html"
		shell: "create_report {input.vcf} {input.fasta} --info-columns ANN --tracks {input.bam} {input.bed} --output {output.path}"