Pipeline used to call SNP on BS-seq reads (Bismark bam output files)
# Tools used:
# Bismark_v0.19.0
# samtools 1.9
# bcftools 1.9
# htslib 1.9
# vcf-merge from vcftools-v0.1.14-6
# Alignment with Bismark (reference genome fasta file needs to be prepared first
# with bismark_genome_preparation)
bismark --bowtie2 -N 1 --output_dir $output_dir $fasta_reference $fastq_file
# Sort bam files (bam generated by bismark aligner)
for i in *bam; do
samtools sort $i -o $i.sorted.bam
done
# Perform SNP calling
# consensus-caller rather than multi-allelic algorithm
# as it seems to be less stringent and better call heterozygous
# position (empirical observation for one specific dataset)
for i in ../*sorted.bam; do
bcftools mpileup -f ../TAIR10/TAIR10.fasta $i | \
bcftools call --skip-variants indels --consensus-caller \
--variants-only -O z -o ${i}.vcf.gz - && \
awk -v FS="\t" -v OFS="\t" ' $4 != "G" && $5 != "A" {print}' <(bcftools view -H ${i}.vcf.gz) |\
awk -v FS="\t" -v OFS="\t" ' $4 != "C" && $5 != "T" {print}' - > ${i}.vcf.filtered.vcf && bcftools view -h ${i}.vcf.gz |\
cat - ${i}.vcf.filtered.vcf > ${i}.vcf.filtered.with_header.vcf
done
# This loop generates 3 files:
# .bam.vcf.gz, .bam.vcf.filtered.gz, .bam.vcf.filtered.with_header.vcf.gz
# The filtered files contain only SNPs which are not C>T and G>A (most resulting from bisulfite conversion)
# Note that the awk command will also discard lines with multiallelic ALT (e.g. $5 != "A" will remove "A,C"
# and $5 != "T" will remove "T,C")
# The filtered files with header allow further processing with bcftools/vcftools
# Compress and Tabix vcf files
for i in *filtered.with_header.vcf; do bgzip $i && tabix $i.gz; done
# Merge vcf files in one file and tabix file
vcf-merge *filtered.with_header.vcf.gz | bgzip -c > vcf_files_merged.vcf.gz && tabix vcf_files_merged.vcf.gz
# Check that all samples are in there
bcftools query -l vcf_files_merged.vcf.gz