OSD2014 viral analyses

9/15/2016 - 6:13 PM

# Using the workable merged sequences that will be used for the paper I did:

# Transform to FASTA
${BBMAP}/reformat.sh in="${FASTQ}"/"${NAME}"_ME_shotgun_workable_merged.fastq.gz out="${RESULTS}"/"${NAME}".ME.fasta overwrite=true threads="${NSLOTS}"
${BBMAP}/reformat.sh in="${FASTQ}"/"${NAME}"_SE_shotgun_workable_merged.fastq.gz out="${RESULTS}"/"${NAME}".SE.fasta overwrite=true threads="${NSLOTS}"

# Concatenate merged and non-merged sequences
cat "${RESULTS}"/"${NAME}".ME.fasta "${RESULTS}"/"${NAME}".SE.fasta > "${RESULTS}"/"${NAME}".fasta

# Remove each individual FASTA file
rm "${RESULTS}"/"${NAME}".ME.fasta "${RESULTS}"/"${NAME}".SE.fasta

# Fragment recruitment using FR-HIT

# I used as minimal alignment coverage control for the read (-m) the 40% of the average read length of the sample for the local mode of the search. And 80% as minimum identity
L=$(seqstat "${RESULTS}"/"${NAME}".fasta  |awk '$0 ~ /Average/{printf("%.0f\n", $3*0.4)}')

"${FRHIT}"/fr-hit -a "${RESULTS}"/"${NAME}".fasta -d "${GENOME}" -f 1 -r 0 -T "${NSLOTS}" -c 80 -m "${L}" -p 9 -o "${RESULTS}"/"${NAME}"_loc.psl

# Then I transformed from PSL to BED using http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/pslToBed
pslToBed "${RESULTS}"/"${NAME}"_loc.psl "${RESULTS}"/"${NAME}"_loc.bed

# And from BED to BAM using bedtools
bedtools bedtobam -bed12 -i "${RESULTS}"/"${NAME}"_loc.bed  -g "${INPUT}"/VirRefSeq_and_Virsorted_Tara_10kb_lengths.txt > "${RESULTS}"/"${NAME}"_loc.bam

# Transformed your SAM files to BAM, and sorted
for i in *sam; do samtools view -bT VirRefSeq_and_Virsorted_Tara_10kb.fasta "${i}" | samtools sort - -o "${i/sam/bam}"; done

# And for BOWTIE and FR-HIT BAM files I extracted the genomes with a breadth of coverage >= 50%
for i in *bam; do genomeCoverageBed -ibam "${i}" | awk -vC=50 -f get_mean_coverage_from_bam.awk > "${i/.bam/_coverage.txt}"; done

# Create files for R
for i in *breadth_of_coverage_50.txt; do SMP=$(basename "${i}" _Rx_shotgun_workable_breadth_of_coverage_50.txt); awk -vL="${SMP}" '{print L"\t"$0}' "${i}"; done > OSD2014_bowtie_bc50.txt
for i in *breadth_of_coverage_50.txt; do SMP=$(basename "${i}" _breadth_of_coverage_50.txt); awk -vL="${SMP}" '{print L"\t"$0}' "${i}"; done > OSD2014_fr-hit_bc50.txt

get_mean_coverage_from_bam.awk

# Columns in genomeCoverageBed
# 1. Chromosome/Contig/Genome
# 2. Coverage
# 3. bp with coverage in 2
# 4. Chromosome/Contig/Genome bp
# 5. Percentage of Chromosome/Contig/Genome with coverage in 2

BEGIN{
    FS="\t"
}
{
    if ($0 i ~ /^genome/){
        next;
    }else{
        # Get mean coverage
        c[$1]=c[$1] + ($2*$5)
        if ($2 > 0){
            # Get fraction with coverage > 0
            d[$1] = d[$1] + $5
            # Get bp covered
            b[$1] = b[$1] + $3
            g[$1] = $4
        }
    }
}END{
    for (var in c){
        if (d[var]*100 >= C){
            # outputs: Chromosome/Contig/Genome name, Chromosome/Contig/Genome bp, bp with coverage > 0, breadth of coverage, mean coverage
            print var"\t"g[var],"\t",b[var]"\t"d[var]*100"\t"c[var]
        }
    }
}

Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

OSD2014 viral analyses