OSD2014 viral analyses
# Using the workable merged sequences that will be used for the paper I did:
# Transform to FASTA
${BBMAP}/reformat.sh in="${FASTQ}"/"${NAME}"_ME_shotgun_workable_merged.fastq.gz out="${RESULTS}"/"${NAME}".ME.fasta overwrite=true threads="${NSLOTS}"
${BBMAP}/reformat.sh in="${FASTQ}"/"${NAME}"_SE_shotgun_workable_merged.fastq.gz out="${RESULTS}"/"${NAME}".SE.fasta overwrite=true threads="${NSLOTS}"
# Concatenate merged and non-merged sequences
cat "${RESULTS}"/"${NAME}".ME.fasta "${RESULTS}"/"${NAME}".SE.fasta > "${RESULTS}"/"${NAME}".fasta
# Remove each individual FASTA file
rm "${RESULTS}"/"${NAME}".ME.fasta "${RESULTS}"/"${NAME}".SE.fasta
# Fragment recruitment using FR-HIT
# I used as minimal alignment coverage control for the read (-m) the 40% of the average read length of the sample for the local mode of the search. And 80% as minimum identity
L=$(seqstat "${RESULTS}"/"${NAME}".fasta |awk '$0 ~ /Average/{printf("%.0f\n", $3*0.4)}')
"${FRHIT}"/fr-hit -a "${RESULTS}"/"${NAME}".fasta -d "${GENOME}" -f 1 -r 0 -T "${NSLOTS}" -c 80 -m "${L}" -p 9 -o "${RESULTS}"/"${NAME}"_loc.psl
# Then I transformed from PSL to BED using http://hgdownload.cse.ucsc.edu/admin/exe/linux.x86_64/pslToBed
pslToBed "${RESULTS}"/"${NAME}"_loc.psl "${RESULTS}"/"${NAME}"_loc.bed
# And from BED to BAM using bedtools
bedtools bedtobam -bed12 -i "${RESULTS}"/"${NAME}"_loc.bed -g "${INPUT}"/VirRefSeq_and_Virsorted_Tara_10kb_lengths.txt > "${RESULTS}"/"${NAME}"_loc.bam
# Transformed your SAM files to BAM, and sorted
for i in *sam; do samtools view -bT VirRefSeq_and_Virsorted_Tara_10kb.fasta "${i}" | samtools sort - -o "${i/sam/bam}"; done
# And for BOWTIE and FR-HIT BAM files I extracted the genomes with a breadth of coverage >= 50%
for i in *bam; do genomeCoverageBed -ibam "${i}" | awk -vC=50 -f get_mean_coverage_from_bam.awk > "${i/.bam/_coverage.txt}"; done
# Create files for R
for i in *breadth_of_coverage_50.txt; do SMP=$(basename "${i}" _Rx_shotgun_workable_breadth_of_coverage_50.txt); awk -vL="${SMP}" '{print L"\t"$0}' "${i}"; done > OSD2014_bowtie_bc50.txt
for i in *breadth_of_coverage_50.txt; do SMP=$(basename "${i}" _breadth_of_coverage_50.txt); awk -vL="${SMP}" '{print L"\t"$0}' "${i}"; done > OSD2014_fr-hit_bc50.txt
# Columns in genomeCoverageBed
# 1. Chromosome/Contig/Genome
# 2. Coverage
# 3. bp with coverage in 2
# 4. Chromosome/Contig/Genome bp
# 5. Percentage of Chromosome/Contig/Genome with coverage in 2
BEGIN{
FS="\t"
}
{
if ($0 i ~ /^genome/){
next;
}else{
# Get mean coverage
c[$1]=c[$1] + ($2*$5)
if ($2 > 0){
# Get fraction with coverage > 0
d[$1] = d[$1] + $5
# Get bp covered
b[$1] = b[$1] + $3
g[$1] = $4
}
}
}END{
for (var in c){
if (d[var]*100 >= C){
# outputs: Chromosome/Contig/Genome name, Chromosome/Contig/Genome bp, bp with coverage > 0, breadth of coverage, mean coverage
print var"\t"g[var],"\t",b[var]"\t"d[var]*100"\t"c[var]
}
}
}