BGC analysis
#Read a hist file from bedtools coverage -hist and create a coverage per orf file.
#Coverage per each gene is calculated as: coverage_orfA = sum(depth_of_coverage * fraction_covered)
BEGIN{
FS="\t"
}
{
if ($0 i ~ /^all/){
next;
}else{
c[$1"#"$4"#"$8]=c[$1"#"$4"#"$8] + ($7*$9)
}
}END{
for (var in c){
print var"\t"c[var]
}
}
#!/bin/bash
SAMP=$(echo $1 | tr -d '\')
bedtools coverage -hist -sorted -b <(zcat ../../TARA/"${SAMP}".sorted.markdup.sorted.bed.gz) -a "${SAMP}"_bgc-fam.sorted.txt > "${SAMP}".hist
# TARA samples
# File provided by Marley: BGC_combined_TARA.txt (11392 rows/11391 unique)
# There is one duplicate
# TARA_146_SRF_0.22-346_1.cluster019 scaffold117228_1 1 9264 phosphonate
# There are 11375 unique contigs
# There are 11388 combinations contig - BGC-family
# Three duplicates:
# TARA_025_DCM_<-0.22_scaffold52786_1 terpene
# TARA_025_DCM_<-0.22_scaffold52786_1 0 20972 terpene
# TARA_025_DCM_<-0.22_scaffold52786_1 0 20990 terpene
# TARA_037_MES_0.22-1.6_scaffold83544_1 bacteriocin
# TARA_037_MES_0.22-1.6_scaffold83544_1 0 11866 bacteriocin
# TARA_037_MES_0.22-1.6_scaffold83544_1 0 12449 bacteriocin
# TARA_148b_MES_0.22-3_scaffold55397_1 terpene
# TARA_148b_MES_0.22-3_scaffold55397_1 0 20840 terpene
# TARA_148b_MES_0.22-3_scaffold55397_1 0 32089 terpene
# We need to fix some names first
paste <(grep -o -f <(cut -d ' ' -f1 /bioinf/projects/megx/TARA/assem-file-name2url.txt) <(sed -e 's/__/_</g' BGC_combined_TARA.txt)) <(cat BGC_combined_TARA.txt) >BGC_combined_TARA-fixed.txt
# Extract each sample and create a BED file for each sample (0-index)
# Example for TARA_004_DCM_0.22
# TARA_004_DCM_0.22_scaffold1643_1 0 1322 otherks
# TARA_004_DCM_0.22_scaffold2303_1 0 2023 terpene
# TARA_004_DCM_0.22_scaffold42075_1 0 8151 terpene
# TARA_004_DCM_0.22_scaffold64179_1 0 2415 arylpolyene
awk '{print $1"_"$3"\t"$4-1"\t"$5"\t"$2"\t"$6 > "TARA-BGC/"$1"_bgc-fam.txt"}' BGC_combined_TARA-fixed.txt
# Sort all BED files
cut -f1 ../BGC_combined_TARA-fixed.txt | sort -u | while read LINE; do sort -k1,1 -k2,2n "${LINE}"_bgc-fam.txt > "${LINE}"_bgc-fam.sorted.txt; done
# Use the metagenomic mappings already exising to get the coverage between the BGC coordinates
cut -f1 ../BGC_combined_TARA-fixed.txt | sort -u | ~/opt/parallel/bin/parallel -j24 'bash ../get_TARA_coverage.sh "{}"'
# Two failed so we have to repeat
bedtools coverage -hist -sorted -b ../../TARA/TARA_094_SRF_0.22-3.sorted.markdup.sorted.bed.gz -a <(bedtools sort -i TARA_094_SRF_0.22-3_bgc-fam.sorted.txt) > TARA_094_SRF_0.22-3.hist
bedtools coverage -hist -sorted -b ../../TARA/TARA_124_MIX_0.45-0.8.sorted.markdup.sorted.bed.gz -a <(bedtools sort -i TARA_124_MIX_0.45-0.8_bgc-fam.sorted.txt) > TARA_124_MIX_0.45-0.8.hist
# Aggregate the coverage
cat *hist | awk -f ../get_orf_coverage_from_bam.awk | tr '#' $'\t' | sort -k1,1V > ../TARA-BGC.fam.abun.txt
# OSD samples
# File provided by Marley: BGC_combined_OSD.txt (878 rows/875 unique)
# Removed:
# results c00060_OSD153__c1 contig-3383 1 1 1946 arylpolyene
# results c00132_OSD153__c2 contig-9283 1 1 1163 terpene
# results c00445_OSD153__c3 contig-31011 1 1 1061 terpene
# Are identical tp:
# OSD153_2014-06-21_0m_NPL022_spades c00060_OSD153__c1 contig-3383 1 1 1946 arylpolyene
# OSD153_2014-06-21_0m_NPL022_spades c00132_OSD153__c2 contig-9283 1 1 1163 terpene
# OSD153_2014-06-21_0m_NPL022_spades c00445_OSD153__c3 contig-31011 1 1 1061 terpene
# Removed the header
# Extract each sample and create a BED file for each sample (0-index)
# Example for OSD1_2014-06-21_0m_NPL022
# OSD1_2014-06-21_0m_NPL022_spades_contig-2551 0 1101 terpene
# OSD1_2014-06-21_0m_NPL022_spades_contig-49351 0 7546 terpene
# OSD1_2014-06-21_0m_NPL022_spades_contig-53809 0 21903 terpene
# OSD1_2014-06-21_0m_NPL022_spades_contig-67289 0 1940 terpene
awk '{print $1"_"$3"\t"$5-1"\t"$6"\t"$7 > "OSD2014-BGC/"$1"_bgc-fam.txt"}' BGC_combined_OSD.txt
# Sort all BED files
cut -f1 ../BGC_combined_OSD.txt | sort -u | ~/opt/parallel/bin/parallel -j4 'sort -k1,1 -k2,2n "{}"_bgc-fam.txt > "{}"_bgc-fam.sorted.txt'
# Use the metagenomic mappings already exising to get the coverage between the BGC coordinates
cut -f1 ../BGC_combined_OSD.txt | sort -u | sed -e 's/_bgc-fam.sorted.txt//g' | ~/opt/parallel/bin/parallel --progress -j 16 'bedtools coverage -hist -sorted -b ../../OSD/"{}"_contigs.gt500.sorted.markdup.sorted.bed -a "{}"_bgc-fam.sorted.txt > "{}".hist'
# Aggregate the coverage
cat *hist | awk -f ../get_orf_coverage_from_bam.awk | tr '#' $'\t' | sort -k1,1V > ../OSD2014-BGC.fam.abun.txt