genomewalker
3/3/2016 - 10:05 AM

BGC analysis

BGC analysis

#Read a hist file from bedtools coverage -hist and create a coverage per orf file.
#Coverage per each gene is calculated as: coverage_orfA = sum(depth_of_coverage * fraction_covered)

BEGIN{
FS="\t"
}
{
    if ($0 i ~ /^all/){
        next;
    }else{
        c[$1"#"$4"#"$8]=c[$1"#"$4"#"$8] + ($7*$9)
    }
}END{
    for (var in c){
        print var"\t"c[var]
    }
}
#!/bin/bash
SAMP=$(echo $1 | tr -d '\')
bedtools coverage -hist -sorted -b <(zcat ../../TARA/"${SAMP}".sorted.markdup.sorted.bed.gz) -a "${SAMP}"_bgc-fam.sorted.txt > "${SAMP}".hist
# TARA samples
# File provided by Marley: BGC_combined_TARA.txt (11392 rows/11391 unique)
# There is one duplicate
# TARA_146_SRF_0.22-346_1.cluster019      scaffold117228_1        1       9264    phosphonate
# There are 11375 unique contigs
# There are 11388 combinations contig - BGC-family
# Three duplicates:
# TARA_025_DCM_<-0.22_scaffold52786_1     terpene
#   TARA_025_DCM_<-0.22_scaffold52786_1    0       20972   terpene
#   TARA_025_DCM_<-0.22_scaffold52786_1    0       20990   terpene
# TARA_037_MES_0.22-1.6_scaffold83544_1   bacteriocin
#   TARA_037_MES_0.22-1.6_scaffold83544_1        0       11866   bacteriocin
#   TARA_037_MES_0.22-1.6_scaffold83544_1        0       12449   bacteriocin
# TARA_148b_MES_0.22-3_scaffold55397_1    terpene
#   TARA_148b_MES_0.22-3_scaffold55397_1  0       20840   terpene
#   TARA_148b_MES_0.22-3_scaffold55397_1  0       32089   terpene
# We need to fix some names first
paste <(grep -o -f <(cut -d ' ' -f1 /bioinf/projects/megx/TARA/assem-file-name2url.txt) <(sed -e 's/__/_</g' BGC_combined_TARA.txt)) <(cat BGC_combined_TARA.txt) >BGC_combined_TARA-fixed.txt
# Extract each sample and create a BED file for each sample (0-index)
# Example for TARA_004_DCM_0.22
# TARA_004_DCM_0.22_scaffold1643_1        0       1322    otherks
# TARA_004_DCM_0.22_scaffold2303_1        0       2023    terpene
# TARA_004_DCM_0.22_scaffold42075_1       0       8151    terpene
# TARA_004_DCM_0.22_scaffold64179_1       0       2415    arylpolyene
awk '{print $1"_"$3"\t"$4-1"\t"$5"\t"$2"\t"$6 > "TARA-BGC/"$1"_bgc-fam.txt"}' BGC_combined_TARA-fixed.txt
# Sort all BED files
cut -f1  ../BGC_combined_TARA-fixed.txt | sort -u | while read LINE; do sort -k1,1 -k2,2n "${LINE}"_bgc-fam.txt > "${LINE}"_bgc-fam.sorted.txt; done
# Use the metagenomic mappings already exising to get the coverage between the BGC coordinates
cut -f1  ../BGC_combined_TARA-fixed.txt | sort -u | ~/opt/parallel/bin/parallel -j24 'bash ../get_TARA_coverage.sh "{}"'
# Two failed so we have to repeat
bedtools coverage -hist -sorted -b ../../TARA/TARA_094_SRF_0.22-3.sorted.markdup.sorted.bed.gz -a <(bedtools sort -i TARA_094_SRF_0.22-3_bgc-fam.sorted.txt) > TARA_094_SRF_0.22-3.hist
bedtools coverage -hist -sorted -b ../../TARA/TARA_124_MIX_0.45-0.8.sorted.markdup.sorted.bed.gz -a <(bedtools sort -i TARA_124_MIX_0.45-0.8_bgc-fam.sorted.txt) > TARA_124_MIX_0.45-0.8.hist
# Aggregate the coverage
cat *hist | awk -f ../get_orf_coverage_from_bam.awk | tr '#' $'\t' | sort -k1,1V > ../TARA-BGC.fam.abun.txt
# OSD samples
# File provided by Marley: BGC_combined_OSD.txt (878 rows/875 unique)
# Removed:
# results	c00060_OSD153__c1	contig-3383	1	1	1946	arylpolyene
# results	c00132_OSD153__c2	contig-9283	1	1	1163	terpene
# results	c00445_OSD153__c3	contig-31011	1	1	1061	terpene
# Are identical tp:
# OSD153_2014-06-21_0m_NPL022_spades	c00060_OSD153__c1	contig-3383	1	1	1946	arylpolyene
# OSD153_2014-06-21_0m_NPL022_spades	c00132_OSD153__c2	contig-9283	1	1	1163	terpene
# OSD153_2014-06-21_0m_NPL022_spades	c00445_OSD153__c3	contig-31011	1	1	1061	terpene
# Removed the header
# Extract each sample and create a BED file for each sample (0-index)
# Example for OSD1_2014-06-21_0m_NPL022 
# OSD1_2014-06-21_0m_NPL022_spades_contig-2551    0       1101    terpene
# OSD1_2014-06-21_0m_NPL022_spades_contig-49351   0       7546    terpene
# OSD1_2014-06-21_0m_NPL022_spades_contig-53809   0       21903   terpene
# OSD1_2014-06-21_0m_NPL022_spades_contig-67289   0       1940    terpene
awk '{print $1"_"$3"\t"$5-1"\t"$6"\t"$7 > "OSD2014-BGC/"$1"_bgc-fam.txt"}' BGC_combined_OSD.txt
# Sort all BED files
cut -f1  ../BGC_combined_OSD.txt | sort -u | ~/opt/parallel/bin/parallel -j4 'sort -k1,1 -k2,2n "{}"_bgc-fam.txt > "{}"_bgc-fam.sorted.txt'
# Use the metagenomic mappings already exising to get the coverage between the BGC coordinates
cut -f1  ../BGC_combined_OSD.txt | sort -u | sed -e 's/_bgc-fam.sorted.txt//g' | ~/opt/parallel/bin/parallel --progress -j 16 'bedtools coverage -hist -sorted -b ../../OSD/"{}"_contigs.gt500.sorted.markdup.sorted.bed -a "{}"_bgc-fam.sorted.txt > "{}".hist'
# Aggregate the coverage
cat *hist | awk -f ../get_orf_coverage_from_bam.awk | tr '#' $'\t' | sort -k1,1V > ../OSD2014-BGC.fam.abun.txt