danielecook
12/15/2014 - 8:30 PM

Download WS245 Annotations and Split into separate tracks; prepend "CHROMOSOME_"

Download WS245 Annotations and Split into separate tracks; prepend "CHROMOSOME_"

# Download wormbase gff file
curl 'ftp://ftp.wormbase.org/pub/wormbase/releases/WS245/species/c_elegans/PRJNA13758/c_elegans.PRJNA13758.WS245.annotations.gff3.gz' > c_elegans.WS245.annotations.gff3.gz

# Use gff parallelized tools:  brew install dmd
# Extract each type into its own GFF File

# This list obtained by running:
# gunzip -kfc c_elegans.WS245.annotations.gff3.gz | cut -f 3 | sort | uniq 
types="CDS
DNAseI_hypersensitive_site
G_quartet
PCR_product
RNAi_reagent
SAGE_tag
SL1_acceptor_site
SL2_acceptor_site
SNP
TF_binding_site
TSS_region
antisense_RNA
assembly_component
base_call_error_correction
binding_site
biological_region
complex_substitution
conserved_region
deletion
duplication
enhancer
exon
experimental_result_region
expressed_sequence_match
five_prime_UTR
five_prime_open_reading_frame
gene
histone_binding_site
insertion_site
intron
inverted_repeat
lincRNA
low_complexity_region
mRNA
mRNA_region
miRNA
ncRNA
nc_primary_transcript
nucleotide_match
operon
piRNA
point_mutation
polyA_signal_sequence
polyA_site
possible_base_call_error
pre_miRNA
promoter
protein_coding_primary_transcript
protein_match
pseudogenic_transcript
rRNA
reagent
regulatory_region
repeat_region
scRNA
sequence_alteration
sequence_motif
snRNA
snoRNA
substitution
tRNA
tandem_repeat
three_prime_UTR
transcribed_fragment
transcript_region
transcription_end_site
translated_nucleotide_match
transposable_element
transposable_element_insertion_site"

for i in $types; do
    gunzip -kfc c_elegans.WS245.annotations.gff3.gz | grep "\t$i\t" | sed 's/^/CHROMOSOME_/' >> $i.gff
    (grep ^"#" $i.gff; grep -v ^"#" $i.gff | sort -k1,1 -k4,4n) | bgzip > $i.sorted.gff.gz;
    tabix $i.sorted.gff.gz
    rm $i.gff
done

# Remove tracks that are unlikely to be used:
rm c_elegans.WS245.annotations.gff3.gz
rm expressed_sequence_match.sorted.gff.gz*
rm intron.sorted.gff.gz*
rm protein_match.sorted.gff.gz*