chiaravanni
2/25/2016 - 10:51 AM

data_protein.faa_files.R

##download RefSeq data for all complete bacterial genomes##
wget ftp://ftp.ncbi.nlm.nih.gov/genomes/refseq/bacteria/assembly_summary.txt

#List the FTP path (column 20) for the assemblies of interest (here for all the assembly levels)
awk -F "\t" '$12=="Complete Genome" && $11=="latest"{print $20}' assembly_summary.txt > ftpdirpaths-complete
awk -F "\t" '$12=="Scaffold" && $11=="latest"{print $20}' assembly_summary.txt > ftpdirpaths-scaffold
awk -F "\t" '$12=="Contig" && $11=="latest"{print $20}' assembly_summary.txt > ftpdirpaths-contig
awk -F "\t" '$12=="Chromosome" && $11=="latest"{print $20}' assembly_summary.txt > ftpdirpaths-chromosome

#Append the filename of interest, in this case "*protein.faa.gz" to the FTP directory names. 
awk 'BEGIN{FS=OFS="/";filesuffix="protein.faa.gz"}{ftpdir=$0;asm=$6;file=asm"_"filesuffix;print ftpdir,file}' ftpdirpaths-complete > ftpfilepaths-complete
#same command for the other levels (Contig, Scaffold and Chromosome)....
#then download the data file for each FTP path in the list
aria2c -i ../ftpfilepaths-complete -j 16 -x 2 -l ../complete.log -m 10 --retry-wait 1 
#and repet the same command for the other assembly levels
##after downloading the "protein.faa.gz" files as shown in 'download_faa.gz_files' 
##-> extract information about how many proteins and hypothetical proteins are there for each taxa

library(parallel)
library(tidyr)
library(dplyr)
library(data.table)
library("Biostrings")

#Function to count how many hypothetical proteins there are in each .faa.gz file 
Dataset <- function(folder_path) {
  read_faa <- function(file){
    faa_file <- fasta.index(file)
    hypotetical <- faa_file %>%  select(desc) %>% filter(grepl('hypothetical', desc)) %>% summarise(n_hypotetical = sum(n()))
    proteins <- faa_file %>%  select(desc) %>% filter(!grepl('hypothetical', desc)) %>% summarise(n_protein = sum(n()))
    total <- faa_file %>% select(desc) %>% summarise(total = sum(n()))
    df <- cbind(proteins, hypotetical, total)
    return(df)
  }
  files <- list.files(folder_path, pattern="*.faa.gz", full.names=TRUE)
  d <-lapply(files, read_faa)
  dataf <- rbindlist(d, fill = T)
  return(dataf)
}

#Show how many proteins, hypothetical..for each taxa, at each level of assembly
assembly_summary <- read.csv("assembly_summary.txt",
                             sep= "\t",
                             head=F,
                             dec=".")
taxid_ch <- assembly_summary %>% filter(V12 == "Chromosome") %>% select(V6) 
colnames(taxid_ch) <- "taxid"
taxid_compl <- assembly_summary %>% filter(V12 == "Complete Genome") %>% select(V6) 
colnames(taxid_compl) <- "taxid"
taxid_sc <- assembly_summary %>% filter(V12 == "Scaffold") %>% select(V6) 
colnames(taxid_sc) <- "taxid"
taxid_con <- assembly_summary %>% filter(V12 == "Contig") %>% select(V6) 
colnames(taxid_con) <- "taxid"

data_chromosome <- Dataset("chromosome/")
data_complete <- Dataset("complete/")
data_scaffold <- Dataset("scaffold/")
data_contig <- Dataset("contig/")

Summary_ch <- cbind(taxid_ch, data_chromosome)
Summary_compl <- cbind(taxid_compl, data_complete)
Summary_scaf <- cbind(taxid_sc, data_scaffold)
Summary_cont <- cbind(taxid_con, data_contig)