##after downloading the "protein.faa.gz" files as shown in 'download_faa.gz_files' 
##-> extract information about how many proteins and hypothetical proteins are there for each taxa
library(parallel)
library(tidyr)
library(dplyr)
library(data.table)
library("Biostrings")
#Function to count how many hypothetical proteins there are in each .faa.gz file 
Dataset <- function(folder_path) {
  read_faa <- function(file){
    faa_file <- fasta.index(file)
    hypotetical <- faa_file %>%  select(desc) %>% filter(grepl('hypothetical', desc)) %>% summarise(n_hypotetical = sum(n()))
    proteins <- faa_file %>%  select(desc) %>% filter(!grepl('hypothetical', desc)) %>% summarise(n_protein = sum(n()))
    total <- faa_file %>% select(desc) %>% summarise(total = sum(n()))
    df <- cbind(proteins, hypotetical, total)
    return(df)
  }
  files <- list.files(folder_path, pattern="*.faa.gz", full.names=TRUE)
  d <-lapply(files, read_faa)
  dataf <- rbindlist(d, fill = T)
  return(dataf)
}
#Show how many proteins, hypothetical..for each taxa, at each level of assembly
assembly_summary <- read.csv("assembly_summary.txt",
                             sep= "\t",
                             head=F,
                             dec=".")
taxid_ch <- assembly_summary %>% filter(V12 == "Chromosome") %>% select(V6) 
colnames(taxid_ch) <- "taxid"
taxid_compl <- assembly_summary %>% filter(V12 == "Complete Genome") %>% select(V6) 
colnames(taxid_compl) <- "taxid"
taxid_sc <- assembly_summary %>% filter(V12 == "Scaffold") %>% select(V6) 
colnames(taxid_sc) <- "taxid"
taxid_con <- assembly_summary %>% filter(V12 == "Contig") %>% select(V6) 
colnames(taxid_con) <- "taxid"
data_chromosome <- Dataset("chromosome/")
data_complete <- Dataset("complete/")
data_scaffold <- Dataset("scaffold/")
data_contig <- Dataset("contig/")
Summary_ch <- cbind(taxid_ch, data_chromosome)
Summary_compl <- cbind(taxid_compl, data_complete)
Summary_scaf <- cbind(taxid_sc, data_scaffold)
Summary_cont <- cbind(taxid_con, data_contig)