##after downloading the "protein.faa.gz" files as shown in 'download_faa.gz_files'
##-> extract information about how many proteins and hypothetical proteins are there for each taxa
library(parallel)
library(tidyr)
library(dplyr)
library(data.table)
library("Biostrings")
#Function to count how many hypothetical proteins there are in each .faa.gz file
Dataset <- function(folder_path) {
read_faa <- function(file){
faa_file <- fasta.index(file)
hypotetical <- faa_file %>% select(desc) %>% filter(grepl('hypothetical', desc)) %>% summarise(n_hypotetical = sum(n()))
proteins <- faa_file %>% select(desc) %>% filter(!grepl('hypothetical', desc)) %>% summarise(n_protein = sum(n()))
total <- faa_file %>% select(desc) %>% summarise(total = sum(n()))
df <- cbind(proteins, hypotetical, total)
return(df)
}
files <- list.files(folder_path, pattern="*.faa.gz", full.names=TRUE)
d <-lapply(files, read_faa)
dataf <- rbindlist(d, fill = T)
return(dataf)
}
#Show how many proteins, hypothetical..for each taxa, at each level of assembly
assembly_summary <- read.csv("assembly_summary.txt",
sep= "\t",
head=F,
dec=".")
taxid_ch <- assembly_summary %>% filter(V12 == "Chromosome") %>% select(V6)
colnames(taxid_ch) <- "taxid"
taxid_compl <- assembly_summary %>% filter(V12 == "Complete Genome") %>% select(V6)
colnames(taxid_compl) <- "taxid"
taxid_sc <- assembly_summary %>% filter(V12 == "Scaffold") %>% select(V6)
colnames(taxid_sc) <- "taxid"
taxid_con <- assembly_summary %>% filter(V12 == "Contig") %>% select(V6)
colnames(taxid_con) <- "taxid"
data_chromosome <- Dataset("chromosome/")
data_complete <- Dataset("complete/")
data_scaffold <- Dataset("scaffold/")
data_contig <- Dataset("contig/")
Summary_ch <- cbind(taxid_ch, data_chromosome)
Summary_compl <- cbind(taxid_compl, data_complete)
Summary_scaf <- cbind(taxid_sc, data_scaffold)
Summary_cont <- cbind(taxid_con, data_contig)