genomewalker
6/30/2017 - 11:02 AM

ergm_clusters.r

reg_edges <- ergm(net ~ edges)
    
    if (!is.infinite(reg_edges$coef)){
      factors <- c(1,2,4,5)
      steps <- function(i, term) {
        reg <- ergm(as.formula(paste("net ~ edges +",term, sep="")))
        # The following is the point on which I have doubts
        # Basically when the model returns: 
            # "Warning messages:
            # 1: In ergm.mple(Clist, Clist.miss, m, MPLEtype = MPLEtype, init = init,  :
            # glm.fit: fitted probabilities numerically 0 or 1 occurred"
        # it returns "warning" next to the tested factor, that will be discarded from the following steps, 
        # because I've noticed that the resulting estimates are really weird when this happen
        # a problem is that to test it it re-run the models, but I haven't found a better solution
        x <- tryCatch(
          ergm(as.formula(paste("net ~ edges +",term, sep=""))),
          warning=function(w) print("warning"))
        if(any(x=="warning")){
           x
        }else{
           x="ok"
        }
        aic <- reg$glm$aic
        df <- data.frame(aic=aic, factors=i, W=x)
      }
      lm1 <- lapply(factors, steps,
                    term='absdiff(paste("f",i,sep=""))')
      lmdf <- plyr::ldply(lm1,data.frame) 
      min_aic1 <- lmdf %>% filter(W=="ok") %>% filter(aic == min(aic))
      if(dim(min_aic1)[1]==0){
        min_aic1 <- lmdf %>% filter(aic == min(aic))
      }
      factors <- factors[! factors %in% min_aic1$factors ]
      lm2 <- lapply(factors, steps,
                    term='absdiff(paste("f",min_aic1$factors,sep="")) + absdiff(paste("f",i,sep=""))')
      lmdf <- plyr::ldply(lm2,data.frame)
      if(all(lmdf$W=="warning")){
        reg <- ergm(net ~ edges + absdiff(paste("f",min_aic1$factors,sep="")))
      }else{
        min_aic2 <- lmdf %>% filter(W=="ok") %>% filter(aic == min(aic))
        
        if(min_aic2$aic > min_aic1$aic){
          #return model
          reg <- ergm(net ~ edges + absdiff(paste("f",min_aic1$factors,sep="")))
        }else{
          factors <- factors[! factors %in% min_aic2$factors ]
          lm3 <- lapply(factors, steps,
                        term='absdiff(paste("f",min_aic1$factors,sep="")) + absdiff(paste("f",min_aic2$factor,sep="")) + absdiff(paste("f",i,sep=""))')
          lmdf <- plyr::ldply(lm3,data.frame)
          if(all(lmdf$W=="warning")){
            reg <- ergm(net ~ edges + absdiff(paste("f",min_aic1$factors,sep="")) + 
                          absdiff(paste("f",min_aic2$factors,sep="")))
          }else{
            min_aic3 <- lmdf %>% filter(W=="ok") %>% filter(aic == min(aic))
            if(min_aic3$aic > min_aic2$aic){
              #return model
              reg <- ergm(net ~ edges + absdiff(paste("f",min_aic1$factors,sep="")) + 
                            absdiff(paste("f",min_aic2$factors,sep="")))
            }else{
              factors <- factors[! factors %in% min_aic3$factors ]
              lm4 <- lapply(factors, steps,
                            term='absdiff(paste("f",min_aic1$factors,sep="")) + absdiff(paste("f",min_aic2$factors,sep="")) + absdiff(paste("f",min_aic3$factors,sep="")) + absdiff(paste("f",i,sep=""))')
              lmdf <- plyr::ldply(lm4,data.frame)
              min_aic4 <- lmdf  %>% filter(aic == min(aic))
              if(lmdf$W=="warning" || min_aic4$aic > min_aic3$aic){
                reg <- ergm(net ~ edges + absdiff(paste("f",min_aic1$factors,sep="")) + 
                              absdiff(paste("f",min_aic2$factors,sep="")) + 
                              absdiff(paste("f",min_aic3$factors,sep="")))
              }else{
                reg <- ergm(net ~ edges + 
                              absdiff(paste("f",min_aic1$factors,sep="")) + 
                              absdiff(paste("f",min_aic2$factors,sep="")) + 
                              absdiff(paste("f",min_aic3$factors,sep="")) + 
                              absdiff(paste("f",min_aic4$factors,sep="")))
              }}}}}
      
      sim_ergm  <- data.frame(summary(reg)$coefs)
library(tidyverse)
library(igraph)
library(assortnet)
library(protr)
library(ergm)
library(intergraph)
library(network)
library(vegan)
library(data.table)
library(HDMD)

args=(commandArgs(TRUE))

print(args)

fasta_files <- as.list(list.files(path = args[1], pattern = "*.fasta",full.names = T))
aln_files <- as.list(list.files(path = args[1], pattern = "*.m8",full.names = T))

ergm_funct <- function(fasta,aln) {
  cat("Processing file...\n")
  clstr <- tools::file_path_sans_ext(basename(fasta))
  pc_seqs <- readFASTA(fasta)
  # Remove sequences with amino acids different than the 20 common types
  pc_seqs_good <- plyr::ldply(pc_seqs, protcheck) %>% filter(V1 == "TRUE")

  pc_seqs <- pc_seqs[pc_seqs_good$.id]


  sim_net <- read.table(aln, header = F)

  if (all(sim_net$V3==1)) {
    sim_ergm <- data_frame(Estimate = Inf, Std..Error = Inf,  MCMC..= Inf, p.value= Inf, Terms="any", Clstr=clstr, Density=NA)
  } else {
    sim_net <- sim_net %>% dplyr::filter(sim_net$V1 %in% pc_seqs_good$.id, sim_net$V2 %in% pc_seqs_good$.id) %>%
      tbl_df
    #Build graph (undirected, weighted and without parallel/self-loops)
    sim_net <- sim_net %>%
      dplyr::select(V1, V2, V3) %>%
      dplyr::rename(node1 = V1, node2 = V2, weight = V3) %>%
      graph_from_data_frame(directed = F) %>%
      igraph::simplify(edge.attr.comb = "max")


    # Remove isolates edges
     sim_net <- igraph::delete.vertices(sim_net, which(degree(sim_net)<1))
    # edge weights
    wg <- unique(sort(E(sim_net)$weight, decreasing = FALSE))
    # seqs length
    l <- plyr::ldply(pc_seqs, function(x){length(unlist(strsplit(x, "")))})
    #check for and remove sequence outlier of length -2 Median absolute deviation (MAD)
    med_ab_dev <- mad(l$V1, center = median(l$V1), constant = 1.4826, na.rm = FALSE,low = FALSE, high = FALSE)
    if(med_ab_dev>0){
      lower_lim <- median(l$V1) - (2*med_ab_dev)
      #filter from l
      l <- filter(l, V1 > lower_lim)
      #filter from pc_seqs
      pc_seqs <- pc_seqs[l$.id]
      #filter from sim_net
      sim_net <- igraph::delete.vertices(sim_net, V(sim_net)[!V(sim_net)$name %in% l$.id])
    }

    #Filtering edges with low weight, because ergms don't work with density=1
    binary_search_filter_1 <- function(g, low, high) {
      x<-g
      if ( high < low ) {

        return(NULL)

      }else {
 mid <- floor((low + high) / 2)
        x <-igraph::delete.edges(g, which(E(g)$weight <= wg[mid]))
        cat("low:", low, " mid:", mid, " high:", high, " mid_weight:", wg[mid], " components:", count_components(x), " edges:", ecount(x), "\n")

        if ((mid - low) == 0){
          x <-igraph::delete.edges(g, which(E(g)$weight <= wg[mid-1]))
          cat("Final: low:", low - 1, " mid:", mid - 1, " high:", high - 1, " mid_weight:", wg[mid - 1], " components:",count_components(x), " edges:", ecount(x), "\n")
          return(list(weight=mid - 1, graph=x))
          exit
        }

        if (((high - low) == 0)){
          x<-igraph::delete.edges(g, which(E(g)$weight <= wg[mid+1]))
          cat("Final: low:", low, " mid:", mid, " high:", high, " mid_weight:", wg[mid], " components:",count_components(x), " edges:", ecount(x), "\n")
          return(list(weight=mid , graph=x))
          exit
        }


        if (!(is.connected(x))){
          binary_search_filter_1(g, low, mid - 1)
        }else if (is.connected(x)){
          binary_search_filter_1(g, mid + 1, high)
        }
      }
 sim_net <- binary_search_filter_1(g = sim_net, low = 1, high = length(wg))$graph
    #plot(sim_net, vertex.label=NA)
    if(any(degree(sim_net)<2)) {
      if (any(E(sim_net)[from(V(sim_net)[which(degree(sim_net)<2)])]$weight < median(wg))){
        sim_net <- igraph::delete.edges(sim_net, E(sim_net)[which(E(sim_net)[from(V(sim_net)[which(degree(sim_net)<2)])]$weight < median(wg))])
        sim_net <- igraph::delete.vertices(sim_net, which(degree(sim_net)<1))
        #plot(sim_net, vertex.label=NA)
      }
    }
    #subsample the sequences
    pc_seqs <- pc_seqs[V(sim_net)$name]
    wg <- unique(sort(E(sim_net)$weight, decreasing = FALSE))
    sim_net <- binary_search_filter_1(g = sim_net, low = 1, high = length(wg))$graph

    #find the graph degree strength and in-between centrality
    #getting the properties from out network sim_net
    #degree <- igraph::degree(sim_net, v = V(sim_net))

    strength <- igraph::strength(sim_net, vids = V(sim_net)) #may want to normalize (between 0 and 1)

    #takes dissimilarity matrix as igraph betweenness calc assumes edge weights are costs, not strengths
    betweenness <- igraph::betweenness(sim_net, v = V(sim_net), normalized = TRUE, weights = (1-E(sim_net)$weight)+1)

    eigen_centrality <- eigen_centrality(sim_net, scale = TRUE)$vector

df <- data.frame(seq_id = V(sim_net)$name, strength = strength, betweenness = betweenness, eigen_centrality=eigen_centrality)

    #rep <- df %>% arrange(desc(betweenness), desc(strength))  %>% head(n=1)

    write.table(df, file=paste("/bioinf/projects/megx/UNKNOWNS/chiara/Clusters_data/2016_11/multi_PFAM/TEST_ERGM/stat/stat_",clstr,".tsv",sep=""), sep="\t",row.names = F, col.names = T)

    #seqs length as attribute
    l <- plyr::ldply(pc_seqs, function(x){length(unlist(strsplit(x, "")))})
    sim_net <- igraph::set.vertex.attribute(graph = sim_net, name = "length", index=V(sim_net), value = l$V1)
    #hist(V(sim_net)$length)

    ### retrieve AA descriptors for the nodes(proteins)
    #AA descriptors
    f1 <- unlist(lapply(HDMD::FactorTransform(pc_seqs, Factor = 1), sum))/l$V1
    f2 <- unlist(lapply(HDMD::FactorTransform(pc_seqs, Factor = 2), sum))/l$V1
    #f3 <- unlist(lapply(HDMD::FactorTransform(pc_seqs, Factor = 3), sum))/l$V1
    f4 <- unlist(lapply(HDMD::FactorTransform(pc_seqs, Factor = 4), sum))/l$V1
    f5 <- unlist(lapply(HDMD::FactorTransform(pc_seqs, Factor = 5), sum))/l$V1

    # as node attributes
    sim_net <- igraph::set.vertex.attribute(graph = sim_net, name = "f1", index=V(sim_net), value = f1)
    sim_net <- igraph::set.vertex.attribute(graph = sim_net, name = "f2", index=V(sim_net), value = f2)
    #sim_net <- igraph::set.vertex.attribute(graph = sim_net, name = "f3", index=V(sim_net), value = f3)
    sim_net <- igraph::set.vertex.attribute(graph = sim_net, name = "f4", index=V(sim_net), value = f4)
    sim_net <- igraph::set.vertex.attribute(graph = sim_net, name = "f5", index=V(sim_net), value = f5)
      
    density <- graph.density(sim_net)

    net <- asNetwork(sim_net)
    # We test that the edges are different enough to model homophily, if not we will get Inf
    reg_edges <- ergm(net ~ edges)

    if (!is.infinite(reg_edges$coef)){
      factors <- c(1,2,4,5)
      steps <- function(i, term) {
        reg <- ergm(as.formula(paste("net ~ edges +",term, sep="")))
        x <- tryCatch(
          ergm(as.formula(paste("net ~ edges +",term, sep=""))),
          warning=function(w) print("warning"))
        if(any(x=="warning")){
          x
        }else{
          x="ok"
        }
        aic <- reg$glm$aic
        df <- data.frame(aic=aic, factors=i, W=x)
      }
      lm1 <- lapply(factors, steps,
                    term='absdiff(paste("f",i,sep=""))')
lmdf <- plyr::ldply(lm1,data.frame)
      min_aic1 <- lmdf %>% filter(W=="ok") %>% filter(aic == min(aic))
      if(dim(min_aic1)[1]==0){
        min_aic1 <- lmdf %>% filter(aic == min(aic))
      }
      factors <- factors[! factors %in% min_aic1$factor ]
      lm2 <- lapply(factors, steps,
                    term='absdiff(paste("f",min_aic1$factor,sep="")) + absdiff(paste("f",i,sep=""))')
      lmdf <- plyr::ldply(lm2,data.frame)
      if(all(lmdf$W=="warning")){
        reg <- ergm(net ~ edges + absdiff(paste("f",min_aic1$factor,sep="")))
      }else{
        min_aic2 <- lmdf %>% filter(W=="ok") %>% filter(aic == min(aic))

        if(min_aic2$aic > min_aic1$aic){
          #return model
          reg <- ergm(net ~ edges + absdiff(paste("f",min_aic1$factor,sep="")))
        }else{
          factors <- factors[! factors %in% min_aic2$factor ]
          lm3 <- lapply(factors, steps,
                        term='absdiff(paste("f",min_aic1$factor,sep="")) + absdiff(paste("f",min_aic2$factor,sep="")) + absdiff(paste("f",i,sep=""))')
          lmdf <- plyr::ldply(lm3,data.frame)
          if(all(lmdf$W=="warning")){
            reg <- ergm(net ~ edges + absdiff(paste("f",min_aic1$factor,sep="")) +
                          absdiff(paste("f",min_aic2$factors,sep="")))
 }else{
            min_aic3 <- lmdf %>% filter(W=="ok") %>% filter(aic == min(aic))
            if(min_aic3$aic > min_aic2$aic){
              #return model
              reg <- ergm(net ~ edges + absdiff(paste("f",min_aic1$factor,sep="")) +
                            absdiff(paste("f",min_aic2$factors,sep="")))
            }else{
              factors <- factors[! factors %in% min_aic3$factor ]
              lm4 <- lapply(factors, steps,
                            term='absdiff(paste("f",min_aic1$factor,sep="")) + absdiff(paste("f",min_aic2$factor,sep="")) + absdiff(paste("f",min_aic3$factor,sep="")) + absdiff(paste("f",i,sep=""$
              lmdf <- plyr::ldply(lm4,data.frame)
              min_aic4 <- lmdf  %>% filter(aic == min(aic))
              if(lmdf$W=="warning" || min_aic4$aic > min_aic3$aic){
                reg <- ergm(net ~ edges + absdiff(paste("f",min_aic1$factor,sep="")) +
                              absdiff(paste("f",min_aic2$factors,sep="")) +
                              absdiff(paste("f",min_aic3$factors,sep="")))
              }else{
                reg <- ergm(net ~ edges +
                              absdiff(paste("f",min_aic1$factor,sep="")) +
                              absdiff(paste("f",min_aic2$factors,sep="")) +
                              absdiff(paste("f",min_aic3$factors,sep="")) +
                              absdiff(paste("f",min_aic4$factors,sep="")))
              }}}}}

      sim_ergm  <- data.frame(summary(reg)$coefs)

 sim_ergm$Terms <- rownames(sim_ergm)
      sim_ergm$Clstr <- clstr
      sim_ergm$Density <- density
    } else {
      sim_ergm <- data_frame(Estimate = Inf, Std..Error = Inf,  MCMC..= Inf, p.value= Inf, Terms="any", Clstr=clstr, Density=density)
    }
  }
 return(sim_ergm)
}

library(parallel)
ergm_res <- mcmapply(ergm_funct, fasta=fasta_files, aln=aln_files, mc.cores = getOption("mc.cores", 4L))

#library(pbmcapply)
#proc.time()
#ergm_res <- pbmcmapply(ergm_funct, fasta=fasta_files, aln=aln_files, mc.cores = getOption("mc.cores", 4L))
#proc.time()

my.list <- as.data.frame(ergm_res)
results <- plyr::ldply(my.list, data.frame)
results <- results[,-1]
results$sign <- ifelse(results$p.value < 0.001, "sig", "no-sig")


write.table(results,file=paste(args[2],"/ergm_res.tsv",sep=""), sep = "\t", row.names = F, col.names = F)