ERGM for protein clusters

3/31/2017 - 3:52 PM
library(tidyverse)
library(igraph)
library(assortnet)
library(protr)
library(ergm)
library(intergraph)
library(network)

setwd("~/Downloads/PC_meren/")

# Read files
# Protein clusters
pc <- read.table(file = "protein-clusters.txt", header = T) %>% tbl_df

# PC sequences
pc_seqs <- readFASTA("protein-clusters.fasta")

# Remove sequences with amino acids different than the 20 common types
pc_seqs_good <- plyr::ldply(pc_seqs, protcheck) %>% filter(V1 == "TRUE")

pc_seqs <- pc_seqs[pc_seqs_good$.id]

# Sequence-similarity network (blastp all-vs-all)
sim_net <- read.table(file = "protein-cluster.filt.m8", header = F) %>% 
  filter(V1 %in% pc_seqs_good$.id, V2 %in% pc_seqs_good$.id) %>%
  tbl_df

# Build graph (undirected, weighted and without parallel/self-loops)
sim_net <- sim_net %>% 
  dplyr::select(V1, V2, V3) %>%
  dplyr::rename(node1 = V1, node2 = V2, weight = V3) %>%
  graph_from_data_frame(directed = F) %>%
  simplify(edge.attr.comb = "max") 

# Remove isolates edges
sim_net <- igraph::delete.vertices(sim_net, which(degree(sim_net)<1))

# Check how the graph looks like

dd <- degree.distribution(sim_net, cumulative = T) %>%
  as.data.frame %>%
  mutate(x = 1:length(.)) %>% 
  magrittr::set_colnames(c("y", "x")) %>%
  tbl_df %>% 
  filter(y > 0) %>%
  ggplot(aes(x,y)) + 
  geom_point() + 
  scale_x_log10() + 
  scale_y_log10() + 
  xlab("Degree") + 
  ylab("Cumulative Frequency") +
  theme_bw()

dd

network_description <- function (X) {
  require(igraph)
  cat("NETWORK DESCRIPTION","\n")
  cat("----------------------------------------------","\n")
  #cat("Diameter:",diameter(X),"\n")
  #cat("Girth:",girth(X)$girth,"\n")
  #cat("Average betweenness:",sum(betweenness(X))/vcount(X),"\n")
  #cat("Average closeness:",sum(closeness(X))/vcount(X),"\n")
  #cat("Average path length:",average.path.length(simplify(X)),"\n")
  cat("Transitivity:",transitivity(X),"\n")
  cat("Total number of motifs size 3:",graph.motifs.no(X),"\n")
  cat("Density:",graph.density(X),"\n")
  cat("Average vextex neighbor degree:",sum(graph.knn(X)$knn)/vcount(X),"\n")
  cat("Unique links:",ecount(simplify(X)),"\n")
  cat("Link ratio:",ecount(X)/ecount(simplify(X)),"\n")
  cat("Number of clusters:",no.clusters(X),"\n")
  cat("----------------------------------------------","\n")
}

network_description(sim_net)

mcl_bin <- "mcl"

gx <- sim_net

E(gx)$weight <- E(gx)$weight - min(E(gx)$weight) + 0.001

g_mcl_list <- vector( mode = "list")

options <- paste("--abc -I", 2.5, sep=" ") 
I <- as.character(2.5)

g_mcl_list <- run_mcl(X = "g", graph = gx, options = options, mcl_bin = mcl_bin)
g_mcl_list <- g_mcl_list[match(igraph::get.vertex.attribute(gx, "name"), g_mcl_list$vertex),]
g_mcl_list <- make_clusters(gx, as.numeric(g_mcl_list$com), algorithm = 'MCL', modularity = TRUE)

com_mclX <- paste("com_mcl",I,sep="")
sim_net <- igraph::set.vertex.attribute(graph = sim_net, name = eval(com_mclX), index=V(sim_net),value = as.character(membership(g_mcl_list)) )

dist2origin <- function(x,y){
  m <- rbind(c(0,0), cbind(x, y))
  m <- as.matrix(dist(m, method = "euclidean"))[1,][-1]
}

binary_search_filter <- function(g, low, high) {
  x<-g
  if ( high < low ) {
    return(NULL)
  }else {
    mid <- floor((low + high) / 2)
    x<-igraph::delete.edges(g, which(E(g)$weight <= weights[mid])) 
    cat("low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:", count_components(x), " edges:", ecount(x), "\n")
    if ((high - low) == 0){
      cat("low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:",count_components(x), " edges:", ecount(x), "\n")
      x<-igraph::delete.edges(g, which(E(g)$weight < weights[mid+1])) 
      return(list(weight=mid+1, graph=x))
      exit
    }
    if (!(is.connected(x))){
      binary_search_filter(g, low, mid - 1)
    }else if ( is.connected(x) ){
      binary_search_filter(g, mid + 1, high)
    }
  }
}

binary_search_filter_1 <- function(g, low, high) {
  x<-g
  if ( high < low ) {
    
    return(NULL)
    
  }else {
    mid <- floor((low + high) / 2)
    x<-igraph::delete.edges(g, which(E(g)$weight <= weights[mid])) 
    cat("low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:", count_components(x), " edges:", ecount(x), "\n")
    
    if ((mid - low) == 0){
      x<-igraph::delete.edges(g, which(E(g)$weight <= weights[mid-1])) 
      cat("Final: low:", low - 1, " mid:", mid - 1, " high:", high - 1, " mid_weight:", weights[mid - 1], " components:",count_components(x), " edges:", ecount(x), "\n")
      return(list(weight=mid - 1, graph=x))
      exit
    }
    
    if (((high - low) == 0)){
      x<-igraph::delete.edges(g, which(E(g)$weight <= weights[mid+1])) 
      cat("Final: low:", low, " mid:", mid, " high:", high, " mid_weight:", weights[mid], " components:",count_components(x), " edges:", ecount(x), "\n")
      return(list(weight=mid , graph=x))
      exit
    }
    
    
    if (!(is.connected(x))){
      binary_search_filter_1(g, low, mid - 1)
    }else if (is.connected(x)){
      binary_search_filter_1(g, mid + 1, high)
    }
  }
}

zero_range <- function(x, tol = .Machine$double.eps ^ 0.5) {
  if (length(x) == 1) return(TRUE)
  x <- range(x) / mean(x)
  isTRUE(all.equal(x[1], x[2], tolerance = tol))
}


G <- sim_net
X <- as_data_frame(G , what="vertices")
clmethod <- 'com_mcl2.5'
G.comms <- tapply( X$name, as.numeric(X[,clmethod]), FUN=function(x) { induced.subgraph(G,vids = x ) }  )

com_stats <- vector(mode = "list")
com_ergm <- vector(mode = "list")

for ( i in 1:length(G.comms)) {
  com <- G.comms[[i]]
  if (vcount(com) >=10){
  com_seqs <- pc_seqs[as.character(V(com)$name)]
  com_seqs <- Filter(length, com_seqs)
  den <- graph.density(com)
  if (graph.density(com) == 1){
    weights <- unique(sort(E(com)$weight, decreasing = FALSE))
    com <- binary_search_filter_1(g = com, low = 1, high = length(weights))$graph
  } 
  
  # Protein length
  l <- plyr::ldply(com_seqs, function(x){length(unlist(strsplit(x, "")))})
  length_equal <- zero_range(l$V1)
  com <- igraph::set.vertex.attribute(graph = com, name = "length", index=V(com),value = l$V1) 
  #assortment.continuous(as_adjacency_matrix(com, sparse = F, attr = "weight" ), aac_pc1)
  
  # Acidic-basic ratio
  l <- plyr::ldply(com_seqs, extractAAC)
  row.names(l) <- l$.id
  l <- plyr::adply(l, 1, function(x){
    ab<-(x$D + x$E)/(x$H + x$R + x$K)
    ab <- ifelse(is.infinite(ab), 0, ab)
  })
  ab_ratio_equal <- zero_range(l$V1)
  com <- igraph::set.vertex.attribute(graph = com, name = "ab_ratio", index=V(com),value = l$V1)
  
  # Amino acid composition
  l <- plyr::ldply(com_seqs, extractAAC)
  row.names(l) <- l$.id
  l$.id <- NULL
  aac_equal <- unique(plyr::ldply(l, zero_range)$V1)
  if (aac_equal != TRUE || length(aac_equal) > 1){
    aac_pca <- rda(l, scale = T)
    aac_inertia <- round(cumsum(100*aac_pca$CA$eig/sum(aac_pca$CA$eig)),2)
    if (length(aac_inertia) > 1){
      aac_ed <- dist2origin(aac_pca$CA$u[,1], aac_pca$CA$u[,2])
      com <- igraph::set.vertex.attribute(graph = com, name = "aac_ed", index=V(com),value = aac_ed) 
    }else{
      com <- igraph::set.vertex.attribute(graph = com, name = "aac_ed", index=V(com),value = aac_pca$CA$u[,1])
    }
  } 
  
  # Descriptors: hydrophobicity, normalized van der Waals volume polarity, and polarizability
  l <- plyr::ldply(com_seqs, extractCTDC)
  row.names(l) <- l$.id
  l$.id <- NULL
  ctdc_equal <- unique(plyr::ldply(l, zero_range)$V1)
  if (ctdc_equal != TRUE || length(ctdc_equal) > 1){
    ctdc_pca <- rda(l, scale = T)
    ctdc_inertia <- round(cumsum(100*ctdc_pca$CA$eig/sum(ctdc_pca$CA$eig)),2)
    if (length(ctdc_inertia) > 1){
      ctdc_ed <- dist2origin(ctdc_pca$CA$u[,1], ctdc_pca$CA$u[,2])
      com <- igraph::set.vertex.attribute(graph = com, name = "ctdc_ed", index=V(com), value = ctdc_ed) 
    }else{
      com <- igraph::set.vertex.attribute(graph = com, name = "ctdc_ed", index=V(com), value = ctdc_pca$CA$u[,1]) 
    }
  }
  # Conjoint Triad Descriptors
  # l <- plyr::ldply(com_seqs, extractCTriad)
  # row.names(l) <- l$.id
  # l$.id <- NULL
  # triad_pca <- rda(l, scale = T)
  # triad_inertia <- round(cumsum(100*triad_pca$CA$eig/sum(triad_pca$CA$eig)),2)
  # triad_ed <- dist2origin(triad_pca$CA$u[,1], triad_pca$CA$u[,2])
  # com <- igraph::set.vertex.attribute(graph = com, name = "triad_ed", index=V(com), value = triad_ed) 
  # 
  
  if ((length(length_equal) == 1 & length_equal == TRUE) & 
      (length(aac_equal) == 1 & aac_equal == TRUE) & 
      (length(ctdc_equal) == 1 & ctdc_equal == TRUE)){
    com_ergm[[i]] <- data_frame(Estimate = "equal", Std..Error = "equal",  MCMC..= "equal", p.value= "equal", com = i)
  }else{
      net <- asNetwork(com)
      
      # We test that the edges are different enough to model homophily, if not we will get Inf
      reg_edges <- ergm(net ~ edges)
      
      if (!is.infinite(reg_edges$coef)){
        
        reg <- ergm(net ~ edges + absdiff("ctdc_ed") + absdiff("aac_ed") + 
                      absdiff("length"), verbose = T)
        com_ergm[[i]]  <- data.frame(summary(reg)$coefs, com = i)
        
      }else{
        com_ergm[[i]] <- data_frame(Estimate = Inf, Std..Error = Inf,  MCMC..= Inf, p.value= Inf, com = i)
      }
      
   
    
    com_stats[[i]] <- data.frame(com = i, nodes = V(com)$name, p_len = V(com)$length, 
                                 ctdc_ed = V(com)$ctdc_ed, aac_ed = V(com)$aac_ed,
                                 ab_ratio = V(com)$ab_ratio, den = den)
    }
  }else{
    com_ergm[[i]] <- data_frame(Estimate = NA, Std..Error = NA,  MCMC..= NA, p.value= NA, com = i)
  }
}

p$sign <- ifelse(p$p.value < 0.001, "sig", "no-sig")

p %>% 
  filter(class == "absdiff.ctdc_ed" | class == "absdiff.aac_ed", Estimate <= 100, Estimate >= -100) %>% 
  ggplot(aes(class, Estimate)) + 
  geom_jitter(size = 1, alpha = 0.6, aes(color = sign)) + 
  geom_violin(alpha = 0, color = "red") +
  theme_bw()
Cacher is the code snippet organizer for pro developers

We empower you and your team to get more done, faster

ERGM for protein clusters