genomewalker
5/31/2016 - 9:50 AM

upSet.R

library(UpSetR)
library(dplyr)
library(magrittr)
library(tidyr)
library(ggplot2)

f <- c("0-0.2", "0.22-3_miTAG", "0.22-3_mOTU", "0.8-5", "5-20","20-180","180-2000")
n<-f[7]
for (n in f ){
fname <- paste("~/Downloads/OTU_relative_abundances.2016_05_18/", n,".txt", sep = "")
otu_df <- read.table(fname, sep = "\t", header = T, check.names = F, stringsAsFactors = T)


basins <- read.table("~/Desktop/forDaniel/data/ocean_regions_mod.txt", header = T, sep = "\t")


atlantic <- basins %>%
  filter(basins == "Atlantic", grepl(pattern = "SUR", Station_Depth)) %>%
  .$Station_Depth %>% as.vector

mediterranean <- basins %>%
  filter(basins == "Mediterranean", grepl(pattern = "SUR", Station_Depth)) %>%
  .$Station_Depth %>% as.vector

#red_sea <- basins %>%
#  filter(basins == "Red Sea", grepl(pattern = "SUR", Station_Depth)) %>%
#  .$Station_Depth %>% as.vector

indic <- basins %>%
  filter(basins == "Indic", grepl(pattern = "SUR", Station_Depth)) %>%
  .$Station_Depth %>% as.vector

antartic <- basins %>%
  filter(basins == "Antartic", grepl(pattern = "SUR", Station_Depth)) %>%
  .$Station_Depth %>% as.vector

pacific <- basins %>%
  filter(basins == "Pacific", grepl(pattern = "SUR", Station_Depth)) %>%
  .$Station_Depth %>% as.vector


#l[colSums(l[x]) > 0][x] <- 1


# na <- c("142_SUR", "143_SUR") 
# gs <- c("146_SUR", "147_SUR", "148_SUR", "149_SUR", "150_SUR", "151_SUR") 
# med <- c("4_SUR", "7_SUR", "18_SUR", "30_SUR")
# sa <- c("72_SUR", "76_SUR")

atlantic_stations <- paste('Atlantic (', otu_df %>% dplyr::select(match(intersect(atlantic, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")
mediterranean_stations <- paste('Mediterranean (', otu_df %>% dplyr::select(match(intersect(mediterranean, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")
#red_sea_stations <- paste('Red Sea (', otu_df %>% dplyr::select(match(intersect(red_sea, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")
indic_stations <- paste('Indic (', otu_df %>% dplyr::select(match(intersect(indic, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")
antartic_stations <- paste('Antartic (', otu_df %>% dplyr::select(match(intersect(antartic, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")
pacific_stations <- paste('Pacific (', otu_df %>% dplyr::select(match(intersect(pacific, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")


atlantic_df <- otu_df %>% dplyr::select(match(intersect(atlantic, names(otu_df)),names(.))) %>%
  mutate(atlantic = rowSums(.)) %>% dplyr::select(atlantic) %>%
  mutate(atlantic = replace(atlantic, atlantic > 0, 1)) %>%
  set_colnames("Atlantic")
  
mediterranean_df <- otu_df %>% dplyr::select(match(intersect(mediterranean, names(otu_df)),names(.))) %>%
  mutate(mediterranean = rowSums(.)) %>% dplyr::select(mediterranean) %>%
  mutate(mediterranean = replace(mediterranean, mediterranean > 0, 1)) %>%
  set_colnames("Mediterranean")

# red_sea_df <- otu_df %>% dplyr::select(match(intersect(red_sea, names(otu_df)),names(.))) %>%
#   mutate(red_sea = rowSums(.)) %>% dplyr::select(red_sea) %>%
#   mutate(red_sea = replace(red_sea, red_sea > 0, 1)) %>%
#   set_colnames("Red_sea")

indic_df <- otu_df %>% dplyr::select(match(intersect(indic, names(otu_df)),names(.))) %>%
  mutate(indic = rowSums(.)) %>% dplyr::select(indic) %>%
  mutate(indic = replace(indic, indic > 0, 1)) %>%
  set_colnames("Indian")

antartic_df <- otu_df %>% dplyr::select(match(intersect(antartic, names(otu_df)),names(.))) %>%
  mutate(antartic = rowSums(.)) %>% dplyr::select(antartic) %>%
  mutate(antartic = replace(antartic, antartic > 0, 1)) %>%
  set_colnames("Antartic")

pacific_df <- otu_df %>% dplyr::select(match(intersect(pacific, names(otu_df)),names(.))) %>%
  mutate(pacific = rowSums(.)) %>% dplyr::select(pacific) %>%
  mutate(pacific = replace(pacific, pacific > 0, 1)) %>%
  set_colnames("Pacific")



atlantic_df_ab <- otu_df %>% dplyr::select(match(intersect(atlantic, names(otu_df)),names(.))) 
mediterranean_df_ab <- otu_df %>% dplyr::select(match(intersect(mediterranean, names(otu_df)),names(.)))
#red_sea_df_ab <- otu_df %>% dplyr::select(match(intersect(red_sea, names(otu_df)),names(.))) 
indic_df_ab <- otu_df %>% dplyr::select(match(intersect(indic, names(otu_df)),names(.))) 
antartic_df_ab <- otu_df %>% dplyr::select(match(intersect(antartic, names(otu_df)),names(.))) 
pacific_df_ab <- otu_df %>% dplyr::select(match(intersect(pacific, names(otu_df)),names(.))) 

all_ab <- data.frame(atl = atlantic_df_ab, med = mediterranean_df_ab, 
                     ind = indic_df_ab, ant = antartic_df_ab, pac = pacific_df_ab)

abundance <- rowSums(all_ab)/max(rowSums(all_ab))
occupancy <- rowSums(all_ab != 0)/dim(all_ab)[2]

all <- data.frame(md5sum=otu_df[,1], atl = atlantic_df, med = mediterranean_df, 
                  ind = indic_df, ant = antartic_df, pac = pacific_df, abundance = log(abundance), occupancy = (occupancy))

# To have violin plots instead of boxplot. First edit the function upset with fix and modify BoxPlotsPlot for BoxPlotsPlot_mod
#Then uncomment the function below and load it.
# fix(upset)
# 
# BoxPlotsPlot_mod <- function (bdat, att, att_color)
# {
#   yaxis <- as.character(att)
#   col <- match(att, colnames(bdat))
#   colnames(bdat)[col] <- "attribute"
#   upper_xlim <- as.numeric((max(bdat$x) + 1))
#   plot_lims <- as.numeric(0:upper_xlim)
#   bdat$x <- as.factor(bdat$x)
#   boxplots <- ggplotGrob(ggplot() + theme_bw() + ylab(yaxis) +
#                            scale_x_discrete(limits = plot_lims, expand = c(0, 0)) +
#                            theme(plot.margin = unit(c(-0.7, 0, 0, 0), "cm"), axis.title.y = element_text(vjust = -0.8),
#                                  axis.ticks.x = element_blank(), axis.text.x = element_blank(),
#                                  panel.border = element_blank(), panel.grid.minor = element_blank(),
#                                  panel.grid.major = element_blank(), axis.title.x = element_blank()) +
#                            geom_violin(data = bdat, aes_string(x = "x", y = "attribute"),
#                                         fill = att_color, colour = "gray80"))
#   return(boxplots)
# }
# 
# Counter <- function (data, num_sets, start_col, name_of_sets, nintersections, 
#           mbar_color, order_mat, aggregate, cut, empty_intersects, 
#           decrease) 
# {
#   temp_data <- list()
#   Freqs <- data.frame()
#   end_col <- as.numeric(((start_col + num_sets) - 1))
#   for (i in 1:num_sets) {
#     temp_data[i] <- match(name_of_sets[i], colnames(data))
#   }
#   Freqs <- data.frame(count(data[, as.integer(temp_data)]))
#   colnames(Freqs)[1:num_sets] <- name_of_sets
#   if (is.null(empty_intersects) == F) {
#     empty <- rep(list(c(0, 1)), times = num_sets)
#     empty <- data.frame(expand.grid(empty))
#     colnames(empty) <- name_of_sets
#     empty$freq <- 0
#     all <- rbind(Freqs, empty)
#     Freqs <- data.frame(all[!duplicated(all[1:num_sets]), 
#                             ])
#   }
#   Freqs <- Freqs[!(rowSums(Freqs[, 1:num_sets]) == 0), ]
#   if (tolower(aggregate) == "degree") {
#     for (i in 1:nrow(Freqs)) {
#       Freqs$degree[i] <- rowSums(Freqs[i, 1:num_sets])
#     }
#     order_cols <- c()
#     for (i in 1:length(order_mat)) {
#       order_cols[i] <- match(order_mat[i], colnames(Freqs))
#     }
#     for (i in 1:length(order_cols)) {
#       logic <- decrease[i]
#       Freqs <- Freqs[order(Freqs[, order_cols[i]], decreasing = logic), 
#                      ]
#     }
#   }
#   else if (tolower(aggregate) == "sets") {
#     Freqs <- Get_aggregates(Freqs, num_sets, order_mat, cut)
#   }
#   delete_row <- (num_sets + 2)
#   Freqs <- Freqs[, -delete_row]
#   for (i in 1:nrow(Freqs)) {
#     Freqs$x[i] <- i
#     Freqs$color <- mbar_color
#   }
#   Freqs <- Freqs %>% dplyr::mutate(dg = rowSums(Freqs[,1:num_sets])) %>% dplyr::arrange((dg))
#   x <- Freqs$x
#   print(x)
#   x1 <- c(x[1:5], x[length(x)], x[7:length(x)-1])
#   print(x1)
#   print(Freqs)
#   Freqs <- Freqs[match(x1, Freqs$x), ]
#   Freqs <- Freqs[1:nintersections, ] %>% dplyr::select(-dg)
#   print(Freqs)
#   Freqs <- na.omit(Freqs)
#   return(Freqs)
# }


# 
# Make_matrix_plot_mod <- function (Mat_data, Set_size_data, Main_bar_data, point_size, 
#           line_size, name_size, labels, shading_data, shade_color, 
#           shade_alpha) 
# {
#   Matrix_plot <- (ggplot() + theme(panel.background = element_rect(fill = "white"), 
#                                    plot.margin = unit(c(-0.2, 0.5, 0.5, 0.5), "lines"), 
#                                    axis.text.x = element_blank(), axis.ticks.x = element_blank(), 
#                                    axis.ticks.y = element_blank(), axis.text.y = element_text(colour = "gray0", 
#                                                                                               size = name_size, hjust = 0.4), panel.grid.major = element_blank(), 
#                                    panel.grid.minor = element_blank()) + xlab(NULL) + ylab("   ") + 
#                     scale_y_continuous(breaks = c(1:nrow(Set_size_data)), 
#                                        limits = c(0.5, (nrow(Set_size_data) + 0.5)), labels = labels, 
#                                        expand = c(0, 0)) + scale_x_continuous(limits = c(0, 
#                                                                                          (nrow(Main_bar_data) + 1)), expand = c(0, 0)) + geom_rect(data = shading_data, 
#                                                                                                                                                    aes_string(xmin = "min", xmax = "max", ymin = "y_min", 
#                                                                                                                                                               ymax = "y_max"), fill = shade_color, alpha = shade_alpha) + 
#                     geom_point(data = Mat_data, aes_string(x = "x", y = "y"), 
#                                colour = Mat_data$color, size = point_size) + geom_line(data = Mat_data, 
#                                                                                        aes_string(group = "Intersection", x = "x", y = "y", 
#                                                                                                   colour = "color"), size = line_size) + scale_color_identity())
#   Matrix_plot <- ggplot_gtable(ggplot_build(Matrix_plot))
#   return(Matrix_plot)
# }

names(all) <- plyr::mapvalues(names(all),
c('Atlantic', 'Mediterranean', 'Indian', 'Antartic', 'Pacific'),
c(atlantic_stations, mediterranean_stations, indic_stations, antartic_stations, pacific_stations))



oname <- paste("~/Downloads/all_basins_upset_abundance_occupancy_logtrans_",n,"_violin.pdf", sep = "")
pdf(file = oname, height = 10, width = 16)
upset(all, nsets = 5, order.by = c("freq", "degree"), group.by = "degree",
      boxplot.summary = c("abundance", "occupancy"), number.angles = 0, 
      mb.ratio = c(0.45, 0.55))

dev.off()

}

# antartic "#516479"
# med "#F7F740"
# atlantic "#62D414"
# indian "#54C7FF"
# pacific "#FF6666"