library(UpSetR)
library(dplyr)
library(magrittr)
library(tidyr)
library(ggplot2)
f <- c("0-0.2", "0.22-3_miTAG", "0.22-3_mOTU", "0.8-5", "5-20","20-180","180-2000")
n<-f[7]
for (n in f ){
fname <- paste("~/Downloads/OTU_relative_abundances.2016_05_18/", n,".txt", sep = "")
otu_df <- read.table(fname, sep = "\t", header = T, check.names = F, stringsAsFactors = T)
basins <- read.table("~/Desktop/forDaniel/data/ocean_regions_mod.txt", header = T, sep = "\t")
atlantic <- basins %>%
filter(basins == "Atlantic", grepl(pattern = "SUR", Station_Depth)) %>%
.$Station_Depth %>% as.vector
mediterranean <- basins %>%
filter(basins == "Mediterranean", grepl(pattern = "SUR", Station_Depth)) %>%
.$Station_Depth %>% as.vector
#red_sea <- basins %>%
# filter(basins == "Red Sea", grepl(pattern = "SUR", Station_Depth)) %>%
# .$Station_Depth %>% as.vector
indic <- basins %>%
filter(basins == "Indic", grepl(pattern = "SUR", Station_Depth)) %>%
.$Station_Depth %>% as.vector
antartic <- basins %>%
filter(basins == "Antartic", grepl(pattern = "SUR", Station_Depth)) %>%
.$Station_Depth %>% as.vector
pacific <- basins %>%
filter(basins == "Pacific", grepl(pattern = "SUR", Station_Depth)) %>%
.$Station_Depth %>% as.vector
#l[colSums(l[x]) > 0][x] <- 1
# na <- c("142_SUR", "143_SUR")
# gs <- c("146_SUR", "147_SUR", "148_SUR", "149_SUR", "150_SUR", "151_SUR")
# med <- c("4_SUR", "7_SUR", "18_SUR", "30_SUR")
# sa <- c("72_SUR", "76_SUR")
atlantic_stations <- paste('Atlantic (', otu_df %>% dplyr::select(match(intersect(atlantic, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")
mediterranean_stations <- paste('Mediterranean (', otu_df %>% dplyr::select(match(intersect(mediterranean, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")
#red_sea_stations <- paste('Red Sea (', otu_df %>% dplyr::select(match(intersect(red_sea, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")
indic_stations <- paste('Indic (', otu_df %>% dplyr::select(match(intersect(indic, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")
antartic_stations <- paste('Antartic (', otu_df %>% dplyr::select(match(intersect(antartic, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")
pacific_stations <- paste('Pacific (', otu_df %>% dplyr::select(match(intersect(pacific, names(otu_df)),names(.))) %>% names() %>% length, ')', sep ="")
atlantic_df <- otu_df %>% dplyr::select(match(intersect(atlantic, names(otu_df)),names(.))) %>%
mutate(atlantic = rowSums(.)) %>% dplyr::select(atlantic) %>%
mutate(atlantic = replace(atlantic, atlantic > 0, 1)) %>%
set_colnames("Atlantic")
mediterranean_df <- otu_df %>% dplyr::select(match(intersect(mediterranean, names(otu_df)),names(.))) %>%
mutate(mediterranean = rowSums(.)) %>% dplyr::select(mediterranean) %>%
mutate(mediterranean = replace(mediterranean, mediterranean > 0, 1)) %>%
set_colnames("Mediterranean")
# red_sea_df <- otu_df %>% dplyr::select(match(intersect(red_sea, names(otu_df)),names(.))) %>%
# mutate(red_sea = rowSums(.)) %>% dplyr::select(red_sea) %>%
# mutate(red_sea = replace(red_sea, red_sea > 0, 1)) %>%
# set_colnames("Red_sea")
indic_df <- otu_df %>% dplyr::select(match(intersect(indic, names(otu_df)),names(.))) %>%
mutate(indic = rowSums(.)) %>% dplyr::select(indic) %>%
mutate(indic = replace(indic, indic > 0, 1)) %>%
set_colnames("Indian")
antartic_df <- otu_df %>% dplyr::select(match(intersect(antartic, names(otu_df)),names(.))) %>%
mutate(antartic = rowSums(.)) %>% dplyr::select(antartic) %>%
mutate(antartic = replace(antartic, antartic > 0, 1)) %>%
set_colnames("Antartic")
pacific_df <- otu_df %>% dplyr::select(match(intersect(pacific, names(otu_df)),names(.))) %>%
mutate(pacific = rowSums(.)) %>% dplyr::select(pacific) %>%
mutate(pacific = replace(pacific, pacific > 0, 1)) %>%
set_colnames("Pacific")
atlantic_df_ab <- otu_df %>% dplyr::select(match(intersect(atlantic, names(otu_df)),names(.)))
mediterranean_df_ab <- otu_df %>% dplyr::select(match(intersect(mediterranean, names(otu_df)),names(.)))
#red_sea_df_ab <- otu_df %>% dplyr::select(match(intersect(red_sea, names(otu_df)),names(.)))
indic_df_ab <- otu_df %>% dplyr::select(match(intersect(indic, names(otu_df)),names(.)))
antartic_df_ab <- otu_df %>% dplyr::select(match(intersect(antartic, names(otu_df)),names(.)))
pacific_df_ab <- otu_df %>% dplyr::select(match(intersect(pacific, names(otu_df)),names(.)))
all_ab <- data.frame(atl = atlantic_df_ab, med = mediterranean_df_ab,
ind = indic_df_ab, ant = antartic_df_ab, pac = pacific_df_ab)
abundance <- rowSums(all_ab)/max(rowSums(all_ab))
occupancy <- rowSums(all_ab != 0)/dim(all_ab)[2]
all <- data.frame(md5sum=otu_df[,1], atl = atlantic_df, med = mediterranean_df,
ind = indic_df, ant = antartic_df, pac = pacific_df, abundance = log(abundance), occupancy = (occupancy))
# To have violin plots instead of boxplot. First edit the function upset with fix and modify BoxPlotsPlot for BoxPlotsPlot_mod
#Then uncomment the function below and load it.
# fix(upset)
#
# BoxPlotsPlot_mod <- function (bdat, att, att_color)
# {
# yaxis <- as.character(att)
# col <- match(att, colnames(bdat))
# colnames(bdat)[col] <- "attribute"
# upper_xlim <- as.numeric((max(bdat$x) + 1))
# plot_lims <- as.numeric(0:upper_xlim)
# bdat$x <- as.factor(bdat$x)
# boxplots <- ggplotGrob(ggplot() + theme_bw() + ylab(yaxis) +
# scale_x_discrete(limits = plot_lims, expand = c(0, 0)) +
# theme(plot.margin = unit(c(-0.7, 0, 0, 0), "cm"), axis.title.y = element_text(vjust = -0.8),
# axis.ticks.x = element_blank(), axis.text.x = element_blank(),
# panel.border = element_blank(), panel.grid.minor = element_blank(),
# panel.grid.major = element_blank(), axis.title.x = element_blank()) +
# geom_violin(data = bdat, aes_string(x = "x", y = "attribute"),
# fill = att_color, colour = "gray80"))
# return(boxplots)
# }
#
# Counter <- function (data, num_sets, start_col, name_of_sets, nintersections,
# mbar_color, order_mat, aggregate, cut, empty_intersects,
# decrease)
# {
# temp_data <- list()
# Freqs <- data.frame()
# end_col <- as.numeric(((start_col + num_sets) - 1))
# for (i in 1:num_sets) {
# temp_data[i] <- match(name_of_sets[i], colnames(data))
# }
# Freqs <- data.frame(count(data[, as.integer(temp_data)]))
# colnames(Freqs)[1:num_sets] <- name_of_sets
# if (is.null(empty_intersects) == F) {
# empty <- rep(list(c(0, 1)), times = num_sets)
# empty <- data.frame(expand.grid(empty))
# colnames(empty) <- name_of_sets
# empty$freq <- 0
# all <- rbind(Freqs, empty)
# Freqs <- data.frame(all[!duplicated(all[1:num_sets]),
# ])
# }
# Freqs <- Freqs[!(rowSums(Freqs[, 1:num_sets]) == 0), ]
# if (tolower(aggregate) == "degree") {
# for (i in 1:nrow(Freqs)) {
# Freqs$degree[i] <- rowSums(Freqs[i, 1:num_sets])
# }
# order_cols <- c()
# for (i in 1:length(order_mat)) {
# order_cols[i] <- match(order_mat[i], colnames(Freqs))
# }
# for (i in 1:length(order_cols)) {
# logic <- decrease[i]
# Freqs <- Freqs[order(Freqs[, order_cols[i]], decreasing = logic),
# ]
# }
# }
# else if (tolower(aggregate) == "sets") {
# Freqs <- Get_aggregates(Freqs, num_sets, order_mat, cut)
# }
# delete_row <- (num_sets + 2)
# Freqs <- Freqs[, -delete_row]
# for (i in 1:nrow(Freqs)) {
# Freqs$x[i] <- i
# Freqs$color <- mbar_color
# }
# Freqs <- Freqs %>% dplyr::mutate(dg = rowSums(Freqs[,1:num_sets])) %>% dplyr::arrange((dg))
# x <- Freqs$x
# print(x)
# x1 <- c(x[1:5], x[length(x)], x[7:length(x)-1])
# print(x1)
# print(Freqs)
# Freqs <- Freqs[match(x1, Freqs$x), ]
# Freqs <- Freqs[1:nintersections, ] %>% dplyr::select(-dg)
# print(Freqs)
# Freqs <- na.omit(Freqs)
# return(Freqs)
# }
#
# Make_matrix_plot_mod <- function (Mat_data, Set_size_data, Main_bar_data, point_size,
# line_size, name_size, labels, shading_data, shade_color,
# shade_alpha)
# {
# Matrix_plot <- (ggplot() + theme(panel.background = element_rect(fill = "white"),
# plot.margin = unit(c(-0.2, 0.5, 0.5, 0.5), "lines"),
# axis.text.x = element_blank(), axis.ticks.x = element_blank(),
# axis.ticks.y = element_blank(), axis.text.y = element_text(colour = "gray0",
# size = name_size, hjust = 0.4), panel.grid.major = element_blank(),
# panel.grid.minor = element_blank()) + xlab(NULL) + ylab(" ") +
# scale_y_continuous(breaks = c(1:nrow(Set_size_data)),
# limits = c(0.5, (nrow(Set_size_data) + 0.5)), labels = labels,
# expand = c(0, 0)) + scale_x_continuous(limits = c(0,
# (nrow(Main_bar_data) + 1)), expand = c(0, 0)) + geom_rect(data = shading_data,
# aes_string(xmin = "min", xmax = "max", ymin = "y_min",
# ymax = "y_max"), fill = shade_color, alpha = shade_alpha) +
# geom_point(data = Mat_data, aes_string(x = "x", y = "y"),
# colour = Mat_data$color, size = point_size) + geom_line(data = Mat_data,
# aes_string(group = "Intersection", x = "x", y = "y",
# colour = "color"), size = line_size) + scale_color_identity())
# Matrix_plot <- ggplot_gtable(ggplot_build(Matrix_plot))
# return(Matrix_plot)
# }
names(all) <- plyr::mapvalues(names(all),
c('Atlantic', 'Mediterranean', 'Indian', 'Antartic', 'Pacific'),
c(atlantic_stations, mediterranean_stations, indic_stations, antartic_stations, pacific_stations))
oname <- paste("~/Downloads/all_basins_upset_abundance_occupancy_logtrans_",n,"_violin.pdf", sep = "")
pdf(file = oname, height = 10, width = 16)
upset(all, nsets = 5, order.by = c("freq", "degree"), group.by = "degree",
boxplot.summary = c("abundance", "occupancy"), number.angles = 0,
mb.ratio = c(0.45, 0.55))
dev.off()
}
# antartic "#516479"
# med "#F7F740"
# atlantic "#62D414"
# indian "#54C7FF"
# pacific "#FF6666"