genomewalker
4/11/2017 - 12:44 PM

Halpern interpolation

Halpern interpolation

#================================================================================
#STEP 1: unzip files
#================================================================================
## set dir
setwd("~/Downloads/HALPERN")
## list all files for 2008

files_rescaled_2013 <- list.files(path = "2013_scaled/tif")
print(files_rescaled_2013)


#================================================================================
#STEP 2
#================================================================================
library(proj4)
library(tidyverse)
library(raster)
my_db<- src_postgres(host = "localhost", port = 5432, dbname = "osd_analysis", options = "-c search_path=osd_analysis")
osd2014_amp_mg_intersect <- tbl(my_db, "osd2014_amp_mg_intersect") %>%
  collect(n = Inf)
osd2014_metadata <- read_tsv("~/ownCloud/OSD_paper/OSD-GC/data/osd2014_metadata_18-01-2017.tsv", 
                             col_names = TRUE, trim_ws = TRUE, 
                             col_types = list(mrgid = col_character(), biome_id = col_character(), 
                                              feature_id = col_character(),material_id = col_character()))

osd2014_metadata_mg <- osd2014_metadata %>% filter(label %in% osd2014_amp_mg_intersect$label)
sc <- cbind(long=osd2014_metadata_mg$start_lon, lat=osd2014_metadata_mg$start_lat)                                #(long,lat)


r1 <- raster("~/Downloads/HALPERN/2013_scaled/tif_wgs84/global_cumul_impact_2013_all_layers_wgs84.tif")
r <- raster("~/Downloads/HALPERN/2013_scaled/tif/global_cumul_impact_2013_all_layers.tif")
#change WGS84 to Mollweide
oldproj <- "+proj=longlat +datum=WGS84 +no_defs"                                #WGS84
newproj <- "+proj=moll +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +units=m +no_defs"   #Mollweide

wgs.84    <- "+proj=longlat +datum=WGS84 +no_defs +ellps=WGS84 +towgs84=0,0,0"
# ESRI:54009 world mollweide projection, units = meters
# see http://www.spatialreference.org/ref/esri/54009/
mollweide <- "+proj=moll +lon_0=0 +x_0=0 +y_0=0 +ellps=WGS84 +datum=WGS84 +units=m +no_defs"


sp.points <- SpatialPoints(sc, proj4string=CRS(oldproj))
sc <- spTransform(sp.points,CRS(newproj))

#================================================================================
#STEP 3 : values for 2008
#================================================================================
require(rgdal)
library(raster)

#sink("GlobalMinMaxForVar.txt")
GlobalMinMaxForVar13 = data.frame(id=1:length(files_rescaled_2013), VarNames=gsub(".tif", "", files_rescaled_2013), minValue=rep(0,length(files_rescaled_2013)), maxValue=rep(0,length(files_rescaled_2013)))
head(GlobalMinMaxForVar13)

#OUTPUT FILE
out_rescaled_2013 = data.frame(osd_id=osd2014_metadata_mg$label, long=osd2014_metadata_mg$start_lon, lat=osd2014_metadata_mg$start_lat)
m=ncol(out_rescaled_2013)

#LOOP TO READ TIFF ONE BY ONE
for(i in 1:length(files_rescaled_2013)) {
  
  varName = gsub(".tif", "", files_rescaled_2013[i])
  print(varName); flush.console()
  
  ## Reading in a zip data file without unzipping it
  #r <- raster(list.files$Name[2])
  f <- paste("~/Downloads/HALPERN/2013_scaled/tif/", files_rescaled_2013[i], sep = "")
  r <- raster(f)
  print(r); flush.console()
  
  #Get min and max cell values from raster
  GlobalMinMaxForVar13$minValue[i] = cellStats(r, min)
  GlobalMinMaxForVar13$maxValue[i] = round(cellStats(r, max),3)
  
  #LOOP TO READ TIFF ONE BY ONE
  for(k in 1: nrow(sc@coords)){
    
    # if(k==16){
    # out_rescaled_2013[k, m+1] =  "NA"; colnames(out_rescaled_2013)[m+1] = (paste(varName, "50kms_max", sep="_"))
    # out_rescaled_2013[k, m+2] =  "NA"; colnames(out_rescaled_2013)[m+2] = (paste(varName, "50kms_min", sep="_"))
    # out_rescaled_2013[k, m+3] =  "NA"; colnames(out_rescaled_2013)[m+3] = (paste(varName, "10kms_max", sep="_"))
    # out_rescaled_2013[k, m+4] =  "NA"; colnames(out_rescaled_2013)[m+4] = (paste(varName, "10kms_min", sep="_"))
    #} else{
    ## MEHOD:1
    ## Extract values from Raster objects
    #equivalent to 0.5 degree or 50 kms approx radius
    e0 = extract(r, sc[k,], buffer = 100000, weights = TRUE, small = TRUE) %>% unlist
    
    e1 = extract(r, sc[k,], buffer = 50000, weights = TRUE, small = TRUE) %>% unlist

    #equivalent to 0.1 degree or 10 kms approx
    e2 = extract(r, sc[k,], buffer = 10000, weights = TRUE, small = TRUE) %>% unlist
    
    #equivalent to 0.1 degree or 10 kms approx
    e3 = extract(r, sc[k,], buffer = 5000, weights = TRUE, small = TRUE) %>% unlist

    e4 = extract(r, sc[k,], buffer = 1000, weights = TRUE, small = TRUE) %>% unlist
    
    out_rescaled_2013[k, m+1] =  round(max(e1, na.rm=TRUE),3) ; colnames(out_rescaled_2013)[m+1] = (paste(varName, "2013_50kms_max", sep="_"))
    out_rescaled_2013[k, m+2] =  round(mean(e1, na.rm=TRUE),3)   ; colnames(out_rescaled_2013)[m+2] = (paste(varName, "2013_50kms_mean", sep="_"))
   
    out_rescaled_2013[k, m+3] =  round(max(e2, na.rm=TRUE),3) ; colnames(out_rescaled_2013)[m+3] = (paste(varName, "2013_10kms_max", sep="_"))
    out_rescaled_2013[k, m+4] =  round(mean(e2, na.rm=TRUE),3)   ; colnames(out_rescaled_2013)[m+4] = (paste(varName, "2013_10kms_mean", sep="_"))
    
    out_rescaled_2013[k, m+5] =  round(max(e3, na.rm=TRUE),3) ; colnames(out_rescaled_2013)[m+5] = (paste(varName, "2013_5kms_max", sep="_"))
    out_rescaled_2013[k, m+6] =  round(mean(e3, na.rm=TRUE),3)   ; colnames(out_rescaled_2013)[m+6] = (paste(varName, "2013_5kms_mean", sep="_"))
    
    out_rescaled_2013[k, m+7] =  round(max(e4, na.rm=TRUE),3) ; colnames(out_rescaled_2013)[m+7] = (paste(varName, "2013_1kms_max", sep="_"))
    out_rescaled_2013[k, m+8] =  round(mean(e4, na.rm=TRUE),3)   ; colnames(out_rescaled_2013)[m+8] = (paste(varName, "2013_1kms_mean", sep="_"))
    
    out_rescaled_2013[k, m+9] =  round(max(e0, na.rm=TRUE),3) ; colnames(out_rescaled_2013)[m+9] = (paste(varName, "2013_100kms_max", sep="_"))
    out_rescaled_2013[k, m+10] =  round(mean(e0, na.rm=TRUE),3)   ; colnames(out_rescaled_2013)[m+10] = (paste(varName, "2013_100kms_mean", sep="_"))
    
    
    if(k %in% c(25,50,75,100,125,150,175,200)){cat(k, " | ")}
    flush.console()
    #}
  }
  cat("Next!!! \n")
  m=ncol(out_rescaled_2013)
}


out_rescaled_2013_mean_long <- out_rescaled_2013 %>% 
  dplyr::select(osd_id, contains("mean")) %>%
  dplyr::rename(label = osd_id) %>%
  gather(ohi_variable, mean, -label) %>% tbl_df
library(naturalsort)
out_rescaled_2013_mean_long$ohi_variable <- factor(out_rescaled_2013_mean_long$ohi_variable, levels = unique(out_rescaled_2013_mean_long$ohi_variable) %>% naturalsort())

for (i in 1:13){
  fname <- paste("~/Downloads/halpern_mean_", i, ".pdf", sep = "")
  pdf(fname, width = 14, height = 3)
  p <- ggplot(out_rescaled_2013_mean_long, aes(mean)) +
    geom_density(fill = "#333333", alpha = 0.8) +
    ggforce::facet_wrap_paginate(~ohi_variable, scales = "free", ncol = 5, nrow = 1, page = i) +
    theme_bw() +
    theme(strip.text.x = element_text(size = 6),
          axis.text = element_text(size = 8))
  print(p)
  dev.off()
}

out_rescaled_2013_mean_long %>% 
  filter(is.na(mean)) %>% 
  group_by(ohi_variable) %>% 
  count() %>%
  separate("ohi_variable", c("variable", "buffer"), sep = "_2013_", remove = TRUE) %>%
  mutate(buffer = gsub("s_mean", "", buffer)) %>%
  complete(variable, buffer, fill = list(n = 0)) %>%
  ggplot(aes(buffer, n)) +
  geom_bar(stat = "identity") +
  geom_text(aes(label = n), vjust = -0.2, size = 3) +
  facet_wrap(~variable, scales = "free_x") +
  scale_x_discrete(limits = c("1km", "5km", "10km", "50km", "100km")) +
  theme_bw() +
  xlab("Extraction buffer") +
  ylab("# of NAs")


library(maptools)
data(wrld_simpl)
wrldMoll <- spTransform(wrld_simpl, CRS(newproj))
for(i in 1:length(files_rescaled_2013)) {
  
  varName = gsub(".tif", "", files_rescaled_2013[i])
  print(varName); flush.console()
  
  ## Reading in a zip data file without unzipping it
  #r <- raster(list.files$Name[2])
  f <- paste("~/Downloads/HALPERN/2013_scaled/tif/", files_rescaled_2013[i], sep = "")
  r <- raster(f)
  
  fname <- paste("~/Downloads/halpern_", varName,".pdf", sep = "")
  pdf(fname, width = 7, height = 5)
  colors <- rev(colorRampPalette(c(brewer.pal(11, "Spectral"), "#FFFFFF"))(100))
  plot(r, col = colors, axes = F, box = F)
  plot(wrldMoll, col = "#E5E6E6", border = "#E5E6E6", add = TRUE, box = FALSE)
  points(sc, cex = 0.5)
  dev.off()
}


out_rescaled_2013_mean_long_5km <- out_rescaled_2013_mean_long %>%
  mutate(ohi_variable = gsub("_2013_all_layers", "", ohi_variable)) %>%
  mutate(ohi_variable = gsub("_2013_minus_2008", "diff", ohi_variable))


out_rescaled_2013_mean_long_5km <- out_rescaled_2013_mean_long_5km %>%
  separate("ohi_variable", c("variable", "buffer"), sep = "_2013_", remove = TRUE) %>%
  mutate(buffer = gsub("s_mean", "", buffer))

out_rescaled_2013_mean_long_5km$label <- factor(out_rescaled_2013_mean_long_5km$label, levels = st_100_order_terrestrial)
out_rescaled_2013_mean_long_5km$buffer <- factor(out_rescaled_2013_mean_long_5km$buffer, levels = c("1km", "5km", "10km", "50km", "100km"))

for (i in 1:13){
  fname <- paste("~/Downloads/halpern_mean_", i, "_by_sample.pdf", sep = "")
  pdf(fname, width = 14, height = 3)  
p <- ggplot(out_rescaled_2013_mean_long_5km, aes(label, mean)) +
  geom_bar(stat = "identity") +
  ggforce::facet_grid_paginate(variable~buffer, scales = "free", ncol = 5, nrow = 1, page = i) +
  scale_x_discrete(limits = st_100_order_terrestrial) +
  theme_bw() +
  theme(axis.text.x = element_blank(),
        axis.ticks.x = element_blank()) +
  xlab("") +
  ylab("Average")
print(p)
dev.off()
}


out_rescaled_2013_mean_long_lat_long <- out_rescaled_2013_mean_long %>% 
  filter(is.na(mean)) %>%
  separate("ohi_variable", c("variable", "buffer"), sep = "_2013_", remove = TRUE) %>%
  mutate(buffer = gsub("s_mean", "", buffer)) %>%
  filter(buffer == "10km") %>%
  left_join(osd2014_metadata)
  

wmap<-map_data("world")

pdf("~/Downloads/nas_map_10km.pdf", width = 21, height = 10)
p <- ggplot(wmap, aes(x = long, y = lat)) +
  geom_polygon(aes(group = group), fill = "#E5E6E6") +
  geom_path(aes(group = group), colour = "#E5E6E6") +
  geom_point(data = out_rescaled_2013_mean_long_lat_long, aes(x = start_lon, y = start_lat),
             alpha = 0.8) +
  xlab("") +
  ylab("") +
  coord_equal(ratio = 1)  +
  facet_wrap(~variable) +
  theme(panel.background = element_rect(fill = "white", colour = "white"),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.ticks = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_blank(),
        legend.position = "top")
print(p)
dev.off()



# Very low impact: <1.4
# Low impact: 1.4-4.95
# Medium impact: 4.95-8.47
# Medium high impact: 8.47-12
# High impact: 12-15.52
# Very high impact: >15.52

# Classify samples in impacted and non-impacted
halpern_impact <- out_rescaled_2013_mean_long %>%
  mutate(ohi_variable = gsub("_2013_all_layers", "", ohi_variable)) %>%
  mutate(ohi_variable = gsub("_2013_minus_2008", "diff", ohi_variable)) %>% 
  filter(!is.na(mean)) %>%
  separate("ohi_variable", c("variable", "buffer"), sep = "_2013_", remove = TRUE) %>%
  mutate(buffer = gsub("s_mean", "", buffer)) %>%
  filter(buffer == "5km", variable == "global_cumul_impact") %>%
  mutate(class = ifelse(mean <= 1.4, "non-impacted", "impacted"))





halpern_impact_class_min <- halpern_impact %>% 
  group_by(class) %>%
  count %>%
  dplyr::slice(which.min(n)) %>%
  .$n

sites_non_impact <- halpern_impact %>%
  filter(class == "non-impacted") %>%
  .$label %>%
  droplevels()

sites_rand <- halpern_impact %>%
  filter(class == "impacted") %>%
  .$label %>%
  sample(.,halpern_impact_class_min) %>%
  droplevels()


filt <- osd2014_16s_prevalence %>% 
  filter(total_counts >= 10, prev > 2) %>% .$otu_name %>% droplevels()

sites_rm <- c("OSD63_2014-06-20_0m_NPL022",
              "OSD167_2014-06-21_0.1m_NPL022",
              "OSD128_2014-06-21_1m_NPL022",
              "OSD80_2014-06-21_2m_NPL022")

sites_rm <- sample_names(osd2014_16s_otuXsample_physeq_filt_prev_beta_mg)[!(sample_names(osd2014_16s_otuXsample_physeq_filt_prev_beta_mg) %in% sites_rm)]


osd2014_16s_otuXsample_physeq_filt_prev_beta_filt <- prune_taxa(filt %>% as.vector(), osd2014_16s_otuXsample_physeq_filt_prev_beta_mg)
osd2014_16s_otuXsample_physeq_filt_prev_beta_filt <- prune_samples(sites_rm,osd2014_16s_otuXsample_physeq_filt_prev_beta_filt)

osd2014_16s_otuXsample_physeq_filt_prev_beta_filt_css <- cssTrans(osd2014_16s_otuXsample_physeq_filt_prev_beta_filt, norm = T, log = F)

osd2014_16s_otuXsample_physeq_filt_prev_beta_filt_css_m <- t(as(otu_table(osd2014_16s_otuXsample_physeq_filt_prev_beta_filt_css), "matrix"))

plot(hclust(dist(osd2014_16s_otuXsample_physeq_filt_prev_beta_filt_css_m), "ward.D"))


pca <- prcomp((osd2014_16s_otuXsample_physeq_filt_prev_beta_filt_css_m), center=F)

osd2014_16s_otuXsample_physeq_filt_prev_beta_filt_css_m_hel <- decostand(osd2014_16s_otuXsample_physeq_filt_prev_beta_filt_css_m, "total")

osd2014_16s_otuXsample_physeq_filt_prev_beta_filt_css_m_hel <- decostand(osd2014_16s_otuXsample_physeq_filt_prev_beta_filt_css_m_hel, "hellinger")


mdsplot <- metaMDS(osd2014_16s_otuXsample_physeq_filt_prev_beta_filt_css_m_hel, distance = "bray")


halpern_impact_all <- out_rescaled_2013_mean_long %>%
  mutate(ohi_variable = gsub("_2013_all_layers", "", ohi_variable)) %>%
  mutate(ohi_variable = gsub("_2013_minus_2008", "diff", ohi_variable)) %>% 
  filter(!is.na(mean)) %>%
  separate("ohi_variable", c("variable", "buffer"), sep = "_2013_", remove = TRUE) %>%
  mutate(buffer = gsub("s_mean", "", buffer)) %>%
  filter(buffer == "10km")

test <- melt(scores(pca, display = "sites")) %>%
  dplyr::rename(label = Var1) %>%
  filter(Var2 == "NMDS1") %>%
  left_join(halpern_impact_all) %>%
  dplyr::select(label, value, variable, mean) %>%
  group_by(variable) %>%
  do(fit = Hmisc::rcorr(.$value, .$mean))



test %>% broom::tidy(fit) %>%
  filter(p.value < 0.01)

otutable <- t(as(otu_table(osd2014_16s_otuXsample_physeq_filt_prev_beta), "matrix"))
otutable <- base::as.data.frame(otutable)
otutable <- otutable[,filt]
names_otus <- names(otutable)
names_otus_short <- paste("otu", seq(1:length(names_otus)), sep = "_")
names(otutable) <- names_otus_short
otutable$label <- base::row.names(otutable)
otutable <- otutable %>% filter(label %in% sites_non_impact | label %in% sites_rand)

otutable <- otutable %>% left_join(halpern_impact %>% dplyr::select(label, class))
class_impact <- otutable %>% dplyr::select(label, class)
base::row.names(otutable) <- otutable$label
otutable$label <- NULL
otutable$class <- NULL


library(parallelRandomForest)

set.seed(2)
halpern_classify <- randomForest::randomForest(y = as.factor(class_impact$class), 
                                               x = otutable, ntree=500, importance=TRUE, do.trace=100)

print(halpern_classify)

imp <- importance(halpern_classify)
imp <- data.frame(predictors = rownames(imp), imp)

imp.sort <- arrange(imp, desc(MeanDecreaseGini))
imp.sort$predictors <- factor(imp.sort$predictors, levels = imp.sort$predictors)

# Select the top 10 predictors
imp.20 <- imp.sort[1:20, ]


# ggplot
ggplot(imp.20, aes(x = predictors, y = MeanDecreaseGini)) +
  geom_bar(stat = "identity", fill = "indianred") +
  coord_flip() +
  ggtitle("Most important OTUs for classifying Erie samples\n into nearshore or offshore")