cksun-usc
8/29/2016 - 9:31 PM

MCMCglmm related functions

MCMCglmm related functions

# import generics
# (cheap version of importFrom() in a package)
fixef <- nlme:::fixef
ranef <- nlme:::ranef

# define generic prediction function
# but using my functions/methods
predict2 <- function (object, ...) {
  UseMethod("predict2")
}

# extract fixed and random effect parameter names
# from an MCMCglmm object
paramNames.MCMCglmm <- function(object, ...) {
  fNames <- as.character(attr(terms(object$Fixed$formula), "variables"))[-c(1, 2)]

  rNames <- NULL

  if (!is.null(object$Random$formula)) {
    rNames <- as.character(attr(terms(object$Random$formula), "variables"))[-c(1)]
  }

  if ("(Intercept)" %in% colnames(object[["Sol"]])) {
    fNames <- c("(Intercept)", fNames)
  }

  list(fixed = fNames, random = rNames)
}

# extract the levels of the random effects
# from an MCMCglmm object
ranefLevels <- function(object, data, ...) {
  n <- paramNames.MCMCglmm(object)$random
  res <- lapply(n, function(n) {
    levels(data[, n])
  })
  names(res) <- n
  return(res)
}


# extract the fixed and random effects portions from an MCMCglmm object
# note for the random, these are the estimates themselves
# not the variability in the estimates
# two use options let you get either just the posterior mean
# or all the posterior samples
.extractEffects <- function(object, use = c("all", "mean"),
  which = c("fixed", "random"), ...) {

  use <- match.arg(use)
  which <- match.arg(which)

  b <- object[["Sol"]]

  eff <- switch(which,
    fixed = {
      # this does not work because the intercept term contains
      # special characters (), that screw with the regular expressions
      # cannot use fixed matching because factor variables can
      # expand with arbitrarily named levels
      #unlist(lapply(paramNames.MCMCglmm(object)$fixed, function(n) {
      #  grep(paste0("^", n, ".*$"), colnames(b), value = TRUE)
      #}))
      object$X@Dimnames[[2]]
    },
    random = {
      unlist(lapply(paramNames.MCMCglmm(object)$random, function(n) {
        grep(paste0("^", n, "\\..*$"), colnames(b), value = TRUE)
      }))
    }
  )

  b <- b[, eff]

  switch(use,
    all = t(b),
    mean = as.matrix(colMeans(b)))
}

# methods for fixef and ranef to extract the fixed and random effects
# for an MCMCglmm model object
fixef.MCMCglmm <- function(object, use = c("all", "mean"), ...) {
  .extractEffects(object = object, use = use, which = "fixed", ...)
}

ranef.MCMCglmm <- function(object, use = c("all", "mean"), ...) {
  .extractEffects(object = object, use = use, which = "random", ...)
}

# this is the main workhorse
# a predict function for MCMCglmm objects
# takes a few core arguments
# the model object
# and data matrices, X (fixed effects) and Z (random effects)
# if X and Z are missing, attempts to fill in from the
# model object (which optionally saves them)
# If X and Z are specified or NULL, they are not used
# this is useful either for out of sample predictions
# or to use just the fixed effects, for example
# use can use all posterior samples or the mean
# all is nice because it lets you construct HPD intervals around the predicted
# values, rather than just get an estimate
# the mean is nice because if that is all you care about, it is much much faster
# type is either lp for linear predictor or response for the predicted values
# on the response scale.  Currently this is only meant for ordinal models,
# theoretically could be extended but the code is a pain
# varepsilon is the residual variance (probably could get it from the model object??)
# which I usually fix at 1 for probit/logit models
predict2.MCMCglmm <- function(object, X, Z, use = c("all", "mean"),
  type = c("lp", "response"), varepsilon, ...) {

  use <- match.arg(use)
  type <- match.arg(type)

  if (!is.null(object$X) && missing(X)) {
    X <- object$X
  }

  if (!is.null(object$Z) && missing(Z)) {
    Z <- object$Z
  }

  if (missing(X)) X <- NULL
  if (missing(Z)) Z <- NULL

  Xb <- Zu <- 0L

  if (!is.null(X)) {
    Xb <- X %*% as.matrix(fixef(object, use = use))
  }
  if (!is.null(Z)) {
    Zu <-  Z %*% as.matrix(ranef(object, use = use))
  }

  res <- t(as.matrix(Xb + Zu))
  dimnames(res) <- list(switch(use,
    all = paste0("Rep.", 1:nrow(res)), mean = NULL), NULL)

  # get predicted probabilities for each category from an ordered probit model
  # set the resulting object class to MCMCglmlmPredictedProbs
  # which I wrote a summary method for
  if (type == "response") {
    if (all(object$family %in% c("ordinal"))) {
      if (missing(varepsilon)) {
        varepsilon <- 1
      }

      CP <- switch(use,
        all = object$CP,
        mean = colMeans(object$CP))

      CP <- as.list(as.data.frame(CP))
      CP <- lapply(CP, function(x) {do.call("cbind", rep(list(x), dim(res)[2L]))})
      CP <- c(-Inf, 0, CP, Inf)

      p <- function(x) {pnorm(x - res, 0, sqrt(varepsilon + 1))}

      q <- vector("list", length(CP) - 2)
      for (i in 2:(length(CP) - 1)) {
        q[[i - 1]] <- mcmc(p(CP[[i + 1]]) - p(CP[[i]]))
      }
      q <- c(list(mcmc(1 - Reduce(`+`, q[1:(i - 1)]))), q)
      class(q) <- c("list", "MCMCglmmPredictedProbs")
      res <- q
    } else {
      stop("Function does not support response type for families beside ordinal")
    }
  }
  return(res)
}

# simple little function to generate confusion matrices
# I use it for one column predicted class, one column actual
# then look at percentage correctly classified
confusion <- function(formula, data) {
  res <- xtabs(formula = formula, data = data)
  print(res)
  res/sum(res)
}

# summary method for predicted probabilities
# optionally first marginalizes across all observations
# by taking the row means
# If the predicted values only used the posterior means
# cannot generate intervals, so only the means are returned
# otherwise, calculates the mean predicted probability, as well as
# the HPD interval
# this can either be per observation or marginalized
summary.MCMCglmmPredictedProbs <- function(object,
  marginalize = FALSE, level = .95, ...) {

  if (nrow(object[[1]]) > 1) {
    intervals <- TRUE
  } else {
    intervals <- FALSE
  }
  if (marginalize) {
    object <- lapply(object, rowMeans)
  }

  if (intervals) {
    res <- lapply(object, function(x) {
      if (!is.null(ncol(x))) {
        res <- cbind(colMeans(x), HPDinterval(mcmc(x), prob = level))
      } else {
        res <- cbind(mean(x), HPDinterval(mcmc(x), prob = level))
      }
      colnames(res) <- c("M", "LL", "UL")
      return(res)
    })
  } else {
    res <- lapply(object, mean)
  }
  return(res)
}

# recycler wraps most the previous functions to calculate
# the change in predicted probabilities for a twiddle change in the predictor
# or for discrete predictors, can use values so it is the change from 0 to 1
# or whatever
# the result is itself a MCMCglmmPredictedProbs (of course a difference but still)
# object, so it can be summarized using my method for the summary function
# this gives average marginal recycled predicted probabilities, as well as HPD intervals
recycler <- function(object, index = 2, twiddle, values) {
  L1 <- missing(twiddle)
  L2 <- missing(values)
  if (!L1 && !L2) stop("Please specify either a twiddle value or actual values, not both")
  X1 <- X2 <- object$X
  if (L1 && L2) {
    twiddle <- .01
  }
  if (L2) {
    X2[, index] <- X2[, index] + twiddle
  } else {
    stopifnot(length(values) == 2L)
    X1[, index] <- values[1]
    X2[, index] <- values[2]
    twiddle <- diff(values)
  }

  p1 <- predict2(object, X = X1, use = "all", type = "response", varepsilon = 1)
  p2 <- predict2(object, X = X2, use = "all", type = "response", varepsilon = 1)

  pdelta <- lapply(1:3, function(i) {
    (p2[[i]] - p1[[i]])/twiddle
  })
  class(pdelta) <- c("list", "MCMCglmmPredictedProbs")
  return(pdelta)
}

# sample models I used as test cases for developing the code
if (FALSE) {
require(MCMCglmm)

set.seed(10)
dat <- mtcars[sample(1:32, 1000, replace = TRUE), ]
dat <- within(dat, {
  qsec <- scale(qsec)
  hp <- scale(hp)
  mpg <- scale(mpg)
  disp <- scale(disp)
})
dat$ID <- factor(rep(letters, length.out = 1000))
dat$cyl <- factor(dat$cyl)

m <- MCMCglmm(cyl ~ qsec + mpg + drat, random = ~ ID, family = "ordinal",
  data = dat, prior = list(
  B = list(mu = c(0, 0, 0, 0), V = diag(4) * 1e2),
  R = list(V = 1, fix = 1),
  G = list(G1 = list(V = 1, nu = .002))), pr=TRUE,
  nitt = 510000, thin = 200, burnin = 10000, verbose=TRUE)

## model summary
summary(m)

## predictor means
colMeans(dat[, c("qsec", "mpg", "drat")])
## predictor ranges
lapply(dat[, c("qsec", "mpg", "drat")], range)

## new data for prediction
myX <- as.matrix(data.frame("(Intercept)" = 1, qsec = 0, mpg = 0, drat = seq(from = 2.76, to = 4.93, by = .1), check.names=FALSE))

## get the predicted values (fixed effects only)
pred1 <- predict2(m, X = myX, Z = NULL, use = "all", type = "response", varepsilon = 1)

## summarize them
(spred1 <- summary(pred1))

## can combine predicted probs + HPD intervals with prediction data
## to be able to plot
preddat <- as.data.frame(cbind(do.call(rbind, rep(list(myX), 3)), do.call(rbind, spred1)))
## need an indicator for which level of the outcome
preddat$outcome <- factor(rep(1:3, each = nrow(myX)))

## look at the data
head(preddat)

## plot it
require(ggplot2)
ggplot(preddat, aes(x = drat, y = M, colour = outcome)) +
  geom_ribbon(aes(ymin = LL, ymax = UL, fill = outcome), alpha = .25) +
  geom_line(size=2)


## now get average marginal recycled probabilities
## note that this can be a bit slow
tmpres <- lapply(c("qsec", "mpg", "drat"), function(i) {
  tmp <- recycler(m, index = i, twiddle = .01)
  tmp <- summary(tmp, marginalize = TRUE)
  tmp <- do.call("rbind", tmp)
  (tmp2 <- sprintf("%0.2f [%0.2f, %0.2f]", tmp[, 1], tmp[, 2], tmp[, 3]))
})

## setup final "output" table
finaltable <- matrix(NA, nrow = 3, ncol = 4, dimnames = list(
  NULL, c("Variable", "L1", "L2", "L3")))

finaltable[1, ] <- c("Var1", tmpres[[1]])
finaltable[2, ] <- c("Var2", tmpres[[2]])
finaltable[3, ] <- c("Var3", tmpres[[3]])
finaltable[,1] <- c("qsec", "mpg", "disp")
finaltable[] <- gsub("0\\.", ".", finaltable)

## these are the average changes in probability
## of membership in each group, for a one unit change
## of the predictor, on average across the sample
## with 95% HPD intervals
finaltable

## write to clipboard on Windows (e.g., for collaborators who want in Excel)
write.table(finaltable, file = "clipboard", na = "", sep = "\t", row.names=FALSE, col.names=TRUE)

#### other example models ####
m2 <- MCMCglmm(dv4 ~ hp, random = ~ ID, family = "ordinal",
  data = dat, prior = list(
  B = list(mu = c(0, 0), V = diag(2) * 1e10),
  R = list(V = 1, fix = 1),
  G = list(G1 = list(V = 1, nu = .002))), pr=TRUE,
  nitt = 55000, thin = 100, burnin = 5000, verbose=FALSE)

msimp <- MCMCglmm(cyl ~ qsec, family = "ordinal",
  data = dat, prior = list(
  B = list(mu = c(0, 0), V = diag(2) * 1e10),
  R = list(V = 1, fix = 1)),
  nitt = 55000, thin = 100, burnin = 5000, verbose=FALSE)

malt <- MASS::polr(cyl ~ qsec, data = dat, method = "probit", Hess=TRUE)

}