# R Code for the paper titled JointAI: Joint Analysis and Imputation of
# Incomplete Data in R


## Note: Many of the models in this syntax will return a message saying that no
## MCMC sample will be created when n.iter is set to 0 (the default value for the
## argument n.iter). Specifically, this is the case for models that are used to
## demonstrate how models are specified in JointAI, for which no output or results
## were needed in the paper.  To obtain an MCMC samples for these models, the
## argument n.iter has to be set, for example n.iter = 100.  In addition, some
## models will result in a warning about the usage of orthogonal polynomial
## contrasts ('contr.poly') for incomplete categorical covariates.  These
## warnings/messages are printed to inform the user about the behaviour of JointAI
## and do not indicate problems with the model. They were omitted in the paper.

library("knitr")
library("survival")
library("JointAI")
library("ggplot2")

# ggplot theme
theme_set(theme_gray())
theme_update(panel.grid = element_blank(), panel.background = element_rect(fill = grey(0.98)), 
  panel.border = element_rect(fill = "transparent", color = grey(0.85)))

## ----distrplotNHANES
plot_all(NHANES, ncol = 5)

## ----mdpatternNHANES 
md_pattern(data = NHANES, pattern = FALSE)

## ----trajectories
plotDF <- reshape2::melt(simLong[, c("ID", "age", "hgt", "wgt", "bmi", "hc")], id.vars = c("ID", 
  "age"))

ggplot(plotDF, aes(x = age, y = value, color = factor(ID))) + geom_line(na.rm = TRUE, 
  alpha = 0.7) + facet_wrap("variable", scales = "free", ncol = 4) + scale_color_viridis_d(guide = "none", 
  end = 0.8) + xlab("age in months") + theme(strip.text = element_text(size = 11))


## ----distrplotsimLong
plot_all(simLong, use_level = TRUE, idvar = "ID", ncol = 5)

## ----mdpatternsimLongprep
md1 <- md_pattern(data = subset(simLong, select = -c(bmi, age, hc, hgt, wgt, sleep, 
  time)), pattern = FALSE)
md2 <- md_pattern(data = subset(simLong, select = c(bmi, age, hc, hgt, wgt, sleep, 
  time)), pattern = FALSE)

## ----mdpatternsimLong
cowplot::plot_grid(md2, md1, align = "h", rel_widths = c(7, 9))

## ----mdpatternPBC
md_pattern(PBC)

## ----distrplotPBC
plot_all(PBC, idvars = "id", nclass = 15, ncol = 5)

## ----mod1a ----
mod1a <- glm_imp(educ ~ gender * (age + smoke + creat),
                 data = NHANES, family = binomial())

## ----mod1b ----
mod1b <- glm_imp(educ ~ gender + (age + smoke + creat)^3,
                 data = NHANES, family = binomial())

## ----mod2a ----
mod2a <- lm_imp(SBP ~ age + gender + abs(bili - creat), data = NHANES)

## ----mod2b ----
mod2b <- lm_imp(SBP ~ ns(age, df = 2) + gender + I(bili^2) + I(bili^3),
                data = NHANES)

## ----mod2c ----
mod2c <- lm_imp(SBP ~ age + gender + I(creat/albu^2),
                data = NHANES, trunc = list(albu = c(1e-05, NA)))

## ----mod2d ----
mod2d <- lm_imp(SBP ~ bili + sin(creat) + cos(albu), data = NHANES)

## ----mod2e ----
mod2e <- lm_imp(SBP ~ age + gender + sqrt(exp(creat)/2), data = NHANES)

## ----mod3a ----
mod3a <- lm_imp(SBP ~ age + gender + log(uricacid) + exp(creat),
                trunc = list(uricacid = c(1e-05, NA)), data = NHANES)

## ----mod3b ----
mod3b <- lm_imp(SBP ~ age + gender + log(uricacid) + exp(creat),
                models = c(uricacid = "lognorm"), data = NHANES)

## ----mod3c ----
mod3c <- lm_imp(SBP ~ age + gender + log(uricacid) + exp(creat),
                models = c(uricacid = "glm_gamma_inverse"), data = NHANES)

## ----ilogit
ilogit <- plogis

## ----mod4a ----
mod4a <- lm_imp(SBP ~ age + gender + ilogit(creat), data = NHANES)

## ----listmod2b
list_models(mod2b, priors = FALSE, regcoef = FALSE, otherpars = FALSE)

## ----mod4b ----
mod4b <- lm_imp(SBP ~ age + gender + bili, auxvars = ~I(WC^2),
                data = NHANES)

list_models(mod4b, priors = FALSE, regcoef = FALSE, otherpars = FALSE)

## ----mod5 ----
mod5 <- lme_imp(bmi ~ GESTBIR + ETHN + HEIGHT_M + ns(age, df = 2),
                random = ~ ns(age, df = 2) | ID, data = simLong)


## ----mod6a ----
mod6a <- survreg_imp(Surv(futime, status != "alive") ~ age + sex +
                     copper + trig, models = c(copper = "lognorm", trig = "lognorm"),
                     data = subset(PBC, day == 0), n.iter = 250)

## ----mod6b ----
mod6b <- coxph_imp(Surv(futime, status != "alive") ~ age + sex + 
                   copper + trig, models = c(copper = "lognorm", trig = "lognorm"),
                   data = subset(PBC, day == 0), n.iter = 250)

## ----mod6c ----
mod6c <- coxph_imp(Surv(futime, status != "alive") ~ age + sex + copper +
                   trig + platelet + (1 | id),
                   models = c(copper = "lognorm", trig = "lognorm"),
                   timevar = "day", data = PBC)

## ----mod6d ----
PBC$logbili <- log(PBC$bili)
mod6d <- JM_imp(
  list(Surv(futime, status != "alive") ~ age + sex + platelet + logbili + 
  (1 | id),
  platelet ~ age + sex + day + logbili + (day | id),
  logbili ~ age + sex + day + (day | id)
  ),
  timevar = "day", data = PBC, n.adapt = 10)


## ----mod7a ----
mod7a <- lm_imp(SBP ~ age + gender + WC + alc + bili + occup + smoke,
                models = c(WC = "glm_gamma_inverse", bili = "lognorm"),
                data = NHANES, n.adapt = 0)

mod7a$models

## ----mod7cmodels ----
mod7b <- lme_imp(bmi ~ GESTBIR + ETHN + HEIGHT_M + SMOKE + hc +
                 MARITAL + ns(age, df = 2), random = ~ ns(age, df = 2) | ID,
                 data = simLong, no_model = "age", n.adapt = 0)

mod7b$models

## ----mod8a ----
mod8a <- lm_imp(SBP ~ gender + age + occup, auxvars = ~ educ + smoke,
                data = NHANES, n.iter = 100)

## ----listmod8a
list_models(mod8a, priors = FALSE, regcoef = FALSE, otherpars = FALSE,
            refcat = FALSE)

## ---- mod9a ----
mod9a <- lm_imp(SBP ~ gender + age + race + educ + occup + smoke,
                refcats = "largest", data = NHANES)

## ---- mod9b ----
mod9b <- lm_imp(SBP ~ gender + age + race + educ + occup + smoke,
                refcats = list(occup = "not working", race = 3, educ = "largest"),
                data = NHANES)

## ----refsmod9 To re-create the selection from the paper, select option 4 in the
## first menu and then option 2 for 'gender', 3 for 'race', 1 for 'educ', 3 for
## 'occup' and 1 for 'smoke'.
refs_mod9 <- set_refcat(NHANES, formula = formula(mod9b))

## ----mod9c ----
mod9c <- lm_imp(SBP ~ gender + age + race + educ + occup + smoke,
                refcats = refs_mod9, data = NHANES)

## ----mod9d ----
hyp <- default_hyperpars()
hyp$norm["shape_tau_norm"] <- 0.5

mod9d <- lm_imp(SBP ~ gender + age + race + educ + occup + smoke,
                data = NHANES, hyperpars = hyp)

## ----mod10a ----
mod10a <- lm_imp(SBP ~ alc, data = NHANES, n.iter = 100)

## ----summod10a
## capture the output of the summary to be able to print only the relevant parts
## of it
a1 <- capture.output(print(summary(mod10a)))
cat(paste0("[...]\n", paste(a1[18:22], collapse = "\n")))

## ----mod10b ----
mod10b <- lm_imp(SBP ~ alc, data = NHANES, n.adapt = 10, n.iter = 100)

## ----summod10b capture the output of the summary to be able to print only the
## relevant parts of it
a2 <- capture.output(print(summary(mod10b)))
cat(paste0("[...]\n", paste(a2[18:22], collapse = "\n")))

## ----mod10c ----
mod10c <- lm_imp(SBP ~ alc, data = NHANES, n.iter = 500, thin = 10)

## ----summod10c
## capture the output of the summary to be able to print only the relevant parts
## of it
a3 <- capture.output(print(summary(mod10c)))
cat(paste0("[...]\n", paste(a3[18:22], collapse = "\n")))

## ----mod11a ----
mod11a <- lm_imp(SBP ~ gender + WC + alc + creat, data = NHANES,
                 monitor_params = list(other = c("p_alc[1:3]", "mu_creat[1]")))

## ----initlist
init_list <- lapply(1:3, function(i) {
  list(beta = rnorm(5),
       tau_SBP = rgamma(1, 1, 1))
})

mod12a <- lm_imp(SBP ~ gender + WC + alc + creat, data = NHANES,
                 inits = init_list)

## ----initfun
inits_fun <- function() {
  list(beta = rnorm(5),
       alpha = rnorm(9))
}

mod12b <- lm_imp(SBP ~ gender + WC + alc + creat, data = NHANES,
                 inits = inits_fun)

## ---- eval = FALSE---------------------------------------------------------
# library("doFuture")
# doFuture::registerDoFuture()
# plan(multiprocess(workers = 4))

## ---- eval = FALSE---------------------------------------------------------
# plan(sequential)


## ----traceplot13a
mod13a <- lm_imp(SBP ~ gender + WC + alc + creat,
                 data = NHANES, n.iter = 500, seed = 2020)

traceplot(mod13a)

## ----ggtrace13a
traceplot(mod13a, ncol = 4, use_ggplot = TRUE) +
  theme(legend.position = "bottom") + 
  scale_color_viridis_d(end = 0.9)

## ----densplot13a
densplot(mod13a, ncol = 4,
         vlines = list(list(v = coef(mod13a)$SBP, lwd = 2),
                       list(v = confint(mod13a)$SBP[, "2.5%"], lty = 2),
                       list(v = confint(mod13a)$SBP[, "97.5%"], lty = 2)))


## ----ggdens13a
#(Appendix B) fit the complete-case version of the model
mod13a_cc <- lm(formula(mod13a), data = NHANES)


# make a dataset containing the quantiles of the posterior sample and confidence
# intervals from the complete case analysis:
quantDF <- rbind(
  data.frame(variable = names(coef(mod13a)$SBP),
             type = "2.5%",
             model = "JointAI", 
             value = confint(mod13a)$SBP[, c("2.5%")]),
  data.frame(variable = names(coef(mod13a)$SBP), 
             type = "97.5%", 
             model = "JointAI", 
             value = confint(mod13a)$SBP[, c("97.5%")]), 
  data.frame(variable = names(coef(mod13a_cc)),
             type = "2.5%",
             model = "cc",
             value = confint(mod13a_cc)[, "2.5 %"]),
  data.frame(variable = names(coef(mod13a_cc)),
             type = "97.5%", 
             model = "cc",
             value = confint(mod13a_cc)[, "97.5 %"]))


# ggplot version, excluding tau_SBP from the plot:
p13a <- densplot(mod13a,
                 ncol = 3, use_ggplot = TRUE, joined = TRUE) + 
  theme(legend.position = "bottom")


# add vertical lines for the: - confidence intervals from the complete case
# analysis - quantiles of the posterior distribution

p13a + 
  geom_vline(data = quantDF,
             aes(xintercept = value, color = model), lty = 2) + 
  scale_color_manual(name = "CI from model: ", limits = c("JointAI", "cc"), values = c("blue", 
    "red"), labels = c("JointAI", "compl.case"))

## ----summarymod13a
summary(mod13a)

## ----mod13b ----
mod13b <- lme_imp(bmi ~ GESTBIR + ETHN + HEIGHT_M + ns(age, df = 3), 
                  random = ~ ns(age, df = 3) | ID,
                  data = subset(simLong, !is.na(bmi)),
                  n.iter = 250, seed = 2020)

summary(mod13b, missinfo = TRUE)

## ----tailprob
op <- par(mfrow = c(1, 3), mgp = c(1, 0.6, 0), mar = c(2.5, 1, 2, 1))
mus <- c(0.7, -1.5, -2.5)
for (i in seq_along(mus)) {
  x <- seq(-3.5, 3.5, length = 1000) + mus[i]
  y <- dnorm(x, mean = mus[i])

  plot(x, y, type = "l", yaxt = "n", xaxt = "n", xlab = expression(theta), ylab = "", 
    cex.lab = 1.5, main = paste0("tail prob. = ", round(2 * pnorm(0, abs(mus[i])), 
      3)))

  if (mus[i] > 0) {
    polygon(x = c(x[x < 0], max(x[x < 0])), y = c(y[x < 0], min(y)), col = "#18bc9c", 
      border = NA)
  } else {
    polygon(x = c(x[x > 0], min(x[x > 0])), y = c(y[x > 0], min(y)), col = "#18bc9c", 
      border = NA)
  }
  lines(x, y)
  axis(side = 1, at = 0)
  abline(v = 0, lty = 2)
}
par(mfrow = c(1, 1))

## ----GR13a
GR_crit(mod13a)

## ----MCE13acalc
MC_error(mod13a)

## ----MCE13a
plot(MC_error(mod13a))
plot(MC_error(mod13a, end = 250))

## ----mod13c ----
mod13c <- update(mod13a, monitor_params = c(other_models = TRUE))
summary(mod13c, subset = c(analysis_main = FALSE, other_models = TRUE))

## ----densplot13asimple 
densplot(mod13a, nrow = 1,
         subset = list(analysis_main = FALSE,
                       other = c('beta[2]', 'beta[4]')))

## ----trace13d
mod13d <- update(mod13a,
                 monitor_params = c(imps = TRUE))

## ----sub3 
sub3 <- grep("M_lvlone\\[[[:digit:]]+,4\\]", parameters(mod13d)$coef, value = TRUE)
sub3

traceplot(mod13d, subset = list(other = sub3))

## ----mod13e ----
mod13e <- update(mod13b,
                 monitor_params = c(ranef_main = TRUE))

## ----ranefs
rde <- grep("^b_bmi_ID\\[", colnames(mod13e$MCMC[[1]]), value = TRUE)
traceplot(mod13e,
          subset = list(analysis_main = FALSE,
                        other = sample(rde, size = 12)),
          ncol = 4)

## ----predict13a ----
predict(mod13a, newdata = NHANES[27, ])

## ----pred13b ----
newDF <- predDF(mod13b, vars = ~age)
pred <- predict(mod13b, newdata = newDF)

## ----predvis
matplot(pred$newdata$age, pred$newdata[, c("fit", "2.5%", "97.5%")],
        lty = c(1, 2, 2), type = "l", col = 1,
        xlab = "age in months", ylab = "predicted value")

newDF2 <- predDF(mod13b, vars = ~ age + HEIGHT_M,
                 HEIGHT_M = c(160, 175))

## ----impDF
impDF <- get_MIdat(mod13d, m = 10, seed = 2019)

## ----impcomp
# Note: The function plot_imp_distr() requires the package "ggpubr" to be installed.
# You can install this package using the following syntax:
# install.packages("ggpubr")
plot_imp_distr(impDF, nrow = 1)

## ----defaulthyperpars (Appendix A)
default_hyperpars()