# R Code for the paper titled JointAI: Joint Analysis and Imputation of # Incomplete Data in R ## Note: Many of the models in this syntax will return a message saying that no ## MCMC sample will be created when n.iter is set to 0 (the default value for the ## argument n.iter). Specifically, this is the case for models that are used to ## demonstrate how models are specified in JointAI, for which no output or results ## were needed in the paper. To obtain an MCMC samples for these models, the ## argument n.iter has to be set, for example n.iter = 100. In addition, some ## models will result in a warning about the usage of orthogonal polynomial ## contrasts ('contr.poly') for incomplete categorical covariates. These ## warnings/messages are printed to inform the user about the behaviour of JointAI ## and do not indicate problems with the model. They were omitted in the paper. library("knitr") library("survival") library("JointAI") library("ggplot2") # ggplot theme theme_set(theme_gray()) theme_update(panel.grid = element_blank(), panel.background = element_rect(fill = grey(0.98)), panel.border = element_rect(fill = "transparent", color = grey(0.85))) ## ----distrplotNHANES plot_all(NHANES, ncol = 5) ## ----mdpatternNHANES md_pattern(data = NHANES, pattern = FALSE) ## ----trajectories plotDF <- reshape2::melt(simLong[, c("ID", "age", "hgt", "wgt", "bmi", "hc")], id.vars = c("ID", "age")) ggplot(plotDF, aes(x = age, y = value, color = factor(ID))) + geom_line(na.rm = TRUE, alpha = 0.7) + facet_wrap("variable", scales = "free", ncol = 4) + scale_color_viridis_d(guide = "none", end = 0.8) + xlab("age in months") + theme(strip.text = element_text(size = 11)) ## ----distrplotsimLong plot_all(simLong, use_level = TRUE, idvar = "ID", ncol = 5) ## ----mdpatternsimLongprep md1 <- md_pattern(data = subset(simLong, select = -c(bmi, age, hc, hgt, wgt, sleep, time)), pattern = FALSE) md2 <- md_pattern(data = subset(simLong, select = c(bmi, age, hc, hgt, wgt, sleep, time)), pattern = FALSE) ## ----mdpatternsimLong cowplot::plot_grid(md2, md1, align = "h", rel_widths = c(7, 9)) ## ----mdpatternPBC md_pattern(PBC) ## ----distrplotPBC plot_all(PBC, idvars = "id", nclass = 15, ncol = 5) ## ----mod1a ---- mod1a <- glm_imp(educ ~ gender * (age + smoke + creat), data = NHANES, family = binomial()) ## ----mod1b ---- mod1b <- glm_imp(educ ~ gender + (age + smoke + creat)^3, data = NHANES, family = binomial()) ## ----mod2a ---- mod2a <- lm_imp(SBP ~ age + gender + abs(bili - creat), data = NHANES) ## ----mod2b ---- mod2b <- lm_imp(SBP ~ ns(age, df = 2) + gender + I(bili^2) + I(bili^3), data = NHANES) ## ----mod2c ---- mod2c <- lm_imp(SBP ~ age + gender + I(creat/albu^2), data = NHANES, trunc = list(albu = c(1e-05, NA))) ## ----mod2d ---- mod2d <- lm_imp(SBP ~ bili + sin(creat) + cos(albu), data = NHANES) ## ----mod2e ---- mod2e <- lm_imp(SBP ~ age + gender + sqrt(exp(creat)/2), data = NHANES) ## ----mod3a ---- mod3a <- lm_imp(SBP ~ age + gender + log(uricacid) + exp(creat), trunc = list(uricacid = c(1e-05, NA)), data = NHANES) ## ----mod3b ---- mod3b <- lm_imp(SBP ~ age + gender + log(uricacid) + exp(creat), models = c(uricacid = "lognorm"), data = NHANES) ## ----mod3c ---- mod3c <- lm_imp(SBP ~ age + gender + log(uricacid) + exp(creat), models = c(uricacid = "glm_gamma_inverse"), data = NHANES) ## ----ilogit ilogit <- plogis ## ----mod4a ---- mod4a <- lm_imp(SBP ~ age + gender + ilogit(creat), data = NHANES) ## ----listmod2b list_models(mod2b, priors = FALSE, regcoef = FALSE, otherpars = FALSE) ## ----mod4b ---- mod4b <- lm_imp(SBP ~ age + gender + bili, auxvars = ~I(WC^2), data = NHANES) list_models(mod4b, priors = FALSE, regcoef = FALSE, otherpars = FALSE) ## ----mod5 ---- mod5 <- lme_imp(bmi ~ GESTBIR + ETHN + HEIGHT_M + ns(age, df = 2), random = ~ ns(age, df = 2) | ID, data = simLong) ## ----mod6a ---- mod6a <- survreg_imp(Surv(futime, status != "alive") ~ age + sex + copper + trig, models = c(copper = "lognorm", trig = "lognorm"), data = subset(PBC, day == 0), n.iter = 250) ## ----mod6b ---- mod6b <- coxph_imp(Surv(futime, status != "alive") ~ age + sex + copper + trig, models = c(copper = "lognorm", trig = "lognorm"), data = subset(PBC, day == 0), n.iter = 250) ## ----mod6c ---- mod6c <- coxph_imp(Surv(futime, status != "alive") ~ age + sex + copper + trig + platelet + (1 | id), models = c(copper = "lognorm", trig = "lognorm"), timevar = "day", data = PBC) ## ----mod6d ---- PBC$logbili <- log(PBC$bili) mod6d <- JM_imp( list(Surv(futime, status != "alive") ~ age + sex + platelet + logbili + (1 | id), platelet ~ age + sex + day + logbili + (day | id), logbili ~ age + sex + day + (day | id) ), timevar = "day", data = PBC, n.adapt = 10) ## ----mod7a ---- mod7a <- lm_imp(SBP ~ age + gender + WC + alc + bili + occup + smoke, models = c(WC = "glm_gamma_inverse", bili = "lognorm"), data = NHANES, n.adapt = 0) mod7a$models ## ----mod7cmodels ---- mod7b <- lme_imp(bmi ~ GESTBIR + ETHN + HEIGHT_M + SMOKE + hc + MARITAL + ns(age, df = 2), random = ~ ns(age, df = 2) | ID, data = simLong, no_model = "age", n.adapt = 0) mod7b$models ## ----mod8a ---- mod8a <- lm_imp(SBP ~ gender + age + occup, auxvars = ~ educ + smoke, data = NHANES, n.iter = 100) ## ----listmod8a list_models(mod8a, priors = FALSE, regcoef = FALSE, otherpars = FALSE, refcat = FALSE) ## ---- mod9a ---- mod9a <- lm_imp(SBP ~ gender + age + race + educ + occup + smoke, refcats = "largest", data = NHANES) ## ---- mod9b ---- mod9b <- lm_imp(SBP ~ gender + age + race + educ + occup + smoke, refcats = list(occup = "not working", race = 3, educ = "largest"), data = NHANES) ## ----refsmod9 To re-create the selection from the paper, select option 4 in the ## first menu and then option 2 for 'gender', 3 for 'race', 1 for 'educ', 3 for ## 'occup' and 1 for 'smoke'. refs_mod9 <- set_refcat(NHANES, formula = formula(mod9b)) ## ----mod9c ---- mod9c <- lm_imp(SBP ~ gender + age + race + educ + occup + smoke, refcats = refs_mod9, data = NHANES) ## ----mod9d ---- hyp <- default_hyperpars() hyp$norm["shape_tau_norm"] <- 0.5 mod9d <- lm_imp(SBP ~ gender + age + race + educ + occup + smoke, data = NHANES, hyperpars = hyp) ## ----mod10a ---- mod10a <- lm_imp(SBP ~ alc, data = NHANES, n.iter = 100) ## ----summod10a ## capture the output of the summary to be able to print only the relevant parts ## of it a1 <- capture.output(print(summary(mod10a))) cat(paste0("[...]\n", paste(a1[18:22], collapse = "\n"))) ## ----mod10b ---- mod10b <- lm_imp(SBP ~ alc, data = NHANES, n.adapt = 10, n.iter = 100) ## ----summod10b capture the output of the summary to be able to print only the ## relevant parts of it a2 <- capture.output(print(summary(mod10b))) cat(paste0("[...]\n", paste(a2[18:22], collapse = "\n"))) ## ----mod10c ---- mod10c <- lm_imp(SBP ~ alc, data = NHANES, n.iter = 500, thin = 10) ## ----summod10c ## capture the output of the summary to be able to print only the relevant parts ## of it a3 <- capture.output(print(summary(mod10c))) cat(paste0("[...]\n", paste(a3[18:22], collapse = "\n"))) ## ----mod11a ---- mod11a <- lm_imp(SBP ~ gender + WC + alc + creat, data = NHANES, monitor_params = list(other = c("p_alc[1:3]", "mu_creat[1]"))) ## ----initlist init_list <- lapply(1:3, function(i) { list(beta = rnorm(5), tau_SBP = rgamma(1, 1, 1)) }) mod12a <- lm_imp(SBP ~ gender + WC + alc + creat, data = NHANES, inits = init_list) ## ----initfun inits_fun <- function() { list(beta = rnorm(5), alpha = rnorm(9)) } mod12b <- lm_imp(SBP ~ gender + WC + alc + creat, data = NHANES, inits = inits_fun) ## ---- eval = FALSE--------------------------------------------------------- # library("doFuture") # doFuture::registerDoFuture() # plan(multiprocess(workers = 4)) ## ---- eval = FALSE--------------------------------------------------------- # plan(sequential) ## ----traceplot13a mod13a <- lm_imp(SBP ~ gender + WC + alc + creat, data = NHANES, n.iter = 500, seed = 2020) traceplot(mod13a) ## ----ggtrace13a traceplot(mod13a, ncol = 4, use_ggplot = TRUE) + theme(legend.position = "bottom") + scale_color_viridis_d(end = 0.9) ## ----densplot13a densplot(mod13a, ncol = 4, vlines = list(list(v = coef(mod13a)$SBP, lwd = 2), list(v = confint(mod13a)$SBP[, "2.5%"], lty = 2), list(v = confint(mod13a)$SBP[, "97.5%"], lty = 2))) ## ----ggdens13a #(Appendix B) fit the complete-case version of the model mod13a_cc <- lm(formula(mod13a), data = NHANES) # make a dataset containing the quantiles of the posterior sample and confidence # intervals from the complete case analysis: quantDF <- rbind( data.frame(variable = names(coef(mod13a)$SBP), type = "2.5%", model = "JointAI", value = confint(mod13a)$SBP[, c("2.5%")]), data.frame(variable = names(coef(mod13a)$SBP), type = "97.5%", model = "JointAI", value = confint(mod13a)$SBP[, c("97.5%")]), data.frame(variable = names(coef(mod13a_cc)), type = "2.5%", model = "cc", value = confint(mod13a_cc)[, "2.5 %"]), data.frame(variable = names(coef(mod13a_cc)), type = "97.5%", model = "cc", value = confint(mod13a_cc)[, "97.5 %"])) # ggplot version, excluding tau_SBP from the plot: p13a <- densplot(mod13a, ncol = 3, use_ggplot = TRUE, joined = TRUE) + theme(legend.position = "bottom") # add vertical lines for the: - confidence intervals from the complete case # analysis - quantiles of the posterior distribution p13a + geom_vline(data = quantDF, aes(xintercept = value, color = model), lty = 2) + scale_color_manual(name = "CI from model: ", limits = c("JointAI", "cc"), values = c("blue", "red"), labels = c("JointAI", "compl.case")) ## ----summarymod13a summary(mod13a) ## ----mod13b ---- mod13b <- lme_imp(bmi ~ GESTBIR + ETHN + HEIGHT_M + ns(age, df = 3), random = ~ ns(age, df = 3) | ID, data = subset(simLong, !is.na(bmi)), n.iter = 250, seed = 2020) summary(mod13b, missinfo = TRUE) ## ----tailprob op <- par(mfrow = c(1, 3), mgp = c(1, 0.6, 0), mar = c(2.5, 1, 2, 1)) mus <- c(0.7, -1.5, -2.5) for (i in seq_along(mus)) { x <- seq(-3.5, 3.5, length = 1000) + mus[i] y <- dnorm(x, mean = mus[i]) plot(x, y, type = "l", yaxt = "n", xaxt = "n", xlab = expression(theta), ylab = "", cex.lab = 1.5, main = paste0("tail prob. = ", round(2 * pnorm(0, abs(mus[i])), 3))) if (mus[i] > 0) { polygon(x = c(x[x < 0], max(x[x < 0])), y = c(y[x < 0], min(y)), col = "#18bc9c", border = NA) } else { polygon(x = c(x[x > 0], min(x[x > 0])), y = c(y[x > 0], min(y)), col = "#18bc9c", border = NA) } lines(x, y) axis(side = 1, at = 0) abline(v = 0, lty = 2) } par(mfrow = c(1, 1)) ## ----GR13a GR_crit(mod13a) ## ----MCE13acalc MC_error(mod13a) ## ----MCE13a plot(MC_error(mod13a)) plot(MC_error(mod13a, end = 250)) ## ----mod13c ---- mod13c <- update(mod13a, monitor_params = c(other_models = TRUE)) summary(mod13c, subset = c(analysis_main = FALSE, other_models = TRUE)) ## ----densplot13asimple densplot(mod13a, nrow = 1, subset = list(analysis_main = FALSE, other = c('beta[2]', 'beta[4]'))) ## ----trace13d mod13d <- update(mod13a, monitor_params = c(imps = TRUE)) ## ----sub3 sub3 <- grep("M_lvlone\\[[[:digit:]]+,4\\]", parameters(mod13d)$coef, value = TRUE) sub3 traceplot(mod13d, subset = list(other = sub3)) ## ----mod13e ---- mod13e <- update(mod13b, monitor_params = c(ranef_main = TRUE)) ## ----ranefs rde <- grep("^b_bmi_ID\\[", colnames(mod13e$MCMC[[1]]), value = TRUE) traceplot(mod13e, subset = list(analysis_main = FALSE, other = sample(rde, size = 12)), ncol = 4) ## ----predict13a ---- predict(mod13a, newdata = NHANES[27, ]) ## ----pred13b ---- newDF <- predDF(mod13b, vars = ~age) pred <- predict(mod13b, newdata = newDF) ## ----predvis matplot(pred$newdata$age, pred$newdata[, c("fit", "2.5%", "97.5%")], lty = c(1, 2, 2), type = "l", col = 1, xlab = "age in months", ylab = "predicted value") newDF2 <- predDF(mod13b, vars = ~ age + HEIGHT_M, HEIGHT_M = c(160, 175)) ## ----impDF impDF <- get_MIdat(mod13d, m = 10, seed = 2019) ## ----impcomp # Note: The function plot_imp_distr() requires the package "ggpubr" to be installed. # You can install this package using the following syntax: # install.packages("ggpubr") plot_imp_distr(impDF, nrow = 1) ## ----defaulthyperpars (Appendix A) default_hyperpars()