##### Section 3.7 ############################################################## ## Use this code to see which super learner candidates are available ## and to display information about how to plug in a user-defined ## candidate library("multiPIM") ?Candidates ##### Section 3.8 ############################################################## ## This snippet just shows the code that is called from within the ## multiPIMboot function when multicore = TRUE in order to set the ## random number generator # RNGkind("L'Ecuyer-CMRG") ##### Section 3.10 ############################################################# num.columns <- 3 num.obs <- 250 set.seed(23) A <- as.data.frame(matrix(rbinom(num.columns*num.obs, 1, .5), nrow = num.obs, ncol = num.columns)) Y <- as.data.frame(matrix(rnorm(num.columns*num.obs), nrow = num.obs, ncol = num.columns)) for(i in 1:num.columns) Y[, i] <- Y[, i] + i * A[, i] names(A) <- paste("A", 1:num.columns, sep = "") names(Y) <- paste("Y", 1:num.columns, sep = "") result <- multiPIM(Y, A) summary(result) ##### Section 4 ################################################################ ## Version information: ## R: 2.15.2 ## multiPIM: 1.3-1 ## lars: 1.1 ## penalized: 0.9-42 ## polspline: 1.1.6 ## MASS: 7.3-22 ## load packages library("MASS") ## use this to generate multivariate normal data (mvrnorm) ## set parameters ns <- round(100*(2500^(1/99))^(0:99)) ## sample sizes ## (100 of them, evenly spaced on log scale, ranging from 100 to 250,000) num.covs <- 4 ## number of covariates error.sd <- 2 ## standard deviation of error term for generation of outcome y1 ## set up covariance matrix for multivariate normal covariates covar <- 0.2 sigma <- matrix(covar, num.covs, num.covs) diag(sigma) <- 1 ## function to generate W as multivariate normal gen.W <- function(n, p, Sigma) { W <- data.frame(mvrnorm(n = n, mu = rep(0, p), Sigma = Sigma)) names(W) <- paste("w", 1:p, sep = "") return(W) } ## function to generate A based on covs (W) gen.A <- function(W) { A.probs <- plogis(0.2*rowSums(as.matrix(W))) A <- data.frame(a1 = rbinom(nrow(W), 1, A.probs)) } ## function to generate Y based on A and W gen.Y <- function(A, W, error.sd) { Y <- data.frame(y1 = W[,1]*W[,2] + W[,3]*W[,4] + rowSums(as.matrix(W)^2) * A[[1]] + rnorm(nrow(W), 0, error.sd)) } ## define estimators to be used estimators <- c("TMLE", "G-COMP") ####### set up storage objects ######## ## list of lists to store resulting objects multiPIM.objects <- vector("list", length = length(ns)) for(i in 1:length(ns)) multiPIM.objects[[i]] <- vector("list", length = length(estimators)) ## matrix to store parameter estimates param.estimates <- matrix(0, length(ns), length(estimators), dimnames = list(ns = ns, estimators = estimators)) ## actual simulation ## set seed set.seed(23) start.time <- Sys.time() for(a in length(ns):1) { ## do the big ns first cat("\n########################################\nStarting on n =", ns[a], "\n########################################\n\n") W <- gen.W(ns[a], num.covs, sigma) A <- gen.A(W) Y <- gen.Y(A, W, error.sd) for(b in 1:length(estimators)) { multiPIM.objects[[a]][[b]] <- multiPIM(Y, A, W, estimator = estimators[b], g.method = "main.terms.logistic", ## will be ignored for g-comp Q.method = "main.terms.linear", return.final.models = FALSE) param.estimates[a, b] <- multiPIM.objects[[a]][[b]]$param.estimates[1] } } end.time = Sys.time() save(param.estimates, multiPIM.objects, file = "sim_stuff.rda") time.taken <- difftime(end.time, start.time, units = "mins") cat("\n########################################\ntime taken is,", time.taken, "minutes\n########################################\n") ## find "true" value of psi truth.n <- 500000 truth.reps <- 100 truth.estimates <- vector("numeric", length = truth.reps) time1 <- Sys.time() for(i in 1:truth.reps) { W <- gen.W(truth.n, num.covs, sigma) A <- gen.A(W) Y <- gen.Y(A, W, error.sd) A.0 <- A A.0[] <- 0 Y.0 <- gen.Y(A.0, W, error.sd) truth.estimates[i] <- mean(Y.0[[1]]) - mean(Y[[1]]) } time2 <- Sys.time() time.taken <- difftime(time2, time1, units = "mins") truth <- mean(truth.estimates) truth.sd <- sd(truth.estimates) save(truth.estimates, truth, truth.sd, file = "truth_stuff.rda") cat("\n######################################\ntime taken for truth stuff is,", time.taken, "minutes\n######################################\n") cat("\n########################################\ntruth =", truth, "\ttruth.sd = ", truth.sd, "\n########################################\n") ## Figure 1 ns <- as.integer(rownames(param.estimates)) x.axis.labs.at <- c(100*10^(0:3), 250*10^(0:3), 500*10^(0:2)) miny <- min(param.estimates) maxy <- max(param.estimates) par(mar = c(4.5, 4, 1, 1)) cex <- 1.3 cex.axis <- 1.2 cols <- c("blue", "red") pchs <- c(3, 1) plot(c(min(ns), max(ns)), c(miny, maxy), type = "n", log = "x", ylab = "Parameter estimate", xlab = "Sample size (log scaling)", xaxt = "n", cex.axis = cex.axis, cex.lab = cex) axis(1, at = x.axis.labs.at, cex.axis = cex.axis) points(ns, param.estimates[, 1], col = cols[1], pch = pchs[1], type = "b") points(ns, param.estimates[, 2], col = cols[2], pch = pchs[2], type = "b") abline(h = truth, col = "black") legend(x = 32000, y = maxy, legend = c("truth = -2.0", "TMLE", "G-Computation"), col = c("black", cols), pch = c(NA, pchs), lty = c(1, NA, NA), cex = cex) ##### Section 5 ################################################################ ## Version information: ## R: 2.15.2 ## multiPIM: 1.3-1 ## lars: 1.1 ## parallel 2.15.2 (part of R distribution) ## penalized: 0.9-42 ## polspline: 1.1.6 ## load data data("wcgs") ## remove 12 observations which have missing cholesterol values wcgs.data.no.na <- na.omit(wcgs) ## cutoff values (in most cases this is upper limit for unexposed group) bmi.cutoff <- 25 chol.cutoff <- 240 dbp.cutoff <- 90 sbp.cutoff <- 140 cigs.cutoff <- 0 ## attach the final data frame attach(wcgs.data.no.na) bmi <- as.integer(bmi0 > bmi.cutoff) chol <- as.integer(chol0 > chol.cutoff) ## subject will be considered as exposed to high BP if either systolic or ## diastolic is elevated high.dbp <- dbp0 > dbp.cutoff high.sbp <- sbp0 > sbp.cutoff highBP <- as.integer(high.dbp | high.sbp) ## exposed is type A typeAB <- dibpat0 ## 1->A and 0->B as required cigs <- as.integer(ncigs0 > cigs.cutoff) A <- data.frame(typeAB=typeAB, bmi=bmi, chol=chol, highBP = highBP, cigs=cigs) W <- data.frame(age=age0) Y <- data.frame(chd=chd69) detach(wcgs.data.no.na) save(Y, A, W, file = "wcgs_YAW.rda") ## this will probably take quite a while... time1 <- Sys.time() boot.result <- multiPIMboot(Y, A, W, times = 2000, multicore = TRUE, mc.num.jobs = 8, verbose = TRUE) time2 <- Sys.time() save(boot.result, file = "boot_result.rda") cat("Time taken is", difftime(time2, time1, units = "hours"), "hours\n") ## tables library("xtable") ## Table 1 res.sum <- summary(boot.result) exposure.order <- order(res.sum$summary.array[, 1, 4]) chi.results <- lapply(A, chisq.test, Y[, 1]) proportion.diseased <- mean(Y[[1]]) summary.vals <- cbind(t(sapply(A, function(x){c(sum(x), mean(x))})), t(sapply(chi.results, function(x) { return(c(x$observed[1, 2] / sum(x$observed[1, ]), x$observed[2, 2] / sum(x$observed[2, ])))}))) summary.vals <- cbind(summary.vals, summary.vals[, 3] - proportion.diseased) asum.tab <- rbind(c("group", "exposed", "with CHD", "with CHD", "attr. risk"), cbind(c("type B", "< 25", "< 240", "D<90 \\& S<140", "< 1/day"), format(summary.vals[, -c(1, 5)], digits = 2, scientific = FALSE), format(summary.vals[, 5], digits = 2))[exposure.order, ]) colnames(asum.tab) <- c("Unexposed", "Proportion", "Prop. unexp.", "Prop. exp.", "Naive") rownames(asum.tab)[1] <- " " align1 <- "lccccc" asum.xtab <- xtable(asum.tab, align = align1) print(asum.xtab, hline.after = c(-1, -1, 1, 6), floating = F, sanitize.text.function = function(str){str}) ## Table 2 cor.tab <- cor(cbind(W, A[, exposure.order], Y)) cor.tab[row(cor.tab) > col(cor.tab)] <- NA align3 <- "lccccccc" cor.xtab <- xtable(cor.tab, align = align3) print(cor.xtab, floating = F, hline.after = c(-1, -1, 0, 7)) ## Table 3 res.tab <- cbind(res.sum$summary.array[, 1, 1], boot.result$plug.in.stand.errs, res.sum$summary.array[, 1, -1]) colnames(res.tab) <- c("$\\hat{\\psi}^{TMLE}$", "$\\hat{\\sigma}^{plug-in}$", "$\\hat{\\sigma}^{boot}$", "$T$", "$p$", "$p^{Bonferroni}$") digits2 <- cbind(matrix(4, nrow = 5, ncol = 4), rep(2, 5), c(-2, -2, -2, 3, 3), c(-2, -2, -2, 3, 1)) align2 <- "lcccccc" res.xtab <- xtable(res.tab[exposure.order, ], digits = digits2, align = align2) print(res.xtab, sanitize.text.function = function(str){str}, floating = F, hline.after = c(-1, -1, 0, 5)) ## Figure 2 par(mfrow = c(5, 1), mar = c(2,3,3,0), oma = c(3, 2, 0, 2), cex.lab = 1.5) grid <- (-25:10/500) get.subset <- function(vec, grid){ increment = grid[2] - grid[1] return((grid > (min(vec) - increment)) & (grid < (max(vec) + increment))) } grid.subsets <- apply(boot.result$boot.param.array[, , 1], 2, get.subset, grid) for(i in exposure.order) { hist(boot.result$boot.param.array[, i, 1], xlim = c(min(grid), max(grid)), breaks = grid[grid.subsets[, i]], yaxt = "n", ylab = NULL, main = NULL, ylim = c(0, 600)) title(main = rownames(boot.result$param)[i], line = 0.6, cex = 1.1) abline(v = boot.result$param[i, 1], lty = 2) axis(2, 0:5*100, labels = c("0", NA, NA, NA, NA, "500")) } cex = 1 mtext("Bootstrap parameter estimates", side = 1, outer = TRUE, adj = .56, line = 1.5, cex = cex) mtext("Frequency", side = 2, outer = TRUE, adj = 0.48, cex = cex, line = 0.6) ## Figure 3 par(mar = c(5, 4, 1, 2)) cumulative <- function(x, fun) { return.vec <- vector("double", length = length(x)) for(i in 1:length(x)) return.vec[i] <- fun(x[1:i]) return(return.vec) } cumulativeSDs <- apply(boot.result$boot.param.array[, , 1], 2, cumulative, fun = sd) plot(c(0, 2000), c(min(cumulativeSDs, na.rm = T), max(cumulativeSDs, na.rm = T)), type = "n", ylab = "Cumulative standard deviation", xlab = "Bootstrap replicate number", cex.lab = 1, cex.axis = 1) for(i in 1:boot.result$num.exposures) { lines(cumulativeSDs[, i]) } ##### Section 6 ################################################################ ## Version information: ## R: 2.15.2 ## multiPIM: 1.3-1 ## rpart: 4.1-0 ## data data("schisto") W <- data.frame(lapply(schisto[, 9:10], as.factor)) A <- schisto[, 2:8] Y <- schisto[, 1, drop = FALSE] ## analysis set.seed(23) num.types <- ncol(A) multiPIM.results <- vector("list", length = num.types) for(i in 1:num.types) { A.current <- A[, i, drop = FALSE] A.current[A.current > 0] <- 1 W.current <- cbind(W, A[, -i]) multiPIM.results[[i]] <- multiPIM(Y, A.current, W.current, estimator = "IPCW", g.method = "rpart.bin") } ## Table 4 results.tab <- t(sapply(multiPIM.results, function(x) summary(x, bf.multiplier = 7)$sum[1,1,])) rownames(results.tab) <- names(A) print(xtable(results.tab, digits = 3), floating = FALSE)