# Simulation and Data Analysis Code library("tmle") # ----- Code chunk 1: Q and g correctly specified ----- set.seed(1960) n <- 250 W <- matrix(rnorm(n * 3), ncol = 3) colnames(W) <- paste("W", 1:3, sep = "") A <- rbinom(n,1, plogis(0.6 * W[,1] + 0.4 * W[,2] + 0.5 * W[,3])) Y <- rbinom(n,1, plogis(A + 0.2 * W[,1] + 0.1 * W[,2] + 0.2 * W[,3]^2)) result.Qcgc <- tmle(Y, A, W, family = "binomial", Qform = Y~ A + W1 + W2 + W3, gform = A ~ W1 + W2 + W3) result.Qcgc # ----- Code chunk 2: Superlearning for Q, regression formula for g ----- result.QSLgc <- tmle(Y, A, W, family="binomial", Q.SL.library = c("SL.glm", "SL.step", "SL.DSA.2"), gform = A ~ W1 + W2 + W3) summary(result.QSLgc) # ----- Code chunk 3: Repeated measures, Q and g correctly specified ----- set.seed(1960) n <- 250 id <- rep(1:n, 2) W <- matrix(rnorm(n * 3), ncol = 3) colnames(W) <- paste("W", 1:3, sep = "") W <- rbind(W,W) A <- rbinom(2*n, 1, plogis(0.6 * W[,1] + 0.4 * W[,2] + 0.5 * W[,3])) Y <- rbinom(2*n, 1, plogis(A + 0.2 * W[,1] + 0.1 * W[,2] + 0.2 * W[,3]^2)) result.Qcgc.repeated <- tmle(Y, A, W, family = "binomial", Qform = Y~ A + W1 + W2 + W3, gform = A ~ W1 + W2 + W3, id = id) result.Qcgc.repeated # ----- Code Chunk 4: EY1 parameter ----- set.seed(1960) n <- 250 W <- matrix(rnorm(n * 3), ncol = 3) colnames(W) <- paste("W",1:3, sep = "") Delta <- rbinom(n, 1, plogis(0.8 + 0.3*W[,1])) Y <- 2 * W[,1] + 4 * W[,2] + 3 * W[,3]+ rnorm(n) Y[Delta == 0] <- NA result.EY1 <- tmle(Y,A = rep(1, n), W, Qform = Y ~ W3, g.Deltaform = Delta ~ W1, Delta = Delta) result.EY1 # ----- Code Chunk 5: ETA violations, logistic and linear fluctuation ----- set.seed(1960) n <- 250 niterations <- 250 gbd <- c(0, 0.01, 0.025, 0.05, 0.1) ngbd <- length(gbd) result.Qmgc <- matrix(NA, nrow = niterations, ncol = 2 * ngbd) for(i in 1:niterations){ W <- matrix(rnorm(n * 3), ncol = 3) colnames(W) <- paste("W", 1:3, sep = "") logitA <- 0.5 + 0.9 * W[,1] + 0.5 * W[,2] + 0.7 * W[,3] A <- rbinom(n,1, plogis(logitA)) Y <- A + 4 * W[,1] + 4 * W[,2] + 3 * W[,3] + rnorm(n) result.Qmgc[i,] <- c(unlist(sapply(gbd, function(x){ tmle(Y, A, W, Qform = Y ~ A, gform = A ~ W1 + W2 + W3, fluctuation = "linear", gbound = x)$estimates$ATE[1]})), unlist(sapply(gbd, function(x){ tmle(Y, A, W, Qform = Y ~ A, gform = A ~ W1+ W2 + W3, fluctuation = "logistic", gbound = x)$estimates$ATE[1]}))) } psi0 <- 1 bias <- colMeans(result.Qmgc - psi0) variance <- apply(result.Qmgc, 2, var) MSE <- colMeans((result.Qmgc -psi0)^2) d <- data.frame(bias.lin = bias[1:5], var.lin = variance[1:5], MSE.lin = MSE[1:5], bias.log = bias[6:10], var.log = variance[6:10], MSE.log = MSE[6:10]) round(d,2) # ----- Code Chunk 6: Controlled Direct Effect ----- set.seed(1960) n <- 1000 W <- matrix(rnorm(n*3), ncol = 3) colnames(W) <- paste("W", 1:3, sep = "") A <- rbinom(n,1, plogis(0.6*W[,1] + 0.4*W[,2] + 0.5*W[,3])) Z <- rbinom(n,1, plogis(0.5 + A)) Y <- A + A*Z+ 0.2*W[,1] + 0.1*W[,2] + 0.2*W[,3]^2 + rnorm(n) Delta <- rbinom(n,1, plogis(Z + A)) pDelta1 <- cbind(rep(plogis(0), n), rep(plogis(1), n), rep(plogis(1), n), rep(plogis(2), n)) colnames(pDelta1) <- c("Z0A0", "Z0A1", "Z1A0", "Z1A1") Y[Delta == 0] <- NA result.Z.missing <- tmle(Y, A, W, Z, Delta = Delta, pDelta1= pDelta1, Qform = Y ~ 1, g.SL.library = "SL.glm") result.Z.missing # ----- Code Chunk 7: AIPTW Estimator ----- calc_aipw <- function(d, Qform, g1W) { Q <- glm(Qform, data = d) QAW.pred <- predict(Q, newdata = d, type ="response") Q1W.pred <- predict(Q, newdata = data.frame(d[,-2], A = 1), type = "response") Q0W.pred <- predict(Q, newdata = data.frame(d[,-2], A = 0), type = "response") h <- d$A/g1W - (1 - d$A)/(1 - g1W) psi <- mean(h*(d$Y - QAW.pred) + Q1W.pred - Q0W.pred) return(psi) } # ----- Code Chunk 8: Simulation Study ----- set.seed(10) n <- 500 niter <- 500 a <- c(.1, 1) b <- c(.25,1.5) est.beta0 <- array(NA, dim = c(2, niter,3), dimnames = list(c("g1", "g2"), NULL, c("IPW", "IPW stabilized","TMLE.MSM"))) est.ATE <- array(NA, dim = c(2, niter, 5), dimnames = list(c("g1", "g2"), NULL, c("IPW", "IPW stabilized","TMLE.MSM", "TMLE", "AIPW"))) for (i in 1:2){ for (j in 1:niter){ U <- runif(n, -10, 10) W <- U/3 + rnorm(n) logitA <- a[i] + b[i]*W A <- rbinom(n, 1, plogis(logitA)) Y <- 2 + 4 * U - 5 * A + rnorm(n) g <- glm(A ~ W, family = "binomial") g1W <- predict(g, type = "response") wt <- A/g1W + (1 - A)/(1 - g1W) wt.stab <- (A * mean(A) + (1 - A) * (1 - mean(A))) * wt ipw.msm <- coef(glm(Y ~ A, weights = wt)) ipw.stab.msm <- coef(glm(Y ~ A, weights = wt.stab)) res.tmleMSM <- tmleMSM(Y, A, as.matrix(W), V = rep(1, n), MSM = "A", Qform = "Y ~ A", g1W = g1W, ub = Inf) res.tmle <- tmle(Y, A, as.matrix(W), Qform ="Y ~ A", g1W = g1W, gbound = c(0,1)) aipw <- calc_aipw(data.frame(Y, A, W), Qform = "Y ~ A", g1W = g1W) est.beta0[i,j,] <- c(ipw.msm[1], ipw.stab.msm[1], res.tmleMSM$psi[1]) est.ATE[i,j,] <- c(ipw.msm[2], ipw.stab.msm[2], res.tmleMSM$psi[2], res.tmle$estimates$ATE$psi, aipw) }} # ----- Data Analysis: FEV dataset ----- data("fev") fev <- fev[fev$age >= 9,] #1. Main terms linear regression to estimate initial Q g.SL.library <- c("SL.glm", "SL.step", "SL.DSA.2", "SL.loess", "SL.caret", "SL.bart", "SL.knn", "SL.knn20", "SL.knn40", "SL.knn60") set.seed(10) smoke.Qmis <- tmle(Y = fev$fev, A = fev$smoke, W = fev[ ,c(1, 3, 4)], Qform = Y ~ ., g.SL.library = g.SL.library) smoke.Qmis # untargeted estimate EY0 <- mean(smoke.Qmis$Qinit$Q[,"Q0W"]) EY1 <- mean(smoke.Qmis$Qinit$Q[,"Q1W"]) EY1 - EY0 #2. Super learning to estimate initial Q SL.glm.int <- function(Y.temp, X.temp, newX.temp, family, ...){ Aint <- paste("A", colnames(X.temp)[-c(1,2)], sep="*") form <- paste("Y.temp~Z+", paste(Aint, collapse="+")) fit.glm <- glm(form, data = data.frame(Y.temp,X.temp), family = family) out <- predict(fit.glm, newdata = newX.temp, type = "response") fit <- list(object = fit.glm) foo <- list(out = out, fit = fit) class(foo$fit) <- c("SL.glm.int") return(foo) } Q.SL.library <- c("SL.glm","SL.glm.int", "SL.DSA.2", "SL.loess", "SL.caret", "SL.bart") smoke.QSL <- tmle(Y = fev$fev, A = fev$smoke, W = fev[, c(1,3,4)], Q.SL.library = Q.SL.library, g1W = smoke.Qmis$g$g1W) smoke.QSL # ----- FAQ: Categorical Treatment ----- set.seed(10) n <- 10^5 A <- sample(1:3, n, replace = TRUE) W1 <- rnorm(n) W2 <- rbinom(n, 1, 0.3) Y <- 2 * A + 10 * W1 - A * W2 + rnorm(n) Delta1 <- as.integer(A == 1) Delta2 <- as.integer(A == 2) Delta3 <- as.integer(A == 3) # Estimate EY1, EY2, EY3 parameters result1 <- tmle(Y, NULL, cbind(W1, W2), Delta = Delta1, Qform = "Y ~ A", g.Deltaform = "Delta ~ 1") result2 <- tmle(Y, NULL, cbind(W1, W2), Delta = Delta2, Qform = "Y ~ A", g.Deltaform = "Delta ~ 1") result3 <- tmle(Y, NULL, cbind(W1, W2), Delta = Delta3, Qform = "Y ~ A", g.Deltaform = "Delta ~ 1") print(c(result1$estimates$EY1$psi, result2$estimates$EY1$psi, result3$estimates$EY1$psi)) # Calculcate ATE for A=1 vs. A=3 print(result1$estimates$EY1$psi - result3$estimates$EY1$psi) # Dose-Response Measure Qstar <- c(result1$Qstar[,1], result2$Qstar[,1], result3$Qstar[,1]) A.intervened <- rep(1:3, each = n) doseResponse <- coef(lm(Qstar ~ A.intervened))[2] print(doseResponse)