# Simulation and Data Analysis Code
library("tmle")

# ----- Code chunk 1: Q and g correctly specified -----
set.seed(1960)
n <- 250
W <- matrix(rnorm(n * 3), ncol = 3)
colnames(W) <- paste("W", 1:3, sep = "")
A <- rbinom(n,1, plogis(0.6 * W[,1] + 0.4 * W[,2] + 0.5 * W[,3]))
Y <- rbinom(n,1, plogis(A + 0.2 * W[,1] + 0.1 * W[,2] + 0.2 * W[,3]^2))

result.Qcgc <- tmle(Y, A, W, family = "binomial",  
               Qform = Y~ A + W1 + W2 + W3, gform = A ~ W1 + W2 + W3)
result.Qcgc


# ----- Code chunk 2: Superlearning for Q, regression formula for g -----
result.QSLgc <- tmle(Y, A, W, family="binomial", 
                  Q.SL.library = c("SL.glm", "SL.step", "SL.DSA.2"),
                  gform = A ~ W1 + W2 + W3)
summary(result.QSLgc)


# ----- Code chunk 3: Repeated measures, Q and g correctly specified -----
set.seed(1960)
n <- 250
id <- rep(1:n, 2)
W <- matrix(rnorm(n * 3), ncol = 3)
colnames(W) <- paste("W", 1:3, sep = "")
W <- rbind(W,W)
A <- rbinom(2*n, 1, plogis(0.6 * W[,1] + 0.4 * W[,2] + 0.5 * W[,3]))
Y <- rbinom(2*n, 1, plogis(A + 0.2 * W[,1] + 0.1 * W[,2] + 0.2 * W[,3]^2))

result.Qcgc.repeated <- tmle(Y, A, W, family = "binomial",  
          Qform = Y~ A + W1 + W2 + W3, gform = A ~ W1 + W2 + W3, id = id)
result.Qcgc.repeated

# ----- Code Chunk 4: EY1 parameter -----
set.seed(1960)
n <- 250
W <- matrix(rnorm(n * 3), ncol = 3)
colnames(W) <- paste("W",1:3, sep = "")
Delta <- rbinom(n, 1, plogis(0.8 + 0.3*W[,1]))
Y <- 2 * W[,1] + 4 * W[,2] + 3 * W[,3]+ rnorm(n)
Y[Delta == 0] <- NA

result.EY1 <- tmle(Y,A = rep(1, n), W, Qform = Y ~ W3, 
              g.Deltaform = Delta ~ W1, Delta = Delta)
result.EY1

# ----- Code Chunk 5: ETA violations, logistic and linear fluctuation -----
set.seed(1960)
n <- 250
niterations <- 250
gbd <- c(0, 0.01, 0.025, 0.05, 0.1)
ngbd <- length(gbd)
result.Qmgc <- matrix(NA, nrow = niterations, ncol = 2 * ngbd)
for(i in 1:niterations){
    W <- matrix(rnorm(n * 3), ncol = 3)
    colnames(W) <- paste("W", 1:3, sep = "")
    logitA <- 0.5 + 0.9 * W[,1] + 0.5 * W[,2] + 0.7 * W[,3]
    A <- rbinom(n,1, plogis(logitA))
    Y <- A + 4 * W[,1] + 4 * W[,2] + 3 * W[,3] + rnorm(n)
    result.Qmgc[i,] <- c(unlist(sapply(gbd, function(x){
       tmle(Y, A, W, Qform = Y ~ A, gform = A ~ W1 + W2 + W3, 
       fluctuation = "linear", gbound = x)$estimates$ATE[1]})),
       unlist(sapply(gbd, function(x){
       tmle(Y, A, W, Qform = Y ~ A, gform = A ~ W1+ W2 + W3,
       fluctuation = "logistic", gbound = x)$estimates$ATE[1]})))
}
psi0 <- 1
bias <- colMeans(result.Qmgc - psi0)
variance <- apply(result.Qmgc, 2, var)
MSE <- colMeans((result.Qmgc -psi0)^2)
d <- data.frame(bias.lin = bias[1:5], var.lin = variance[1:5], MSE.lin = MSE[1:5],
                bias.log = bias[6:10], var.log = variance[6:10], MSE.log = MSE[6:10])
round(d,2)

# ----- Code Chunk 6: Controlled Direct Effect -----
set.seed(1960)
n <- 1000
W <- matrix(rnorm(n*3), ncol = 3)
colnames(W) <- paste("W", 1:3, sep = "")
A <- rbinom(n,1, plogis(0.6*W[,1] + 0.4*W[,2] + 0.5*W[,3]))
Z <- rbinom(n,1, plogis(0.5 + A))
Y <- A + A*Z+ 0.2*W[,1] + 0.1*W[,2] + 0.2*W[,3]^2 + rnorm(n)
Delta <- rbinom(n,1, plogis(Z + A)) 
pDelta1 <- cbind(rep(plogis(0), n), rep(plogis(1), n),
                 rep(plogis(1), n), rep(plogis(2), n))
colnames(pDelta1) <- c("Z0A0", "Z0A1", "Z1A0", "Z1A1")
Y[Delta == 0] <- NA
result.Z.missing <- tmle(Y, A, W, Z, Delta = Delta, pDelta1= pDelta1, 
                         Qform = Y ~ 1, g.SL.library = "SL.glm") 
result.Z.missing

# ----- Code Chunk 7: AIPTW Estimator -----
calc_aipw <- function(d, Qform, g1W) {
    Q <- glm(Qform, data = d)
    QAW.pred <- predict(Q, newdata = d, type ="response")
    Q1W.pred <- predict(Q, newdata = data.frame(d[,-2], A = 1), type = "response")
    Q0W.pred <- predict(Q, newdata = data.frame(d[,-2], A = 0), type = "response")
    h <- d$A/g1W - (1 - d$A)/(1 - g1W)
    psi <- mean(h*(d$Y - QAW.pred) + Q1W.pred - Q0W.pred)  
    return(psi)
}

# ----- Code Chunk 8: Simulation Study -----
set.seed(10)
n <- 500
niter <- 500
a <- c(.1, 1)
b <- c(.25,1.5)
est.beta0 <- array(NA, dim = c(2, niter,3), 
    dimnames = list(c("g1", "g2"), NULL, 
    c("IPW", "IPW stabilized","TMLE.MSM")))
est.ATE <- array(NA, dim = c(2, niter, 5), 
    dimnames = list(c("g1", "g2"), NULL,
    c("IPW", "IPW stabilized","TMLE.MSM", "TMLE", "AIPW")))
for (i in 1:2){
  for (j in 1:niter){
     U <- runif(n, -10, 10)
     W <- U/3 + rnorm(n)
     logitA <- a[i] + b[i]*W
     A <- rbinom(n, 1, plogis(logitA))
     Y <- 2 + 4 * U - 5 * A + rnorm(n)
     g <- glm(A ~ W, family = "binomial")
     g1W <- predict(g, type = "response")
     wt <- A/g1W + (1 - A)/(1 - g1W)
     wt.stab <- (A * mean(A) + (1 - A) * (1 - mean(A))) * wt
     ipw.msm <- coef(glm(Y ~ A, weights = wt))
     ipw.stab.msm <- coef(glm(Y ~ A, weights = wt.stab))		
     res.tmleMSM <- tmleMSM(Y, A, as.matrix(W), V = rep(1, n), MSM = "A", 
         Qform = "Y ~ A", g1W = g1W, ub = Inf)
     res.tmle <- tmle(Y, A, as.matrix(W), Qform ="Y ~ A", g1W = g1W, 
         gbound = c(0,1))
     aipw <- calc_aipw(data.frame(Y, A, W), Qform = "Y ~ A", g1W = g1W)
     est.beta0[i,j,] <- c(ipw.msm[1], ipw.stab.msm[1], res.tmleMSM$psi[1])  
     est.ATE[i,j,] <- c(ipw.msm[2], ipw.stab.msm[2], res.tmleMSM$psi[2], 
         res.tmle$estimates$ATE$psi, aipw) 
}}

# ----- Data Analysis: FEV dataset -----
data("fev")
fev <- fev[fev$age >= 9,]

#1. Main terms linear regression to estimate initial Q
g.SL.library <- c("SL.glm", "SL.step", "SL.DSA.2",
          "SL.loess", "SL.caret", "SL.bart",
          "SL.knn", "SL.knn20", "SL.knn40", "SL.knn60")
          
set.seed(10)
smoke.Qmis <- tmle(Y = fev$fev, A = fev$smoke, W = fev[ ,c(1, 3, 4)], 
           Qform = Y ~ ., g.SL.library = g.SL.library)
smoke.Qmis

# untargeted estimate
EY0 <- mean(smoke.Qmis$Qinit$Q[,"Q0W"])
EY1 <- mean(smoke.Qmis$Qinit$Q[,"Q1W"])
EY1 - EY0

#2. Super learning to estimate initial Q
SL.glm.int <- function(Y.temp, X.temp, newX.temp, family, ...){
	Aint <- paste("A", colnames(X.temp)[-c(1,2)], sep="*")
	form <- paste("Y.temp~Z+", paste(Aint, collapse="+"))
	fit.glm <- glm(form, data = data.frame(Y.temp,X.temp), family = family)
    out <- predict(fit.glm, newdata = newX.temp, type = "response")
    fit <- list(object = fit.glm)
    foo <- list(out = out, fit = fit)
    class(foo$fit) <- c("SL.glm.int")
    return(foo)
} 

Q.SL.library <- c("SL.glm","SL.glm.int", "SL.DSA.2",
           "SL.loess", "SL.caret", "SL.bart")
           
smoke.QSL <- tmle(Y = fev$fev, A = fev$smoke, W = fev[, c(1,3,4)], 
	   Q.SL.library = Q.SL.library, 
	   g1W = smoke.Qmis$g$g1W)
smoke.QSL

# ----- FAQ: Categorical Treatment -----
set.seed(10)
n <- 10^5
A <- sample(1:3, n, replace = TRUE)
W1 <- rnorm(n)
W2 <- rbinom(n, 1, 0.3)
Y <- 2 * A + 10 * W1 - A * W2 + rnorm(n) 
Delta1 <- as.integer(A == 1)
Delta2 <- as.integer(A == 2)
Delta3 <- as.integer(A == 3)

# Estimate EY1, EY2, EY3 parameters
result1 <- tmle(Y, NULL, cbind(W1, W2), Delta = Delta1, 
    Qform = "Y ~ A", g.Deltaform = "Delta ~ 1")
result2 <- tmle(Y, NULL, cbind(W1, W2), Delta = Delta2, 
    Qform = "Y ~ A", g.Deltaform = "Delta ~ 1")
result3 <- tmle(Y, NULL, cbind(W1, W2), Delta = Delta3, 
    Qform = "Y ~ A", g.Deltaform = "Delta ~ 1")
print(c(result1$estimates$EY1$psi, 
    result2$estimates$EY1$psi, 
    result3$estimates$EY1$psi))
    
# Calculcate ATE for A=1 vs. A=3
print(result1$estimates$EY1$psi - result3$estimates$EY1$psi)

# Dose-Response Measure
Qstar <- c(result1$Qstar[,1], result2$Qstar[,1], result3$Qstar[,1])
A.intervened <- rep(1:3, each = n)
doseResponse <- coef(lm(Qstar ~ A.intervened))[2]
print(doseResponse)