##### Section 3.7 ##############################################################

## Use this code to see which super learner candidates are available
## and to display information about how to plug in a user-defined
## candidate
library("multiPIM")
?Candidates


##### Section 3.8 ##############################################################

## This snippet just shows the code that is called from within the
## multiPIMboot function when multicore = TRUE in order to set the
## random number generator

# RNGkind("L'Ecuyer-CMRG")


##### Section 3.10 #############################################################

num.columns <- 3
num.obs <- 250

set.seed(23)

A <- as.data.frame(matrix(rbinom(num.columns*num.obs, 1, .5),
                          nrow = num.obs, ncol = num.columns))
Y <- as.data.frame(matrix(rnorm(num.columns*num.obs),
                          nrow = num.obs, ncol = num.columns))
for(i in 1:num.columns)
  Y[, i] <- Y[, i] + i * A[, i]
names(A) <- paste("A", 1:num.columns, sep = "")
names(Y) <- paste("Y", 1:num.columns, sep = "")

result <- multiPIM(Y, A)
summary(result)


##### Section 4 ################################################################

## Version information:
## R: 2.15.2
## multiPIM: 1.3-1
## lars: 1.1
## penalized: 0.9-42
## polspline: 1.1.6
## MASS: 7.3-22

## load packages
library("MASS") ## use this to generate multivariate normal data (mvrnorm)

## set parameters
ns <- round(100*(2500^(1/99))^(0:99)) ## sample sizes
## (100 of them, evenly spaced on log scale, ranging from 100 to 250,000)

num.covs <- 4 ## number of covariates
error.sd <- 2 ## standard deviation of error term for generation of outcome y1

## set up covariance matrix for multivariate normal covariates
covar <- 0.2
sigma <- matrix(covar, num.covs, num.covs)
diag(sigma) <- 1

## function to generate W as multivariate normal
gen.W <- function(n, p, Sigma) {

  W <- data.frame(mvrnorm(n = n, mu = rep(0, p), Sigma = Sigma))
  names(W) <- paste("w", 1:p, sep = "")
  return(W)
}

## function to generate A based on covs (W)
gen.A <- function(W) {
  A.probs <- plogis(0.2*rowSums(as.matrix(W)))
  A <- data.frame(a1 = rbinom(nrow(W), 1, A.probs))

}

## function to generate Y based on A and W
gen.Y <- function(A, W, error.sd) {
  Y <- data.frame(y1 = W[,1]*W[,2] + W[,3]*W[,4]
                       + rowSums(as.matrix(W)^2) * A[[1]]
                       + rnorm(nrow(W), 0, error.sd))
}

## define estimators to be used
estimators <- c("TMLE", "G-COMP")

####### set up storage objects ########
## list of lists to store resulting objects
multiPIM.objects <- vector("list", length = length(ns))
for(i in 1:length(ns))
  multiPIM.objects[[i]] <- vector("list", length = length(estimators))

## matrix to store parameter estimates
param.estimates <- matrix(0, length(ns), length(estimators),
                          dimnames = list(ns = ns, estimators = estimators))


## actual simulation
## set seed
set.seed(23)
start.time <- Sys.time()

for(a in length(ns):1) { ## do the big ns first
  cat("\n########################################\nStarting on n =",
      ns[a], "\n########################################\n\n")
  W <- gen.W(ns[a], num.covs, sigma)
  A <- gen.A(W)
  Y <- gen.Y(A, W, error.sd)
  for(b in 1:length(estimators)) {
    multiPIM.objects[[a]][[b]] <-
      multiPIM(Y, A, W, estimator = estimators[b],
               g.method = "main.terms.logistic", ## will be ignored for g-comp
               Q.method = "main.terms.linear",
               return.final.models = FALSE)    
    param.estimates[a, b] <- multiPIM.objects[[a]][[b]]$param.estimates[1]
  }
}
end.time = Sys.time()

save(param.estimates, multiPIM.objects, file = "sim_stuff.rda")
time.taken <- difftime(end.time, start.time, units = "mins")
cat("\n########################################\ntime taken is,", time.taken,
    "minutes\n########################################\n")

## find "true" value of psi
truth.n <- 500000
truth.reps <- 100
truth.estimates <- vector("numeric", length = truth.reps)
time1 <- Sys.time()

for(i in 1:truth.reps) {
  W <- gen.W(truth.n, num.covs, sigma)
  A <- gen.A(W)
  Y <- gen.Y(A, W, error.sd)
  A.0 <- A
  A.0[] <- 0
  Y.0 <- gen.Y(A.0, W, error.sd)
  truth.estimates[i] <- mean(Y.0[[1]]) - mean(Y[[1]])
}

time2 <- Sys.time()
time.taken <- difftime(time2, time1, units = "mins")
truth <- mean(truth.estimates)
truth.sd <- sd(truth.estimates)
save(truth.estimates, truth, truth.sd, file = "truth_stuff.rda")
cat("\n######################################\ntime taken for truth stuff is,",
    time.taken, "minutes\n######################################\n")
cat("\n########################################\ntruth =", truth,
    "\ttruth.sd = ", truth.sd, "\n########################################\n")

## Figure 1
ns <- as.integer(rownames(param.estimates))
x.axis.labs.at <- c(100*10^(0:3), 250*10^(0:3), 500*10^(0:2))
miny <- min(param.estimates)
maxy <- max(param.estimates)

par(mar = c(4.5, 4, 1, 1))
cex <- 1.3
cex.axis <- 1.2
cols <- c("blue", "red")
pchs <- c(3, 1)

plot(c(min(ns), max(ns)), c(miny, maxy), type = "n", log = "x",
     ylab = "Parameter estimate",
     xlab = "Sample size (log scaling)", xaxt = "n",
     cex.axis = cex.axis, cex.lab = cex)
axis(1, at = x.axis.labs.at, cex.axis = cex.axis)
points(ns, param.estimates[, 1], col = cols[1], pch = pchs[1], type = "b")
points(ns, param.estimates[, 2], col = cols[2], pch = pchs[2], type = "b")
abline(h = truth, col = "black")
legend(x = 32000, y = maxy, legend = c("truth = -2.0", "TMLE", "G-Computation"),
       col = c("black", cols), pch = c(NA, pchs), lty = c(1, NA, NA), cex = cex)


##### Section 5 ################################################################

## Version information:
## R: 2.15.2
## multiPIM: 1.3-1
## lars: 1.1
## parallel 2.15.2 (part of R distribution)
## penalized: 0.9-42
## polspline: 1.1.6

## load data
data("wcgs")

## remove 12 observations which have missing cholesterol values
wcgs.data.no.na <- na.omit(wcgs)

## cutoff values (in most cases this is upper limit for unexposed group)
bmi.cutoff <- 25
chol.cutoff <- 240
dbp.cutoff <- 90
sbp.cutoff <- 140
cigs.cutoff <- 0

## attach the final data frame
attach(wcgs.data.no.na)
bmi <- as.integer(bmi0 > bmi.cutoff)
chol <- as.integer(chol0 > chol.cutoff)

## subject will be considered as exposed to high BP if either systolic or
## diastolic is elevated
high.dbp <- dbp0 > dbp.cutoff
high.sbp <- sbp0 > sbp.cutoff
highBP <- as.integer(high.dbp | high.sbp)

## exposed is type A
typeAB <- dibpat0 ## 1->A and 0->B as required
cigs <- as.integer(ncigs0 > cigs.cutoff)
A <- data.frame(typeAB=typeAB, bmi=bmi, chol=chol, highBP = highBP, cigs=cigs)
W <- data.frame(age=age0)
Y <- data.frame(chd=chd69)
detach(wcgs.data.no.na)
save(Y, A, W, file = "wcgs_YAW.rda")

## this will probably take quite a while...
time1 <- Sys.time()
boot.result <- multiPIMboot(Y, A, W, times = 2000, multicore = TRUE,
                            mc.num.jobs = 8, verbose = TRUE)
time2 <- Sys.time()
save(boot.result, file = "boot_result.rda")
cat("Time taken is", difftime(time2, time1, units = "hours"), "hours\n")


## tables
library("xtable")

## Table 1
res.sum <- summary(boot.result)
exposure.order <- order(res.sum$summary.array[, 1, 4])

chi.results <- lapply(A, chisq.test, Y[, 1])
proportion.diseased <- mean(Y[[1]])
summary.vals <- cbind(t(sapply(A, function(x){c(sum(x), mean(x))})),
                      t(sapply(chi.results,
                               function(x) {
                                 return(c(x$observed[1, 2] / sum(x$observed[1, ]),
                                          x$observed[2, 2] / sum(x$observed[2, ])))})))
summary.vals <- cbind(summary.vals, summary.vals[, 3] - proportion.diseased)
                               
asum.tab <- rbind(c("group", "exposed", "with CHD", "with CHD", "attr. risk"),
                  cbind(c("type B", "< 25", "< 240", "D<90 \\& S<140",
                          "< 1/day"),
                        format(summary.vals[, -c(1, 5)], digits = 2,
                               scientific = FALSE), 
                        format(summary.vals[, 5],
                               digits = 2))[exposure.order, ])
colnames(asum.tab) <- c("Unexposed", "Proportion", "Prop. unexp.", "Prop. exp.",
                        "Naive")
rownames(asum.tab)[1] <- " "

align1 <- "lccccc"
asum.xtab <- xtable(asum.tab, align = align1)
print(asum.xtab, hline.after = c(-1, -1, 1, 6), floating = F,
      sanitize.text.function = function(str){str})


## Table 2
cor.tab <- cor(cbind(W, A[, exposure.order], Y))
cor.tab[row(cor.tab) > col(cor.tab)] <- NA
align3 <- "lccccccc"

cor.xtab <- xtable(cor.tab, align = align3)
print(cor.xtab, floating = F, hline.after = c(-1, -1, 0, 7))


## Table 3
res.tab <- cbind(res.sum$summary.array[, 1, 1],
                 boot.result$plug.in.stand.errs,
                 res.sum$summary.array[, 1, -1])
colnames(res.tab) <- c("$\\hat{\\psi}^{TMLE}$", "$\\hat{\\sigma}^{plug-in}$",
                       "$\\hat{\\sigma}^{boot}$", "$T$", "$p$",
                       "$p^{Bonferroni}$")
digits2 <- cbind(matrix(4, nrow = 5, ncol = 4), rep(2, 5), c(-2, -2, -2, 3, 3),
                c(-2, -2, -2, 3, 1))
align2 <- "lcccccc"

res.xtab <- xtable(res.tab[exposure.order, ], digits = digits2, align = align2)

print(res.xtab, sanitize.text.function = function(str){str}, floating = F,
      hline.after = c(-1, -1, 0, 5))


## Figure 2
par(mfrow = c(5, 1), mar = c(2,3,3,0), oma = c(3, 2, 0, 2), cex.lab = 1.5)

grid <- (-25:10/500)
get.subset <- function(vec, grid){
  increment = grid[2] - grid[1]  
  return((grid > (min(vec) - increment)) & (grid < (max(vec) + increment)))
}
grid.subsets <- apply(boot.result$boot.param.array[, , 1], 2, get.subset, grid)

for(i in exposure.order) {
  hist(boot.result$boot.param.array[, i, 1], xlim = c(min(grid), max(grid)),
       breaks = grid[grid.subsets[, i]], yaxt = "n",
       ylab = NULL, main = NULL, ylim = c(0, 600))
  title(main = rownames(boot.result$param)[i], line = 0.6, cex = 1.1)
  abline(v = boot.result$param[i, 1], lty = 2)
  axis(2, 0:5*100, labels = c("0", NA, NA, NA, NA, "500"))
}
cex = 1
mtext("Bootstrap parameter estimates", side = 1, outer = TRUE, adj = .56, line = 1.5, cex = cex)
mtext("Frequency", side = 2, outer = TRUE, adj = 0.48, cex = cex, line = 0.6)


## Figure 3
par(mar = c(5, 4, 1, 2))
cumulative <- function(x, fun) {
  return.vec <- vector("double", length = length(x))
  for(i in 1:length(x)) return.vec[i] <- fun(x[1:i])
  return(return.vec)
}

cumulativeSDs <- apply(boot.result$boot.param.array[, , 1], 2, cumulative, fun = sd)
plot(c(0, 2000), c(min(cumulativeSDs, na.rm = T),
                   max(cumulativeSDs, na.rm = T)),
     type = "n",
     ylab = "Cumulative standard deviation",
     xlab = "Bootstrap replicate number", cex.lab = 1, cex.axis = 1)

for(i in 1:boot.result$num.exposures) {
  lines(cumulativeSDs[, i])
}


##### Section 6 ################################################################

## Version information:
## R: 2.15.2
## multiPIM: 1.3-1
## rpart: 4.1-0

## data
data("schisto")
W <- data.frame(lapply(schisto[, 9:10], as.factor))
A <- schisto[, 2:8]
Y <- schisto[, 1, drop = FALSE]

## analysis
set.seed(23)
num.types <- ncol(A)
multiPIM.results <- vector("list", length = num.types)
for(i in 1:num.types) {
  A.current <- A[, i, drop = FALSE]
  A.current[A.current > 0] <- 1
  W.current <- cbind(W, A[, -i])
  multiPIM.results[[i]] <- multiPIM(Y, A.current, W.current,
                                    estimator = "IPCW",
                                    g.method = "rpart.bin")
}

## Table 4
results.tab <- 
  t(sapply(multiPIM.results,
           function(x) summary(x, bf.multiplier = 7)$sum[1,1,]))
rownames(results.tab) <- names(A)
print(xtable(results.tab, digits = 3), floating = FALSE)