## Run simulation.R before to obtain the results of the simulation ## study which are save in file "subgroup_id_sim_no_rct_results.rda" ## and loaded in this script. ## sim_data_1 ## 1st block library("personalized") # use previous PRNG RNGkind(sample.kind = "Rounding") set.seed(123) n.obs <- 1000 n.vars <- 50 x <- matrix(rnorm(n.obs * n.vars, sd = 3), n.obs, n.vars) # simulate non-randomized treatment assignment xbetat <- 0.5 + 0.25 * x[, 21] - 0.25 * x[, 41] trt.prob <- plogis(xbetat) trt <- rbinom(n.obs, 1, prob = trt.prob) # simulate Delta(x) delta <- (0.5 + x[, 2] - 0.5 * x[, 3] - 1 * x[, 11] + 1 * x[, 1] * x[, 12]) # simulate main effects g(X) xbeta <- x[, 1] + x[, 11] - 2 * x[, 12] ** 2 + x[, 13] + 0.5 * x[, 15] ** 2 # add both main effects and interactions xbeta <- xbeta + delta * (2 * trt - 1) trt <- as.factor(ifelse(trt == 1, "Trt", "Ctrl")) # simulate continuous outcomes y <- xbeta + rnorm(n.obs) # simulate binary outcomes y.binary <- 1 * (xbeta + rnorm(n.obs, sd = 2) > 0) # create time-to-event outcomes surv.time <- exp(-20 - xbeta + rnorm(n.obs, sd = 1)) cens.time <- exp(rnorm(n.obs, sd = 3)) y.time.to.event <- pmin(surv.time, cens.time) status <- 1 * (surv.time <= cens.time) ## change_basesize theme_set(theme_grey(base_size = 9)) ## propens_func_example_glm ## 2nd block propensity.func <- function(x, trt) { # save data in a data.frame data.fr <- data.frame(trt = trt, x) # fit propensity score model using binomial GLM propensity.model <- glm(trt ~ ., family = binomial(), data = data.fr) # create estimated probabilities pi.x <- predict(propensity.model, type = "response") return(pi.x) } propensity.func(x, trt)[101:105] trt[101:105] ## plot_overlap ## 3rd block check.overlap(x, trt, propensity.func) ## const_propensity ## 4th block constant.propensity.func <- function(x, trt) 0.5 ## fit_model2 ## 5th block subgrp.model <- fit.subgroup(x = x, y = y, trt = trt, propensity.func = propensity.func, method = "a_learning", loss = "sq_loss_lasso", nfolds = 10) # option for cv.glmnet summary(subgrp.model) ## create_propensity ## 6th block # create function for fitting propensity score model propensity.func.lasso <- function(x, trt) { # fit propensity score model # with binomial model and lasso penalty. # tuning parameter selected by 10-fold CV propens.model <- cv.glmnet(y = trt, x = x, family = "binomial") pi.x <- predict(propens.model, s = "lambda.min", newx = x, type = "response")[, 1] pi.x } ## propens_func_example_const ## 7th block propensity.func.const <- function(x, trt) 0.5 ## multinom_propens ## 8th block propensity.func.multinom <- function(x, trt) { require("nnet") df <- data.frame(trt = trt, x) mfit <- nnet::multinom(trt ~ . -trt, data = df) # predict returns a matrix of probabilities: # one column for each treatment level propens <- predict(mfit, type = "probs") if (is.factor(trt)) { values <- levels(trt)[trt] } else { values <- trt } # return the probability corresponding to the # treatment that was observed probs <- propens[cbind(1:nrow(propens), match(values, colnames(propens)))] probs } ## multinom_propens2 ## 9th block propensity.func.multinom <- function(x, trt) { require("nnet") df <- data.frame(trt = trt, x) mfit <- multinom(trt ~ . -trt, data = df) # predict returns a matrix of probabilities: # one column for each treatment level propens <- predict(mfit, type = "probs") if (is.factor(trt)) { levels <- levels(trt) } else { levels <- sort(unique(trt)) } # return the probability corresponding to the # treatment that was observed probs <- propens[,match(levels, colnames(propens))] probs } ## efficiency_func_example_b ## 10th block augment.func.simple <- function(x, y) { cvmod <- cv.glmnet(y = y, x = x, nfolds = 10) predictions <- predict(cvmod, newx = x, s = "lambda.min") predictions } ## efficiency_func_example_cc ## 11th block augment.func <- function(x, y, trt) { data <- data.frame(x, y, trt = ifelse(trt == "Trt", 1, -1)) xm <- model.matrix(y~trt*x-1, data = data) cvmod <- cv.glmnet(y = y, x = xm) ## get predictions when trt = 1 data$trt <- 1 xm1 <- model.matrix(y~trt*x-1, data = data) preds_1 <- predict(cvmod, xm1, s = "lambda.min") ## get predictions when trt = -1 data$trt <- -1 xm2 <- model.matrix(y~trt*x-1, data = data) preds_n1 <- predict(cvmod, xm2, s = "lambda.min") ## return predictions averaged over trt return(0.5 * (preds_1 + preds_n1)) } ## efficiency_func_example_binary ## 12th block augment.func.bin <- function(x, y) { cvmod <- cv.glmnet(y = y, x = x, family = "binomial") predict(cvmod, newx = x, s = "lambda.min", type = "link") } ## efficiency_ex ## 13th block subgrp.model.eff <- fit.subgroup(x = x, y = y, trt = trt, propensity.func = propensity.func, loss = "sq_loss_lasso", augment.func = augment.func, nfolds = 10) # option for cv.glmnet summary(subgrp.model.eff) ## expolosscustom ## 14th block fit.expo.loss <- function(x, y, weights, ...) { ## define loss expo.loss <- function(beta, x, y, weights) { sum(weights * y * exp(-drop(x %*% beta))) } ## use optim() to minimize loss function opt <- optim(rep(0, NCOL(x)), fn = expo.loss, x = x, y = y, weights = weights) coefs <- opt$par ## define prediction function which ## inputs a design matrix ## and returns benefit scores pred <- function(x, type = "response") { cbind(1, x) %*% coefs } # return list of required components list(predict = pred, model = opt, coefficients = coefs) } ## fit_model_gbm ## 15th block subgrp.model.gbm <- fit.subgroup(x = x, y = y, trt = trt, propensity.func = propensity.func.lasso, loss = "sq_loss_gbm", shrinkage = 0.025, # options for gbm n.trees = 1000, interaction.depth = 2, cv.folds = 5) ## fit_binary_1 ## 16th block subgrp.bin <- fit.subgroup(x = x, y = y.binary, trt = trt, propensity.func = propensity.func.lasso, loss = "logistic_loss_lasso", nfolds = 10) # option for cv.glmnet ## tte_model_example ## 17th block library("survival") set.seed(123) subgrp.cox <- fit.subgroup(x = x, y = Surv(y.time.to.event, status), trt = trt, propensity.func = propensity.func.lasso, loss = "cox_loss_lasso", nfolds = 10) # option for cv.glmnet ## print_tte_model ## 18th block summary(subgrp.cox) ## summarize_sub ## 19th block comp <- summarize.subgroups(subgrp.cox) ## print_summarize ## 20th block print(comp, p.value = 0.01) ## train_test_ex ## 21st block # check that the object is an object returned by fit.subgroup() class(subgrp.model.eff) validation.eff <- validate.subgroup(subgrp.model.eff, B = 25, # specify the number of replications method = "training_test_replication", benefit.score.quantiles = c(0.5, 0.75, 0.9), train.fraction = 0.75) validation.eff ## print_traintest ## 22nd block print(validation.eff, which.quant = c(2, 3)) ## boot_bias_ex ## 28th block validation.boot <- validate.subgroup(subgrp.model.eff, B = 100, # specify the number of replications method = "boot_bias_correction") validation.boot ## gen_test_data ## 29th block x.test <- matrix(rnorm(10 * n.obs * n.vars, sd = 3), 10 * n.obs, n.vars) # simulate non-randomized treatment xbetat.test <- 0.5 + 0.25 * x.test[, 21] - 0.25 * x.test[, 41] trt.prob.test <- plogis(xbetat.test) trt.test <- rbinom(10 * n.obs, 1, prob = trt.prob.test) # simulate response delta.test <- (0.5 + x.test[, 2] - 0.5 * x.test[, 3] - x.test[, 11] + x.test[, 1] * x.test[, 12]) xbeta.test <- x.test[, 1] + x.test[, 11] - 2 * x.test[, 12] ** 2 + x.test[, 13] + 0.5 * x.test[, 15] ** 2 xbeta.test <- xbeta.test + delta.test * (2 * trt.test - 1) y.test <- xbeta.test + rnorm(10 * n.obs, sd = 2) ## predict_bene_score ## 30th block bene.score.test <- predict(subgrp.model.eff, newx = x.test) ## compare_with_test ## 31st block ## Effect of control among those recommended control mean(y.test[bene.score.test <= 0 & trt.test == 0]) - mean(y.test[bene.score.test <= 0 & trt.test == 1]) quantile(validation.boot$boot.results[[1]][, 1], c(0.025, 0.975), na.rm = TRUE) ## Trt effect among those recommended the treatment mean(y.test[bene.score.test > 0 & trt.test == 1]) - mean(y.test[bene.score.test > 0 & trt.test == 0]) quantile(validation.boot$boot.results[[1]][, 2], c(0.025, 0.975), na.rm = TRUE) ## plot_ex_model_1 ## 32nd block plot(subgrp.model) ## plot_ex_model_2 ## 33rd block plot(subgrp.model, type = "density") ## plot_ex_model_3 ## 34th block plot(subgrp.model, type = "interaction") ## plot_ex_model_4 ## 35th block plot(subgrp.model, type = "conditional") ## train_test_ex2 ## 36th block validation <- validate.subgroup(subgrp.model, B = 100, # specify the number of replications method = "training_test_replication", train.fraction = 0.75) validation ## plot_ex_model_1a ## 37th block plot(validation) ## plot_ex_model_1b ## 38th block plot(validation, type = "density") ## plot_ex_model_1c ## 39th block plot(validation, type = "conditional") ## plot_compare_validation_ex ## 40th block plotCompare(validation, validation.eff) ## sim_three_trt_data ## 41st block set.seed(123) n.obs <- 1000 n.vars <- 100 x <- matrix(rnorm(n.obs * n.vars, sd = 3), n.obs, n.vars) # simulated non-randomized treatment with multiple levels # based off of a multinomial logistic model xbetat_1 <- 0.1 + 0.5 * x[, 21] - 0.25 * x[, 25] xbetat_2 <- 0.1 - 0.5 * x[, 11] + 0.25 * x[, 15] trt.1.prob <- exp(xbetat_1) / (1 + exp(xbetat_1) + exp(xbetat_2)) trt.2.prob <- exp(xbetat_2) / (1 + exp(xbetat_1) + exp(xbetat_2)) trt.3.prob <- 1 - (trt.1.prob + trt.2.prob) prob.mat <- cbind(trt.1.prob, trt.2.prob, trt.3.prob) trt.mat <- apply(prob.mat, 1, function(rr) rmultinom(1, 1, prob = rr)) trt.num <- apply(trt.mat, 2, function(rr) which(rr == 1)) trt <- as.factor(paste0("Trt_", trt.num)) # simulate response # effect of treatment 1 relative to treatment 3 delta1 <- 2 * (0.5 + x[, 2] - 2 * x[, 3] ) # effect of treatment 2 relative to treatment 3 delta2 <- (0.5 + x[, 6] - 2 * x[, 5]) # main covariate effects with nonlinearities xbeta <- x[, 1] + x[, 11] - 2 * x[, 12]^2 + x[, 13] + 0.5 * x[, 15] ^ 2 + 2 * x[, 2] - 3 * x[, 5] # create entire functional form of E(Y|T,X) xbeta <- xbeta + delta1 * ((trt.num == 1) - (trt.num == 3)) + delta2 * ((trt.num == 2) - (trt.num == 3)) # simulate continuous outcomes E(Y|T,X) y <- xbeta + rnorm(n.obs, sd = 2) ## display_mult_trt_vector ## 42nd block trt[1:5] table(trt) ## define_multi_propens ## 43rd block propensity.multinom.lasso <- function(x, trt) { if (!is.factor(trt)) trt <- as.factor(trt) gfit <- cv.glmnet(y = trt, x = x, family = "multinomial") # predict returns a matrix of probabilities: # one column for each treatment level propens <- drop(predict(gfit, newx = x, type = "response", s = "lambda.min")) # return the matrix probability of treatment assignments probs <- propens[,match(levels(trt), colnames(propens))] probs } ## check_overlap_multitreat ## 44th block check.overlap(x = x, trt = trt, propensity.multinom.lasso) ## fit_multi_trt_model ## 45th block set.seed(123) subgrp.multi <- fit.subgroup(x = x, y = y, trt = trt, propensity.func = propensity.multinom.lasso, reference.trt = "Trt_3", loss = "sq_loss_lasso") summary(subgrp.multi) ## plot_multi_trt_model ## 46th block pl <- plot(subgrp.multi) pl + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ## validate_multi_trt_model ## 47th block set.seed(123) validation.multi <- validate.subgroup(subgrp.multi, B = 100, # specify the number of replications method = "training_test_replication", train.fraction = 0.5) print(validation.multi, digits = 2, sample.pct = TRUE) ## plotcomparemultivalidated ## 48th block plv <- plot(validation.multi) plv + theme(axis.text.x = element_text(angle = 90, hjust = 1)) ## readsimresults ## Run simulation.R first to create this object and save it. load("subgroup_id_sim_no_rct_results.rda") auc.res <- do.call(rbind, res.list2[[3]]) cor.res <- do.call(rbind, res.list2[[1]]) ## ALL OWL methods are dominated by their FOWL versions, ## so let's only plot FOWL results owls <- c("OWL-Loss-W-Aug", "OWL-Logistic-Loss-W", "OWL-Logistic-Loss-W-Aug", "OWL-Loss-W") cor.res <- cor.res[!(cor.res$Method %in% owls), ] auc.res <- auc.res[!(auc.res$Method %in% owls), ] if (is.factor(cor.res$Method)) { cor.res$Method <- droplevels(cor.res$Method) } else { cor.res$Method <- as.factor(cor.res$Method) } # reorder levels so personalized methods are all last orders <- c(8, 1, 7:6, 9, 10:13, 2:5) cor.res$Method <- factor(cor.res$Method, levels = levels(cor.res$Method)[orders]) if (is.factor(auc.res$Method)) { auc.res$Method <- droplevels(auc.res$Method) } else { auc.res$Method <- as.factor(auc.res$Method) } auc.res$Method <- factor(auc.res$Method, levels = levels(auc.res$Method)[orders]) cols <- c("#53354ACC", "#66CD00", "darkorange", "#55DDE0", "#574AE2FF", "#574AE266", "#D12600", "#D1260066", "#CEFF1AFF", "#CEFF1A66", "#943CB4FF", "#943CB466", "#FE6900FF") levels(auc.res$Method)[c(6:13)] <- c("Sq-A", "Sq-A-Aug", "Sq-W","Sq-W-Aug","FOWL-L-W","FOWL-L-W-Aug", "FOWL-H-W", "FOWL-H-W-Aug") levels(cor.res$Method)[c(6:13)] <- c("Sq-A", "Sq-A-Aug", "Sq-W","Sq-W-Aug","FOWL-L-W","FOWL-L-W-Aug", "FOWL-H-W", "FOWL-H-W-Aug") ## drop model4you-Tree since it is significantly ## and consistently worse than model4you-Forest and ## as such makes it difficult to look at the results cor.res <- cor.res[cor.res$Method != "model4you-Tree", ] auc.res <- auc.res[auc.res$Method != "model4you-Tree", ] cor.res$Method <- droplevels(cor.res$Method) auc.res$Method <- droplevels(auc.res$Method) auc.res$Model <- ifelse(auc.res$main.effect.misspec, "Model~2", "Model~1") auc.res$n <- paste0("n==", auc.res$sample.size) auc.res$nu <- paste0("ME~size:", auc.res$main.effect.size) cor.res$Model <- ifelse(cor.res$main.effect.misspec, "Model~2", "Model~1") cor.res$n <- paste0("n==", cor.res$sample.size) cor.res$nu <- paste0("ME~size:", cor.res$main.effect.size) ## plotsimres2 plotsimres2 <- ggplot(aes(x = Method, y = (Value), group = Method, fill = Method), data = cor.res) + geom_boxplot() + ylab("Correlation") + facet_grid(Model + nu ~ n, scales = "free_y", labeller = label_parsed) + scale_fill_manual(values = cols) + coord_cartesian(ylim = c(0, 1)) + theme(legend.position = "bottom", axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.text = element_text(size = 12), axis.title = element_text(size = 16), legend.text = element_text(size = 12), legend.title = element_blank(), strip.text = element_text(size = 12), legend.key.width = unit(2.5,"cm")) + geom_vline(xintercept = 4.5, color = scales::alpha("black", 0.5), linetype = "dashed") plotsimres2 pdf("Figures/plotsimres2-1.pdf", height = 10, width = 9) print(plotsimres2) dev.off() ## plotsimres4 plotsimres4 <- ggplot(aes(x = Method, y = (Value), group = Method, fill = Method), data = auc.res) + geom_boxplot() + ylab("AUC") + facet_grid(Model + nu ~ n, scales = "free_y", labeller = label_parsed) + scale_fill_manual(values = cols) + coord_cartesian(ylim = c(0.48, 1)) + theme(legend.position = "bottom", axis.text.x = element_blank(), axis.ticks.x = element_blank(), axis.text = element_text(size = 12), axis.title = element_text(size = 16), legend.text = element_text(size = 12), legend.title = element_blank(), strip.text = element_text(size = 12), legend.key.width = unit(2.5,"cm")) + geom_vline(xintercept = 4.5, color = scales::alpha("black", 0.5), linetype = "dashed") plotsimres4 pdf("Figures/plotsimres4-1.pdf", height = 10, width = 9) print(plotsimres4) dev.off() ## data_prep ## 49th block data("LaLonde", package = "personalized") # whether an individual had a higher salary # in 1978 than 1975 y <- LaLonde$outcome # treatment assignment (employment training vs not) trt <- LaLonde$treat x.varnames <- c("age", "educ", "black", "hisp", "white", "marr", "nodegr", "log.re75", "u75") # covariates data.x <- LaLonde[, x.varnames] # construct design matrix (with no intercept) x <- model.matrix(~ -1 + ., data = data.x) ## const_propens_func ## 50th block const.propens <- function(x, trt) { mean.trt <- mean(trt == "Trt") rep(mean.trt, length(trt)) } ## fit_model_logreg_nsw ## 51st block set.seed(1) subgrp_fit_w <- fit.subgroup(x = x, y = y, trt = trt, loss = "logistic_loss_lasso", propensity.func = const.propens, type.measure = "auc", nfolds = 10) summary(subgrp_fit_w) ## validate_subgrps ## 52nd block val_subgrp_w <- validate.subgroup(subgrp_fit_w, B = 500, method = "training", train.fraction = 0.80) ## print_results_val ## 53rd block print(val_subgrp_w, digits = 4, sample.pct = TRUE) ## plot ## 54th block plot(val_subgrp_w) pdf("Figures/plot-1.pdf", height = 3.75, width = 6) print(plot(val_subgrp_w)) dev.off() ## boot_validate_subgrps ## 55th block val_subgrp_w_boot <- validate.subgroup(subgrp_fit_w, B = 500, method = "boot") ## print_results_bootval ## 56th block print(val_subgrp_w_boot, digits = 4, sample.pct = TRUE) ## plot_validate_subgrps ## 57th block plotCompare(subgrp_fit_w, val_subgrp_w, val_subgrp_w_boot, type = "int") pdf("Figures/plot_validate_subgrps-1.pdf", height = 3.75, width = 5) print(plotCompare(subgrp_fit_w, val_subgrp_w, val_subgrp_w_boot, type = "int")) dev.off()