library("survival") library("eventglm") ## Note on computational resources #--------------------------------- ## This script contains two long running computations. The timings below ## are based on a 8th gen core i7 laptop with 16GB of RAM. ## - bootstrap replicates starting on line 124 (takes 2-3 minutes) ## - speed and memory benchmark starting on line 208 (take about 30 minutes and lots of memory) ## change the variables below to FALSE if you would like to skip these ## computations. run_bootstrap <- TRUE run_benchmark <- TRUE ## The code for the simulation study in the appendix ## takes about 10 hours to run and is included separately in ## the subdirectory "simulation-study". ### Data analysis -- section 5 ## ---------------------------------------------------------------------------------------------------------------------- library("knitr") render_sweave() options(prompt = "R> ", continue = "+ ", width = 70, useFancyQuotes = FALSE) #library("survival") #library("eventglm") ## ---------------------------------------------------------------------------------------------------------------------- pdf("../Figures/exdata.pdf", width = 7.5, height = 7.5 * .618) par(mfrow = c(1, 2)) colonsfit <- survfit(Surv(time, status) ~ rx, data = colon) colscolon <- c("black", "darkblue", "violetred") plot(colonsfit, col = colscolon, xlab = "days since registration", ylab = "survival") abline(v = 2500, lty = 2) for(i in 3:1){ seg0 <- summary(colonsfit[i], times = colonsfit[i]$time[colonsfit[i]$time < 2500]) rect(c(0, seg0$time), 0, c(seg0$time, 2500), c(1, seg0$surv), border = NA, col = c("grey80", "skyblue", "salmon")[i]) } for(i in 1:3){ lines(colonsfit[i], conf.int = FALSE, col = colscolon[i]) points(x = 2500, y = summary(colonsfit[i], times = 2500)$surv) } legend("bottomleft", fill = c("black", "darkblue", "violetred"), legend = names(colonsfit$strata)) crfit <- survfit(Surv(etime, event) ~ 1, eventglm::mgus2) plot(crfit[2:3], col=1:2, noplot="", lwd=1, xscale=12, xlab="Years post diagnosis", ylab="Cumulative incidence") legend("right", c("pcm", "death"), col=c(1,2),lwd=2) abline(v = 120, lty = 2) seg0b <- summary(crfit[2], times = crfit[2]$time[crfit[2]$time < 120]) rect(c(0, seg0b$time), 0, c(seg0b$time, 120), c(seg0b$pstate), border = NA, col = c("grey80")) points(x = 120, y = summary(crfit[2], times = 120)$pstate) lines(crfit[2:3], col = 1:2, conf.int = FALSE) dev.off() ## ---------------------------------------------------------------------------------------------------------------------- colon.cifit <- cumincglm(Surv(time, status) ~ rx, time = 2500, data = colon) summary(colon.cifit) se.ci <- sqrt(diag(vcov(colon.cifit, type = "robust"))) b.ci <- coefficients(colon.cifit) conf.ci <- confint(colon.cifit) round(cbind(b.ci, conf.ci), 2) ## ---------------------------------------------------------------------------------------------------------------------- colon.smry <- summary(colonsfit, times = 2500, rmean = 2500) cbind(eventglm = b.ci, survfit = c(1 - colon.smry$surv[1], (1 - colon.smry$surv[2:3]) - (1 - rep(colon.smry$surv[1], 2)))) ## ---------------------------------------------------------------------------------------------------------------------- colon.rr <- cumincglm(Surv(time, status) ~ rx, time = 2500, data = colon, link = "log") br.ci <- coefficients(colon.rr) confr.ci <- confint(colon.rr) round(exp(cbind(br.ci, confr.ci)), 2) ## ---------------------------------------------------------------------------------------------------------------------- colon.rmfit <- rmeanglm(Surv(time, status) ~ rx, time = 2500, data = colon) summary(colon.rmfit) se.rm <- sqrt(diag(vcov(colon.rmfit, type = "robust"))) b.rm <- coefficients(colon.rmfit) conf.rm <- confint(colon.rmfit) round(cbind(b.rm, conf.rm), 2) ## ---------------------------------------------------------------------------------------------------------------------- cbind(eventglm = b.rm, survfit = c(colon.smry$table[1, 5], colon.smry$table[2:3, 5] - colon.smry$table[1, 5])) ## ---------------------------------------------------------------------------------------------------------------------- colon.rm.adj <- rmeanglm(Surv(time, status) ~ rx + age + node4, time = 2500, data = colon) summary(colon.rm.adj) colon.mvt <- cumincglm(Surv(time, status) ~ rx + age + node4, time = c(500, 1000, 2500), data = colon) summary(colon.mvt) colon.mvt2 <- cumincglm(Surv(time, status) ~ tve(rx) + age + node4, time = c(500, 1000, 2500), data = colon) colon.mvt2 ## ---------------------------------------------------------------------------------------------------------------------- cumincglm(Surv(etime, event) ~ sex, cause = "pcm", time = 120, data = mgus2) mgfit1 <- rmeanglm(Surv(etime, event) ~ sex, cause = "pcm", time = 120, data = mgus2) mgfit1 pdf("../Figures/diag-plot.pdf", width = 7.5, height = 7.5) par(mfrow = c(2,2)) plot(mgfit1) dev.off() ## ---------------------------------------------------------------------------------------------------------------------- mgfitrmean <- rmeanglm(Surv(etime, event) ~ sex * age + hgb + I(hgb^2), cause = "pcm", time = 120, data = mgus2) summary(mgfitrmean) ## ---------------------------------------------------------------------------------------------------------------------- if(run_bootstrap) { set.seed(5500) nboot <- 1000 bootests <- matrix(numeric(nboot * 4), nrow = nboot, ncol = 4) for(i in 1:nboot) { mgus.b <- mgus2[sample(1:nrow(mgus2), replace = TRUE), ] mgfitrmean.b <- rmeanglm(Surv(etime, event) ~ sex + age + hgb, cause = "pcm", time = 120, data = mgus.b) bootests[i,] <- coefficients(mgfitrmean.b) } mgfitrmean2 <- rmeanglm(Surv(etime, event) ~ sex + age + hgb, cause = "pcm", time = 120, data = mgus2) se.boot <- sqrt(diag(cov(bootests))) round(cbind(se.boot = se.boot, se.robust = sqrt(diag(vcov(mgfitrmean2))), #se.corrected = sqrt(diag(vcov(mgfitrmean2, type = "corrected"))), se.naive = sqrt(diag(vcov(mgfitrmean2, type = "naive")))), 3) } ## ---------------------------------------------------------------------------------------------------------------------- pdf("../Figures/predhist.pdf", width = 5.5, height = 3.39) hist(predict(mgfitrmean, newdata = mgus2), xlab = "Predicted lifetime lost due to PCM up to 120 days", main = "") dev.off() ## ---------------------------------------------------------------------------------------------------------------------- colon.ci.cen1 <- cumincglm(Surv(time, status) ~ rx + age + node4, time = 2500, data = colon, model.censoring = "stratified", formula.censoring = ~ rx) ## ---------------------------------------------------------------------------------------------------------------------- colon.ci.adj <- cumincglm(Surv(time, status) ~ rx + age + node4, time = 2500, data = colon, model.censoring = "independent", formula.censoring = ~ rx + age + node4) colon.ci.cen2 <- cumincglm(Surv(time, status) ~ rx + age + node4, time = 2500, data = colon, model.censoring = "coxph", formula.censoring = ~ rx + age + node4) colon.ci.cen3 <- cumincglm(Surv(time, status) ~ rx + age + node4, time = 2500, data = colon, model.censoring = "aareg", formula.censoring = ~ rx + age + node4) colon.ci.cen2h <- cumincglm(Surv(time, status) ~ rx + age + node4, time = 2500, data = colon, model.censoring = "coxph", formula.censoring = ~ rx + age + node4, ipcw.method = "hajek") colon.ci.cen3h <- cumincglm(Surv(time, status) ~ rx + age + node4, time = 2500, data = colon, model.censoring = "aareg", formula.censoring = ~ rx + age + node4, ipcw.method = "hajek") round(cbind("indep" = coef(colon.ci.adj), "strat" = coef(colon.ci.cen1), "coxipcw" = coef(colon.ci.cen2), "aalenipcw" = coef(colon.ci.cen3), "coxipcw.hajek" = coef(colon.ci.cen2h), "aalenipcw.hajek" = coef(colon.ci.cen3h)), 3) summary(colon.ci.cen2$ipcw.weights) ## ---------------------------------------------------------------------------------------------------------------------- set.seed(918) subc <- rbinom(nrow(mgus2), size = 1, prob = .2) samp.ind <- subc + (1 - subc) * (mgus2$event == "pcm") * rbinom(nrow(mgus2), size = 1, prob = .9) mgus2.cc <- mgus2[as.logical(samp.ind), ] mgus2.cc$samp.wt <- 1 / ifelse(mgus2.cc$event == "pcm", .2 + .8 * .9, .2) ## ---------------------------------------------------------------------------------------------------------------------- mgfit.cc <- rmeanglm(Surv(etime, event) ~ I(age - 65) + sex + hgb, cause = "pcm", time = 120, data = mgus2.cc, weights = samp.wt) mgfit.full <- rmeanglm(Surv(etime, event) ~ I(age - 65) + sex + hgb, cause = "pcm", time = 120, data = mgus2) mdf <- data.frame(casecohort = summary(mgfit.cc)$coefficients[,1:2], fullsamp = summary(mgfit.full)$coefficients[, 1:2]) colnames(mdf) <- c("case cohort Est", "SE", "full Est", "SE") round(mdf, 3) ### Computation speed comparison ## if(run_benchmark) { library("pseudo") library("prodlim") library("fastpseudo") library("ggplot2") library("patchwork") #' Generate survival data for analysis #' #' Set up the coefficients for generating competing risks data. The coefficients #' correspond to the covariate vector with distribution (constant, binomial(.5), N(0,1), N(0,1)) #' #' @param scenario Character constant determining the scenario #' @param beta.cens Coefficient vector for the association with the censoring distribution #' #' @export #' #' @return A list with components for each of the coefficient vectors/matrices #' switch_scenario <- function(scenario, beta.cens = c(1,0,0,0)) { ## NULL scenario if(scenario == "0") { b1 <- matrix(c(2, 0, 0, 0), nrow = 4) b2 <- matrix(c(2, 0, 0, 0), nrow = 4) g1 <- 2.25 g2 <- 2.25 } else if (scenario == "1") { b1 <- matrix(c(1, .5, 0, 0), nrow = 4) b2 <- matrix(c(1, 0, 0, 0), nrow = 4) g1 <- 1.5 g2 <- 1.5 } else if (scenario == "2") { b1 <- matrix(c(1, 1, .5, 0), nrow = 4) b2 <- matrix(c(1, 0, 0, 0), nrow = 4) g1 <- 1.5 g2 <- 1.5 } else if (scenario == "3") { b1 <- matrix(c(1, .25, -.1, 0), nrow = 4) b2 <- matrix(c(1, .05, 0, .1), nrow = 4) g1 <- 3 g2 <- 3 } else if (scenario == "4") { b1 <- matrix(c(1, .85, -.1, 0), nrow = 4) b2 <- matrix(c(1, -.1, 0, 0), nrow = 4) g1 <- 3.5 g2 <- 3.5 } else if (scenario == "5") { b1 <- matrix(c(1, .75, .5, .5), nrow = 4) b2 <- matrix(c(1, 1, -1, -1), nrow = 4) g1 <- 1.5 g2 <- 1.5 } else stop("Scenario does not exist") list(b1 = b1, b2 = b2, g1 = g1, g2 = g2, beta.cens = beta.cens) } #' Cumulative hazard function under the proportional hazards Weibull model #' #' @param t Time at which to be evaluated (must be >= 0) #' @param Z Covariate matrix #' @param beta Coefficients to be applied to covariate matrix #' @param gamma Shape parameter #' #' @return Cumulative hazards evaluated at t Haz11<-function(t,Z, beta,gamma){ lambda <- exp(Z %*% beta) (t / lambda)^gamma } #' Hazard function under the proportional hazards Weibull model #' #' @param t Time at which to be evaluated (must be >= 0) #' @param Z Covariate matrix #' @param beta Coefficients to be applied to covariate matrix #' @param gamma Shape parameter #' #' @return Hazards evaluated at t haz11 <- function(t, Z, beta, gamma) { lambda <- exp(Z %*% beta) gamma * t^(gamma - 1) * (1/lambda)^gamma } #' Overall survival probability #' #' Overall survival probability under a competing risks model with two causes #' #' @param t Time at which to be evaluated (must be >= 0) #' @param Z Covariate matrix #' @param beta1 Coefficients to be applied to covariate matrix for cause 1 #' @param gamma1 Shape parameter for cause 1 #' @param beta2 Coefficients to be applied to covariate matrix for cause 2 #' @param gamma2 Shape parameter for cause 2 #' #' @return Survival probabilities evaluated at t St <- function(t, Z,beta1,gamma1,beta2,gamma2){ exp(-(Haz11(t,Z, beta1,gamma1)+Haz11(t,Z, beta2,gamma2))) } #' Numeric inverse #' #' @param fn Function to invert #' @param mix_x Lower limit of the range of fn #' @param max_x Upper limit of the range of fn #' #' @return A function corresponding to the inverse of fn inverse <- function(fn, min_x, max_x) { fn_inv = function(y){ uniroot((function(x){fn(x) - y}), lower=min_x, upper=max_x)[1]$root } return(Vectorize(fn_inv)) } #' Sample event times #' #' Overall survival times in the competing risks model using the #' probability integral transform. #' #' @param mat covariate matrix #' @param b1 Coefficients for cause 1 #' @param g1 Shape for cause 1 #' @param b2 Coefficients for cause 2 #' @param g2 Shape for cause 2 #' #' @return A vector of overall survival times #' sampletimesfunc <- function(mat,b1,g1,b2,g2) { n <- dim(mat)[1] UU <- runif(n) sapply(1:n, function(i){ inverse(function(t) 1 - St(t, mat[i,], beta1 = b1,gamma1=g1, beta2=b2, gamma2=g2), 0, 1e8)(UU[i]) }) } #' Generate a dataset #' #' Simulates a dataset under the competing risks model with two causes #' and censoring. #' #' @param n Sample size #' @param beta1 Coefficients for cause 1 #' @param gamma1 Shape for cause 1 #' @param beta2 Coefficients for cause 2 #' @param gamma2 Shape for cause 2 #' @param beta.cens Coefficients for censoring #' @param cens.rate Target censoring rate, used for testing #' @param gamma.cens Shape parameter of weibull for censoring, if 1 we have PH censoring #' #' @return A data frame with the simulated data #' @export #' genset_func<-function(n, beta1, gamma1, beta2, gamma2, beta.cens = c(0,0,0), cens.rate = .2, gamma.cens = 3){ tr<-rbinom(n,1,0.3) X1 <- rnorm(n, mean = 4, sd = 1) X2 <- exp(rnorm(n, mean = 0, sd = 1)) if(!is.matrix(beta1)) { beta100<-c(beta1, rep(0,4-length(beta1))) beta10<-as.matrix(beta100,nrow=4) beta200<-c(beta2, rep(0,4-length(beta2))) beta20<-as.matrix(beta200,nrow=4) } else { beta10 <- beta1 beta20 <- beta2 } matx<-cbind(1,tr,X1, X2) Touts<-sampletimesfunc(matx,beta10,gamma1,beta20,gamma2) cen0 <- uniroot(function(c0) { lambda <- exp(matx %*% c(c0, beta.cens)) Cen<- rweibull(n, scale = lambda, shape = gamma.cens) cens.rate - mean(Cen < Touts) }, c(-1e4, 100))$root lambda <- exp(matx %*% c(cen0, beta.cens)) Cen<- rweibull(n, scale = lambda, shape = gamma.cens) type1<-rbinom(n,1,haz11(Touts,matx,beta10,gamma1)/(haz11(Touts,matx,beta10,gamma1)+ haz11(Touts,matx,beta20,gamma2))) type1<-ifelse(type1 == 1, 1, 2) ID<-1:length(type1) Toutfin<-pmin(Touts, Cen) type1b<-ifelse(Cen 1e5 jack.s <- system.time(prodlim:::jackknife.survival(me.surv, times = tmax)) po <- system.time(pseudo::pseudosurv(n500$Toutfin, n500$type1b == 1, tmax)) } else { jack.s <- po <- NA } mrs <- with(n500, survival::Surv(Toutfin, type1b == 1)) myests <- system.time(eventglm:::jackknife.survival2(sfit.surv, times = tmax, mrs)) res <- rbind(res, data.frame(n = nnn, times = c(jack.s[1], myests[1], po[1]), type = c("prodlim", "eventglm", "pseudo"))) write.table(res, "benchmark-surv.csv", row.names = FALSE, append = TRUE, col.names = FALSE, sep = ",") } res2 <- NULL for(nn in c(500, 1000, 5000, 10000, 50000, 100000)){ n500 <- gen_data(nn) fpm <- system.time(fast_pseudo_mean(round(n500$Toutfin*1000), as.numeric(n500$type1b == 1), 2000) / 1000)[1] if(nn < 50000) { pm <- system.time(pseudomean(n500$Toutfin, as.numeric(n500$type1b == 1), 2))[1] } else { pm <- NA } sfit.surv <- survival::survfit(survival::Surv(Toutfin,type1b == 1) ~ 1, data = n500) mrs <- with(n500, survival::Surv(Toutfin, type1b == 1)) if(nn < 100000) { myests <- system.time(eventglm:::get_pseudo_rmean(sfit.surv, time = tmax,cause = 1, mrs))[1] } else { myests <- NA } res2 <- rbind(res2, data.frame(n = nn, times = c(fpm,pm,myests), type = c("fastpseudo", "pseudo", "eventglm"))) } p1 <- ggplot(res, aes(x = n, y = times, color = type)) + geom_point() + geom_line() + scale_y_log10("running time (seconds)") + scale_x_log10("sample size") + scale_color_brewer("", type = "qual", palette = 2) + theme_bw() + theme(legend.position = "bottom") + ggtitle("Survival probability") p2 <- ggplot(res2, aes(x = n, y = times, color = type)) + geom_point() + geom_line() + scale_y_log10("running time (seconds)") + scale_x_log10("sample size") + scale_color_brewer("", type = "qual", palette = 2) + theme_bw() + theme(legend.position = "bottom") + ggtitle("Restricted mean survival") p1 + p2 ggsave("../Figures/benchmark.pdf", width = 7.5, height = 3.4) } ## extending eventglm eventglm::pseudo_independent pseudo_parametric <- function(formula, time, cause = 1, data, type = c("cuminc", "survival", "rmean"), formula.censoring = NULL, ipcw.method = NULL){ margformula <- update.formula(formula, . ~ 1) mr <- model.response(model.frame(margformula, data = data)) marginal.estimate <- survival::survreg(margformula, data = data, dist = "weibull") theta <- pweibull(time, shape = 1 / marginal.estimate$scale, scale = exp(marginal.estimate$coefficients[1])) theta.i <- sapply(1:nrow(data), function(i) { me <- survival::survreg(margformula, data = data[-i, ], dist = "weibull") pweibull(time, shape = 1 / me$scale, scale = exp(me$coefficients[1])) }) POi <- theta + (nrow(data) - 1) * (theta - theta.i) POi } fitpara <- cumincglm(Surv(time, status) ~ rx + sex + age, time = 2500, model.censoring = pseudo_parametric, data = colon) fitdef <- cumincglm(Surv(time, status) ~ rx + sex + age, time = 2500, model.censoring = "independent", data = colon) sapply(list(parametric = fitpara, default = fitdef), coefficients) fitpara <- cumincglm(Surv(time, status) ~ rx + sex + age, time = 2500, model.censoring = "parametric", data = colon) pseudo_infjack <- function(formula, time, cause = 1, data, type = c("cuminc", "survival", "rmean"), formula.censoring = NULL, ipcw.method = NULL) { marge <- survival::survfit(update.formula(formula, . ~ 1), data = data, influence = TRUE) tdex <- sapply(time, function(x) max(which(marge$time <= x))) pstate <- marge$surv[tdex] ## S(t) + (n)[S(t) -S_{-i}(t)] POi <- matrix(pstate, nrow = marge$n, ncol = length(time), byrow = TRUE) + (marge$n) * (marge$influence.surv[, tdex + 1]) POi } fitinf <- cumincglm(Surv(time, status) ~ rx + sex + age, time = 2500, model.censoring = "infjack", data = colon) fitdefsurv <- cumincglm(Surv(time, status) ~ rx + sex + age, time = 2500, survival = TRUE, data = colon) sapply(list(infjack = fitinf, default = fitdefsurv), coefficients)