### R code from vignette source ### "Z:/Competing_risk/Paper_simDAG/JoSS/article.Rnw" ################################################### ### code chunk number 1: preliminaries ################################################### options(prompt = "R> ", continue = "+ ", useFancyQuotes = FALSE) ################################################### ### code chunk number 2: article.Rnw:117-123 ################################################### # the first simple base R example to simulate data set.seed(43) n <- 100 A <- stats::rnorm(n) B <- stats::rnorm(n) C <- -2 + A * 0.3 + B * -2 + stats::rnorm(n) ################################################### ### code chunk number 3: article.Rnw:213-218 ################################################### # the simDAG version of the previous example library("simDAG") dag <- empty_dag() + node(c("A", "B"), type = "rnorm", mean = 0, sd = 1) + node("C", type = "gaussian", formula = ~ -2 + A * 0.3 + B * -2, error = 1) ################################################### ### code chunk number 3: article.Rnw:213-218 ################################################### # plotting the DAG from above, recreating figure 1 library("igraph") library("ggplot2") library("ggforce") plot(dag, layout = "as_tree", node_size = 0.15, node_fill = "gray", node_text_fontface = "italic") ################################################### ### code chunk number 3: article.Rnw:213-218 ################################################### # printing the structural equations of the DAG above summary(dag) ################################################### ### code chunk number 6: article.Rnw:292-308 ################################################### # recreating the DGP of Denz et al. (2023) using simDAG code dag <- empty_dag() + node(c("X1", "X3"), type = "rbernoulli", p = 0.5, output = "numeric") + node(c("X4", "X6"), type = "rnorm", mean = 0, sd = 1) + node("X2", type = "gaussian", formula = ~0.3 + X3 * 0.1, error = 1) + node("X5", type = "gaussian", formula = ~0.3 + X6 * 0.1, error = 1) + node("Z", type = "binomial", formula = ~-1.2 + I(X2^2) * log(3) + X3 * log(1.5) + X5 * log(1.5) + X6 * log(2), output = "numeric") + node("T", type = "cox", formula = ~X1 * log(1.8) + X2 * log(1.8) + X4 * log(1.8) + I(X5^2) * log(2.3) + Z * -1, surv_dist = "weibull", lambda = 2, gamma = 2.4, cens_dist = "rweibull", cens_args = list(shape = 1, scale = 2)) ################################################### ### code chunk number 6: article.Rnw:292-308 ################################################### plot(dag, node_size = 0.3, node_fill = "gray", node_text_fontface = "italic") ################################################### ### code chunk number 8: article.Rnw:325-326 ################################################### summary(dag) ################################################### ### code chunk number 9: article.Rnw:331-334 ################################################### # simulating data from the dag above library("data.table") dat <- sim_from_dag(dag, n_sim = 500) head(round(dat, 3)) ################################################### ### code chunk number 10: article.Rnw:347-361 ################################################### # recreating the DGP of the simulation study by Gruber et al. (2015) dag <- empty_dag() + node("U", type = "rbernoulli", p = 0.5, output = "numeric") + node("L0", type = "gaussian", formula = ~0.1 + 0.6 * U, error = 1) + node("A0", type = "binomial", formula = ~-0.4 + 0.6 * L0, output = "numeric") + node("Y1", type = "binomial", formula = ~-3.5 + -0.9 * U, output = "numeric") + node("L1", type = "gaussian", formula = ~0.5 + 0.02 * L0 + 0.5 * U + 1.5 * A0, error = 1) + node("A1", type = "binomial", formula = ~-0.4 + 0.02 * L0 + 0.58 * L1, output = "numeric") + node("Y2", type = "binomial", formula = ~-2.5 + 0.9 * U, output = "numeric") ################################################### ### code chunk number 11: dag3 ################################################### plot(dag, node_size = 0.2, node_fill = "gray", node_text_fontface = "italic", layout = "in_circle") ################################################### ### code chunk number 12: article.Rnw:375-376 ################################################### summary(dag) ################################################### ### code chunk number 12: article.Rnw:375-376 ################################################### dat <- sim_from_dag(dag, n_sim = 1000) head(dat) ################################################### ### code chunk number 14: article.Rnw:397-406 ################################################### # code to recreate figure 2 set.seed(129) dag <- empty_dag() + node("Y_0", type = "rnorm") + node("Y_1", type = "binomial", formula = ~1 + Y_0 * 2) + node("Y_2", type = "binomial", formula = ~1 + Y_1 * 3) plot(dag, node_size = 0.3, node_fill = "gray", node_text_fontface = "italic") ################################################### ### code chunk number 15: article.Rnw:436-439 ################################################### # a first very simple example of discrete-time simulation, using only a single # time-dependent outcome dag <- empty_dag() + node_td("Y", type = "time_to_event", prob_fun = 0.01, event_duration = Inf) ################################################### ### code chunk number 16: article.Rnw:444-445 ################################################### # simulating data for the simple DGP given above sim <- sim_discrete_time(dag, n_sim = 1000, max_t = 80) ################################################### ### code chunk number 17: article.Rnw:450-451 ################################################### head(sim$data) # below is the code used to re-create figure 3 library("cowplot") # P_Y(t) base_p <- 0.005 plotdata <- data.frame(time = seq(1, 300), prob = c(rep(base_p, 99), rep(base_p * 3.24, 20), rep(base_p, 181))) p1 <- ggplot(plotdata, aes(x = time, y = prob)) + geom_step() + theme_minimal() + labs(x = "Time in days", y = expression(P[Y](t))) + ylim(c(0, 0.02)) + theme(axis.title.x = element_blank()) p1 # P_A(t) base_p <- 0.01 plotdata <- data.frame(time = seq(1, 300), prob = c(rep(base_p, 99), rep(1, 20), rep(0, 130), rep(base_p, 51))) p2 <- ggplot(plotdata, aes(x = time, y = prob)) + geom_step() + theme_minimal() + labs(x = "Time in days", y = expression(P[A](t))) + scale_y_continuous(labels = function(x) format(x, nsmall = 3)) p2 plot_grid(p1, p2, ncol = 1) ################################################### ### code chunk number 18: article.Rnw:504-513 ################################################### # a slightly more complex example of a discrete-time simulation DGP encoded as # a DAG simulating the effect of a Covid-19 vaccination (A) on presence of a # myocarditis (Y) prob_myoc <- function(data, P_0, RR_A) { fifelse(data$A_event, P_0 * RR_A, P_0) } dag <- empty_dag() + node_td("A", type = "time_to_event", prob_fun = 0.01, event_duration = 20, immunity_duration = 150) + node_td("Y", type = "time_to_event", prob_fun = prob_myoc, parents = c("A_event"), P_0 = 0.005, RR_A = 3.24) ################################################### ### code chunk number 19: article.Rnw:520-521 ################################################### sim <- sim_discrete_time(dag, n_sim = 10000, max_t = 365 * 2) ################################################### ### code chunk number 20: article.Rnw:526-529 ################################################### # converting the simDT object created in the previous chunk to usable data in # the start-stop format dat <- sim2data(sim, to = "start_stop", target_event = "Y", keep_only_first = TRUE, overlap = TRUE) head(dat) ################################################### ### code chunk number 21: article.Rnw:534-537 ################################################### # fitting a simple Cox regression model to the simulated data to validate the # DGP library("survival") mod <- coxph(Surv(start, stop, Y) ~ A, data = dat) summary(mod) ################################################### ### code chunk number 22: article.Rnw:544-554 ################################################### # a slightly more realistic example of the DGP, in which people who are do not # get vaccinated in the first 80 days after onset of a myocarditis event prob_vacc <- function(data, P_0) { fifelse(data$Y_event, 0, P_0) } dag <- empty_dag() + node_td("A", type = "time_to_event", prob_fun = prob_vacc, event_duration = 20, immunity_duration = 150, P_0 = 0.01) + node_td("Y", type = "time_to_event", prob_fun = prob_myoc, parents = c("A_event"), P_0 = 0.005, RR_A = 3.24, event_duration = 80) ################################################### ### code chunk number 23: article.Rnw:593-641 ################################################### # A small investigation of how the runtime changes with varying values of n_sim # and max_t. This code takes a few minutes to run because of the large values # used to show the scaling. library("microbenchmark") set.seed(1234) prob_myoc <- function(data, P_0, RR_A) { fifelse(data$A_event, P_0 * RR_A, P_0) } run_example <- function(n, max_t) { dag <- empty_dag() + node_td("A", type = "time_to_event", prob_fun = 0.01, event_duration = 20, immunity_duration = 150) + node_td("Y", type = "time_to_event", prob_fun = prob_myoc, parents = c("A_event"), P_0 = 0.005, RR_A = 3.24) sim <- sim_discrete_time(dag, n_sim = n, max_t = max_t) dat <- sim2data(sim, to = "start_stop", target_event = "Y", keep_only_first = TRUE, overlap = TRUE) } n <- c(10, 100, 1000, 10000, 100000) max_t <- c(10, 100, 1000, 10000) params <- data.frame(max_t = rep(max_t, each = length(n)), n = rep(n), time = NA) for (i in seq_len(nrow(params))) { n_i <- params$n[i] max_t_i <- params$max_t[i] bench <- microbenchmark(run_example(n = n_i, max_t = max_t_i), times = 1) params$time[i] <- mean(bench$time / 1000000000) } params <- within(params, { max_t <- factor(max_t) }) ggplot(params, aes(x = n, y = time, color = max_t)) + geom_point() + geom_line() + theme_bw() + labs(x = "n_sim", y = "Runtime in Seconds", color = "max_t") + scale_x_continuous(labels = scales::label_comma(), transform = "log10") + scale_y_log10() ####### Appendix # Below is the code used in the appendix, which gives small and simple examples # on different possible data generation mechanisms supported in the package. ################################################### ### code chunk number 24: article.Rnw:697-711 ################################################### # discrete-time simulation with time-dependent base probabilities prob_myoc <- function(data, P_0, RR_A, sim_time) { P_0 <- P_0 + 0.001 * sim_time fifelse(data$A_event, P_0 * RR_A, P_0) } dag <- empty_dag() + node_td("A", type = "time_to_event", prob_fun = 0.01, event_duration = 20, immunity_duration = 150) + node_td("Y", type = "time_to_event", prob_fun = prob_myoc, parents = c("A_event"), P_0 = 0.005, RR_A = 3.24) sim <- sim_discrete_time(dag, n_sim = 100, max_t = 200) data <- sim2data(sim, to = "start_stop") head(data) ################################################### ### code chunk number 25: article.Rnw:720-734 ################################################### # discrete-time simulation with time-dependent effects prob_myoc <- function(data, P_0, RR_A, sim_time) { RR_A <- RR_A + 0.01 * sim_time fifelse(data$A_event, P_0 * RR_A, P_0) } dag <- empty_dag() + node_td("A", type = "time_to_event", prob_fun = 0.01, event_duration = 20, immunity_duration = 150) + node_td("Y", type = "time_to_event", prob_fun = prob_myoc, parents = c("A_event"), P_0 = 0.005, RR_A = 3.24) sim <- sim_discrete_time(dag, n_sim = 100, max_t = 200) data <- sim2data(sim, to = "start_stop") head(data) ################################################### ### code chunk number 26: article.Rnw:743-759 ################################################### # discrete-time simulation with non-linear effects prob_myoc <- function(data, P_0, RR_A) { RR_A <- RR_A - 0.1 * data$A_time_since_last fifelse(data$A_event, P_0 * RR_A, P_0) } dag <- empty_dag() + node_td("A", type = "time_to_event", prob_fun = 0.01, event_duration = 20, immunity_duration = 150, time_since_last = TRUE) + node_td("Y", type = "time_to_event", prob_fun = prob_myoc, parents = c("A_event", "A_time_since_last"), P_0 = 0.005, RR_A = 3.24) sim <- sim_discrete_time(dag, n_sim = 100, max_t = 200) data <- sim2data(sim, to = "start_stop") head(data) ################################################### ### code chunk number 27: article.Rnw:802-824 ################################################### # discrete-time simulation with multiple interrelated binary time-dependent # variables prob_myoc <- function(data, P_0, RR_A, RR_C) { P_0 * RR_A ^ (data$A_event) * RR_C ^ (data$C_event) } prob_covid <- function(data, P_0, d_immune) { fifelse(data$A_time_since_last < d_immune, 0, P_0, na = P_0) } dag <- empty_dag() + node_td("A", type = "time_to_event", prob_fun = 0.01, event_duration = 20, immunity_duration = 150, time_since_last = TRUE) + node_td("C", type = "time_to_event", prob_fun = prob_covid, parents = c("A_time_since_last"), event_duration = 14, P_0 = 0.01, d_immune = 120) + node_td("Y", type = "time_to_event", prob_fun = prob_myoc, parents = c("A_event", "A_time_since_last", "C_event"), P_0 = 0.005, RR_A = 3.24, RR_C = 2.5) sim <- sim_discrete_time(dag, n_sim = 100, max_t = 200) data <- sim2data(sim, to = "start_stop") head(data) ################################################### ### code chunk number 28: article.Rnw:833-851 ################################################### # discrete-time simulation using a time-fixed baseline covariate prob_myoc <- function(data, P_0, RR_A) { fifelse(data$A_event, P_0 * RR_A, P_0) } prob_vacc <- function(data, P_0) { fifelse(data$Sex, P_0 * 2, P_0) } dag <- empty_dag() + node("Sex", type = "rbernoulli", p = 0.4) + node_td("A", type = "time_to_event", prob_fun = 0.01, event_duration = 20, immunity_duration = 150) + node_td("Y", type = "time_to_event", prob_fun = prob_myoc, parents = c("A_event"), P_0 = 0.005, RR_A = 3.24) sim <- sim_discrete_time(dag, n_sim = 100, max_t = 200) data <- sim2data(sim, to = "start_stop") head(data) ################################################### ### code chunk number 29: article.Rnw:860-882 ################################################### # discrete-time simulation using a time-dependent categorical variable prob_myoc <- function(data, P_0, RR_A) { fifelse(data$A_event > 0, P_0 * RR_A, P_0) } prob_vacc <- function(data) { n <- nrow(data) p_mat <- matrix(c(rep(0.99, n), rep(0.0005, n), rep(0.0005, n)), byrow = FALSE, ncol = 3) return(p_mat) } dag <- empty_dag() + node_td("A", type = "competing_events", prob_fun = prob_vacc, event_duration = c(20, 20), immunity_duration = 150) + node_td("Y", type = "time_to_event", prob_fun = prob_myoc, parents = c("A_event"), P_0 = 0.005, RR_A = 3.24) sim <- sim_discrete_time(dag, n_sim = 100, max_t = 200, save_states = "all") data <- sim2data(sim, to = "start_stop") head(data) ################################################### ### code chunk number 30: article.Rnw:893-902 ################################################### # discrete-time simulation using a time-dependent continuous variable dag <- empty_dag() + node("calories", type = "rnorm", mean = 2500, sd = 150) + node_td("calories", type = "gaussian", formula = ~ 1 + calories * 1.1, error = 1) sim <- sim_discrete_time(dag, n_sim = 100, max_t = 200, save_states = "all") data <- sim2data(sim, to = "long") head(data) ################################################### ### code chunk number 31: article.Rnw:911-930 ################################################### # discrete-time simulation with ordered events prob_bachelors <- function(data) { fifelse(data$highschool_event, 0.01, 0) } prob_masters <- function(data) { fifelse(data$bachelors_event, 0.01, 0) } dag <- empty_dag() + node_td("highschool", type = "time_to_event", prob_fun = 0.01, event_duration = Inf) + node_td("bachelors", type = "time_to_event", prob_fun = prob_bachelors, event_duration = Inf, parents = "highschool_event") + node_td("masters", type = "time_to_event", prob_fun = prob_masters, event_duration = Inf, parents = "bachelors_event") sim <- sim_discrete_time(dag, n_sim = 100, max_t = 200) data <- sim2data(sim, to = "start_stop") head(data)