## Script for reproduction of all results in "cutpointr: Improved and tidy ## estimation of optimal cutpoints in R" library("tidyverse") library("cutpointr") library("fANCOVA") library("mgcv") library("ThresholdROC") library("ROCR") library("OptimalCutpoints") library("pROC") library("microbenchmark") ## Code for Figure 1 Function for generating data with a normal, lognormal or ## gamma distribution and two classes generate_data <- function(n, prev = 0.5, dist, mean_h = 100, sd_h = 10, mean_d = 110, sd_d = 10, shape_h = 0.5, rate_h = 1, shape_d = 0.5, rate_d = 1, meanlog_h = 1, sdlog_h = 1, meanlog_d = 1, sdlog_d = 1, mean_h1 = 100, mean_h2 = 120, mean_h3 = 160, sd_h1 = 10, sd_h2 = 10, sd_h3 = 10, mean_d1 = 140, mean_d2 = 180, mean_d3 = 200, sd_d1 = 10, sd_d2 = 10, sd_d3 = 10) { ## If e.g. both groups with n = 7.5, decide randomly which becomes the larger ## one if ((n * prev)%%1 == 0.5) { if (rnorm(1) > 0) { n_d <- ceiling(n * prev) n_h <- n - n_d } else { n_d <- floor(n * prev) n_h <- n - n_d } } else { n_d <- round(n * prev) n_h <- n - n_d } if (dist == "normal") { dat_h <- data.frame(x = rnorm(n = n_h, mean = mean_h, sd = sd_h), group = "h") dat_d <- data.frame(x = rnorm(n = n_d, mean = mean_d, sd = sd_d), group = "d") return(rbind(dat_h, dat_d)) } else if (dist == "gamma") { dat_h <- data.frame(x = rgamma(n = n_h, shape = shape_h, rate = rate_h), group = "h") dat_d <- data.frame(x = rgamma(n = n_d, shape = shape_d, rate = rate_d), group = "d") return(rbind(dat_h, dat_d)) } else if (dist == "lognormal") { dat_h <- data.frame(x = rlnorm(n = n_h, meanlog = meanlog_h, sd = sdlog_h), group = "h") dat_d <- data.frame(x = rlnorm(n = n_d, mean = meanlog_d, sd = sdlog_d), group = "d") return(rbind(dat_h, dat_d)) } } ## Parameter data frames that include the distribution parameters, the optimal ## cutpoint in the population and the value of the Youden-Index in the ## population params_normal <- structure(list(dist = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "normal", class = "factor"), n = c(30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000), m_h = c(100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100), sd_h = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10), m_d = c(105.05, 105.05, 105.05, 105.05, 105.05, 105.05, 105.05, 105.05, 105.05, 110.488, 110.488, 110.488, 110.488, 110.488, 110.488, 110.488, 110.488, 110.488, 116.83, 116.83, 116.83, 116.83, 116.83, 116.83, 116.83, 116.83, 116.83, 125.631, 125.631, 125.631, 125.631, 125.631, 125.631, 125.631, 125.631, 125.631), sd_d = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10), actual_cutpoint = c(102.525, 102.525, 102.525, 102.525, 102.525, 102.525, 102.525, 102.525, 102.525, 105.244, 105.244, 105.244, 105.244, 105.244, 105.244, 105.244, 105.244, 105.244, 108.415, 108.415, 108.415, 108.415, 108.415, 108.415, 108.415, 108.415, 108.415, 112.8155, 112.8155, 112.8155, 112.8155, 112.8155, 112.8155, 112.8155, 112.8155, 112.8155), youden = c(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8)), class = "data.frame", row.names = c(NA, -36L), .Names = c("dist", "n", "m_h", "sd_h", "m_d", "sd_d", "actual_cutpoint", "youden")) params_lognormal <- structure(list(dist = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "lognormal", class = "factor"), n = c(30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000), meanlog_h = c(2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5), sdlog_h = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), meanlog_d = c(2.76, 2.76, 2.76, 2.76, 2.76, 2.76, 2.76, 2.76, 2.76, 3.02, 3.02, 3.02, 3.02, 3.02, 3.02, 3.02, 3.02, 3.02, 3.34, 3.34, 3.34, 3.34, 3.34, 3.34, 3.34, 3.34, 3.34, 3.78, 3.78, 3.78, 3.78, 3.78, 3.78, 3.78, 3.78, 3.78), sdlog_d = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), actual_cutpoint = c(13.8753268909238, 13.8753268909238, 13.8753268909238, 13.8753268909238, 13.8753268909238, 13.8753268909238, 13.8753268909238, 13.8753268909238, 13.8753268909238, 15.8062732945954, 15.8062732945954, 15.8062732945954, 15.8062732945954, 15.8062732945954, 15.8062732945954, 15.8062732945954, 15.8062732945954, 15.8062732945954, 18.5360654023089, 18.5360654023089, 18.5360654023089, 18.5360654023089, 18.5360654023089, 18.5360654023089, 18.5360654023089, 18.5360654023089, 18.5360654023089, 23.110954563297, 23.110954563297, 23.110954563297, 23.110954563297, 23.110954563297, 23.110954563297, 23.110954563297, 23.110954563297, 23.110954563297), youden = c(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8)), class = "data.frame", row.names = c(NA, -36L), .Names = c("dist", "n", "meanlog_h", "sdlog_h", "meanlog_d", "sdlog_d", "actual_cutpoint", "youden")) params_gamma <- structure(list(dist = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "gamma", class = "factor"), n = c(30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000), shape_h = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), rate_h = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), shape_d = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), rate_d = c(0.344, 0.344, 0.344, 0.344, 0.344, 0.344, 0.344, 0.344, 0.344, 0.233, 0.233, 0.233, 0.233, 0.233, 0.233, 0.233, 0.233, 0.233, 0.1428, 0.1428, 0.1428, 0.1428, 0.1428, 0.1428, 0.1428, 0.1428, 0.1428, 0.0723, 0.0723, 0.0723, 0.0723, 0.0723, 0.0723, 0.0723, 0.0723, 0.0723), actual_cutpoint = c(4.79444155190761, 4.79444155190761, 4.79444155190761, 4.79444155190761, 4.79444155190761, 4.79444155190761, 4.79444155190761, 4.79444155190761, 4.79444155190761, 5.71962280791379, 5.71962280791379, 5.71962280791379, 5.71962280791379, 5.71962280791379, 5.71962280791379, 5.71962280791379, 5.71962280791379, 5.71962280791379, 7.01659041722681, 7.01659041722681, 7.01659041722681, 7.01659041722681, 7.01659041722681, 7.01659041722681, 7.01659041722681, 7.01659041722681, 7.01659041722681, 9.04271203767815, 9.04271203767815, 9.04271203767815, 9.04271203767815, 9.04271203767815, 9.04271203767815, 9.04271203767815, 9.04271203767815, 9.04271203767815), youden = c(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8)), class = "data.frame", row.names = c(NA, -36L), .Names = c("dist", "n", "shape_h", "rate_h", "shape_d", "rate_d", "actual_cutpoint", "youden")) ## Main simulation script that takes the above data frames of parameters and ## returns the estimated cutpoints per method run_sim <- function(paramgrid, reps, pos_class = "d", direction = ">=", boot_runs, break_ties = mean) { library("foreach") library("cutpointr") simdat <- purrr::map_df(1:nrow(paramgrid), function(i) { cat("\n") print(i) tempp <- paramgrid[i, ] exports <- c("generate_data", "pos_class", "direction", "break_ties", "boot_runs") pb <- txtProgressBar(max = reps, style = 3) progress <- function(n) setTxtProgressBar(pb, n) opts <- list(progress = progress) foreach(j = 1:reps, .export = exports, .packages = c("cutpointr", "dplyr", "tibble"), .combine = dplyr::bind_rows, .options.snow = opts) %dorng% { data <- generate_data(n = tempp$n, prev = 0.5, dist = tempp$dist, mean_h = tempp$m_h, sd_h = tempp$sd_h, mean_d = tempp$m_d, sd_d = tempp$sd_d, shape_h = tempp$shape_h, rate_h = tempp$rate_h, shape_d = tempp$shape_d, rate_d = tempp$rate_d, meanlog_h = tempp$meanlog_h, sdlog_h = tempp$sdlog_h, meanlog_d = tempp$meanlog_d, sdlog_d = tempp$sdlog_d, mean_h1 = tempp$mean_h1, mean_h2 = tempp$mean_h2, mean_h3 = tempp$mean_h3, sd_h1 = tempp$sd_h1, sd_h2 = tempp$sd_h2, sd_h3 = tempp$sd_h3, mean_d1 = tempp$mean_d1, mean_d2 = tempp$mean_d2, mean_d3 = tempp$mean_d3, sd_d1 = tempp$sd_d1, sd_d2 = tempp$sd_d2, sd_d3 = tempp$sd_d3) res_emp <- cutpointr(data, "x", "group", pos_class = pos_class, direction = direction, method = maximize_metric, metric = youden, break_ties = break_ties, silent = TRUE, boot_runs = boot_runs, use_midpoints = TRUE) res_normal <- cutpointr(data, "x", "group", pos_class = pos_class, direction = direction, method = oc_youden_normal, metric = youden, break_ties = break_ties, silent = TRUE, boot_runs = boot_runs) res_loess <- try(suppressWarnings(cutpointr(data, "x", "group", pos_class = pos_class, direction = direction, method = maximize_loess_metric, metric = youden, break_ties = break_ties, silent = TRUE, boot_runs = boot_runs, use_midpoints = TRUE)), silent = TRUE) if (class(res_loess) == "try-error") res_loess <- tibble(optimal_cutpoint = NA, boot = list(NA)) res_boot <- cutpointr(data, "x", "group", pos_class = pos_class, direction = direction, method = maximize_boot_metric, boot_cut = 50, metric = youden, break_ties = break_ties, silent = TRUE, boot_runs = boot_runs, use_midpoints = TRUE) res_spline <- try(cutpointr(data, "x", "group", pos_class = pos_class, direction = direction, method = maximize_spline_metric, spar = 1, use_midpoints = TRUE, metric = youden, break_ties = break_ties, silent = TRUE, boot_runs = boot_runs), silent = TRUE) if (class(res_spline) == "try-error") res_spline <- tibble(optimal_cutpoint = NA, boot = list(NA)) res_kernel <- try(cutpointr(data, "x", "group", pos_class = pos_class, direction = direction, method = oc_youden_kernel, metric = youden, break_ties = break_ties, silent = TRUE, boot_runs = boot_runs), silent = TRUE) if (class(res_kernel) == "try-error") res_kernel <- tibble(optimal_cutpoint = NA, boot = list(NA)) res_gam <- try(cutpointr(data, "x", "group", pos_class = pos_class, direction = direction, method = maximize_gam_metric, metric = youden, break_ties = break_ties, silent = TRUE, boot_runs = boot_runs, use_midpoints = TRUE), silent = TRUE) if (class(res_gam) == "try-error") res_gam <- tibble(optimal_cutpoint = NA, boot = list(NA)) if (boot_runs > 0) { res_i <- bind_rows(tibble(method = "emp", cutpoints = res_emp$optimal_cutpoint, boot = res_emp$boot), tibble(method = "normal", cutpoints = res_normal$optimal_cutpoint, boot = res_normal$boot), tibble(method = "loess", cutpoints = res_loess$optimal_cutpoint, boot = res_loess$boot), tibble(method = "boot", cutpoints = res_boot$optimal_cutpoint, boot = res_boot$boot), tibble(method = "spline", cutpoints = res_spline$optimal_cutpoint, boot = res_spline$boot), tibble(method = "kernel", cutpoints = res_kernel$optimal_cutpoint, boot = res_kernel$boot), tibble(method = "gam", cutpoints = res_gam$optimal_cutpoint, boot = res_gam$boot)) } else { res_i <- bind_rows(tibble(method = "emp", cutpoints = res_emp$optimal_cutpoint), tibble(method = "normal", cutpoints = res_normal$optimal_cutpoint), tibble(method = "loess", cutpoints = res_loess$optimal_cutpoint), tibble(method = "boot", cutpoints = res_boot$optimal_cutpoint), tibble(method = "spline", cutpoints = res_spline$optimal_cutpoint), tibble(method = "kernel", cutpoints = res_kernel$optimal_cutpoint), tibble(method = "gam", cutpoints = res_gam$optimal_cutpoint)) } res_i$sim_nr <- i res_i$actual_cutpoint <- tempp$actual_cutpoint res_i$youden <- tempp$youden return(res_i) } }) return(simdat) } ## Run simulation in parallel LONG RUNTIME: The results are included in the ## supplementary file plotdat.RData library("doSNOW") library("doRNG") cl <- makeCluster(parallel::detectCores() - 1) registerDoSNOW(cl) set.seed(123) simdat_normal <- run_sim(params_normal, reps = 10000, boot_runs = 0) ## Optionally save the result # save(simdat_normal, file = "simdat_normal.RData") set.seed(234) simdat_lognormal <- run_sim(params_lognormal, 10000, boot_runs = 0) ## Optionally save the result # save(simdat_lognormal, file = "simdat_lognormal.RData") set.seed(345) simdat_gamma <- run_sim(params_gamma, 10000, boot_runs = 0) ## Optionally save the result # save(simdat_gamma, file = "simdat_gamma.RData") ## Function for summarizing the above simulation results summarise_simdat <- function(simdat, params, interval) { library("tidyverse") simdat <- simdat %>% group_by(sim_nr, method) %>% summarise(cutpoints = list(cutpoints), youden = unique(youden), actual_cutpoint = mean(actual_cutpoint)) params$sim_nr <- 1:nrow(params) simdat <- full_join(simdat, params %>% select(-actual_cutpoint, -youden), by = "sim_nr") simdat <- simdat %>% mutate(residuals = purrr::map2(cutpoints, actual_cutpoint, function(cp, ac) cp - ac), abs_residuals = purrr::map2(cutpoints, actual_cutpoint, function(cp, ac) abs(cp - ac)), rmse = purrr::map2_dbl(cutpoints, actual_cutpoint, function(cp, ac) sqrt(mean((cp[is.finite(cp)] - ac)^2))), mean_err = purrr::map2_dbl(cutpoints, actual_cutpoint, function(cp, ac) { mean(cp[is.finite(cp)] - ac) }), median_abs_err = purrr::map2_dbl(cutpoints, actual_cutpoint, function(cp, ac) { median(abs(cp[is.finite(cp)] - ac)) }), median_err = purrr::map2_dbl(cutpoints, actual_cutpoint, function(cp, ac) { median(cp[is.finite(cp)] - ac) }), mape = purrr::map2_dbl(cutpoints, actual_cutpoint, function(cp, ac) { median(abs((cp[is.finite(cp)] - ac)/ac)) }), perc_in_interval = purrr::map2_dbl(cutpoints, actual_cutpoint, function(cp, ac) { mean((cp > (ac - interval)) & (cp < (ac + interval)), na.rm = TRUE) })) return(simdat) } simdat_normal2 <- summarise_simdat(simdat_normal, params_normal, interval = 1) simdat_normal2$method <- as.factor(simdat_normal2$method) simdat_lognormal2 <- summarise_simdat(simdat_lognormal, params_lognormal, interval = 1.5) simdat_lognormal2$method <- as.factor(simdat_lognormal2$method) simdat_gamma2 <- summarise_simdat(simdat_gamma, params_gamma, interval = 1.5) simdat_gamma2$method <- as.factor(simdat_gamma2$method) # save(simdat_gamma2, file = "data/simdat_gamma2.RData") # save(simdat_normal2, file = "data/simdat_normal2.RData") # save(simdat_lognormal2, file = "data/simdat_lognormal2.RData") plotdat <- simdat_normal2 %>% select(sim_nr, method, youden, actual_cutpoint, dist, n, rmse, mean_err:perc_in_interval) %>% dplyr::bind_rows(simdat_lognormal2 %>% select(sim_nr, method, youden, actual_cutpoint, dist, n, rmse, mean_err:perc_in_interval)) %>% dplyr::bind_rows(simdat_gamma2 %>% select(sim_nr, method, youden, actual_cutpoint, dist, n, rmse, mean_err:perc_in_interval)) plotdat$method <- as.factor(plotdat$method) plotdat <- plotdat %>% mutate(method = recode(method, boot = "Bootstrap", emp = "Empirical", gam = "GAM", kernel = "Kernel", loess = "LOESS", normal = "Parametric Normal", spline = "Spline")) plotdat <- rename(plotdat, Method = method) ## Load supplementary file if the above simulation was not run ## load("plotdat.RData") ## Figure 1 ggplot(plotdat, aes(x = n, y = median_abs_err, color = Method, shape = Method)) + geom_point(size = 2) + geom_line() + facet_grid(youden ~ dist, scales = "fixed", space = "fixed") + scale_shape_manual(values = 1:nlevels(plotdat$Method)) + scale_x_log10(breaks = c(30, 50, 75, 100, 150, 250, 500, 750, 1000)) + scale_y_log10(breaks = c(0.25, 0.5, 1, 2, 4, 6)) + ylab("Median Absolute Error (log scale)") + xlab("Sample Size (log scale)") + theme_bw() + theme(legend.position = "bottom") ## Code for Section 6 Example: suicide dataset library("cutpointr") opt_cut <- cutpointr(suicide, dsi, suicide) opt_cut cutpointr(suicide, dsi, suicide, direction = ">=", pos_class = "yes", neg_class = "no", method = maximize_metric, metric = sum_sens_spec) ## Reproducible Bootstrapping and parallelization library("doSNOW") library("tidyverse") cl <- makeCluster(2) ## 2 cores registerDoSNOW(cl) set.seed(100) opt_cut_b <- cutpointr(suicide, dsi, suicide, boot_runs = 1000, silent = TRUE, allowParallel = TRUE) stopCluster(cl) opt_cut_b opt_cut_b %>% select(optimal_cutpoint, data, boot) summary(opt_cut_b) plot(opt_cut_b) ## Figure 3 plot_metric(opt_cut_b, conf_lvl = 0.95) + ylab("Sensitivity + Specificity") + theme_bw() ## Figure 4 ## Subgroups set.seed(100) opt_cut_b_g <- cutpointr(suicide, dsi, suicide, gender, boot_runs = 1000, boot_stratify = TRUE, pos_class = "yes", direction = ">=") opt_cut_b_g %>% select(subgroup, optimal_cutpoint, sum_sens_spec, data) summary(opt_cut_b_g) plot(opt_cut_b_g) ## Figure 5 set.seed(100) opt_cut_k_b_g <- cutpointr(suicide, dsi, suicide, gender, method = oc_youden_kernel, metric = sum_sens_spec, boot_runs = 1000) summary(opt_cut_k_b_g) # Metric functions set.seed(100) opt_cut_c_g_b <- cutpointr(suicide, dsi, suicide, gender, method = minimize_metric, metric = misclassification_cost, cost_fp = 1, cost_fn = 10, pos_class = "yes", direction = ">=", boot_runs = 1000) opt_cut_c_g_b %>% select(subgroup, optimal_cutpoint, misclassification_cost) ## Return cutpoint that maximizes the sum of sensitivity and specificiy ROCR ## package rocr_sensspec <- function(x, class) { pred <- ROCR::prediction(x, class) perf <- ROCR::performance(pred, "sens", "spec") sens <- slot(perf, "y.values")[[1]] spec <- slot(perf, "x.values")[[1]] cut <- slot(perf, "alpha.values")[[1]] cut[which.max(sens + spec)] } ## pROC package proc_sensspec <- function(x, class) { r <- pROC::roc(class, x, algorithm = 2, levels = c(0, 1), direction = "<") pROC::coords(r, "best", ret = "threshold", best.method = "youden", transpose = FALSE)[1] } ## Start benchmark library("cutpointr") library("OptimalCutpoints") library("ThresholdROC") n <- 100 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) x_pos <- dat$x[dat$y == 1] x_neg <- dat$x[dat$y == 0] bench_100 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1, neg_class = 0, direction = ">=", metric = youden, break_ties = mean), rocr_sensspec(dat$x, dat$y), proc_sensspec(dat$x, dat$y), optimal.cutpoints(X = "x", status = "y", tag.healthy = 0, methods = "Youden", data = dat), thres2(k1 = x_neg, k2 = x_pos, rho = 0.5, method = "empirical", ci = FALSE), times = 100, unit = "ms") n <- 1000 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) x_pos <- dat$x[dat$y == 1] x_neg <- dat$x[dat$y == 0] bench_1000 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1, neg_class = 0, direction = ">=", metric = youden, break_ties = mean), rocr_sensspec(dat$x, dat$y), proc_sensspec(dat$x, dat$y), optimal.cutpoints(X = "x", status = "y", tag.healthy = 0, methods = "Youden", data = dat), thres2(k1 = x_neg, k2 = x_pos, rho = 0.5, method = "empirical", ci = FALSE), times = 100, unit = "ms") n <- 10000 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) x_pos <- dat$x[dat$y == 1] x_neg <- dat$x[dat$y == 0] bench_10000 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1, neg_class = 0, direction = ">=", metric = youden, break_ties = mean, silent = TRUE), rocr_sensspec(dat$x, dat$y), optimal.cutpoints(X = "x", status = "y", tag.healthy = 0, methods = "Youden", data = dat), proc_sensspec(dat$x, dat$y), thres2(k1 = x_neg, k2 = x_pos, rho = 0.5, method = "empirical", ci = FALSE), times = 100) n <- 1e+05 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) bench_1e5 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1, neg_class = 0, direction = ">=", metric = youden, break_ties = mean), rocr_sensspec(dat$x, dat$y), proc_sensspec(dat$x, dat$y), times = 100, unit = "ms") n <- 1e+06 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) bench_1e6 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1, neg_class = 0, direction = ">=", metric = youden, break_ties = mean), rocr_sensspec(dat$x, dat$y), proc_sensspec(dat$x, dat$y), times = 30, unit = "ms") n <- 1e+07 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) bench_1e7 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1, neg_class = 0, direction = ">=", metric = youden, break_ties = mean), rocr_sensspec(dat$x, dat$y), proc_sensspec(dat$x, dat$y), times = 30, unit = "ms") results <- rbind(data.frame(time = summary(bench_100)$median, Solution = summary(bench_100)$expr, n = 100), data.frame(time = summary(bench_1000)$median, Solution = summary(bench_1000)$expr, n = 1000), data.frame(time = summary(bench_10000)$median, Solution = summary(bench_10000)$expr, n = 10000), data.frame(time = summary(bench_1e5)$median, Solution = summary(bench_1e5)$expr, n = 1e+05), data.frame(time = summary(bench_1e6)$median, Solution = summary(bench_1e6)$expr, n = 1e+06), data.frame(time = summary(bench_1e7)$median, Solution = summary(bench_1e7)$expr, n = 1e+07)) results$Solution <- as.character(results$Solution) results$Solution[grep(pattern = "cutpointr", x = results$Solution)] <- "cutpointr" results$Solution[grep(pattern = "rocr", x = results$Solution)] <- "ROCR" results$Solution[grep(pattern = "optimal", x = results$Solution)] <- "OptimalCutpoints" results$Solution[grep(pattern = "proc", x = results$Solution)] <- "pROC" results$Solution[grep(pattern = "thres", x = results$Solution)] <- "ThresholdROC" results$task <- "Cutpoint Estimation" # These are the original results on our system # results <- structure(list(time = # c(8.186, 2.1413, 0.7965, 2.70435, 1.2973, 8.4256, 2.58245, 1.13005, 57.8441, # 42.1741, 10.5344, 6.2411, 3411.51285, 4.23885, 2369.47925, 30.85465, # 45.91525, 38.56395, 318.4059, 766.6403, 684.2322, 3896.7509, 8765.75865, # 8298.36255 ), Solution = c("cutpointr", "ROCR", "pROC", "OptimalCutpoints", # "ThresholdROC", "cutpointr", "ROCR", "pROC", "OptimalCutpoints", # "ThresholdROC", "cutpointr", "ROCR", "OptimalCutpoints", "pROC", # "ThresholdROC", "cutpointr", "ROCR", "pROC", "cutpointr", "ROCR", "pROC", # "cutpointr", "ROCR", "pROC"), n = c(100, 100, 100, 100, 100, 1000, 1000, # 1000, 1000, 1000, 10000, 10000, 10000, 10000, 10000, 1e+05, 1e+05, 1e+05, # 1e+06, 1e+06, 1e+06, 1e+07, 1e+07, 1e+07), task = c("Cutpoint Estimation", # "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation", # "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation", # "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation", # "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation", # "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation", # "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation", # "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation", # "Cutpoint Estimation", "Cutpoint Estimation")), row.names = c(NA, -24L), # class = "data.frame") library("ggplot2") ## Figure 6 ggplot(results, aes(x = n, y = time, col = Solution, shape = Solution)) + geom_point(size = 3) + geom_line() + scale_y_log10(breaks = c(3, 5, 10, 25, 100, 250, 1000, 5000, 10000, 15000)) + scale_x_log10(breaks = c(1000, 10000, 1e+05, 1e+06, 1e+07)) + ylab("Median Time (milliseconds, log scale)") + xlab("Sample Size (log scale)") + theme_bw() ## Benchmark for ROC curve only ## ROCR package rocr_roc <- function(x, class) { pred <- ROCR::prediction(x, class) ROCR::performance(pred, "sens", "spec") } ## pROC package proc_roc <- function(x, class) { pROC::roc(class, x, algorithm = 2, levels = c(0, 1), direction = "<") } n <- 100 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) bench_100 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1, neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y), times = 100, unit = "ms") n <- 1000 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) bench_1000 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1, neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y), times = 100, unit = "ms") n <- 10000 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) bench_10000 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1, neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y), times = 100, unit = "ms") n <- 1e+05 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) bench_1e5 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1, neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y), times = 100, unit = "ms") n <- 1e+06 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) bench_1e6 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1, neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y), times = 30, unit = "ms") n <- 1e+07 set.seed(123) dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE)) bench_1e7 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1, neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y), times = 30, unit = "ms") results_roc <- rbind(data.frame(time = summary(bench_100)$median, Solution = summary(bench_100)$expr, n = 100), data.frame(time = summary(bench_1000)$median, Solution = summary(bench_1000)$expr, n = 1000), data.frame(time = summary(bench_10000)$median, Solution = summary(bench_10000)$expr, n = 10000), data.frame(time = summary(bench_1e5)$median, Solution = summary(bench_1e5)$expr, n = 1e+05), data.frame(time = summary(bench_1e6)$median, Solution = summary(bench_1e6)$expr, n = 1e+06), data.frame(time = summary(bench_1e7)$median, Solution = summary(bench_1e7)$expr, n = 1e+07)) results_roc$Solution <- as.character(results_roc$Solution) results_roc$Solution[grep(pattern = "cutpointr", x = results_roc$Solution)] <- "cutpointr" results_roc$Solution[grep(pattern = "rocr", x = results_roc$Solution)] <- "ROCR" results_roc$Solution[grep(pattern = "proc", x = results_roc$Solution)] <- "pROC" results_roc$task <- "ROC curve calculation" # Our results # results_roc <- structure(list(time = c(2.2162, 2.07115, 0.50925, # 2.4053, 2.5222, 0.80225, 3.40185, 6.2865, 3.7495, 12.9862, 45.9087, 36.20245, # 190.1973, 901.79005, 836.9555, 2353.80215, 8797.38765, 8075.3996 ), Solution # = c("cutpointr", "ROCR", "pROC", "cutpointr", "ROCR", "pROC", "cutpointr", # "ROCR", "pROC", "cutpointr", "ROCR", "pROC", "cutpointr", "ROCR", "pROC", # "cutpointr", "ROCR", "pROC"), n = c(100, 100, 100, 1000, 1000, 1000, 10000, # 10000, 10000, 1e+05, 1e+05, 1e+05, 1e+06, 1e+06, 1e+06, 1e+07, 1e+07, 1e+07), # task = c("ROC curve calculation", "ROC curve calculation", "ROC curve # calculation", "ROC curve calculation", "ROC curve calculation", "ROC curve # calculation", "ROC curve calculation", "ROC curve calculation", "ROC curve # calculation", "ROC curve calculation", "ROC curve calculation", "ROC curve # calculation", "ROC curve calculation", "ROC curve calculation", "ROC curve # calculation", "ROC curve calculation", "ROC curve calculation", "ROC curve # calculation")), row.names = c(NA, -18L), class = "data.frame") ggplot(results_roc, aes(x = n, y = time, col = Solution, shape = Solution)) + geom_point(size = 3) + geom_line() + scale_y_log10(breaks = c(3, 5, 10, 25, 100, 250, 1000, 5000, 10000, 15000)) + scale_x_log10(breaks = c(1000, 10000, 1e+05, 1e+06, 1e+07)) + ylab("Median Time (milliseconds, log scale)") + xlab("Sample Size (log scale)") + theme_bw() results_all <- dplyr::bind_rows(results, results_roc) ggplot(results_all, aes(x = n, y = time, linetype = Solution, shape = Solution)) + geom_point(size = 3) + geom_line() + scale_y_log10(breaks = c(0.5, 1, 2, 3, 5, 10, 25, 100, 250, 1000, 5000, 10000, 15000)) + scale_x_log10(breaks = c(1000, 10000, 1e+05, 1e+06, 1e+07)) + ylab("Median Time (milliseconds, log scale)") + xlab("Sample Size (log scale)") + theme_bw() + facet_grid(~task)