## Script for reproduction of all results in "cutpointr: Improved and tidy
## estimation of optimal cutpoints in R"

library("tidyverse")
library("cutpointr")
library("fANCOVA")
library("mgcv")
library("ThresholdROC")
library("ROCR")
library("OptimalCutpoints")
library("pROC")
library("microbenchmark")

## Code for Figure 1 Function for generating data with a normal, lognormal or
## gamma distribution and two classes
generate_data <- function(n, prev = 0.5, dist, mean_h = 100, sd_h = 10, mean_d = 110,
  sd_d = 10, shape_h = 0.5, rate_h = 1, shape_d = 0.5, rate_d = 1, meanlog_h = 1,
  sdlog_h = 1, meanlog_d = 1, sdlog_d = 1, mean_h1 = 100, mean_h2 = 120, mean_h3 = 160,
  sd_h1 = 10, sd_h2 = 10, sd_h3 = 10, mean_d1 = 140, mean_d2 = 180, mean_d3 = 200,
  sd_d1 = 10, sd_d2 = 10, sd_d3 = 10) {
  ## If e.g. both groups with n = 7.5, decide randomly which becomes the larger
  ## one
  if ((n * prev)%%1 == 0.5) {
    if (rnorm(1) > 0) {
      n_d <- ceiling(n * prev)
      n_h <- n - n_d
    } else {
      n_d <- floor(n * prev)
      n_h <- n - n_d
    }
  } else {
    n_d <- round(n * prev)
    n_h <- n - n_d
  }
  if (dist == "normal") {
    dat_h <- data.frame(x = rnorm(n = n_h, mean = mean_h, sd = sd_h), group = "h")
    dat_d <- data.frame(x = rnorm(n = n_d, mean = mean_d, sd = sd_d), group = "d")
    return(rbind(dat_h, dat_d))
  } else if (dist == "gamma") {
    dat_h <- data.frame(x = rgamma(n = n_h, shape = shape_h, rate = rate_h),
      group = "h")
    dat_d <- data.frame(x = rgamma(n = n_d, shape = shape_d, rate = rate_d),
      group = "d")
    return(rbind(dat_h, dat_d))
  } else if (dist == "lognormal") {
    dat_h <- data.frame(x = rlnorm(n = n_h, meanlog = meanlog_h, sd = sdlog_h),
      group = "h")
    dat_d <- data.frame(x = rlnorm(n = n_d, mean = meanlog_d, sd = sdlog_d),
      group = "d")
    return(rbind(dat_h, dat_d))
  }
}

## Parameter data frames that include the distribution parameters, the optimal
## cutpoint in the population and the value of the Youden-Index in the
## population
params_normal <- structure(list(dist = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
  1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
  1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "normal", class = "factor"), n = c(30,
  50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000,
  30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750,
  1000), m_h = c(100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
  100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100, 100,
  100, 100, 100, 100, 100, 100, 100), sd_h = c(10, 10, 10, 10, 10, 10, 10, 10,
  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
  10, 10, 10, 10, 10, 10, 10, 10), m_d = c(105.05, 105.05, 105.05, 105.05, 105.05,
  105.05, 105.05, 105.05, 105.05, 110.488, 110.488, 110.488, 110.488, 110.488,
  110.488, 110.488, 110.488, 110.488, 116.83, 116.83, 116.83, 116.83, 116.83, 116.83,
  116.83, 116.83, 116.83, 125.631, 125.631, 125.631, 125.631, 125.631, 125.631,
  125.631, 125.631, 125.631), sd_d = c(10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
  10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10, 10,
  10, 10, 10, 10, 10, 10), actual_cutpoint = c(102.525, 102.525, 102.525, 102.525,
  102.525, 102.525, 102.525, 102.525, 102.525, 105.244, 105.244, 105.244, 105.244,
  105.244, 105.244, 105.244, 105.244, 105.244, 108.415, 108.415, 108.415, 108.415,
  108.415, 108.415, 108.415, 108.415, 108.415, 112.8155, 112.8155, 112.8155, 112.8155,
  112.8155, 112.8155, 112.8155, 112.8155, 112.8155), youden = c(0.2, 0.2, 0.2,
  0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.4, 0.6,
  0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8,
  0.8)), class = "data.frame", row.names = c(NA, -36L), .Names = c("dist", "n",
  "m_h", "sd_h", "m_d", "sd_d", "actual_cutpoint", "youden"))

params_lognormal <- structure(list(dist = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L,
  1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
  1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "lognormal", class = "factor"),
  n = c(30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500,
    750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150,
    250, 500, 750, 1000), meanlog_h = c(2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5,
    2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5,
    2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5, 2.5), sdlog_h = c(0.5,
    0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
    0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
    0.5, 0.5, 0.5, 0.5, 0.5), meanlog_d = c(2.76, 2.76, 2.76, 2.76, 2.76, 2.76,
    2.76, 2.76, 2.76, 3.02, 3.02, 3.02, 3.02, 3.02, 3.02, 3.02, 3.02, 3.02, 3.34,
    3.34, 3.34, 3.34, 3.34, 3.34, 3.34, 3.34, 3.34, 3.78, 3.78, 3.78, 3.78, 3.78,
    3.78, 3.78, 3.78, 3.78), sdlog_d = c(0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
    0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
    0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5), actual_cutpoint = c(13.8753268909238,
    13.8753268909238, 13.8753268909238, 13.8753268909238, 13.8753268909238, 13.8753268909238,
    13.8753268909238, 13.8753268909238, 13.8753268909238, 15.8062732945954, 15.8062732945954,
    15.8062732945954, 15.8062732945954, 15.8062732945954, 15.8062732945954, 15.8062732945954,
    15.8062732945954, 15.8062732945954, 18.5360654023089, 18.5360654023089, 18.5360654023089,
    18.5360654023089, 18.5360654023089, 18.5360654023089, 18.5360654023089, 18.5360654023089,
    18.5360654023089, 23.110954563297, 23.110954563297, 23.110954563297, 23.110954563297,
    23.110954563297, 23.110954563297, 23.110954563297, 23.110954563297, 23.110954563297),
  youden = c(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.4, 0.4, 0.4, 0.4,
    0.4, 0.4, 0.4, 0.4, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.8, 0.8,
    0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8)), class = "data.frame", row.names = c(NA,
  -36L), .Names = c("dist", "n", "meanlog_h", "sdlog_h", "meanlog_d", "sdlog_d",
  "actual_cutpoint", "youden"))

params_gamma <- structure(list(dist = structure(c(1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
  1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L,
  1L, 1L, 1L, 1L, 1L, 1L, 1L, 1L), .Label = "gamma", class = "factor"), n = c(30,
  50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750, 1000,
  30, 50, 75, 100, 150, 250, 500, 750, 1000, 30, 50, 75, 100, 150, 250, 500, 750,
  1000), shape_h = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
  2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), rate_h = c(0.5, 0.5, 0.5, 0.5,
  0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5,
  0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5, 0.5),
  shape_d = c(2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2,
    2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2, 2), rate_d = c(0.344, 0.344, 0.344,
    0.344, 0.344, 0.344, 0.344, 0.344, 0.344, 0.233, 0.233, 0.233, 0.233, 0.233,
    0.233, 0.233, 0.233, 0.233, 0.1428, 0.1428, 0.1428, 0.1428, 0.1428, 0.1428,
    0.1428, 0.1428, 0.1428, 0.0723, 0.0723, 0.0723, 0.0723, 0.0723, 0.0723, 0.0723,
    0.0723, 0.0723), actual_cutpoint = c(4.79444155190761, 4.79444155190761,
    4.79444155190761, 4.79444155190761, 4.79444155190761, 4.79444155190761, 4.79444155190761,
    4.79444155190761, 4.79444155190761, 5.71962280791379, 5.71962280791379, 5.71962280791379,
    5.71962280791379, 5.71962280791379, 5.71962280791379, 5.71962280791379, 5.71962280791379,
    5.71962280791379, 7.01659041722681, 7.01659041722681, 7.01659041722681, 7.01659041722681,
    7.01659041722681, 7.01659041722681, 7.01659041722681, 7.01659041722681, 7.01659041722681,
    9.04271203767815, 9.04271203767815, 9.04271203767815, 9.04271203767815, 9.04271203767815,
    9.04271203767815, 9.04271203767815, 9.04271203767815, 9.04271203767815),
  youden = c(0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.2, 0.4, 0.4, 0.4, 0.4, 0.4,
    0.4, 0.4, 0.4, 0.4, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.6, 0.8, 0.8,
    0.8, 0.8, 0.8, 0.8, 0.8, 0.8, 0.8)), class = "data.frame", row.names = c(NA,
  -36L), .Names = c("dist", "n", "shape_h", "rate_h", "shape_d", "rate_d", "actual_cutpoint",
  "youden"))


## Main simulation script that takes the above data frames of parameters and
## returns the estimated cutpoints per method
run_sim <- function(paramgrid, reps, pos_class = "d", direction = ">=", boot_runs,
  break_ties = mean) {
  library("foreach")
  library("cutpointr")
  simdat <- purrr::map_df(1:nrow(paramgrid), function(i) {
    cat("\n")
    print(i)
    tempp <- paramgrid[i, ]
    exports <- c("generate_data", "pos_class", "direction", "break_ties", "boot_runs")
    pb <- txtProgressBar(max = reps, style = 3)
    progress <- function(n) setTxtProgressBar(pb, n)
    opts <- list(progress = progress)
    foreach(j = 1:reps, .export = exports, .packages = c("cutpointr", "dplyr",
      "tibble"), .combine = dplyr::bind_rows, .options.snow = opts) %dorng%
      {
        data <- generate_data(n = tempp$n, prev = 0.5, dist = tempp$dist,
          mean_h = tempp$m_h, sd_h = tempp$sd_h, mean_d = tempp$m_d, sd_d = tempp$sd_d,
          shape_h = tempp$shape_h, rate_h = tempp$rate_h, shape_d = tempp$shape_d,
          rate_d = tempp$rate_d, meanlog_h = tempp$meanlog_h, sdlog_h = tempp$sdlog_h,
          meanlog_d = tempp$meanlog_d, sdlog_d = tempp$sdlog_d, mean_h1 = tempp$mean_h1,
          mean_h2 = tempp$mean_h2, mean_h3 = tempp$mean_h3, sd_h1 = tempp$sd_h1,
          sd_h2 = tempp$sd_h2, sd_h3 = tempp$sd_h3, mean_d1 = tempp$mean_d1,
          mean_d2 = tempp$mean_d2, mean_d3 = tempp$mean_d3, sd_d1 = tempp$sd_d1,
          sd_d2 = tempp$sd_d2, sd_d3 = tempp$sd_d3)

        res_emp <- cutpointr(data, "x", "group", pos_class = pos_class, direction = direction,
          method = maximize_metric, metric = youden, break_ties = break_ties,
          silent = TRUE, boot_runs = boot_runs, use_midpoints = TRUE)
        res_normal <- cutpointr(data, "x", "group", pos_class = pos_class,
          direction = direction, method = oc_youden_normal, metric = youden,
          break_ties = break_ties, silent = TRUE, boot_runs = boot_runs)
        res_loess <- try(suppressWarnings(cutpointr(data, "x", "group", pos_class = pos_class,
          direction = direction, method = maximize_loess_metric, metric = youden,
          break_ties = break_ties, silent = TRUE, boot_runs = boot_runs,
          use_midpoints = TRUE)), silent = TRUE)
        if (class(res_loess) == "try-error")
          res_loess <- tibble(optimal_cutpoint = NA, boot = list(NA))
        res_boot <- cutpointr(data, "x", "group", pos_class = pos_class,
          direction = direction, method = maximize_boot_metric, boot_cut = 50,
          metric = youden, break_ties = break_ties, silent = TRUE, boot_runs = boot_runs,
          use_midpoints = TRUE)
        res_spline <- try(cutpointr(data, "x", "group", pos_class = pos_class,
          direction = direction, method = maximize_spline_metric, spar = 1,
          use_midpoints = TRUE, metric = youden, break_ties = break_ties,
          silent = TRUE, boot_runs = boot_runs), silent = TRUE)
        if (class(res_spline) == "try-error")
          res_spline <- tibble(optimal_cutpoint = NA, boot = list(NA))
        res_kernel <- try(cutpointr(data, "x", "group", pos_class = pos_class,
          direction = direction, method = oc_youden_kernel, metric = youden,
          break_ties = break_ties, silent = TRUE, boot_runs = boot_runs),
          silent = TRUE)
        if (class(res_kernel) == "try-error")
          res_kernel <- tibble(optimal_cutpoint = NA, boot = list(NA))
        res_gam <- try(cutpointr(data, "x", "group", pos_class = pos_class,
          direction = direction, method = maximize_gam_metric, metric = youden,
          break_ties = break_ties, silent = TRUE, boot_runs = boot_runs,
          use_midpoints = TRUE), silent = TRUE)
        if (class(res_gam) == "try-error")
          res_gam <- tibble(optimal_cutpoint = NA, boot = list(NA))
        if (boot_runs > 0) {
          res_i <- bind_rows(tibble(method = "emp", cutpoints = res_emp$optimal_cutpoint,
          boot = res_emp$boot), tibble(method = "normal", cutpoints = res_normal$optimal_cutpoint,
          boot = res_normal$boot), tibble(method = "loess", cutpoints = res_loess$optimal_cutpoint,
          boot = res_loess$boot), tibble(method = "boot", cutpoints = res_boot$optimal_cutpoint,
          boot = res_boot$boot), tibble(method = "spline", cutpoints = res_spline$optimal_cutpoint,
          boot = res_spline$boot), tibble(method = "kernel", cutpoints = res_kernel$optimal_cutpoint,
          boot = res_kernel$boot), tibble(method = "gam", cutpoints = res_gam$optimal_cutpoint,
          boot = res_gam$boot))
        } else {
          res_i <- bind_rows(tibble(method = "emp", cutpoints = res_emp$optimal_cutpoint),
          tibble(method = "normal", cutpoints = res_normal$optimal_cutpoint),
          tibble(method = "loess", cutpoints = res_loess$optimal_cutpoint),
          tibble(method = "boot", cutpoints = res_boot$optimal_cutpoint),
          tibble(method = "spline", cutpoints = res_spline$optimal_cutpoint),
          tibble(method = "kernel", cutpoints = res_kernel$optimal_cutpoint),
          tibble(method = "gam", cutpoints = res_gam$optimal_cutpoint))
        }
        res_i$sim_nr <- i
        res_i$actual_cutpoint <- tempp$actual_cutpoint
        res_i$youden <- tempp$youden
        return(res_i)
      }
  })
  return(simdat)
}

## Run simulation in parallel LONG RUNTIME: The results are included in the
## supplementary file plotdat.RData

library("doSNOW")
library("doRNG")
cl <- makeCluster(parallel::detectCores() - 1)
registerDoSNOW(cl)
set.seed(123)
simdat_normal <- run_sim(params_normal, reps = 10000, boot_runs = 0)
## Optionally save the result
# save(simdat_normal, file = "simdat_normal.RData")
set.seed(234)
simdat_lognormal <- run_sim(params_lognormal, 10000, boot_runs = 0)
## Optionally save the result
# save(simdat_lognormal, file = "simdat_lognormal.RData")
set.seed(345)
simdat_gamma <- run_sim(params_gamma, 10000, boot_runs = 0)
## Optionally save the result
# save(simdat_gamma, file = "simdat_gamma.RData")

## Function for summarizing the above simulation results
summarise_simdat <- function(simdat, params, interval) {
  library("tidyverse")
  simdat <- simdat %>%
    group_by(sim_nr, method) %>%
    summarise(cutpoints = list(cutpoints), youden = unique(youden), actual_cutpoint = mean(actual_cutpoint))
  params$sim_nr <- 1:nrow(params)
  simdat <- full_join(simdat, params %>%
    select(-actual_cutpoint, -youden), by = "sim_nr")
  simdat <- simdat %>%
    mutate(residuals = purrr::map2(cutpoints, actual_cutpoint, function(cp, ac) cp -
      ac), abs_residuals = purrr::map2(cutpoints, actual_cutpoint, function(cp,
      ac) abs(cp - ac)), rmse = purrr::map2_dbl(cutpoints, actual_cutpoint,
      function(cp, ac) sqrt(mean((cp[is.finite(cp)] - ac)^2))), mean_err = purrr::map2_dbl(cutpoints,
      actual_cutpoint, function(cp, ac) {
        mean(cp[is.finite(cp)] - ac)
      }), median_abs_err = purrr::map2_dbl(cutpoints, actual_cutpoint, function(cp,
      ac) {
      median(abs(cp[is.finite(cp)] - ac))
    }), median_err = purrr::map2_dbl(cutpoints, actual_cutpoint, function(cp,
      ac) {
      median(cp[is.finite(cp)] - ac)
    }), mape = purrr::map2_dbl(cutpoints, actual_cutpoint, function(cp, ac) {
      median(abs((cp[is.finite(cp)] - ac)/ac))
    }), perc_in_interval = purrr::map2_dbl(cutpoints, actual_cutpoint, function(cp,
      ac) {
      mean((cp > (ac - interval)) & (cp < (ac + interval)), na.rm = TRUE)
    }))
  return(simdat)
}

simdat_normal2 <- summarise_simdat(simdat_normal, params_normal, interval = 1)
simdat_normal2$method <- as.factor(simdat_normal2$method)
simdat_lognormal2 <- summarise_simdat(simdat_lognormal, params_lognormal, interval = 1.5)
simdat_lognormal2$method <- as.factor(simdat_lognormal2$method)
simdat_gamma2 <- summarise_simdat(simdat_gamma, params_gamma, interval = 1.5)
simdat_gamma2$method <- as.factor(simdat_gamma2$method)
# save(simdat_gamma2, file = "data/simdat_gamma2.RData")
# save(simdat_normal2, file = "data/simdat_normal2.RData")
# save(simdat_lognormal2, file = "data/simdat_lognormal2.RData")

plotdat <- simdat_normal2 %>%
  select(sim_nr, method, youden, actual_cutpoint, dist, n, rmse, mean_err:perc_in_interval) %>%
  dplyr::bind_rows(simdat_lognormal2 %>%
    select(sim_nr, method, youden, actual_cutpoint, dist, n, rmse, mean_err:perc_in_interval)) %>%
  dplyr::bind_rows(simdat_gamma2 %>%
    select(sim_nr, method, youden, actual_cutpoint, dist, n, rmse, mean_err:perc_in_interval))
plotdat$method <- as.factor(plotdat$method)
plotdat <- plotdat %>%
  mutate(method = recode(method, boot = "Bootstrap", emp = "Empirical", gam = "GAM",
    kernel = "Kernel", loess = "LOESS", normal = "Parametric Normal", spline = "Spline"))
plotdat <- rename(plotdat, Method = method)

## Load supplementary file if the above simulation was not run
## load("plotdat.RData")

## Figure 1
ggplot(plotdat, aes(x = n, y = median_abs_err, color = Method, shape = Method)) +
  geom_point(size = 2) + geom_line() + facet_grid(youden ~ dist, scales = "fixed",
  space = "fixed") + scale_shape_manual(values = 1:nlevels(plotdat$Method)) + scale_x_log10(breaks = c(30,
  50, 75, 100, 150, 250, 500, 750, 1000)) + scale_y_log10(breaks = c(0.25, 0.5,
  1, 2, 4, 6)) + ylab("Median Absolute Error (log scale)") + xlab("Sample Size (log scale)") +
  theme_bw() + theme(legend.position = "bottom")


## Code for Section 6 Example: suicide dataset
library("cutpointr")
opt_cut <- cutpointr(suicide, dsi, suicide)
opt_cut

cutpointr(suicide, dsi, suicide, direction = ">=", pos_class = "yes", neg_class = "no",
  method = maximize_metric, metric = sum_sens_spec)

## Reproducible Bootstrapping and parallelization
library("doSNOW")
library("tidyverse")
cl <- makeCluster(2)  ## 2 cores
registerDoSNOW(cl)
set.seed(100)
opt_cut_b <- cutpointr(suicide, dsi, suicide, boot_runs = 1000, silent = TRUE, allowParallel = TRUE)
stopCluster(cl)
opt_cut_b
opt_cut_b %>%
  select(optimal_cutpoint, data, boot)
summary(opt_cut_b)
plot(opt_cut_b)  ## Figure 3
plot_metric(opt_cut_b, conf_lvl = 0.95) + ylab("Sensitivity + Specificity") + theme_bw()  ## Figure 4

## Subgroups
set.seed(100)
opt_cut_b_g <- cutpointr(suicide, dsi, suicide, gender, boot_runs = 1000, boot_stratify = TRUE,
  pos_class = "yes", direction = ">=")
opt_cut_b_g %>%
  select(subgroup, optimal_cutpoint, sum_sens_spec, data)
summary(opt_cut_b_g)
plot(opt_cut_b_g)  ## Figure 5

set.seed(100)
opt_cut_k_b_g <- cutpointr(suicide, dsi, suicide, gender, method = oc_youden_kernel,
  metric = sum_sens_spec, boot_runs = 1000)
summary(opt_cut_k_b_g)

# Metric functions
set.seed(100)
opt_cut_c_g_b <- cutpointr(suicide, dsi, suicide, gender, method = minimize_metric,
  metric = misclassification_cost, cost_fp = 1, cost_fn = 10, pos_class = "yes",
  direction = ">=", boot_runs = 1000)
opt_cut_c_g_b %>%
  select(subgroup, optimal_cutpoint, misclassification_cost)


## Return cutpoint that maximizes the sum of sensitivity and specificiy ROCR
## package
rocr_sensspec <- function(x, class) {
  pred <- ROCR::prediction(x, class)
  perf <- ROCR::performance(pred, "sens", "spec")
  sens <- slot(perf, "y.values")[[1]]
  spec <- slot(perf, "x.values")[[1]]
  cut <- slot(perf, "alpha.values")[[1]]
  cut[which.max(sens + spec)]
}

## pROC package
proc_sensspec <- function(x, class) {
  r <- pROC::roc(class, x, algorithm = 2, levels = c(0, 1), direction = "<")
  pROC::coords(r, "best", ret = "threshold", best.method = "youden", transpose = FALSE)[1]
}

## Start benchmark
library("cutpointr")
library("OptimalCutpoints")
library("ThresholdROC")
n <- 100
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
x_pos <- dat$x[dat$y == 1]
x_neg <- dat$x[dat$y == 0]
bench_100 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1, neg_class = 0,
  direction = ">=", metric = youden, break_ties = mean), rocr_sensspec(dat$x, dat$y),
  proc_sensspec(dat$x, dat$y), optimal.cutpoints(X = "x", status = "y", tag.healthy = 0,
    methods = "Youden", data = dat), thres2(k1 = x_neg, k2 = x_pos, rho = 0.5,
    method = "empirical", ci = FALSE), times = 100, unit = "ms")

n <- 1000
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
x_pos <- dat$x[dat$y == 1]
x_neg <- dat$x[dat$y == 0]
bench_1000 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1,
  neg_class = 0, direction = ">=", metric = youden, break_ties = mean), rocr_sensspec(dat$x,
  dat$y), proc_sensspec(dat$x, dat$y), optimal.cutpoints(X = "x", status = "y",
  tag.healthy = 0, methods = "Youden", data = dat), thres2(k1 = x_neg, k2 = x_pos,
  rho = 0.5, method = "empirical", ci = FALSE), times = 100, unit = "ms")

n <- 10000
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
x_pos <- dat$x[dat$y == 1]
x_neg <- dat$x[dat$y == 0]
bench_10000 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1,
  neg_class = 0, direction = ">=", metric = youden, break_ties = mean, silent = TRUE),
  rocr_sensspec(dat$x, dat$y), optimal.cutpoints(X = "x", status = "y", tag.healthy = 0,
    methods = "Youden", data = dat), proc_sensspec(dat$x, dat$y), thres2(k1 = x_neg,
    k2 = x_pos, rho = 0.5, method = "empirical", ci = FALSE), times = 100)

n <- 1e+05
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
bench_1e5 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1, neg_class = 0,
  direction = ">=", metric = youden, break_ties = mean), rocr_sensspec(dat$x, dat$y),
  proc_sensspec(dat$x, dat$y), times = 100, unit = "ms")

n <- 1e+06
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
bench_1e6 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1, neg_class = 0,
  direction = ">=", metric = youden, break_ties = mean), rocr_sensspec(dat$x, dat$y),
  proc_sensspec(dat$x, dat$y), times = 30, unit = "ms")

n <- 1e+07
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
bench_1e7 <- microbenchmark::microbenchmark(cutpointr(dat, x, y, pos_class = 1, neg_class = 0,
  direction = ">=", metric = youden, break_ties = mean), rocr_sensspec(dat$x, dat$y),
  proc_sensspec(dat$x, dat$y), times = 30, unit = "ms")

results <- rbind(data.frame(time = summary(bench_100)$median, Solution = summary(bench_100)$expr,
  n = 100), data.frame(time = summary(bench_1000)$median, Solution = summary(bench_1000)$expr,
  n = 1000), data.frame(time = summary(bench_10000)$median, Solution = summary(bench_10000)$expr,
  n = 10000), data.frame(time = summary(bench_1e5)$median, Solution = summary(bench_1e5)$expr,
  n = 1e+05), data.frame(time = summary(bench_1e6)$median, Solution = summary(bench_1e6)$expr,
  n = 1e+06), data.frame(time = summary(bench_1e7)$median, Solution = summary(bench_1e7)$expr,
  n = 1e+07))
results$Solution <- as.character(results$Solution)
results$Solution[grep(pattern = "cutpointr", x = results$Solution)] <- "cutpointr"
results$Solution[grep(pattern = "rocr", x = results$Solution)] <- "ROCR"
results$Solution[grep(pattern = "optimal", x = results$Solution)] <- "OptimalCutpoints"
results$Solution[grep(pattern = "proc", x = results$Solution)] <- "pROC"
results$Solution[grep(pattern = "thres", x = results$Solution)] <- "ThresholdROC"

results$task <- "Cutpoint Estimation"

# These are the original results on our system
# results <- structure(list(time =
# c(8.186, 2.1413, 0.7965, 2.70435, 1.2973, 8.4256, 2.58245, 1.13005, 57.8441,
# 42.1741, 10.5344, 6.2411, 3411.51285, 4.23885, 2369.47925, 30.85465,
# 45.91525, 38.56395, 318.4059, 766.6403, 684.2322, 3896.7509, 8765.75865,
# 8298.36255 ), Solution = c("cutpointr", "ROCR", "pROC", "OptimalCutpoints",
# "ThresholdROC", "cutpointr", "ROCR", "pROC", "OptimalCutpoints",
# "ThresholdROC", "cutpointr", "ROCR", "OptimalCutpoints", "pROC",
# "ThresholdROC", "cutpointr", "ROCR", "pROC", "cutpointr", "ROCR", "pROC",
# "cutpointr", "ROCR", "pROC"), n = c(100, 100, 100, 100, 100, 1000, 1000,
# 1000, 1000, 1000, 10000, 10000, 10000, 10000, 10000, 1e+05, 1e+05, 1e+05,
# 1e+06, 1e+06, 1e+06, 1e+07, 1e+07, 1e+07), task = c("Cutpoint Estimation",
# "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation",
# "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation",
# "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation",
# "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation",
# "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation",
# "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation",
# "Cutpoint Estimation", "Cutpoint Estimation", "Cutpoint Estimation",
# "Cutpoint Estimation", "Cutpoint Estimation")), row.names = c(NA, -24L),
# class = "data.frame")

library("ggplot2")
## Figure 6
ggplot(results, aes(x = n, y = time, col = Solution, shape = Solution)) + geom_point(size = 3) +
  geom_line() + scale_y_log10(breaks = c(3, 5, 10, 25, 100, 250, 1000, 5000, 10000,
  15000)) + scale_x_log10(breaks = c(1000, 10000, 1e+05, 1e+06, 1e+07)) + ylab("Median Time (milliseconds, log scale)") +
  xlab("Sample Size (log scale)") + theme_bw()

## Benchmark for ROC curve only

## ROCR package
rocr_roc <- function(x, class) {
  pred <- ROCR::prediction(x, class)
  ROCR::performance(pred, "sens", "spec")
}
## pROC package
proc_roc <- function(x, class) {
  pROC::roc(class, x, algorithm = 2, levels = c(0, 1), direction = "<")
}

n <- 100
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
bench_100 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1,
  neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y),
  times = 100, unit = "ms")
n <- 1000
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
bench_1000 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1,
  neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y),
  times = 100, unit = "ms")
n <- 10000
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
bench_10000 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1,
  neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y),
  times = 100, unit = "ms")
n <- 1e+05
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
bench_1e5 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1,
  neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y),
  times = 100, unit = "ms")
n <- 1e+06
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
bench_1e6 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1,
  neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y),
  times = 30, unit = "ms")
n <- 1e+07
set.seed(123)
dat <- data.frame(x = rnorm(n), y = sample(c(0:1), size = n, replace = TRUE))
bench_1e7 <- microbenchmark::microbenchmark(cutpointr::roc(dat, "x", "y", pos_class = 1,
  neg_class = 0, direction = ">="), rocr_roc(dat$x, dat$y), proc_roc(dat$x, dat$y),
  times = 30, unit = "ms")

results_roc <- rbind(data.frame(time = summary(bench_100)$median, Solution = summary(bench_100)$expr,
  n = 100), data.frame(time = summary(bench_1000)$median, Solution = summary(bench_1000)$expr,
  n = 1000), data.frame(time = summary(bench_10000)$median, Solution = summary(bench_10000)$expr,
  n = 10000), data.frame(time = summary(bench_1e5)$median, Solution = summary(bench_1e5)$expr,
  n = 1e+05), data.frame(time = summary(bench_1e6)$median, Solution = summary(bench_1e6)$expr,
  n = 1e+06), data.frame(time = summary(bench_1e7)$median, Solution = summary(bench_1e7)$expr,
  n = 1e+07))
results_roc$Solution <- as.character(results_roc$Solution)
results_roc$Solution[grep(pattern = "cutpointr", x = results_roc$Solution)] <- "cutpointr"
results_roc$Solution[grep(pattern = "rocr", x = results_roc$Solution)] <- "ROCR"
results_roc$Solution[grep(pattern = "proc", x = results_roc$Solution)] <- "pROC"
results_roc$task <- "ROC curve calculation"

# Our results
# results_roc <- structure(list(time = c(2.2162, 2.07115, 0.50925,
# 2.4053, 2.5222, 0.80225, 3.40185, 6.2865, 3.7495, 12.9862, 45.9087, 36.20245,
# 190.1973, 901.79005, 836.9555, 2353.80215, 8797.38765, 8075.3996 ), Solution
# = c("cutpointr", "ROCR", "pROC", "cutpointr", "ROCR", "pROC", "cutpointr",
# "ROCR", "pROC", "cutpointr", "ROCR", "pROC", "cutpointr", "ROCR", "pROC",
# "cutpointr", "ROCR", "pROC"), n = c(100, 100, 100, 1000, 1000, 1000, 10000,
# 10000, 10000, 1e+05, 1e+05, 1e+05, 1e+06, 1e+06, 1e+06, 1e+07, 1e+07, 1e+07),
# task = c("ROC curve calculation", "ROC curve calculation", "ROC curve
# calculation", "ROC curve calculation", "ROC curve calculation", "ROC curve
# calculation", "ROC curve calculation", "ROC curve calculation", "ROC curve
# calculation", "ROC curve calculation", "ROC curve calculation", "ROC curve
# calculation", "ROC curve calculation", "ROC curve calculation", "ROC curve
# calculation", "ROC curve calculation", "ROC curve calculation", "ROC curve
# calculation")), row.names = c(NA, -18L), class = "data.frame")

ggplot(results_roc, aes(x = n, y = time, col = Solution, shape = Solution)) + geom_point(size = 3) +
  geom_line() + scale_y_log10(breaks = c(3, 5, 10, 25, 100, 250, 1000, 5000, 10000,
  15000)) + scale_x_log10(breaks = c(1000, 10000, 1e+05, 1e+06, 1e+07)) +
  ylab("Median Time (milliseconds, log scale)") + xlab("Sample Size (log scale)") +
  theme_bw()

results_all <- dplyr::bind_rows(results, results_roc)

ggplot(results_all, aes(x = n, y = time, linetype = Solution, shape = Solution)) +
  geom_point(size = 3) + geom_line() + scale_y_log10(breaks = c(0.5, 1, 2, 3, 5,
  10, 25, 100, 250, 1000, 5000, 10000, 15000)) + scale_x_log10(breaks = c(1000,
  10000, 1e+05, 1e+06, 1e+07)) + ylab("Median Time (milliseconds, log scale)") +
  xlab("Sample Size (log scale)") + theme_bw() + facet_grid(~task)