# setup parallel computation to speed-up the process and load all required pkgs
library("future.apply")

stopifnot(
  requireNamespace("stats"),
  requireNamespace("drc"),
  requireNamespace("DoseFinding"),
  requireNamespace("nplr"),
  requireNamespace("drda")
)

if (Sys.info()["sysname"] != "Windows") {
  plan(multicore, workers = availableCores(omit = 1))
} else {
  plan(multisession, workers = availableCores(omit = 1))
}

# define the directory where to save the simulation results
dir_path <- "."

# general functions
benchmark_packages <- function(Z) {
  # x: vector of doses
  # y: vector of observed responses
  # p: vector of four estimated model parameters

  dosef_rss <- function(x, y, p) {
    mu <- p[1] + p[2] * x^p[4] / (p[3]^p[4] + x^p[4])
    sum((y - mu)^2)
  }

  drc_rss <- function(x, y, p) {
    mu <- p[2] + (p[3] - p[2]) / (1 + exp(p[1] * log(x / p[4])))
    sum((y - mu)^2)
  }

  drda_rss <- function(x, y, p) {
    mu <- p[1] + p[2] * x^p[3] / (p[4]^p[3] + x^p[3])
    sum((y - mu)^2)
  }

  nplr_rss <- function(x, y, p) {
    p <- as.numeric(p)
    mu <- p[1] + (p[2] - p[1]) / (1 + 10^(p[4] * (p[3] - log10(x))))
    sum((y - mu)^2)
  }

  nls_rss <- function(x, y, p) {
    mu <- p[1] + (p[2] - p[1]) / (1 + exp((p[3] - x) / p[4]))
    sum((y - mu)^2)
  }

  results <- rep(NA_real_, 15)
  names(results) <- paste(
    rep(c("dosef", "drda", "drc", "nplr", "nls"), 3),
    rep(c("converged", "rss", "time"), each = 5),
    sep = "_"
  )

  # by default we initialize packages to no convergence
  results[seq_len(5)] <- 0

  # DoseFinding
  w <- DoseFinding::defBnds(max(Z$x))$sigEmax

  fn_time <- suppressMessages(
    suppressWarnings(
      tryCatch({
        system.time({
          fit_dosef <- DoseFinding::fitMod(
            Z$x, Z$y, model = "sigEmax", bnds = w
          )
        })
        },
        error = function(e) NULL
      )
    )
  )

  if (!is.null(fn_time)) {
    # fitMod always returns a result, but we don't know if it really converged
    results[1] <- 1
    results[6] <- dosef_rss(Z$x, Z$y, fit_dosef$coefs)
    results[11] <- fn_time["elapsed"]
  }

  # drda
  fn_time <- suppressMessages(
    suppressWarnings(
      tryCatch({
        system.time({
          fit_drda <- drda::drda(
            formula = y ~ x, data = Z, mean_function = "loglogistic4",
            max_iter = 10000
          )
        })
        },
        error = function(e) NULL
      )
    )
  )

  if (!is.null(fn_time) && fit_drda$converged) {
    results[2] <- 1
    results[7] <- drda_rss(Z$x, Z$y, fit_drda$coefficients)
    results[12] <- fn_time["elapsed"]
  }

  # drc
  w <- drc::drmc(
    maxIt = 10000, method = "BFGS", errorm = FALSE, noMessage = TRUE,
    otrace = TRUE
  )

  fn_time <- suppressMessages(
    suppressWarnings(
      tryCatch({
        system.time({
          fit_drc <- drc::drm(
            formula = y ~ x, fct = drc::LL.4(), data = Z, control = w
          )
        })
        },
        error = function(e) NULL
      )
    )
  )

  if (
    !is.null(fn_time) && (is.null(fit_drc$convergence) || fit_drc$convergence)
  ) {
    results[3] <- 1
    results[8] <- drc_rss(Z$x, Z$y, fit_drc$coefficients)
    results[13] <- fn_time["elapsed"]
  }

  # nplr
  fn_time <- suppressMessages(
    suppressWarnings(
      tryCatch({
        system.time({
          fit_nplr <- nplr::nplr(
            Z$x, Z$y, LPweight = 0, npars = 4, silent = TRUE
          )
        })
        },
        error = function(e) NULL
      )
    )
  )

  if (!is.null(fn_time)) {
    # nplr always returns a result, but we don't know if it really converged
    results[4] <- 1
    results[9] <- nplr_rss(Z$x, Z$y, fit_nplr@pars)
    results[14] <- fn_time["elapsed"]
  }

  # nls
  w <- nls.control(maxiter = 10000, warnOnly = TRUE)
  Z$l <- log(Z$x)

  fn_time <- suppressMessages(
    suppressWarnings(
      tryCatch({
        system.time({
          fit_nls <- nls(y ~ SSfpl(l, A, B, xmid, scal), data = Z, control = w)
        })
        },
        error = function(e) NULL
      )
    )
  )

  if (!is.null(fn_time) && fit_nls$convInfo$isConv) {
    results[5] <- 1
    results[10] <- nls_rss(Z$l, Z$y, coef(fit_nls))
    results[15] <- fn_time["elapsed"]
  }

  # compute RSS relative errors
  res_rss <- results[6:10]
  min_rss <- min(res_rss, na.rm = TRUE)
  err_rss <- abs(1 - res_rss / min_rss)
  names(err_rss) <- paste(
    c("dosef", "drda", "drc", "nplr", "nls"), "rss_rel_err", sep = "_"
  )

  c(results, err_rss)
}

compute_statistics <- function(x) {
  q <- quantile(x, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE)
  names(q) <- NULL

  c(
    "Mean" = mean(x, na.rm = TRUE),
    "Min." = q[1],
    "1st Qu." = q[2],
    "Median" = q[3],
    "3rd Qu." = q[4],
    "Max." = q[5]
  )
}

format_number <- function(x) {
  ifelse(x < 1.0e-10,
    # anything below 10^(-10) is approximated to zero
    "0.000",
    ifelse(x < 1.0e-03,
      {
        # anything in [10^(-10), 10^(-3)) will use scientific notation
        str <- format(x, scientific = TRUE, digits = 3)

        # was the number approximated to 10^(-3)?
        ifelse(!grepl("e-03", str, fixed = TRUE),
          {
            # convert into LaTeX form by first removing the zero if present
            str <- sub("e-0(\\d)$", "e-\\1", str, perl = TRUE)
            str <- sub("e-(\\d+)$", "\\\\,10^{-\\1}", str, perl = TRUE)
            paste0("\\(", str, "\\)")
          },
          as.character(round(x, digits = 3))
        )
      },
      # anything above 10^(-3) will be printed in fixed notation
      as.character(round(x, digits = 3))
    )
  )
}

format_row <- function(pct, x, pkg) {
  last <- if (pkg == "nls") {
    " \\\\ \\bottomrule"
  } else {
    " \\\\"
  }

  z <- as.character(round(100 * pct, digits = 2))

  if (pkg == "dosef") {
    pkg <- "DoseFinding"
    z <- "-"
  } else if (pkg == "nplr") {
    z <- "-"
  }

  paste0(
    "      \\pkg{", pkg, "} & ", z, " & ",
    paste(format_number(x), collapse = " & "), last
  )
}

# ------- #
# Table 1 #
# ------- #

do_benchmark <- function(a) {
  D <- apply(a$y, 2, function(w) data.frame(x = a$x, y = w))

  l <- vapply(D, benchmark_packages, numeric(20))

  p <- a$p
  names(p) <- NULL

  results <- data.frame(
    alpha = p[1],
    delta = p[2],
    eta = p[3],
    phi = p[4],
    sigma = p[5],
    t(l)
  )

  write.csv(x = results, file = a$fp, na = "", row.names = FALSE)

  0
}

combine_results <- function(dir_path, npar, nsim) {
  fp <- list.files(path = dir_path, pattern = "\\.csv$", full.names = TRUE)
  m <- length(fp)

  if (m != npar) {
    stop("Found wrong number of simulations", call. = FALSE)
  }

  models_results <- array(0, dim = c(20, nsim, npar))
  params <- matrix(0, nrow = m, ncol = 5)

  for (i in seq_len(m)) {
    tmp <- as.matrix(read.csv(file = fp[i]))
    params[i, ] <- tmp[1, seq_len(5)]
    models_results[, , i] <- t(tmp[, -seq_len(5)])
  }

  ord <- order(
    params[, 1], params[, 2], params[, 3], params[, 4], params[, 5]
  )
  params <- params[ord, ]
  models_results <- models_results[, , ord]

  lbl <- apply(
    params,
    1,
    function(p) {
      paste(
        paste(c("alpha", "delta", "eta", "phi", "sigma"), p, sep = " = "),
        collapse = ", "
      )
    }
  )

  dimnames(models_results) <- list(colnames(tmp)[-seq_len(5)], NULL, lbl)

  models_results
}

# number of replicates for each tested dose
n <- 3

# number of datasets for each parameter vector
nsim <- 100

# tested doses are fixed
dose <- rep(c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), each = n)

# these are the "true" parameters to simulate
alpha_set <- c(0.0, 0.2, 0.45)
delta_set <- c(-0.95, 0.3, 1.2)
eta_set <- c(0.1, 2, 5)
phi_set <- c(0.0001, 1, 100)
sigma_set <- c(0.05, 0.1)

params <- as.matrix(
  expand.grid(sigma_set, phi_set, eta_set, delta_set, alpha_set)
)[, c(5, 4, 3, 2, 1)]
colnames(params) <- c("alpha", "delta", "eta", "phi", "sigma")

npar <- nrow(params)

# set seed here for reproducibility
set.seed(93302902)

# generate all datasets and map our benchmark function on each one of them
obj <- vector("list", npar)
for (i in seq_len(npar)) {
  p <- params[i, ]

  # this is the mean function
  f <- p[1] + p[2] * dose^p[3] / (dose^p[3] + p[4]^p[3])

  # convert the matrix into data.frames and append them to our list
  obj[[i]] <- list(
    p = round(p, digits = 4),
    x = dose,
    # generate the data from a normal distribution with mean `f` and standard
    # deviation `p[5]`
    y = matrix(rep(f, nsim) + p[5] * rnorm(nsim * n), ncol = nsim),
    fp = tempfile(
      pattern = "models_results_", tmpdir = dir_path, fileext = ".csv"
    )
  )
}

# we save the datasets in case we want to manually re-do the fitting
save(obj, file = file.path(dir_path, "obj.RData"))

# simulate! Warning: this step takes a long time.
models_results <- future_vapply(obj, do_benchmark, 0, future.seed = 24832919)
models_results <- combine_results(dir_path, npar, nsim)

save(models_results, file = file.path(dir_path, "models_results.RData"))

# compute overall statistics
stats <- apply(models_results, 1, compute_statistics)
stats[is.nan(stats)] <- NA_real_

# generate Table 1
i <- 0
table_str <- character(14)
table_str[(i <- i + 1)] <- "\\begin{table}[!t]"
table_str[(i <- i + 1)] <- "  \\centering"
table_str[(i <- i + 1)] <- "  \\resizebox{\\textwidth}{!}{%"
table_str[(i <- i + 1)] <- "    \\begin{tabular}{@{}llllllll@{}}"
table_str[(i <- i + 1)] <- "      \\toprule"
table_str[(i <- i + 1)] <- paste(
  "      Package & \\% Convergence & Mean & Minimum & First quartile &",
  "Median & Third quartile & Maximum \\\\ \\midrule"
)
for (pkg in c("dosef", "drc", "drda", "nplr", "nls")) {
  table_str[(i <- i + 1)] <- format_row(
    stats["Mean", paste(pkg, "converged", sep = "_")],
    stats[, paste(pkg, "rss_rel_err", sep = "_")],
    pkg
  )
}
table_str[(i <- i + 1)] <- "    \\end{tabular}%"
table_str[(i <- i + 1)] <- "  }"
table_str[(i <- i + 1)] <- paste0(
  "  \\caption{",
  "Summary statistics of the RSS relative error for simulated models. ",
  "A total of 162 vector parameters were analysed. ",
  "For each parameter, 100 datasets with 7 doses and 3 replicates per dose ",
  "were simulated.}",
  "\\label{tab:l4sim}"
)
table_str[(i <- i + 1)] <- "\\end{table}"

# finally write the table
writeLines(text = table_str, con = file.path(dir_path, "table_1.tex"))

# what's the overall convergence rate?
print(round(100 * stats[1, 1:5], digits = 2))

# what's the mean execution time?
print(round(stats[1, 11:15], digits = 3))

# ------- #
# Table 2 #
# ------- #

# download the data from the open repository
ctrpv2_file <- file.path(dir_path, "CTRPv2_2015.rds")

if (!file.exists("CTRPv2_2015.rds")) {
  download.file(
    url = "https://zenodo.org/record/3905470/files/CTRPv2.rds?download=1",
    destfile = ctrpv2_file, quiet = TRUE, mode = "wb"
  )
}

CTRPv2 <- readRDS(ctrpv2_file)

# CTRPv2 contains a 395263-by-29-by-2 array of raw sensitivity data
#
# raw[i, , ] is a numeric matrix corresponding to a particular cell-line/drug
# combination. The matrix might contain rows full of NAs when the number of
# tested doses is less than 29.
#
# we convert the array into a list of data.frames so that we can later apply our
# parallel `vapply`.

# This function converts a data matrix into a format suitable for us
#
# D: 29-by-2 numeric matrix where D[, 1] is the vector of doses and D[, 2] is
#    the vector of responses
process_data <- function(D) {
  # discard rows with only NAs
  idx <- apply(D, 1, function(y) !all(is.na(y)))

  # for cell-line/drug pairs that were not tested, all rows are NAs
  if (all(!idx)) {
    return(NULL)
  }

  data.frame(
    x = round(D[idx, 1], digits = 7),
    # we use sensitivities on the 0-1 scale
    y = D[idx, 2] / 100
  )
}

CTRPv2_data <- future_apply(
  CTRPv2@sensitivity$raw, 1, process_data, simplify = FALSE
)

# drop the NULL elements
idx <- which(vapply(CTRPv2_data, is.null, FALSE))
CTRPv2_data[idx] <- NULL

# save the list for subsequent analysis
save(CTRPv2_data, file = file.path(dir_path, "CTRPv2_data.RData"))

# set seed here for reproducibility
set.seed(19264367)

# CTRPv2_data is made of 379_533 data.frames
# we sample at random 15_000 data.frames to speed up the process
idx <- sample(seq_len(length(CTRPv2_data)), 15000L)

CTRPv2_results <- future_vapply(
  CTRPv2_data[idx], benchmark_packages, numeric(20), future.seed = 60350074
)

save(CTRPv2_results, file = file.path(dir_path, "CTRPv2_results.RData"))

# compute overall statistics
stats <- apply(CTRPv2_results, 1, compute_statistics)
stats[is.nan(stats)] <- NA_real_

# generate Table 2
i <- 0
table_str <- character(14)
table_str[(i <- i + 1)] <- "\\begin{table}[!t]"
table_str[(i <- i + 1)] <- "  \\centering"
table_str[(i <- i + 1)] <- "  \\resizebox{\\textwidth}{!}{%"
table_str[(i <- i + 1)] <- "    \\begin{tabular}{@{}llllllll@{}}"
table_str[(i <- i + 1)] <- "      \\toprule"
table_str[(i <- i + 1)] <- paste(
  "      Package & \\% Convergence & Mean & Minimum & First quartile &",
  "Median & Third quartile & Maximum \\\\ \\midrule"
)
for (pkg in c("dosef", "drc", "drda", "nplr", "nls")) {
  table_str[(i <- i + 1)] <- format_row(
    stats["Mean", paste(pkg, "converged", sep = "_")],
    stats[, paste(pkg, "rss_rel_err", sep = "_")],
    pkg
  )
}
table_str[(i <- i + 1)] <- "    \\end{tabular}%"
table_str[(i <- i + 1)] <- "  }"
table_str[(i <- i + 1)] <- paste0(
  "  \\caption{",
  "Summary statistics of the RSS relative error as measured on the CTRPv2 ",
  "data.}\\label{tab:ctrpv2}"
)
table_str[(i <- i + 1)] <- "\\end{table}"

# finally write the table
writeLines(text = table_str, con = file.path(dir_path, "table_2.tex"))

# what's the overall convergence rate?
print(round(100 * stats[1, 1:5], digits = 3))

# what's the mean execution time?
print(round(stats[1, 11:15], digits = 3))

# how many times was drda better?
rss_rel_err <- CTRPv2_results[16:20, ]
is_best <- apply(rss_rel_err, 2, function(x) all(x[-2] >= x[2], na.rm = TRUE))

print(round(100 * mean(is_best), digits = 3))

# what if we relax the comparison a bit?
is_best <- rss_rel_err <= 0.01

print(
  apply(is_best, 1, function(x) round(100 * mean(x, na.rm = TRUE), digits = 3))
)

# how about drda vs all other packages?
drda_dosef <- (rss_rel_err[1, ] - rss_rel_err[2, ]) > 0.01
drda_drc <- (rss_rel_err[3, ] - rss_rel_err[2, ]) > 0.01
drda_nplr <- (rss_rel_err[4, ] - rss_rel_err[2, ]) > 0.01
drda_nls <- (rss_rel_err[5, ] - rss_rel_err[2, ]) > 0.01

print(round(100 * mean(drda_dosef, na.rm = TRUE), digits = 3))
print(round(100 * mean(drda_drc, na.rm = TRUE), digits = 3))
print(round(100 * mean(drda_nplr, na.rm = TRUE), digits = 3))
print(round(100 * mean(drda_nls, na.rm = TRUE), digits = 3))

drda_dosef <- (rss_rel_err[2, ] - rss_rel_err[1, ]) > 0.01
drda_drc <- (rss_rel_err[2, ] - rss_rel_err[3, ]) > 0.01
drda_nplr <- (rss_rel_err[2, ] - rss_rel_err[4, ]) > 0.01
drda_nls <- (rss_rel_err[2, ] - rss_rel_err[5, ]) > 0.01

print(round(100 * mean(drda_dosef, na.rm = TRUE), digits = 3))
print(round(100 * mean(drda_drc, na.rm = TRUE), digits = 3))
print(round(100 * mean(drda_nplr, na.rm = TRUE), digits = 3))
print(round(100 * mean(drda_nls, na.rm = TRUE), digits = 3))