# setup parallel computation to speed-up the process and load all required pkgs library("future.apply") stopifnot( requireNamespace("stats"), requireNamespace("drc"), requireNamespace("DoseFinding"), requireNamespace("nplr"), requireNamespace("drda") ) if (Sys.info()["sysname"] != "Windows") { plan(multicore, workers = availableCores(omit = 1)) } else { plan(multisession, workers = availableCores(omit = 1)) } # define the directory where to save the simulation results dir_path <- "." # general functions benchmark_packages <- function(Z) { # x: vector of doses # y: vector of observed responses # p: vector of four estimated model parameters dosef_rss <- function(x, y, p) { mu <- p[1] + p[2] * x^p[4] / (p[3]^p[4] + x^p[4]) sum((y - mu)^2) } drc_rss <- function(x, y, p) { mu <- p[2] + (p[3] - p[2]) / (1 + exp(p[1] * log(x / p[4]))) sum((y - mu)^2) } drda_rss <- function(x, y, p) { mu <- p[1] + p[2] * x^p[3] / (p[4]^p[3] + x^p[3]) sum((y - mu)^2) } nplr_rss <- function(x, y, p) { p <- as.numeric(p) mu <- p[1] + (p[2] - p[1]) / (1 + 10^(p[4] * (p[3] - log10(x)))) sum((y - mu)^2) } nls_rss <- function(x, y, p) { mu <- p[1] + (p[2] - p[1]) / (1 + exp((p[3] - x) / p[4])) sum((y - mu)^2) } results <- rep(NA_real_, 15) names(results) <- paste( rep(c("dosef", "drda", "drc", "nplr", "nls"), 3), rep(c("converged", "rss", "time"), each = 5), sep = "_" ) # by default we initialize packages to no convergence results[seq_len(5)] <- 0 # DoseFinding w <- DoseFinding::defBnds(max(Z$x))$sigEmax fn_time <- suppressMessages( suppressWarnings( tryCatch({ system.time({ fit_dosef <- DoseFinding::fitMod( Z$x, Z$y, model = "sigEmax", bnds = w ) }) }, error = function(e) NULL ) ) ) if (!is.null(fn_time)) { # fitMod always returns a result, but we don't know if it really converged results[1] <- 1 results[6] <- dosef_rss(Z$x, Z$y, fit_dosef$coefs) results[11] <- fn_time["elapsed"] } # drda fn_time <- suppressMessages( suppressWarnings( tryCatch({ system.time({ fit_drda <- drda::drda( formula = y ~ x, data = Z, mean_function = "loglogistic4", max_iter = 10000 ) }) }, error = function(e) NULL ) ) ) if (!is.null(fn_time) && fit_drda$converged) { results[2] <- 1 results[7] <- drda_rss(Z$x, Z$y, fit_drda$coefficients) results[12] <- fn_time["elapsed"] } # drc w <- drc::drmc( maxIt = 10000, method = "BFGS", errorm = FALSE, noMessage = TRUE, otrace = TRUE ) fn_time <- suppressMessages( suppressWarnings( tryCatch({ system.time({ fit_drc <- drc::drm( formula = y ~ x, fct = drc::LL.4(), data = Z, control = w ) }) }, error = function(e) NULL ) ) ) if ( !is.null(fn_time) && (is.null(fit_drc$convergence) || fit_drc$convergence) ) { results[3] <- 1 results[8] <- drc_rss(Z$x, Z$y, fit_drc$coefficients) results[13] <- fn_time["elapsed"] } # nplr fn_time <- suppressMessages( suppressWarnings( tryCatch({ system.time({ fit_nplr <- nplr::nplr( Z$x, Z$y, LPweight = 0, npars = 4, silent = TRUE ) }) }, error = function(e) NULL ) ) ) if (!is.null(fn_time)) { # nplr always returns a result, but we don't know if it really converged results[4] <- 1 results[9] <- nplr_rss(Z$x, Z$y, fit_nplr@pars) results[14] <- fn_time["elapsed"] } # nls w <- nls.control(maxiter = 10000, warnOnly = TRUE) Z$l <- log(Z$x) fn_time <- suppressMessages( suppressWarnings( tryCatch({ system.time({ fit_nls <- nls(y ~ SSfpl(l, A, B, xmid, scal), data = Z, control = w) }) }, error = function(e) NULL ) ) ) if (!is.null(fn_time) && fit_nls$convInfo$isConv) { results[5] <- 1 results[10] <- nls_rss(Z$l, Z$y, coef(fit_nls)) results[15] <- fn_time["elapsed"] } # compute RSS relative errors res_rss <- results[6:10] min_rss <- min(res_rss, na.rm = TRUE) err_rss <- abs(1 - res_rss / min_rss) names(err_rss) <- paste( c("dosef", "drda", "drc", "nplr", "nls"), "rss_rel_err", sep = "_" ) c(results, err_rss) } compute_statistics <- function(x) { q <- quantile(x, probs = c(0, 0.25, 0.5, 0.75, 1), na.rm = TRUE) names(q) <- NULL c( "Mean" = mean(x, na.rm = TRUE), "Min." = q[1], "1st Qu." = q[2], "Median" = q[3], "3rd Qu." = q[4], "Max." = q[5] ) } format_number <- function(x) { ifelse(x < 1.0e-10, # anything below 10^(-10) is approximated to zero "0.000", ifelse(x < 1.0e-03, { # anything in [10^(-10), 10^(-3)) will use scientific notation str <- format(x, scientific = TRUE, digits = 3) # was the number approximated to 10^(-3)? ifelse(!grepl("e-03", str, fixed = TRUE), { # convert into LaTeX form by first removing the zero if present str <- sub("e-0(\\d)$", "e-\\1", str, perl = TRUE) str <- sub("e-(\\d+)$", "\\\\,10^{-\\1}", str, perl = TRUE) paste0("\\(", str, "\\)") }, as.character(round(x, digits = 3)) ) }, # anything above 10^(-3) will be printed in fixed notation as.character(round(x, digits = 3)) ) ) } format_row <- function(pct, x, pkg) { last <- if (pkg == "nls") { " \\\\ \\bottomrule" } else { " \\\\" } z <- as.character(round(100 * pct, digits = 2)) if (pkg == "dosef") { pkg <- "DoseFinding" z <- "-" } else if (pkg == "nplr") { z <- "-" } paste0( " \\pkg{", pkg, "} & ", z, " & ", paste(format_number(x), collapse = " & "), last ) } # ------- # # Table 1 # # ------- # do_benchmark <- function(a) { D <- apply(a$y, 2, function(w) data.frame(x = a$x, y = w)) l <- vapply(D, benchmark_packages, numeric(20)) p <- a$p names(p) <- NULL results <- data.frame( alpha = p[1], delta = p[2], eta = p[3], phi = p[4], sigma = p[5], t(l) ) write.csv(x = results, file = a$fp, na = "", row.names = FALSE) 0 } combine_results <- function(dir_path, npar, nsim) { fp <- list.files(path = dir_path, pattern = "\\.csv$", full.names = TRUE) m <- length(fp) if (m != npar) { stop("Found wrong number of simulations", call. = FALSE) } models_results <- array(0, dim = c(20, nsim, npar)) params <- matrix(0, nrow = m, ncol = 5) for (i in seq_len(m)) { tmp <- as.matrix(read.csv(file = fp[i])) params[i, ] <- tmp[1, seq_len(5)] models_results[, , i] <- t(tmp[, -seq_len(5)]) } ord <- order( params[, 1], params[, 2], params[, 3], params[, 4], params[, 5] ) params <- params[ord, ] models_results <- models_results[, , ord] lbl <- apply( params, 1, function(p) { paste( paste(c("alpha", "delta", "eta", "phi", "sigma"), p, sep = " = "), collapse = ", " ) } ) dimnames(models_results) <- list(colnames(tmp)[-seq_len(5)], NULL, lbl) models_results } # number of replicates for each tested dose n <- 3 # number of datasets for each parameter vector nsim <- 100 # tested doses are fixed dose <- rep(c(0.0001, 0.001, 0.01, 0.1, 1, 10, 100), each = n) # these are the "true" parameters to simulate alpha_set <- c(0.0, 0.2, 0.45) delta_set <- c(-0.95, 0.3, 1.2) eta_set <- c(0.1, 2, 5) phi_set <- c(0.0001, 1, 100) sigma_set <- c(0.05, 0.1) params <- as.matrix( expand.grid(sigma_set, phi_set, eta_set, delta_set, alpha_set) )[, c(5, 4, 3, 2, 1)] colnames(params) <- c("alpha", "delta", "eta", "phi", "sigma") npar <- nrow(params) # set seed here for reproducibility set.seed(93302902) # generate all datasets and map our benchmark function on each one of them obj <- vector("list", npar) for (i in seq_len(npar)) { p <- params[i, ] # this is the mean function f <- p[1] + p[2] * dose^p[3] / (dose^p[3] + p[4]^p[3]) # convert the matrix into data.frames and append them to our list obj[[i]] <- list( p = round(p, digits = 4), x = dose, # generate the data from a normal distribution with mean `f` and standard # deviation `p[5]` y = matrix(rep(f, nsim) + p[5] * rnorm(nsim * n), ncol = nsim), fp = tempfile( pattern = "models_results_", tmpdir = dir_path, fileext = ".csv" ) ) } # we save the datasets in case we want to manually re-do the fitting save(obj, file = file.path(dir_path, "obj.RData")) # simulate! Warning: this step takes a long time. models_results <- future_vapply(obj, do_benchmark, 0, future.seed = 24832919) models_results <- combine_results(dir_path, npar, nsim) save(models_results, file = file.path(dir_path, "models_results.RData")) # compute overall statistics stats <- apply(models_results, 1, compute_statistics) stats[is.nan(stats)] <- NA_real_ # generate Table 1 i <- 0 table_str <- character(14) table_str[(i <- i + 1)] <- "\\begin{table}[!t]" table_str[(i <- i + 1)] <- " \\centering" table_str[(i <- i + 1)] <- " \\resizebox{\\textwidth}{!}{%" table_str[(i <- i + 1)] <- " \\begin{tabular}{@{}llllllll@{}}" table_str[(i <- i + 1)] <- " \\toprule" table_str[(i <- i + 1)] <- paste( " Package & \\% Convergence & Mean & Minimum & First quartile &", "Median & Third quartile & Maximum \\\\ \\midrule" ) for (pkg in c("dosef", "drc", "drda", "nplr", "nls")) { table_str[(i <- i + 1)] <- format_row( stats["Mean", paste(pkg, "converged", sep = "_")], stats[, paste(pkg, "rss_rel_err", sep = "_")], pkg ) } table_str[(i <- i + 1)] <- " \\end{tabular}%" table_str[(i <- i + 1)] <- " }" table_str[(i <- i + 1)] <- paste0( " \\caption{", "Summary statistics of the RSS relative error for simulated models. ", "A total of 162 vector parameters were analysed. ", "For each parameter, 100 datasets with 7 doses and 3 replicates per dose ", "were simulated.}", "\\label{tab:l4sim}" ) table_str[(i <- i + 1)] <- "\\end{table}" # finally write the table writeLines(text = table_str, con = file.path(dir_path, "table_1.tex")) # what's the overall convergence rate? print(round(100 * stats[1, 1:5], digits = 2)) # what's the mean execution time? print(round(stats[1, 11:15], digits = 3)) # ------- # # Table 2 # # ------- # # download the data from the open repository ctrpv2_file <- file.path(dir_path, "CTRPv2_2015.rds") if (!file.exists("CTRPv2_2015.rds")) { download.file( url = "https://zenodo.org/record/3905470/files/CTRPv2.rds?download=1", destfile = ctrpv2_file, quiet = TRUE, mode = "wb" ) } CTRPv2 <- readRDS(ctrpv2_file) # CTRPv2 contains a 395263-by-29-by-2 array of raw sensitivity data # # raw[i, , ] is a numeric matrix corresponding to a particular cell-line/drug # combination. The matrix might contain rows full of NAs when the number of # tested doses is less than 29. # # we convert the array into a list of data.frames so that we can later apply our # parallel `vapply`. # This function converts a data matrix into a format suitable for us # # D: 29-by-2 numeric matrix where D[, 1] is the vector of doses and D[, 2] is # the vector of responses process_data <- function(D) { # discard rows with only NAs idx <- apply(D, 1, function(y) !all(is.na(y))) # for cell-line/drug pairs that were not tested, all rows are NAs if (all(!idx)) { return(NULL) } data.frame( x = round(D[idx, 1], digits = 7), # we use sensitivities on the 0-1 scale y = D[idx, 2] / 100 ) } CTRPv2_data <- future_apply( CTRPv2@sensitivity$raw, 1, process_data, simplify = FALSE ) # drop the NULL elements idx <- which(vapply(CTRPv2_data, is.null, FALSE)) CTRPv2_data[idx] <- NULL # save the list for subsequent analysis save(CTRPv2_data, file = file.path(dir_path, "CTRPv2_data.RData")) # set seed here for reproducibility set.seed(19264367) # CTRPv2_data is made of 379_533 data.frames # we sample at random 15_000 data.frames to speed up the process idx <- sample(seq_len(length(CTRPv2_data)), 15000L) CTRPv2_results <- future_vapply( CTRPv2_data[idx], benchmark_packages, numeric(20), future.seed = 60350074 ) save(CTRPv2_results, file = file.path(dir_path, "CTRPv2_results.RData")) # compute overall statistics stats <- apply(CTRPv2_results, 1, compute_statistics) stats[is.nan(stats)] <- NA_real_ # generate Table 2 i <- 0 table_str <- character(14) table_str[(i <- i + 1)] <- "\\begin{table}[!t]" table_str[(i <- i + 1)] <- " \\centering" table_str[(i <- i + 1)] <- " \\resizebox{\\textwidth}{!}{%" table_str[(i <- i + 1)] <- " \\begin{tabular}{@{}llllllll@{}}" table_str[(i <- i + 1)] <- " \\toprule" table_str[(i <- i + 1)] <- paste( " Package & \\% Convergence & Mean & Minimum & First quartile &", "Median & Third quartile & Maximum \\\\ \\midrule" ) for (pkg in c("dosef", "drc", "drda", "nplr", "nls")) { table_str[(i <- i + 1)] <- format_row( stats["Mean", paste(pkg, "converged", sep = "_")], stats[, paste(pkg, "rss_rel_err", sep = "_")], pkg ) } table_str[(i <- i + 1)] <- " \\end{tabular}%" table_str[(i <- i + 1)] <- " }" table_str[(i <- i + 1)] <- paste0( " \\caption{", "Summary statistics of the RSS relative error as measured on the CTRPv2 ", "data.}\\label{tab:ctrpv2}" ) table_str[(i <- i + 1)] <- "\\end{table}" # finally write the table writeLines(text = table_str, con = file.path(dir_path, "table_2.tex")) # what's the overall convergence rate? print(round(100 * stats[1, 1:5], digits = 3)) # what's the mean execution time? print(round(stats[1, 11:15], digits = 3)) # how many times was drda better? rss_rel_err <- CTRPv2_results[16:20, ] is_best <- apply(rss_rel_err, 2, function(x) all(x[-2] >= x[2], na.rm = TRUE)) print(round(100 * mean(is_best), digits = 3)) # what if we relax the comparison a bit? is_best <- rss_rel_err <= 0.01 print( apply(is_best, 1, function(x) round(100 * mean(x, na.rm = TRUE), digits = 3)) ) # how about drda vs all other packages? drda_dosef <- (rss_rel_err[1, ] - rss_rel_err[2, ]) > 0.01 drda_drc <- (rss_rel_err[3, ] - rss_rel_err[2, ]) > 0.01 drda_nplr <- (rss_rel_err[4, ] - rss_rel_err[2, ]) > 0.01 drda_nls <- (rss_rel_err[5, ] - rss_rel_err[2, ]) > 0.01 print(round(100 * mean(drda_dosef, na.rm = TRUE), digits = 3)) print(round(100 * mean(drda_drc, na.rm = TRUE), digits = 3)) print(round(100 * mean(drda_nplr, na.rm = TRUE), digits = 3)) print(round(100 * mean(drda_nls, na.rm = TRUE), digits = 3)) drda_dosef <- (rss_rel_err[2, ] - rss_rel_err[1, ]) > 0.01 drda_drc <- (rss_rel_err[2, ] - rss_rel_err[3, ]) > 0.01 drda_nplr <- (rss_rel_err[2, ] - rss_rel_err[4, ]) > 0.01 drda_nls <- (rss_rel_err[2, ] - rss_rel_err[5, ]) > 0.01 print(round(100 * mean(drda_dosef, na.rm = TRUE), digits = 3)) print(round(100 * mean(drda_drc, na.rm = TRUE), digits = 3)) print(round(100 * mean(drda_nplr, na.rm = TRUE), digits = 3)) print(round(100 * mean(drda_nls, na.rm = TRUE), digits = 3))