# R script file to be distributed with article: # Monotone Regression: A Simple and Fast O(n) PAVA Implementation # Author: Frank M.T.A. Busing, Leiden University # This file: Replication script for running all Monte Carlo simulations and plots # Accompanying files: In v102c01-replication.zip. # For full replication, run without accompanying .rda files in working directory # load packages library("monotone") library("microbenchmark") library("multcomp") library("Iso") library("UniIsoRegression") library("smacof") library("spsurvey") library("fdrtool") library("isotone") library("ggplot2") library("scatterplot3d") library("xtable") # load dynamic libraries libs <- c("wmrmnh.f", "amalgm.f", "pav.f", "mdpava.c") system2("R", c("CMD", "SHLIB", libs, "-o LIB")) dyn.load("LIB") source("isotonic.R") # set global variables seed <- 19121945 # birthday seed maxr <- 100 # number of replications maxt <- 100 # microbenchmark times # simulation study 1 ========================================================================================================= # what is the speed difference between all PAVA implementations? # what implementations are O(n) and what implementations O(n^2)? # running all available implementations mc_pava_all <- function(x) { return( microbenchmark( "fitm()" = { y <- legacy(x, number = 1) }, "wmrmnh()" = { y <- legacy(x, number = 2) }, "amalgm()" = { y <- legacy(x, number = 3) }, "pav()" = { y <- legacy(x, number = 4) }, "isoreg()" = { y <- legacy(x, number = 5) }, "isopava()" = { y <- legacy(x, number = 6) }, "isotonic()" = { y <- legacy(x, number = 7) }, "isomean()" = { y <- legacy(x, number = 8) }, "pooledpava()" = { y <- legacy(x, number = 9) }, "linearpava()" = { y <- legacy(x, number = 10) }, "inplacepava()" = { y <- legacy(x, number = 11) }, "mdpava()" = { y <- legacy(x, number = 12) }, "reg1dl2()" = { y <- legacy(x, number = 13) }, "jbkpava()" = { y <- legacy(x, number = 14) }, "monotone()" = { y <- monotone(x) }, times = maxt, control = list(warmup = 2) ) ) } # up and down data for simulation study 1 set_data_vector <- function(n) { half <- n / 2 return(c(1:half, half:1)) } # function for actual simulation study 1 mc_study_1 <- function() { n <- 100 x <- set_data_vector(n) res <- mc_pava_all(x) res$time <- res$time / 1000 tab3 <- aggregate(x = res$time, by = list(res$expr), FUN = mean) colnames(tab3) <- c("implementation", "n=100") for (p in 3:5) { n <- 10^p x <- set_data_vector(n) res <- mc_pava_all(x) res$time <- res$time / 1000 tmp <- aggregate(x = res$time, by = list(res$expr), FUN = mean)[2] tab3 <- cbind(tab3, round(tmp, 3)) colnames(tab3)[p] <- paste("n=", n, sep = "") } return(tab3) } if (!file.exists("mc1.rda")) { mc1 <- mc_study_1() save(mc1, file = "mc1.rda") } else { load("mc1.rda") } print(mc1) # simulation study 2 ========================================================================================================= # what is the speed difference between O(n) monotone regression implementations? # function for scaling data vector mc_scale_data <- function(y) { y <- y - min(y) y <- y / max(y) return(10 * y) } # function for generating data vectors mc_data <- function(n, type) { if (type == 1) { y <- mc_scale_data(1:n) } else if (type == 2) { y <- mc_scale_data((1:n) + (n / 5) * sin(10 * (1:n) / n)) } else if (type == 3) { y <- rep(5, n) } else if (type == 4) { y <- mc_scale_data(n - ((1:n) + (n / 5) * sin(10 * (1:n) / n))) } else if (type == 5) { y <- mc_scale_data(n:1) } y <- mc_scale_data(y + rnorm(n, 0, 0.5)) return(y) } # running all O(n) implementations mc_pava_On <- function(x) { return( microbenchmark( "fitm()" = { y <- legacy(x, number = 1) }, "isomean()" = { y <- legacy(x, number = 8) }, "pooledpava()" = { y <- legacy(x, number = 9) }, "inplacepava()" = { y <- legacy(x, number = 11) }, "mdpava()" = { y <- legacy(x, number = 12) }, "reg1dl2()" = { y <- legacy(x, number = 13) }, "jbkpava()" = { y <- legacy(x, number = 14) }, "monotone()" = { y <- monotone(x) }, times = maxt, control = list(warmup = 2) ) ) } # function for actual simulation study 2 mc_study_2 <- function() { ntype <- 5 nsize <- 4 mctype <- vector("list", ntype) mcsize <- vector("list", nsize) set.seed(seed) for (type in 1:ntype) { for (size in 1:nsize) { n <- 10 * 10^size res <- NULL for (r in 1:maxr) { x <- mc_data(n, type) tmp <- mc_pava_On(x) res <- rbind.data.frame(res, tmp) } res$time <- res$time / 1000 mcsize[[size]] <- rbind.data.frame(mcsize[[size]], res) res$time <- 1000 * res$time / n mctype[[type]] <- rbind.data.frame(mctype[[type]], res) } } # table with different sizes tab5 <- data.frame(as.character(levels(tmp$expr))) col <- 1 colnames(tab5)[col] <- "implementation" for (size in 1:nsize) { n <- 10 * 10^size tmp5 <- aggregate(x = mcsize[[size]]$time, by = list(mcsize[[size]]$expr), FUN = mean)[2] tab5 <- cbind(tab5, round(tmp5, 3)) col <- col + 1 colnames(tab5)[col] <- paste("mean n=", n, sep = "") amod <- aov(time ~ expr, data = mcsize[[size]]) glht(amod, linfct = mcp(expr = "Tukey")) tuk <- glht(amod, linfct = mcp(expr = "Tukey")) tab5 <- cbind(tab5, cld(tuk)$mcletters$Letters) col <- col + 1 colnames(tab5)[col] <- paste("cld n=", n, sep = "") } tab5 <- subset(tab5, select = -1) # table with different data variants tab6 <- data.frame(as.character(levels(tmp$expr))) col <- 1 colnames(tab6)[col] <- "implementation" overall <- aggregate(x = mctype[[1]]$time, by = list(mctype[[1]]$expr), FUN = mean)[2] overall$x <- 0 for (type in 1:ntype) { tmp6 <- aggregate(x = mctype[[type]]$time, by = list(mctype[[type]]$expr), FUN = mean)[2] overall$x <- overall$x + tmp6$x tab6 <- cbind(tab6, round(tmp6, 3)) col <- col + 1 colnames(tab6)[col] <- paste("mean ", type, sep = "") amod <- aov(time ~ expr, data = mctype[[type]]) glht(amod, linfct = mcp(expr = "Tukey")) tuk <- glht(amod, linfct = mcp(expr = "Tukey")) tab6 <- cbind(tab6, cld(tuk)$mcletters$Letters) col <- col + 1 colnames(tab6)[col] <- paste("cld ", type, sep = "") } tab6 <- cbind(tab6, round(overall$x / ntype, 3)) col <- col + 1 colnames(tab6)[col] <- "overall" tab6 <- cbind(tab6, rank(tab6[col])) col <- col + 1 colnames(tab6)[col] <- "rank" tab6 <- subset(tab6, select = -1) return(list(table5 = tab5, table6 = tab6)) } if (!file.exists("mc2.rda")) { mc2 <- mc_study_2() save(mc2, file = "mc2.rda") } else { load("mc2.rda") } print(mc2$table5) print(mc2$table6) # simulation study 3 ========================================================================================================= # what is the speed difference between unimodal monotone regression implementations? # example, first test unimodal functions for equality x <- c(0.00, 0.34, 0.67, 1.00, 1.34, 1.67, 2.00, 2.50, 3.00, 3.50, 4.00, 4.50, 5.00, 5.50, 6.00, 8.00, 12.00, 16.00, 24.00) y <- c(0.0, 61.9, 183.3, 173.7, 250.6, 238.1, 292.6, 293.8, 268.0, 285.9, 258.8, 297.4, 217.3, 226.4, 170.1, 74.2, 59.8, 4.1, 6.1) print(ufit(y, x = x, type = "b")) print(unimonotone(y)) # function for actual simulation study 3 mc_study_3 <- function() { set.seed(seed) n <- 500 tab <- NULL for (r in 1:maxr) { xl <- mc_data(n, 2) xr <- mc_data(n, 4) x <- c(xl, xr) res <- microbenchmark( "ufit()" = { y <- Iso::ufit(x)$y }, "uniisoregression()" = { y <- UniIsoRegression::reg_1d(x, rep(1, length(x)), 2, unimodal = TRUE) }, "unimonotone()" = { y <- unimonotone(x) }, times = maxt, check = "equivalent", control = list(warmup = 2) ) tab <- rbind.data.frame(tab, res) } return(tab) } if (!file.exists("mc3.rda")) { mc3 <- mc_study_3() save(mc3, file = "mc3.rda") } else { load("mc3.rda") } print(mc3) # simulation study 4 ========================================================================================================= # what is the speed difference between bivariate monotone regression implementations? # example, first test bivariate functions for equality G <- matrix(c(1, 5.2, 0.1, 0.1, 5, 0, 6, 2, 3, 5.2, 5, 7, 4, 5.5, 6, 6), 4, 4) print(G) print(biviso(G)) print(reg_2d(G, matrix(1, 4, 4), metric = 2)) print(bimonotone(G, matrix(1, 4, 4))) y <- matrix(c(8, 4, 8, 2, 2, 0, 8), 1, 7) print(bimonotone(y, matrix(1, 1, 7))) y <- matrix(c(8, 4, 8, 2, 2, 0, 8), 7, 1) print(bimonotone(y, matrix(1, 7, 1))) # function for actual simulation study 4 mc_study_4 <- function() { set.seed(seed) n <- 32 tab <- NULL for (r in 1:maxr) { sqrtn <- sqrt(n) G <- matrix(0, n, n) for (i in 1:n) { for (j in 1:n) { G[i, j] <- i + j + runif(1, -i, j) } } res <- microbenchmark( "biviso()" = { y <- matrix(biviso(G), n, n) }, "bimonotone(default)" = { y <- bimonotone(G) }, "bimonotone(eps=1.0e-5)" = { y <- bimonotone(G, eps = 1.0e-5) }, times = maxt, control = list(warmup = 2) ) tab <- rbind.data.frame(tab, res) } return(res) } if (!file.exists("mc4.rda")) { mc4 <- mc_study_4() save(mc4, file = "mc4.rda") } else { load("mc4.rda") } print(mc4) # simulation study 5 ========================================================================================================= # what is the speed difference between the original and the legacy function? # function for comparing wmrmnh implementations wmrmnh <- function(x, w = rep(1, length(x))) { return(.Fortran("wmrmnh", as.numeric(x), as.numeric(w), as.integer(length(x)))[[1]]) } wmonreg <- function(x, w = rep(1, length(x))) { return(.C("wmonreg", as.numeric(x), as.numeric(w), as.integer(length(x)))[[1]]) } mc_wmrmnh <- function() { set.seed(seed) res <- NULL for (type in 1:5) { for (r in 1:maxr) { x <- mc_data(1000, type) res <- rbind.data.frame( res, microbenchmark( "original (Fortran)" = { y <- wmrmnh(x) }, "original (C)" = { y <- wmonreg(x) }, "legacy" = { y <- legacy(x, number = 2) }, times = maxt, check = "equivalent", control = list(warmup = 2) ) ) } } return(res) } if (!file.exists("raw_wmrmnh.rda")) { raw_wmrmnh <- mc_wmrmnh() save(raw_wmrmnh, file = "raw_wmrmnh.rda") } else { load("raw_wmrmnh.rda") } print(raw_wmrmnh) # function for comparing amalgm implementations amalgm <- function(x, w = rep(1, length(x))) { n <- 1000 xx <- rep(1, 1000) ww <- rep(1, 1000) xa <- rep(1, 1000) ifault <- 0 return(.Fortran("amalgm", as.integer(n), as.numeric(x), as.numeric(w), as.numeric(xx), as.numeric(ww), as.numeric(xa), as.integer(ifault))[[6]]) } mc_amalgm <- function() { set.seed(seed) res <- NULL for (type in 1:5) { for (r in 1:maxr) { x <- mc_data(1000, type) res <- rbind.data.frame( res, microbenchmark( "original" = { y <- amalgm(x) }, "legacy" = { y <- legacy(x, number = 3) }, times = maxt, check = "equivalent", control = list(warmup = 2) ) ) } } return(res) } if (!file.exists("raw_amalgm.rda")) { raw_amalgm <- mc_amalgm() save(raw_amalgm, file = "raw_amalgm.rda") } else { load("raw_amalgm.rda") } print(raw_amalgm) # function for comparing pav implementations pav <- function(x, w = rep(1, length(x))) { n <- 1000 iorder <- 1 finalx <- rep(1, 1000) nw <- rep(1, 1000) fx <- rep(1, 1000) pw <- rep(1, 1000) w1 <- rep(1, 1000) wt <- rep(1, 1000) return(.Fortran("pav", as.integer(n), as.numeric(x), as.integer(iorder), as.numeric(w), as.numeric(finalx), as.integer(nw), as.numeric(fx), as.numeric(pw), as.numeric(w1), as.numeric(wt))[[5]]) } mc_pav <- function() { set.seed(seed) res <- NULL for (type in 1:5) { for (r in 1:maxr) { x <- mc_data(1000, type) res <- rbind.data.frame( res, microbenchmark( "original" = { y <- pav(x) }, "legacy" = { y <- legacy(x, number = 4) }, times = maxt, check = "equivalent", control = list(warmup = 2) ) ) } } return(res) } if (!file.exists("raw_pav.rda")) { raw_pav <- mc_pav() save(raw_pav, file = "raw_pav.rda") } else { load("raw_pav.rda") } print(raw_pav) # function for comparing isoreg implementations mc_isoreg <- function() { set.seed(seed) res <- NULL for (type in 1:5) { for (r in 1:maxr) { x <- mc_data(1000, type) res <- rbind.data.frame( res, microbenchmark( "original" = { y <- isoreg(x)$yf }, "legacy" = { y <- legacy(x, number = 5) }, times = maxt, check = "equivalent", control = list(warmup = 2) ) ) } } return(res) } if (!file.exists("raw_isoreg.rda")) { raw_isoreg <- mc_isoreg() save(raw_isoreg, file = "raw_isoreg.rda") } else { load("raw_isoreg.rda") } print(raw_isoreg) # function for comparing iso_pava implementations mc_isopava <- function() { set.seed(seed) res <- NULL for (type in 1:5) { for (r in 1:maxr) { x <- mc_data(1000, type) res <- rbind.data.frame( res, microbenchmark( "original" = { y <- Iso::pava(x) }, "legacy" = { y <- legacy(x, number = 6) }, times = maxt, check = "equivalent", control = list(warmup = 2) ) ) } } return(res) } if (!file.exists("raw_isopava.rda")) { raw_isopava <- mc_isopava() save(raw_isopava, file = "raw_isopava.rda") } else { load("raw_isopava.rda") } print(raw_isopava) # function for comparing isotonic implementations mc_isotonic <- function() { set.seed(seed) res <- NULL for (type in 1:5) { for (r in 1:maxr) { x <- mc_data(1000, type) res <- rbind.data.frame( res, microbenchmark( "original" = { y <- isotonic(x, -.Machine$double.xmax, .Machine$double.xmax) }, "legacy" = { y <- legacy(x, number = 7) }, times = maxt, check = "equivalent", control = list(warmup = 2) ) ) } } return(res) } if (!file.exists("raw_isotonic.rda")) { raw_isotonic <- mc_isotonic() save(raw_isotonic, file = "raw_isotonic.rda") } else { load("raw_isotonic.rda") } print(raw_isotonic) # function for comparing isomean implementations C_isomean <- function(x, w = rep(1, length(x))) { ghat <- rep(1, length(x)) return(.C("C_isomean", as.double(x), as.double(w), as.integer(1000), ghat = as.double(ghat), PACKAGE = "fdrtool")$ghat) } mc_isomean <- function() { set.seed(seed) res <- NULL for (type in 1:5) { for (r in 1:maxr) { x <- mc_data(1000, type) res <- rbind.data.frame( res, microbenchmark( "original (R)" = { y <- fdrtool::monoreg(x)$yf }, "original (C)" = { C_isomean(x) }, "legacy" = { y <- legacy(x, number = 8) }, times = maxt, check = "equivalent", control = list(warmup = 2) ) ) } } return(res) } if (!file.exists("raw_isomean.rda")) { raw_isomean <- mc_isomean() save(raw_isomean, file = "raw_isomean.rda") } else { load("raw_isomean.rda") } print(raw_isomean) # function for comparing md_pava implementations mdpava <- function(x, w = rep(1, length(x))) { return(.C("mdpava", x = as.double(x), as.integer(length(x)))$x) } mc_mdpava <- function() { set.seed(seed) res <- NULL for (type in 1:5) { for (r in 1:maxr) { x <- mc_data(1000, type) res <- rbind.data.frame( res, microbenchmark( "original" = { y <- mdpava(x) }, "legacy" = { y <- legacy(x, number = 12) }, times = maxt, check = "equivalent", control = list(warmup = 2) ) ) } } return(res) } if (!file.exists("raw_mdpava.rda")) { raw_mdpava <- mc_mdpava() save(raw_mdpava, file = "raw_mdpava.rda") } else { load("raw_mdpava.rda") } print(raw_mdpava) # function for comparing reg_1d_l2 implementations mc_reg1dl2 <- function() { set.seed(seed) res <- NULL for (type in 1:5) { for (r in 1:maxr) { x <- mc_data(1000, type) res <- rbind.data.frame( res, microbenchmark( "original" = { y <- UniIsoRegression::reg_1d(x, rep(1, 1000), 2) }, "legacy" = { y <- legacy(x, number = 13) }, times = maxt, control = list(warmup = 2) ) ) } } return(res) } if (!file.exists("raw_reg1dl2.rda")) { raw_reg1dl2 <- mc_reg1dl2() save(raw_reg1dl2, file = "raw_reg1dl2.rda") } else { load("raw_reg1dl2.rda") } print(raw_reg1dl2) # additional stuff =========================================================================================================== # functions for creating clarifying plots for snippet # set global variables n <- 100 set.seed(seed) # create plots variants 1 and 2: order value <- c(mc_data(n,1),mc_data(n,2)) element <- c(1:n,1:n) variant <- rep(c(1,2),c(n,n)) order <- data.frame(cbind(element,value,variant)) ggplot(order, aes(x=element, y=value)) + geom_point(aes(color = factor(variant))) + theme(text = element_text(size = 28)) + theme(legend.position = "none") # create plot variant 3: random value <- c(mc_data(n,3)) element <- c(1:n) variant <- rep(c(3),c(n)) order <- data.frame(cbind(element,value,variant)) ggplot(order, aes(x=element, y=value)) + geom_point(aes(color = factor(variant))) + theme(text = element_text(size = 28)) + theme(legend.position = "none") # create plots variants 4 and 5: disorder value <- c(mc_data(n,4),mc_data(n,5)) element <- c(1:n,1:n) variant <- rep(c(5,4),c(n,n)) disorder <- data.frame(cbind(element,value,variant)) ggplot(disorder, aes(x=element, y=value)) + geom_point(aes(color = factor(variant))) + theme(text = element_text(size = 28)) + theme(legend.position = "none") # create plot unimodal xl <- mc_data(n,2) xr <- mc_data(n,4) value <- c( xl, xr ) element <- c(1:(2*n)) variant <- rep(c(3),c(2*n)) order <- data.frame(cbind(element,value,variant)) ggplot(order, aes(x=element, y=value)) + geom_point(aes(color = factor(variant))) + theme(text = element_text(size = 28)) + theme(legend.position = "none") # create plot bivariate n <- 10 G <- matrix(0,n,n) for ( i in 1:n ) { for ( j in 1:n ) { G[i,j] = i + j + runif(1, -i, j ) } } x <- as.vector(matrix(1:n,n,n,byrow=TRUE)) y <- as.vector(matrix(1:n,n,n,byrow=FALSE)) z <- as.vector(G) scatterplot3d(x, y, z, highlight.3d=TRUE, xlab="row", ylab = "column", zlab = "value", col.axis="blue", col.grid="lightblue", grid=TRUE, angle = 65, type = "h", main=NULL, pch=20, cex.axis = 1.5, cex.lab = 1.8)