# R script file to be distributed with article:
# Monotone Regression: A Simple and Fast O(n) PAVA Implementation
# Author: Frank M.T.A. Busing, Leiden University
# This file: Replication script for running all Monte Carlo simulations and plots
# Accompanying files: In v102c01-replication.zip.
# For full replication, run without accompanying .rda files in working directory

# load packages
library("monotone")
library("microbenchmark")
library("multcomp")
library("Iso")
library("UniIsoRegression")
library("smacof")
library("spsurvey")
library("fdrtool")
library("isotone")
library("ggplot2")
library("scatterplot3d")
library("xtable")

# load dynamic libraries
libs <- c("wmrmnh.f", "amalgm.f", "pav.f", "mdpava.c")
system2("R", c("CMD", "SHLIB", libs, "-o LIB"))
dyn.load("LIB")
source("isotonic.R")

# set global variables
seed <- 19121945  # birthday seed
maxr <- 100       # number of replications
maxt <- 100       # microbenchmark times

# simulation study 1 =========================================================================================================
# what is the speed difference between all PAVA implementations?
# what implementations are O(n) and what implementations O(n^2)?

# running all available implementations
mc_pava_all <- function(x) {
  return(
    microbenchmark(
      "fitm()" = { y <- legacy(x, number = 1) },
      "wmrmnh()" = { y <- legacy(x, number = 2) },
      "amalgm()" = { y <- legacy(x, number = 3) },
      "pav()" = { y <- legacy(x, number = 4) },
      "isoreg()" = { y <- legacy(x, number = 5) },
      "isopava()" = { y <- legacy(x, number = 6) },
      "isotonic()" = { y <- legacy(x, number = 7) },
      "isomean()" = { y <- legacy(x, number = 8) },
      "pooledpava()" = { y <- legacy(x, number = 9) },
      "linearpava()" = { y <- legacy(x, number = 10) },
      "inplacepava()" = { y <- legacy(x, number = 11) },
      "mdpava()" = { y <- legacy(x, number = 12) },
      "reg1dl2()" = { y <- legacy(x, number = 13) },
      "jbkpava()" = { y <- legacy(x, number = 14) },
      "monotone()" = { y <- monotone(x) },
      times = maxt,
      control = list(warmup = 2)
    )
  )
}

# up and down data for simulation study 1
set_data_vector <- function(n) {
  half <- n / 2
  return(c(1:half, half:1))
}

# function for actual simulation study 1
mc_study_1 <- function() {
  n <- 100
  x <- set_data_vector(n)
  res <- mc_pava_all(x)
  res$time <- res$time / 1000
  tab3 <- aggregate(x = res$time, by = list(res$expr), FUN = mean)
  colnames(tab3) <- c("implementation", "n=100")
  for (p in 3:5) {
    n <- 10^p
    x <- set_data_vector(n)
    res <- mc_pava_all(x)
    res$time <- res$time / 1000
    tmp <- aggregate(x = res$time, by = list(res$expr), FUN = mean)[2]
    tab3 <- cbind(tab3, round(tmp, 3))
    colnames(tab3)[p] <- paste("n=", n, sep = "")
  }
  return(tab3)
}
if (!file.exists("mc1.rda")) {
  mc1 <- mc_study_1()
  save(mc1, file = "mc1.rda")
} else {
  load("mc1.rda")
}
print(mc1)

# simulation study 2 =========================================================================================================
# what is the speed difference between O(n) monotone regression implementations?

# function for scaling data vector
mc_scale_data <- function(y) {
  y <- y - min(y)
  y <- y / max(y)
  return(10 * y)
}

# function for generating data vectors
mc_data <- function(n, type) {
  if (type == 1) {
    y <- mc_scale_data(1:n)
  } else if (type == 2) {
    y <- mc_scale_data((1:n) + (n / 5) * sin(10 * (1:n) / n))
  } else if (type == 3) {
    y <- rep(5, n)
  } else if (type == 4) {
    y <- mc_scale_data(n - ((1:n) + (n / 5) * sin(10 * (1:n) / n)))
  } else if (type == 5) {
    y <- mc_scale_data(n:1)
  }
  y <- mc_scale_data(y + rnorm(n, 0, 0.5))
  return(y)
}

# running all O(n) implementations
mc_pava_On <- function(x) {
  return(
    microbenchmark(
      "fitm()" = { y <- legacy(x, number = 1) },
      "isomean()" = { y <- legacy(x, number = 8) },
      "pooledpava()" = { y <- legacy(x, number = 9) },
      "inplacepava()" = { y <- legacy(x, number = 11) },
      "mdpava()" = { y <- legacy(x, number = 12) },
      "reg1dl2()" = { y <- legacy(x, number = 13) },
      "jbkpava()" = { y <- legacy(x, number = 14) },
      "monotone()" = { y <- monotone(x) },
      times = maxt,
      control = list(warmup = 2)
    )
  )
}

# function for actual simulation study 2
mc_study_2 <- function() {
  ntype <- 5
  nsize <- 4
  mctype <- vector("list", ntype)
  mcsize <- vector("list", nsize)
  set.seed(seed)
  for (type in 1:ntype) {
    for (size in 1:nsize) {
      n <- 10 * 10^size
      res <- NULL
      for (r in 1:maxr) {
        x <- mc_data(n, type)
        tmp <- mc_pava_On(x)
        res <- rbind.data.frame(res, tmp)
      }
      res$time <- res$time / 1000
      mcsize[[size]] <- rbind.data.frame(mcsize[[size]], res)
      res$time <- 1000 * res$time / n
      mctype[[type]] <- rbind.data.frame(mctype[[type]], res)
    }
  }

  # table with different sizes
  tab5 <- data.frame(as.character(levels(tmp$expr)))
  col <- 1
  colnames(tab5)[col] <- "implementation"
  for (size in 1:nsize) {
    n <- 10 * 10^size
    tmp5 <- aggregate(x = mcsize[[size]]$time, by = list(mcsize[[size]]$expr), FUN = mean)[2]
    tab5 <- cbind(tab5, round(tmp5, 3))
    col <- col + 1
    colnames(tab5)[col] <- paste("mean n=", n, sep = "")
    amod <- aov(time ~ expr, data = mcsize[[size]])
    glht(amod, linfct = mcp(expr = "Tukey"))
    tuk <- glht(amod, linfct = mcp(expr = "Tukey"))
    tab5 <- cbind(tab5, cld(tuk)$mcletters$Letters)
    col <- col + 1
    colnames(tab5)[col] <- paste("cld n=", n, sep = "")
  }
  tab5 <- subset(tab5, select = -1)

  # table with different data variants
  tab6 <- data.frame(as.character(levels(tmp$expr)))
  col <- 1
  colnames(tab6)[col] <- "implementation"
  overall <- aggregate(x = mctype[[1]]$time, by = list(mctype[[1]]$expr), FUN = mean)[2]
  overall$x <- 0
  for (type in 1:ntype) {
    tmp6 <- aggregate(x = mctype[[type]]$time, by = list(mctype[[type]]$expr), FUN = mean)[2]
    overall$x <- overall$x + tmp6$x
    tab6 <- cbind(tab6, round(tmp6, 3))
    col <- col + 1
    colnames(tab6)[col] <- paste("mean ", type, sep = "")
    amod <- aov(time ~ expr, data = mctype[[type]])
    glht(amod, linfct = mcp(expr = "Tukey"))
    tuk <- glht(amod, linfct = mcp(expr = "Tukey"))
    tab6 <- cbind(tab6, cld(tuk)$mcletters$Letters)
    col <- col + 1
    colnames(tab6)[col] <- paste("cld ", type, sep = "")
  }
  tab6 <- cbind(tab6, round(overall$x / ntype, 3))
  col <- col + 1
  colnames(tab6)[col] <- "overall"
  tab6 <- cbind(tab6, rank(tab6[col]))
  col <- col + 1
  colnames(tab6)[col] <- "rank"
  tab6 <- subset(tab6, select = -1)

  return(list(table5 = tab5, table6 = tab6))
}
if (!file.exists("mc2.rda")) {
  mc2 <- mc_study_2()
  save(mc2, file = "mc2.rda")
} else {
  load("mc2.rda")
}
print(mc2$table5)
print(mc2$table6)

# simulation study 3 =========================================================================================================
# what is the speed difference between unimodal monotone regression implementations?

# example, first test unimodal functions for equality
x <- c(0.00, 0.34, 0.67, 1.00, 1.34, 1.67, 2.00, 2.50, 3.00, 3.50, 4.00, 4.50, 5.00, 5.50, 6.00, 8.00, 12.00, 16.00, 24.00)
y <- c(0.0, 61.9, 183.3, 173.7, 250.6, 238.1, 292.6, 293.8, 268.0, 285.9, 258.8, 297.4, 217.3, 226.4, 170.1, 74.2, 59.8, 4.1, 6.1)
print(ufit(y, x = x, type = "b"))
print(unimonotone(y))

# function for actual simulation study 3
mc_study_3 <- function() {
  set.seed(seed)
  n <- 500
  tab <- NULL
  for (r in 1:maxr) {
    xl <- mc_data(n, 2)
    xr <- mc_data(n, 4)
    x <- c(xl, xr)
    res <- microbenchmark(
      "ufit()" = { y <- Iso::ufit(x)$y },
      "uniisoregression()" = { y <- UniIsoRegression::reg_1d(x, rep(1, length(x)), 2, unimodal = TRUE) },
      "unimonotone()" = { y <- unimonotone(x) },
      times = maxt,
      check = "equivalent",
      control = list(warmup = 2)
    )
    tab <- rbind.data.frame(tab, res)
  }
  return(tab)
}
if (!file.exists("mc3.rda")) {
  mc3 <- mc_study_3()
  save(mc3, file = "mc3.rda")
} else {
  load("mc3.rda")
}
print(mc3)

# simulation study 4 =========================================================================================================
# what is the speed difference between bivariate monotone regression implementations?

# example, first test bivariate functions for equality
G <- matrix(c(1, 5.2, 0.1, 0.1, 5, 0, 6, 2, 3, 5.2, 5, 7, 4, 5.5, 6, 6), 4, 4)
print(G)
print(biviso(G))
print(reg_2d(G, matrix(1, 4, 4), metric = 2))
print(bimonotone(G, matrix(1, 4, 4)))
y <- matrix(c(8, 4, 8, 2, 2, 0, 8), 1, 7)
print(bimonotone(y, matrix(1, 1, 7)))
y <- matrix(c(8, 4, 8, 2, 2, 0, 8), 7, 1)
print(bimonotone(y, matrix(1, 7, 1)))

# function for actual simulation study 4
mc_study_4 <- function() {
  set.seed(seed)
  n <- 32
  tab <- NULL
  for (r in 1:maxr) {
    sqrtn <- sqrt(n)
    G <- matrix(0, n, n)
    for (i in 1:n) {
      for (j in 1:n) {
        G[i, j] <- i + j + runif(1, -i, j)
      }
    }
    res <- microbenchmark(
      "biviso()" = { y <- matrix(biviso(G), n, n) },
      "bimonotone(default)" = { y <- bimonotone(G) },
      "bimonotone(eps=1.0e-5)" = { y <- bimonotone(G, eps = 1.0e-5) },
      times = maxt,
      control = list(warmup = 2)
    )
    tab <- rbind.data.frame(tab, res)
  }
  return(res)
}
if (!file.exists("mc4.rda")) {
  mc4 <- mc_study_4()
  save(mc4, file = "mc4.rda")
} else {
  load("mc4.rda")
}
print(mc4)

# simulation study 5 =========================================================================================================
# what is the speed difference between the original and the legacy function?

# function for comparing wmrmnh implementations
wmrmnh <- function(x, w = rep(1, length(x))) {
  return(.Fortran("wmrmnh", as.numeric(x), as.numeric(w), as.integer(length(x)))[[1]])
}
wmonreg <- function(x, w = rep(1, length(x))) {
  return(.C("wmonreg", as.numeric(x), as.numeric(w), as.integer(length(x)))[[1]])
}
mc_wmrmnh <- function() {
  set.seed(seed)
  res <- NULL
  for (type in 1:5) {
    for (r in 1:maxr) {
      x <- mc_data(1000, type)
      res <- rbind.data.frame(
        res,
        microbenchmark(
          "original (Fortran)" = { y <- wmrmnh(x) },
          "original (C)" = { y <- wmonreg(x) },
          "legacy" = { y <- legacy(x, number = 2) },
          times = maxt,
          check = "equivalent",
          control = list(warmup = 2)
        )
      )
    }
  }
  return(res)
}
if (!file.exists("raw_wmrmnh.rda")) {
  raw_wmrmnh <- mc_wmrmnh()
  save(raw_wmrmnh, file = "raw_wmrmnh.rda")
} else {
  load("raw_wmrmnh.rda")
}
print(raw_wmrmnh)

# function for comparing amalgm implementations
amalgm <- function(x, w = rep(1, length(x))) {
  n <- 1000
  xx <- rep(1, 1000)
  ww <- rep(1, 1000)
  xa <- rep(1, 1000)
  ifault <- 0
  return(.Fortran("amalgm", as.integer(n), as.numeric(x), as.numeric(w), as.numeric(xx), as.numeric(ww), as.numeric(xa), as.integer(ifault))[[6]])
}
mc_amalgm <- function() {
  set.seed(seed)
  res <- NULL
  for (type in 1:5) {
    for (r in 1:maxr) {
      x <- mc_data(1000, type)
      res <- rbind.data.frame(
        res,
        microbenchmark(
          "original" = { y <- amalgm(x) },
          "legacy" = { y <- legacy(x, number = 3) },
          times = maxt,
          check = "equivalent",
          control = list(warmup = 2)
        )
      )
    }
  }
  return(res)
}
if (!file.exists("raw_amalgm.rda")) {
  raw_amalgm <- mc_amalgm()
  save(raw_amalgm, file = "raw_amalgm.rda")
} else {
  load("raw_amalgm.rda")
}
print(raw_amalgm)

# function for comparing pav implementations
pav <- function(x, w = rep(1, length(x))) {
  n <- 1000
  iorder <- 1
  finalx <- rep(1, 1000)
  nw <- rep(1, 1000)
  fx <- rep(1, 1000)
  pw <- rep(1, 1000)
  w1 <- rep(1, 1000)
  wt <- rep(1, 1000)
  return(.Fortran("pav", as.integer(n), as.numeric(x), as.integer(iorder),
    as.numeric(w), as.numeric(finalx), as.integer(nw), as.numeric(fx),
    as.numeric(pw), as.numeric(w1), as.numeric(wt))[[5]])
}
mc_pav <- function() {
  set.seed(seed)
  res <- NULL
  for (type in 1:5) {
    for (r in 1:maxr) {
      x <- mc_data(1000, type)
      res <- rbind.data.frame(
        res,
        microbenchmark(
          "original" = { y <- pav(x) },
          "legacy" = { y <- legacy(x, number = 4) },
          times = maxt,
          check = "equivalent",
          control = list(warmup = 2)
        )
      )
    }
  }
  return(res)
}
if (!file.exists("raw_pav.rda")) {
  raw_pav <- mc_pav()
  save(raw_pav, file = "raw_pav.rda")
} else {
  load("raw_pav.rda")
}
print(raw_pav)

# function for comparing isoreg implementations
mc_isoreg <- function() {
  set.seed(seed)
  res <- NULL
  for (type in 1:5) {
    for (r in 1:maxr) {
      x <- mc_data(1000, type)
      res <- rbind.data.frame(
        res,
        microbenchmark(
          "original" = { y <- isoreg(x)$yf },
          "legacy" = { y <- legacy(x, number = 5) },
          times = maxt,
          check = "equivalent",
          control = list(warmup = 2)
        )
      )
    }
  }
  return(res)
}
if (!file.exists("raw_isoreg.rda")) {
  raw_isoreg <- mc_isoreg()
  save(raw_isoreg, file = "raw_isoreg.rda")
} else {
  load("raw_isoreg.rda")
}
print(raw_isoreg)

# function for comparing iso_pava implementations
mc_isopava <- function() {
  set.seed(seed)
  res <- NULL
  for (type in 1:5) {
    for (r in 1:maxr) {
      x <- mc_data(1000, type)
      res <- rbind.data.frame(
        res,
        microbenchmark(
          "original" = { y <- Iso::pava(x) },
          "legacy" = { y <- legacy(x, number = 6) },
          times = maxt,
          check = "equivalent",
          control = list(warmup = 2)
        )
      )
    }
  }
  return(res)
}
if (!file.exists("raw_isopava.rda")) {
  raw_isopava <- mc_isopava()
  save(raw_isopava, file = "raw_isopava.rda")
} else {
  load("raw_isopava.rda")
}
print(raw_isopava)

# function for comparing isotonic implementations
mc_isotonic <- function() {
  set.seed(seed)
  res <- NULL
  for (type in 1:5) {
    for (r in 1:maxr) {
      x <- mc_data(1000, type)
      res <- rbind.data.frame(
        res,
        microbenchmark(
          "original" = { y <- isotonic(x, -.Machine$double.xmax, .Machine$double.xmax) },
          "legacy" = { y <- legacy(x, number = 7) },
          times = maxt,
          check = "equivalent",
          control = list(warmup = 2)
        )
      )
    }
  }
  return(res)
}
if (!file.exists("raw_isotonic.rda")) {
  raw_isotonic <- mc_isotonic()
  save(raw_isotonic, file = "raw_isotonic.rda")
} else {
  load("raw_isotonic.rda")
}
print(raw_isotonic)

# function for comparing isomean implementations
C_isomean <- function(x, w = rep(1, length(x))) {
  ghat <- rep(1, length(x))
  return(.C("C_isomean", as.double(x), as.double(w), as.integer(1000), ghat = as.double(ghat), PACKAGE = "fdrtool")$ghat)
}
mc_isomean <- function() {
  set.seed(seed)
  res <- NULL
  for (type in 1:5) {
    for (r in 1:maxr) {
      x <- mc_data(1000, type)
      res <- rbind.data.frame(
        res,
        microbenchmark(
          "original (R)" = { y <- fdrtool::monoreg(x)$yf },
          "original (C)" = { C_isomean(x) },
          "legacy" = { y <- legacy(x, number = 8) },
          times = maxt,
          check = "equivalent",
          control = list(warmup = 2)
        )
      )
    }
  }
  return(res)
}
if (!file.exists("raw_isomean.rda")) {
  raw_isomean <- mc_isomean()
  save(raw_isomean, file = "raw_isomean.rda")
} else {
  load("raw_isomean.rda")
}
print(raw_isomean)

# function for comparing md_pava implementations
mdpava <- function(x, w = rep(1, length(x))) {
  return(.C("mdpava", x = as.double(x), as.integer(length(x)))$x)
}
mc_mdpava <- function() {
  set.seed(seed)
  res <- NULL
  for (type in 1:5) {
    for (r in 1:maxr) {
      x <- mc_data(1000, type)
      res <- rbind.data.frame(
        res,
        microbenchmark(
          "original" = { y <- mdpava(x) },
          "legacy" = { y <- legacy(x, number = 12) },
          times = maxt,
          check = "equivalent",
          control = list(warmup = 2)
        )
      )
    }
  }
  return(res)
}
if (!file.exists("raw_mdpava.rda")) {
  raw_mdpava <- mc_mdpava()
  save(raw_mdpava, file = "raw_mdpava.rda")
} else {
  load("raw_mdpava.rda")
}
print(raw_mdpava)

# function for comparing reg_1d_l2 implementations
mc_reg1dl2 <- function() {
  set.seed(seed)
  res <- NULL
  for (type in 1:5) {
    for (r in 1:maxr) {
      x <- mc_data(1000, type)
      res <- rbind.data.frame(
        res,
        microbenchmark(
          "original" = { y <- UniIsoRegression::reg_1d(x, rep(1, 1000), 2) },
          "legacy" = { y <- legacy(x, number = 13) },
          times = maxt,
          control = list(warmup = 2)
        )
      )
    }
  }
  return(res)
}
if (!file.exists("raw_reg1dl2.rda")) {
  raw_reg1dl2 <- mc_reg1dl2()
  save(raw_reg1dl2, file = "raw_reg1dl2.rda")
} else {
  load("raw_reg1dl2.rda")
}
print(raw_reg1dl2)

# additional stuff ===========================================================================================================
# functions for creating clarifying plots for snippet

# set global variables
n <- 100
set.seed(seed)

# create plots variants 1 and 2: order
value <- c(mc_data(n,1),mc_data(n,2))
element <- c(1:n,1:n)
variant <- rep(c(1,2),c(n,n))
order <- data.frame(cbind(element,value,variant))
ggplot(order, aes(x=element, y=value)) + geom_point(aes(color = factor(variant))) + theme(text = element_text(size = 28)) + theme(legend.position = "none")

# create plot variant 3: random
value <- c(mc_data(n,3))
element <- c(1:n)
variant <- rep(c(3),c(n))
order <- data.frame(cbind(element,value,variant))
ggplot(order, aes(x=element, y=value)) + geom_point(aes(color = factor(variant))) + theme(text = element_text(size = 28)) + theme(legend.position = "none")

# create plots variants 4 and 5: disorder
value <- c(mc_data(n,4),mc_data(n,5))
element <- c(1:n,1:n)
variant <- rep(c(5,4),c(n,n))
disorder <- data.frame(cbind(element,value,variant))
ggplot(disorder, aes(x=element, y=value)) + geom_point(aes(color = factor(variant))) + theme(text = element_text(size = 28)) + theme(legend.position = "none")

# create plot unimodal
xl <- mc_data(n,2)
xr <- mc_data(n,4)
value <- c( xl, xr )
element <- c(1:(2*n))
variant <- rep(c(3),c(2*n))
order <- data.frame(cbind(element,value,variant))
ggplot(order, aes(x=element, y=value)) + geom_point(aes(color = factor(variant))) + theme(text = element_text(size = 28)) + theme(legend.position = "none")

# create plot bivariate
n <- 10
G <- matrix(0,n,n)
for ( i in 1:n ) {
  for ( j in 1:n ) {
    G[i,j] = i + j + runif(1, -i, j )
  }
}
x <- as.vector(matrix(1:n,n,n,byrow=TRUE))
y <- as.vector(matrix(1:n,n,n,byrow=FALSE))
z <- as.vector(G)
scatterplot3d(x, y, z, highlight.3d=TRUE, xlab="row", ylab = "column", zlab = "value", col.axis="blue", col.grid="lightblue", grid=TRUE, angle = 65, type = "h", main=NULL, pch=20, cex.axis = 1.5, cex.lab = 1.8)