###################################################
### code chunk number 1: options
###################################################
options(repos = "http://cloud.r-project.org")
options(prompt = "R> ")

### Write down what package versions work with your R code, and
### attempt to download and load those packages. The first argument is
### the version of R that you used, e.g. "3.0.2" and then the rest of
### the arguments are package versions. For
### CRAN/Bioconductor/R-Forge/etc packages, write
### e.g. RColorBrewer = "1.0.5" and if RColorBrewer is not installed
### then we use install.packages to get the most recent version, and
### warn if the installed version is not the indicated version. For
### GitHub packages, write "user/repo@commit"
### e.g. "tdhock/animint@f877163cd181f390de3ef9a38bb8bdd0396d08a4" and
### we use install_github to get it, if necessary.
works_with_R <- function(Rvers, ...) {
  pkg_ok_have <- function(pkg, ok, have) {
    stopifnot(is.character(ok))
    if (!as.character(have) %in% ok) {
      warning("works with ", pkg, " version ", 
              paste(ok, collapse = " or "), 
              ", have ", have)
    }
  }
  pkg_ok_have("R", Rvers, getRversion())
  pkg.vers <- list(...)
  for (pkg.i in seq_along(pkg.vers)) {
    vers <- pkg.vers[[pkg.i]]
    pkg <- if (is.null(names(pkg.vers))) {
      ""
    } else {
      names(pkg.vers)[[pkg.i]]
    }
    if (pkg == "") { # Then it is from GitHub.
      ## suppressWarnings is quieter than quiet.
      if (!suppressWarnings(require(requireGitHub))) {
        ## If requireGitHub is not available, then install it using
        ## devtools.
        if (!suppressWarnings(require(devtools))) {
          install.packages("devtools")
          require(devtools)
        }
        install_github("tdhock/requireGitHub")
        require(requireGitHub)
      }
      requireGitHub(vers)
    } else { # it is from a CRAN-like repos.
      if (!suppressWarnings(require(pkg, character.only = TRUE))) {
        install.packages(pkg)
      }
      pkg_ok_have(pkg, vers, packageVersion(pkg))
      library(pkg, character.only = TRUE)
    }
  }
}
# works_with_R(
#   "4.1.2", 
#   data.table = "1.14.2", 
#   namedCapture = "2020.4.1", 
#   directlabels = "2021.1.13", 
#   geometry = "0.4.5", 
#   penaltyLearning = "2020.5.13", 
#   ggplot2 = "3.3.5", 
#   future.apply = "1.8.1", 
#   PeakSegOptimal = "2018.5.25", 
#   PeakSegDisk = "2020.8.13", 
#   tikzDevice = "0.12.3.1")

library("data.table")
library("namedCapture")
library("directlabels")
library("geometry")
library("penaltyLearning")
library("ggplot2")
library("future.apply")
library("PeakSegOptimal")
library("PeakSegDisk")
library("tikzDevice")


###################################################
### Help functions for Figures
###################################################
library("grid")
## to use jss.cls with tikzDevice, need to copy it to a place which is
## findable from a temp dir (which is where it does the metrics
## computation), so here we copy the files to the user-specific
## texmf/tex directory.
# tex.dir <- file.path("~", "texmf", "tex")
# dir.create(tex.dir, showWarnings = FALSE, recursive = TRUE)
# from.vec <- c("jss.cls", "jsslogo.jpg")
# to.vec <- file.path(tex.dir, from.vec)
# file.copy(from.vec, to.vec)
# options(
#   tikzDocumentDeclaration = paste(
#     ##"\\documentclass[article, 11pt]{jss}", 
#     "\\documentclass[11pt]{article}", 
#     "\\usepackage{amsmath, amssymb, amsthm}"), 
#   tikzMetricsDictionary = "tikzMetricsJSS")
#' @export
#' @rdname geom_text
#' @param label.padding Amount of padding around label. Defaults to 0.25 lines.
#' @param label.r Radius of rounded corners. Defaults to 0.15 lines.
#' @param label.size Size of label border, in mm.
geom_l <- function(mapping = NULL, data = NULL, 
                   stat = "identity", position = "identity", 
                   ..., 
                   parse = FALSE, 
                   nudge_x = 0, 
                   nudge_y = 0, 
                   label.padding = unit(0.25, "lines"), 
                   label.r = unit(0.15, "lines"), 
                   label.size = 0.25, 
                   na.rm = FALSE, 
                   show.legend = NA, 
                   inherit.aes = TRUE) {
  if (!missing(nudge_x) || !missing(nudge_y)) {
    if (!missing(position)) {
      stop("Specify either `position` or `nudge_x`/`nudge_y`", call. = FALSE)
    }
    position <- position_nudge(nudge_x, nudge_y)
  }
  layer(
    data = data, 
    mapping = mapping, 
    stat = stat, 
    geom = GeomL, 
    position = position, 
    show.legend = show.legend, 
    inherit.aes = inherit.aes, 
    params = list(
      parse = parse, 
      label.padding = label.padding, 
      label.r = label.r, 
      label.size = label.size, 
      na.rm = na.rm, 
      ...
    )
  )
}
#' @rdname ggplot2-ggproto
#' @format NULL
#' @usage NULL
#' @export
GeomL <- ggproto("GeomL", Geom, 
                 required_aes = c("x", "y", "label"), 
                 default_aes = aes(
                   colour = "black", fill = "white", size = 3.88, angle = 0, 
                   hjust = 0.5, vjust = 0.5, alpha = NA, family = "", fontface = 1, 
                   lineheight = 1.2
                 ), 
                 draw_panel = function(self, data, panel_params, coord, parse = FALSE, 
                                       na.rm = FALSE, 
                                       label.padding = unit(0.25, "lines"), 
                                       label.r = unit(0.15, "lines"), 
                                       label.size = 0.25) {
                   lab <- data$label
                   if (parse) {
                     lab <- parse(text = as.character(lab))
                   }
                   data <- coord$transform(data, panel_params)
                   if (is.character(data$vjust)) {
                     data$vjust <- compute_just(data$vjust, data$y)
                   }
                   if (is.character(data$hjust)) {
                     data$hjust <- compute_just(data$hjust, data$x)
                   }
                   grobs <- lapply(1:nrow(data), function(i) {
                     row <- data[i, , drop = FALSE]
                     lGrob(lab[i], 
                           x = unit(row$x, "native"), 
                           y = unit(row$y, "native"), 
                           just = c(row$hjust, row$vjust), 
                           padding = label.padding, 
                           r = label.r, 
                           text.gp = gpar(
                             col = row$colour, 
                             fontsize = row$size * .pt, 
                             fontfamily = row$family, 
                             fontface = row$fontface, 
                             lineheight = row$lineheight
                           ), 
                           rect.gp = gpar(
                             col = "white", 
                             fill = alpha(row$fill, row$alpha), 
                             lwd = 0
                           )
                     )
                   })
                   class(grobs) <- "gList"
                   ggplot2:::ggname("geom_l", grobTree(children = grobs))
                 }, 
                 draw_key = ggplot2:::draw_key_label
)
lGrob <- function(label, x = unit(0.5, "npc"), y = unit(0.5, "npc"), 
                  just = "center", padding = unit(0.25, "lines"), r = unit(0.1, "snpc"), 
                  default.units = "npc", name = NULL, 
                  text.gp = gpar(), rect.gp = gpar(fill = "white"), vp = NULL) {
  stopifnot(length(label) == 1)
  if (!is.unit(x))
    x <- unit(x, default.units)
  if (!is.unit(y))
    y <- unit(y, default.units)
  gTree(label = label, x = x, y = y, just = just, padding = padding, r = r, 
        name = name, text.gp = text.gp, rect.gp = rect.gp, vp = vp, cl = "labelgrob")
}
#' @export
makeContent.lgrob <- function(x) {
  hj <- resolveHJust(x$just, NULL)
  vj <- resolveVJust(x$just, NULL)
  t <- textGrob(
    x$label, 
    x$x + 2 * (0.5 - hj) * x$padding, 
    x$y + 2 * (0.5 - vj) * x$padding, 
    just = c(hj, vj), 
    gp = x$text.gp, 
    name = "text"
  )
  r <- roundrectGrob(x$x, x$y, default.units = "native", 
                     width = grobWidth(t) + 2 * x$padding, 
                     height = grobHeight(t) + 2 * x$padding, 
                     just = c(hj, vj), 
                     r = x$r, 
                     gp = x$rect.gp, 
                     name = "box"
  )
  setChildren(x, gList(r, t))
}


###################################################
### code chunk number 2: loadData
###################################################

library("PeakSegDisk")
data(Mono27ac, package = "PeakSegDisk")
Mono27ac$coverage


###################################################
### code chunk number 3: saveData
###################################################

data.dir <- file.path("Mono27ac", "chr11:60000-580000")
dir.create(data.dir, showWarnings = FALSE, recursive = TRUE)
write.table(Mono27ac$coverage, file.path(data.dir, "coverage.bedGraph"), 
            col.names = FALSE, row.names = FALSE, quote = FALSE, sep = "\t")


###################################################
### code chunk number 4: problemPeakSegFPOP
###################################################

fit <- PeakSegDisk::PeakSegFPOP_dir(data.dir, "10000")


###################################################
### code chunk number 5: fitLoss
###################################################

summary(fit)
# fit$loss


###################################################
### code chunk: Figure 3
###################################################

library("ggplot2")
gg <- ggplot() + theme_bw() + 
  geom_step(aes(chromStart, count), color = "grey50", data = Mono27ac$coverage) + 
  geom_segment(aes(chromStart, mean, xend = chromEnd, yend = mean), 
               color = "green", size = 1, data = fit$segments) + 
  coord_cartesian(xlim = c(2e5, 3e5))
print(gg)


###################################################
### code chunk: Figure 4
###################################################

fit <- PeakSegDisk::PeakSegFPOP_df(Mono27ac$coverage, 999.9)
class(fit)

gg <- plot(fit)
print(gg)


###################################################
### code chunk: Figure 5
###################################################

print(gg + ggplot2::coord_cartesian(xlim = c(205000, 210000)))


###################################################
### code chunk number 7: seq-search
###################################################

fit <- PeakSegDisk::sequentialSearch_dir(data.dir, 17L)


###################################################
### code chunk number 8: jss-paper.Rnw:1128 - 1131
###################################################

fit$others[, list(iteration, under, over, penalty, peaks, total.loss)]


##############################################################
###  Figures 6 (concave G function we maximize to find the model with P
###  peaks) and 15 (time complexity of computing sqrt N peaks).
##############################################################
bench.models <- fread("jss.bench.models.csv")
bench.models[, gigabytes := megabytes/1024]
inf.evaluations <- readRDS("jss.evaluations.rds")
jss.evaluations <- inf.evaluations[others.penalty != Inf]
jss.evaluations[peaks  !=  loss.peaks, list(bedGraph.lines, peaks, loss.peaks)]
not.found <- inf.evaluations[bedGraph.lines == 66031]
inf.pen <- 25000
not.found[, x := ifelse(others.penalty == Inf, inf.pen, others.penalty)]
not.found[, y := ifelse(others.penalty == Inf, others.total.loss - inf.pen*peaks,
                        others.total.loss + (others.peaks - peaks)*others.penalty)]
biggest.it <- 4
it.text <- not.found[others.iteration <=  biggest.it]
it.points <- not.found[biggest.it < others.iteration]
gg <- ggplot() + 
  ggtitle(sprintf(
    "Numbers = iterations 1-%d\nPoints = iterations %d-%d", 
    biggest.it, 
    min(it.points$others.iteration), 
    max(it.points$others.iteration))) + 
  theme_bw() + 
  xlab("Penalty $\\lambda$") + 
  ylab("$G(\\lambda) = F(\\lambda) - \\lambda P^*$") + 
  geom_abline(aes(
    slope = others.peaks - peaks, 
    intercept = others.total.loss), 
    color = "grey50", 
    data = not.found) + 
  geom_text(aes(
    ifelse(others.penalty == Inf, inf.pen, others.penalty), 
    ifelse(
      others.penalty == Inf, 
      others.total.loss - inf.pen*peaks, 
      others.total.loss + (others.peaks - peaks)*others.penalty), 
    label = others.iteration), 
    size = 3, 
    data = it.text) + 
  geom_point(aes(
    others.penalty, 
    others.total.loss + others.peaks*others.penalty - peaks*others.penalty), 
    shape = 1, 
    data = it.points)
o <- 0.05
it.text[, vjust := c(0, 1, 0.5, 1.2, 0.9)]
it.text[, hjust := c(0 - o, 1 + o, 0 - o, 0 - o, 0 - o)]
gg <- ggplot() + 
  ggtitle("All 13 iterations") + 
  theme_bw() + 
  coord_cartesian(
    xlim = c(-1000, inf.pen), 
    ylim = c(-1600000, -10000), 
    expand = FALSE) + 
  scale_x_continuous(
    "Penalty $\\lambda$", 
    breaks = seq(0, 20000, by = 5000)) + 
  ylab("$G(\\lambda) = F(\\lambda) - \\lambda P^*$") + 
  geom_abline(aes(
    slope = others.peaks - peaks, 
    intercept = others.total.loss), 
    color = "grey50", 
    data = not.found) + 
  geom_l(aes(
    x, y, hjust = hjust, vjust = vjust, 
    label = sprintf("it = %d, $P = %d$", others.iteration, others.peaks)), 
    size = 3, 
    alpha = 0.7, 
    color = "red", 
    data = it.text) + 
  geom_point(aes(
    x, y), 
    shape = 1, 
    color = "red", 
    data = not.found)

gg

# tikz("jss-figure-evaluations-concave.tex", width = 3, height = 2)
# print(gg)
# dev.off()

biggest.it <- 8
it.text <- not.found[others.iteration <=  biggest.it]
it.points <- not.found[biggest.it < others.iteration]
o <- 0.05
it.points[, hjust := c(0 - o, 1 + o, 0 - o, 1 + o, 0 - o)]
it.points[, vjust := c(1 + o, 0 - o, 1 + o, 1 + o, 0 - o)]
gg.zoom <- ggplot() + 
  ggtitle(sprintf(
    "Zoom to iterations %d-%d", 
    min(it.points$others.iteration), 
    max(it.points$others.iteration)
  )) + 
  theme_bw() + 
  xlab("penalty") + 
  ylab("concave function to maximize") + 
  geom_abline(aes(
    slope = others.peaks - peaks, 
    intercept = others.total.loss), 
    color = "grey50", 
    data = it.points) + 
  geom_text(aes(
    others.penalty, 
    others.total.loss + others.peaks*others.penalty - peaks*others.penalty, 
    vjust = vjust, 
    hjust = hjust, 
    label = sprintf(
      "iteration = %d
peaks = %d", others.iteration, others.peaks)), 
    data = it.points) + 
  geom_point(aes(
    others.penalty, 
    others.total.loss + others.peaks*others.penalty - peaks*others.penalty), 
    shape = 1, 
    data = it.points)
gg.zoom <- ggplot() + 
  ggtitle(sprintf(
    "Zoom to iterations %d-%d", 
    min(it.points$others.iteration), 
    max(it.points$others.iteration)
  )) + 
  xlab("Penalty $\\lambda$") + 
  scale_y_continuous(
    "$G(\\lambda) = F(\\lambda) - \\lambda P^*$", 
    limits = c(NA, -71830)
  ) + 
  theme_bw() + 
  geom_abline(aes(
    slope = others.peaks - peaks, 
    intercept = others.total.loss), 
    color = "grey50", 
    data = it.points) + 
  geom_label(aes(
    others.penalty, 
    others.total.loss + others.peaks*others.penalty - peaks*others.penalty, 
    vjust = vjust, 
    hjust = hjust, 
    label = sprintf(
      "it = %d
$P = %d$", others.iteration, others.peaks)), 
    size = 3, 
    alpha = 0.5, 
    color = "white", 
    data = it.points) + 
  geom_text(aes(
    others.penalty, 
    others.total.loss + others.peaks*others.penalty - peaks*others.penalty, 
    vjust = vjust, 
    hjust = hjust, 
    label = sprintf(
      "it = %d
$P = %d$", others.iteration, others.peaks)), 
    size = 3, 
    data = it.points) + 
  geom_point(aes(
    others.penalty, 
    others.total.loss + others.peaks*others.penalty - peaks*others.penalty), 
    shape = 1, 
    data = it.points)
gg.zoom <- ggplot() + 
  ggtitle(sprintf(
    "Zoom to iterations %d--%d", 
    min(it.points$others.iteration), 
    max(it.points$others.iteration)
  )) + 
  xlab("Penalty $\\lambda$") + 
  scale_y_continuous(
    "$G(\\lambda) = F(\\lambda) - \\lambda P^*$", 
    limits = c(NA, -71830)
  ) + 
  theme_bw() + 
  geom_abline(aes(
    slope = others.peaks - peaks, 
    intercept = others.total.loss), 
    color = "grey50", 
    data = it.points) + 
  geom_l(aes(
    others.penalty, 
    others.total.loss + others.peaks*others.penalty - peaks*others.penalty, 
    vjust = vjust, 
    hjust = hjust, 
    label = sprintf(
      "it = %d, $P = %d$", others.iteration, others.peaks)), 
    size = 3, 
    alpha = 0.7, 
    color = "red", 
    data = it.points) + 
  geom_point(aes(
    others.penalty, 
    others.total.loss + others.peaks*others.penalty - peaks*others.penalty), 
    shape = 1, 
    color = "red", 
    data = it.points)

gg.zoom

# tikz("jss-figure-evaluations-concave-zoom.tex", width = 3, height = 2)
# print(gg.zoom)
# dev.off()

jss.evaluations[, others.minutes := others.seconds/60]
jss.evaluations[, others.gigabytes := others.megabytes/1024]
others.tall <- melt(
  jss.evaluations, 
  measure.vars = c("others.minutes", "others.gigabytes"))
others.tall[, var := sub("others.", "", variable)]
prob.stats <- others.tall[, list(
  OP = .N, 
  sum = sum(value), 
  median = median(value), 
  max = max(value), 
  q95 = quantile(value, 0.95), 
  q05 = quantile(value, 0.05)
), by = list(var, bedGraph.lines, segments = peaks*2 + 1, peaks)]
target.stats <- others.tall[, list(
  sum = sum(value), 
  median = median(value), 
  q95 = quantile(value, 0.95), 
  q05 = quantile(value, 0.05)
), by = list(var, target.N)]
both.points <- rbind(
  prob.stats[var == "minutes", data.table(
    bedGraph.lines, value = sum, var, algorithm = "find model")], 
  prob.stats[var == "gigabytes", data.table(
    bedGraph.lines, value = max, var, algorithm = "find model")], 
  others.tall[, data.table(bedGraph.lines, value, var, algorithm = "solve one")])
algo.key <- c(
  peaks = "O(sqrt N) peaks\nin zero-error model", 
  SN = "Segment\nNeighborhood\nO(sqrt N)\niterations", 
  OP = "Optimal\nPartitioning\nO(log N)\niterations")
abbrev.colors <- c(
  "#E41A1C", #red
  "#377EB8", #blue
  OP = "#4DAF4A", #green
  "#984EA3", #purple
  "#FF7F00", #orange
  "#FFFF33", #yellow
  "#A65628", #brown
  "#F781BF", #pink
  SN = "#999999")#grey
op.color <- abbrev.colors[["OP"]]
algo.colors <- structure(abbrev.colors, names = algo.key[names(abbrev.colors)])
text.dt <- both.points[bedGraph.lines == max(bedGraph.lines), list(
  y = (max(value) + min(value))/2, 
  hjust = 0, 
  vjust = 1
), by = list(var, algorithm, bedGraph.lines)]
d <- function(bedGraph.lines, y, algorithm, var, hjust, vjust) {
  data.table(bedGraph.lines, y, algorithm, var, hjust, vjust)
}
text.dt <- rbind(
  d(3e3, 3e3, "find model", "gigabytes", 0, 1), 
  d(1e7, 0.002, "solve one", "gigabytes", 1, 0), 
  d(3e3, 500, "find model", "minutes", 0, 1), 
  d(1e7, 0.01, "solve one", "minutes", 1, 0))
gg <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_grid(var ~ ., scales = "free") + 
  geom_text(aes(
    bedGraph.lines, y, hjust = hjust, vjust = vjust, label = {
      l <- ifelse(
        algorithm == "solve one", 
        "Solve for one penalty\nO(N log N) time", 
        "Find zero-error model\nwith O(sqrt N) peaks\nO(N(logN)^2) time")
      ifelse(var == "gigabytes", sub("time", "space", l), l)
    }), 
    size = 3, 
    color = op.color, 
    data = text.dt) + 
  geom_ribbon(aes(
    target.N, ymin = q05, ymax = q95), 
    alpha = 0.5, 
    fill = op.color, 
    data = target.stats) + 
  geom_line(aes(
    target.N, median), 
    size = 1, 
    color = op.color, 
    data = target.stats) + 
  geom_point(aes(
    bedGraph.lines, value), 
    shape = 1, 
    color = op.color, 
    data = both.points[algorithm == "find model"]) + 
  scale_x_log10(
    "N = data to segment (log scale)"
  ) + 
  scale_y_log10(
    "Computational resources to find
zero-error model with O(sqrt N) peaks
via Optimal Partitioning (log scales)", 
    labels = paste)

gg

# pdf("jss-figure-evaluations-computation.pdf", 3.5, 3)
# print(gg)
# dev.off()

evals.dt <- prob.stats[var == "minutes"]
evals.dt[, SN := segments - 1]
evals.tall <- melt(
  evals.dt, 
  measure.vars = c(
    "SN", 
    ##"peaks", 
    "OP"
  ), 
  variable.name = "algo", 
  value.name = "evaluations")

evals.tall[, algorithm := algo.key[paste(algo)] ]
N.data <- 10^seq(4, 7, l = 100)
fun.list <- list(
  N = identity, 
  "log(N)" = log, 
  "loglog(N)" = function(x)log(log(x)), 
  "sqrt(N)" = sqrt)

ref.line.list <- list(
  OP = list(y = 9, lines = c("log(N)", "sqrt(N)", "loglog(N)")), 
  SN = list(y = 60, lines = c("N", "log(N)", "sqrt(N)")))
ref.tall.list <- list()

for (ref.name in names(ref.line.list)) {
  ref.info <- ref.line.list[[ref.name]]
  for (fun.name in ref.info$lines) {
    fun <- fun.list[[fun.name]]
    first.y <- fun(min(N.data))
    ref.tall.list[[paste(fun.name, ref.name)]] <- data.table(
      N.data, 
      ref.name, 
      fun.name, 
      value = fun(N.data)/first.y*ref.info$y)
  }
}
ref.tall <- do.call(rbind, ref.tall.list)
leg <- ggplot() + 
  theme_bw() + 
  scale_color_manual(values = algo.colors) + 
  geom_point(aes(
    bedGraph.lines, evaluations, color = algorithm), 
    data = evals.tall) + 
  scale_x_log10(
    "N = data to segment (log scale)", 
    limits = c(NA, 10^8.5), 
    breaks = 10^seq(4, 7, by = 1)
  ) + 
  scale_y_log10(
    "Number of O(N log N) dynamic
programming iterations to find
model with O(sqrt N) peaks (log scale)", 
    limits = c(NA, 2000)
  )
m <- list(
  cex = 0.75, 
  dl.trans(x = x + 0.1), 
  "last.points", "calc.boxes", 
  "reduce.cex.lr", 
  function(d, ...) {
    d$h <- d$h * 1.5
    d
  }, 
  "calc.borders", 
  qp.labels("y", "bottom", "top", make.tiebreaker("x", "y"), ylimits), 
  "calc.borders")
dl <- direct.label(leg, m)
dl.ref <- dl + 
  geom_line(aes(
    N.data, value, group = paste(ref.name, fun.name)), 
    data = ref.tall) + 
  geom_text(aes(
    N.data, value, label = fun.name), 
    hjust = 0, 
    data = ref.tall[N.data == max(N.data)])
algo.key <- c(
  peaks = "$O(\\sqrt N)$ peaks\nin zero-error model", 
  SN = "Segment\nNeighborhood\n$O(\\sqrt N)$\niterations", 
  OP = "Optimal\nPartitioning\n$O(\\log N)$\niterations")
abbrev.colors <- c(
  "#E41A1C", #red
  "#377EB8", #blue
  OP = "#4DAF4A", #green
  "#984EA3", #purple
  "#FF7F00", #orange
  "#FFFF33", #yellow
  "#A65628", #brown
  "#F781BF", #pink
  SN = "#999999")#grey
op.color <- abbrev.colors[["OP"]]
algo.colors <- structure(abbrev.colors, names = algo.key[names(abbrev.colors)])
text.dt <- both.points[bedGraph.lines == max(bedGraph.lines), list(
  y = (max(value) + min(value))/2, 
  hjust = 0, 
  vjust = 1
), by = list(var, algorithm, bedGraph.lines)]
d <- function(bedGraph.lines, y, algorithm, var, hjust, vjust) {
  data.table(bedGraph.lines, y, algorithm, var, hjust, vjust)
}
text.dt <- rbind(
  d(3e3, 1e5, "find model", "gigabytes", 0, 1), 
  d(1e7, 0.0005, "solve one", "gigabytes", 1, 0), 
  d(3e3, 1e4, "find model", "minutes", 0, 1), 
  d(1e7, 0.005, "solve one", "minutes", 1, 0))
gg <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_grid(var ~ ., scales = "free") + 
  geom_text(aes(
    bedGraph.lines, y, hjust = hjust, vjust = vjust, label = {
      l <- ifelse(
        algorithm == "solve one", 
        "Solve for one penalty\n$O(N \\log N)$ time", 
        "Find zero-error model\nwith $O(\\sqrt N)$ peaks\n$O(N(\\log N)^2)$ time")
      ifelse(var == "gigabytes", sub("time", "space", l), l)
    }), 
    size = 2.5, 
    color = op.color, 
    data = text.dt) + 
  geom_ribbon(aes(
    target.N, ymin = q05, ymax = q95), 
    alpha = 0.5, 
    fill = op.color, 
    data = target.stats) + 
  geom_line(aes(
    target.N, median), 
    size = 1, 
    color = op.color, 
    data = target.stats) + 
  geom_point(aes(
    bedGraph.lines, sum), 
    shape = 1, 
    color = op.color, 
    data = prob.stats) + 
  scale_x_log10(
    "$N$ = data to segment (log scale)"
  ) + 
  scale_y_log10(
    "Computational resources to find
zero-error model with $O(\\sqrt N)$ peaks
via Optimal Partitioning (log scales)", 
    labels = paste)
algo.key <- c(
  peaks = "$O(\\sqrt N)$ peaks\nin zero-error model", 
  SN = "Segment\nNeighborhood\nGPDPA\n$O(\\sqrt N)$\niterations", 
  OP = "Optimal\nPartitioning\nGFPOP\n$O(\\log N)$\niterations")
abbrev.colors <- c(
  "#E41A1C", # red
  "#377EB8", # blue
  OP = "#4DAF4A", # green
  "#984EA3", # purple
  "#FF7F00", # orange
  "#FFFF33", # yellow
  "#A65628", # brown
  "#F781BF", # pink
  SN = "#999999") # grey
op.color <- abbrev.colors[["OP"]]
algo.colors <- structure(abbrev.colors, names = algo.key[names(abbrev.colors)])
text.dt <- both.points[bedGraph.lines == max(bedGraph.lines), list(
  y = (max(value) + min(value))/2, 
  hjust = 0, 
  vjust = 1
), by = list(var, algorithm, bedGraph.lines)]
d <- function(bedGraph.lines, y, algorithm, var, hjust, vjust) {
  data.table(bedGraph.lines, y, algorithm, var, hjust, vjust)
}
text.dt <- rbind(
  d(3e3, 1e5, "find model", "gigabytes", 0, 1), 
  d(1e7, 0.0005, "solve one", "gigabytes", 1, 0), 
  d(3e3, 300, "find model", "minutes", 0, 1), 
  d(1e7, 0.03, "solve one", "minutes", 1, 0))
exp.vec <- seq(4, 7, by = 1)
breaks.vec <- 10^exp.vec
labels.vec <- sprintf("$10^%d$", exp.vec)
gg <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  geom_text(aes(
    bedGraph.lines, y, hjust = hjust, vjust = vjust, label = {
      l <- ifelse(
        algorithm == "solve one", 
        "Solve for one penalty\n$O(N \\log N)$ time", 
        "Sequential search for model\nwith $O(\\sqrt N)$ peaks\n$O(N(\\log N)^2)$ time")
      ifelse(var == "gigabytes", sub("time", "space", l), l)
    }), 
    size = 3, 
    color = op.color, 
    data = text.dt[var == "minutes"]) + 
  geom_ribbon(aes(
    target.N, ymin = q05, ymax = q95), 
    alpha = 0.5, 
    fill = op.color, 
    data = target.stats[var == "minutes"]) + 
  geom_line(aes(
    target.N, median), 
    size = 1, 
    color = op.color, 
    data = target.stats[var == "minutes"]) + 
  geom_point(aes(
    bedGraph.lines, sum), 
    shape = 1, 
    color = op.color, 
    data = prob.stats[var == "minutes"]) + 
  scale_x_log10(
    "$N$ = data to segment
(log scale)", 
    breaks = breaks.vec, 
    labels = labels.vec
  ) + 
  scale_y_log10(
    "Time (minutes) to compute
     model with $O(\\sqrt N)$ peaks
     via GFPOP (log scale)", 
    breaks = 10^seq(-1, 2), 
    labels = paste)

gg

# tikz("jss-figure-evaluations-computation.tex", width = 3.1, height = 2.6)
# print(gg)
# dev.off()

evals.dt <- prob.stats[var == "minutes"]
evals.dt[, SN := segments - 1]
evals.tall <- melt(
  evals.dt, 
  measure.vars = c(
    "SN", 
    ##"peaks", 
    "OP"
  ), 
  variable.name = "algo", 
  value.name = "evaluations")
evals.tall[, algorithm := algo.key[paste(algo)] ]
leg <- ggplot() + 
  theme_bw() + 
  scale_color_manual(values = algo.colors) + 
  geom_point(aes(
    bedGraph.lines, evaluations, color = algorithm), 
    data = evals.tall) + 
  scale_x_log10(
    "$N$ = data to segment
(log scale)", 
    limits = c(NA, 10^8.5), 
    breaks = breaks.vec, 
    labels = labels.vec
  ) + 
  scale_y_log10(
    "Number of $O(N \\log N)$
DP iterations to compute
model with $O(\\sqrt N)$
peaks (log scale)", 
    limits = c(NA, 2000)
  )
m <- list(
  cex = 0.8, 
  dl.trans(x = x + 0.1), 
  "last.points", "calc.boxes", 
  "reduce.cex.lr", 
  function(d, ...) {
    d$h <- d$h  + 0.3
    d
  }, 
  "calc.borders", 
  qp.labels("y", "bottom", "top", make.tiebreaker("x", "y"), ylimits), 
  "calc.borders")
dl <- direct.label(leg, m)

dl

# tikz("jss-figure-evaluations.tex", width = 3.1, height = 2.6)
# print(dl)
# dev.off()


#########################################################
## Figure 7 (PeakSegFPOP can be used to compute more likely models
## than the default MACS2 model).
#########################################################
chunk.dir <- "jss-figure-more-likely-models"
counts.RData <- file.path(chunk.dir, "counts.RData")
load(counts.RData)
counts.dt <- data.table(counts)
sample.num.vec <- c(#101, 
  ## 322, 
  ## 4, 91, 
  104
)
sample.id.vec <- sprintf("McGill%04d", sample.num.vec)
some.counts <- counts.dt[sample.id %in% sample.id.vec]
peaks.RData <- file.path(chunk.dir, "macs.default.RData")
load(peaks.RData)
peaks.dt <- data.table(peaks[[1]])
some.peaks <- peaks.dt[sample.id %in% sample.id.vec]
PoissonLogLik <- function(data.dt, seg.dt.in) {
  dataStart <- data.dt$chromStart[1]
  dataEnd <- data.dt[.N, chromEnd]
  seg.dt <- data.table(seg.dt.in)
  seg.dt[, segStart1 := segStart + 1]
  data.dt[, chromStart1 := chromStart + 1]
  setkey(seg.dt, segStart1, segEnd)
  setkey(data.dt, chromStart1, chromEnd)
  over.dt <- foverlaps(data.dt, seg.dt, nomatch = 0L)
  over.dt[chromStart < segStart, chromStart := segStart]
  over.dt[segEnd < chromEnd, chromEnd := segEnd]
  over.dt[, weight := chromEnd - chromStart]
  stopifnot(dataEnd - dataStart == sum(over.dt$weight))
  seg.means <- over.dt[, list(
    mean = sum(coverage*weight)/sum(weight)
  ), by = list(segStart, segEnd)]
  seg.means[, status := rep(c("background", "peak"), l = .N)]
  data.means <- seg.means[over.dt, on = list(segStart, segEnd)]
  data.means[, list(
    total.logLik = sum(dpois(coverage, mean, log = TRUE)*weight), 
    total.weight = sum(weight)
  )]
}
segs.list <- list()
loss.list <- list()
logLik.list <- list()
chrom <- "chr11"
n.peaks.vec <- 5:1
n.peaks.vec <- as.integer(c(5, 3))
for (sample.i in seq_along(sample.id.vec)) {
  sample.id <- sample.id.vec[[sample.i]]
  cat(sprintf("%4d / %4d %s\n", sample.i, length(sample.id.vec), sample.id))
  s.dt <- data.table(sample.id)
  sample.counts <- counts.dt[s.dt, on = list(sample.id)]
  sample.peaks <- peaks.dt[s.dt, on = list(sample.id)]
  problemStart <- sample.counts$chromStart[1]
  problemEnd <- sample.counts[.N, chromEnd]
  problem <- data.table(chrom, problemStart, problemEnd)
  problem.dir <- file.path(
    "jss-data", sample.id, 
    sprintf("%s-%d-%d", chrom, problemStart, problemEnd))
  dir.create(problem.dir, showWarnings = FALSE, recursive = TRUE)
  problem.bed <- file.path(problem.dir, "problem.bed")
  fwrite(problem, problem.bed, sep = "\t", col.names = FALSE)
  coverage.bedGraph <- file.path(problem.dir, "coverage.bedGraph")
  coverage.dt <- sample.counts[, data.table(
    chrom, chromStart, chromEnd, coverage)]
  fwrite(coverage.dt, coverage.bedGraph, sep = "\t", col.names = FALSE)
  seg.limit.vec <- c(
    problemStart, 
    sample.peaks[, rbind(chromStart, chromEnd)], 
    problemEnd)
  n.limits <- length(seg.limit.vec)
  seg.dt <- data.table(
    segStart = seg.limit.vec[-n.limits], 
    segEnd = seg.limit.vec[-1])
  seg.dt[, segStart1 :=  segStart + 1]
  sample.counts[, chromStart1 := chromStart + 1]
  setkey(seg.dt, segStart1, segEnd)
  setkey(sample.counts, chromStart1, chromEnd)
  over.dt <- foverlaps(sample.counts, seg.dt, nomatch = 0L)
  over.dt[chromStart < segStart, chromStart := segStart]
  over.dt[segEnd < chromEnd, chromEnd := segEnd]
  over.dt[, weight := chromEnd - chromStart]
  stopifnot(problemEnd - problemStart == sum(over.dt$weight))
  seg.means <- over.dt[, list(
    mean = sum(coverage*weight)/sum(weight)
  ), by = list(segStart, segEnd)]
  seg.means[, status := rep(c("background", "peak"), l = .N)]
  data.means <- seg.means[over.dt, on = list(segStart, segEnd)]
  logLik.macs <- data.means[, sum(dpois(coverage, mean, log = TRUE)*weight)]
  equality.constraints <- seg.means[, sum(diff(mean) == 0)]
  total.loss.macs <- data.means[, PeakSegOptimal::PoissonLoss(
    coverage, mean, weight)]
  loss.list[[paste(sample.id, "macs2")]] <- data.table(
    sample.id, 
    model = "macs2", 
    equality.constraints, 
    total.loss = total.loss.macs, 
    peaks = nrow(sample.peaks))
  segs.list[[paste(sample.id, "macs2")]] <- data.table(
    sample.id, model = "macs2", seg.means)
  for (n.peaks in n.peaks.vec) {
    better.list <- NULL
    while(is.null(better.list)) {
      better.list <- tryCatch({
        PeakSegDisk::sequentialSearch_dir(problem.dir, n.peaks)
      }, error = function(e) {
        print("trying again")
        NULL
      })
    }
    loss.list[[paste(sample.id, n.peaks)]] <- data.table(
      sample.id, 
      model = n.peaks, 
      better.list$loss[, .(equality.constraints, total.loss, peaks)])
    segs.list[[paste(sample.id, n.peaks)]] <- data.table(
      sample.id, 
      model = n.peaks, 
      better.list$segments[, .(
        segStart = chromStart, 
        segEnd = chromEnd, 
        mean, 
        status
      )]
    )
  }
}
segs <- do.call(rbind, segs.list)
loss <- do.call(rbind, loss.list)
logLik <- segs[, {
  data.table(
    PoissonLogLik(sample.counts, .SD), 
    peaks = (.N - 1)/2
  )
}, by = list(sample.id, model)]
changes <- segs[order(segStart), data.table(
  position = segStart[-1], 
  constraint = ifelse(diff(mean) == 0, "equality", "inequality")
), by = list(sample.id, model)]
max.dt <- some.counts[, list(
  max = max(coverage)
), by = list(sample.id)]
max.logLik <- logLik[max.dt, on = list(sample.id)]
possible.vec <- c("macs2", n.peaks.vec)
lab.vec <- gsub(" ", "\n", ifelse(
  possible.vec == "macs2", 
  "Default MACS2 model", 
  paste0("Optimal ", possible.vec, " - peak model")))
mfactor <- function(val) {
  factor(
    val, 
    possible.vec, 
    lab.vec)
}
max.logLik[, model.fac := mfactor(model)]
segs[, model.fac := mfactor(model)]
changes[, model.fac := mfactor(model)]
gg <- ggplot() + 
  geom_text(aes(
    118120, max, label = sprintf(
      "logLik = %.1f\n%d peak%s", 
      total.logLik, peaks, ifelse(peaks == 1, "", "s"))), 
    hjust = 1, 
    vjust = 1, 
    color = "deepskyblue", 
    data = max.logLik) + 
  geom_step(aes(
    chromEnd/1e3, coverage), 
    color = "grey", 
    data = some.counts) + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_grid(sample.id ~ model.fac, scales = "free", labeller = function(df) {
    if ("sample.id" %in% names(df)) {
      df$sample.id <- sub("McGill0", "", df$sample.id)
    } else {
      df$model.fac <- paste(df$model.fac)
    }
    df
  }) + 
  geom_segment(aes(
    segStart/1e3, 0, 
    xend = segEnd/1e3, yend = 0), 
    data = segs[status == "peak"], 
    color = "deepskyblue", 
    size = 1) + 
  geom_point(aes(
    segStart/1e3, 0), 
    data = segs[status  ==  "peak"], 
    shape = 1, 
    color = "deepskyblue") + 
  geom_segment(aes(
    segStart/1e3, mean, 
    xend = segEnd/1e3, yend = mean), 
    color = "green", 
    size = 0.5, 
    data = segs) + 
  xlab("position on chromosome (kb)") + 
  ylab("aligned read coverage")
one <- function(dt) {
  dt[sample.id == "McGill0322"]
}
gg <- ggplot() + 
  geom_text(aes(
    118120000, max, label = sprintf(
      "logLik = %.1f\n%d peak%s", 
      total.logLik, peaks, ifelse(peaks == 1, "", "s"))), 
    hjust = 1, 
    vjust = 1, 
    color = "deepskyblue", 
    data = one(max.logLik)) + 
  geom_step(aes(
    chromEnd, coverage), 
    color = "grey", 
    data = one(some.counts)) + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_grid(model.fac ~ .) + 
  geom_segment(aes(
    segStart, 0, 
    xend = segEnd, yend = 0), 
    data = one(segs[status == "peak"]), 
    color = "deepskyblue", 
    size = 1) + 
  geom_point(aes(
    segStart, 0), 
    data = one(segs[status == "peak"]), 
    shape = 1, 
    color = "deepskyblue") + 
  geom_segment(aes(
    segStart, mean, 
    xend = segEnd, yend = mean), 
    color = "green", 
    size = 1, 
    data = one(segs)) + 
  geom_text(aes(
    segStart, mean, label = format(segStart, big.mark = ", ")), 
    color = "green", 
    hjust = 1, 
    data = one(segs[status == "peak"])) + 
  geom_text(aes(
    segEnd, mean, label = format(segEnd, big.mark = ", ")), 
    color = "green", 
    hjust = 0, 
    data = one(segs[status == "peak"])) + 
  xlab("position on chromosome") + 
  ylab("aligned read coverage")
one <- function(dt) {
  dt[sample.id == "McGill0104"]
}
xmin <- 118122000
xmax <- 118124000
gg <- ggplot() + 
  geom_text(aes(
    118120000, max, label = sprintf(
      "logLik = %.1f\n%d peak%s", 
      total.logLik, peaks, ifelse(peaks == 1, "", "s"))), 
    hjust = 1, 
    vjust = 1, 
    color = "deepskyblue", 
    data = one(max.logLik)) + 
  geom_step(aes(
    chromEnd, coverage), 
    color = "grey", 
    data = one(some.counts)) + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_grid(model.fac ~ .) + 
  geom_segment(aes(
    segStart, 0, 
    xend = segEnd, yend = 0), 
    data = one(segs[status == "peak"]), 
    color = "deepskyblue", 
    size = 1) + 
  geom_point(aes(
    segStart, 0), 
    data = one(segs[status == "peak"]), 
    shape = 1, 
    color = "deepskyblue") + 
  geom_segment(aes(
    segStart, mean, 
    xend = segEnd, yend = mean), 
    color = "green", 
    alpha = 0.5, 
    size = 1, 
    data = one(segs)) + 
  xlab("position on chromosome") + 
  scale_linetype_manual(values = c(
    equality = "solid", 
    inequality = "dotted")) + 
  geom_vline(aes(
    xintercept = position, 
    linetype = constraint), 
    data = changes, 
    alpha = 0.5, 
    size = 0.4, 
    color = "green") + 
  ylab("aligned read coverage")

gg
 
# png("jss-figure-more-likely-models-three-peaks.png", 
#    units = "in", res = 300, width = 4, height = 3)
# print(gg + guides(linetype = "none"))
# dev.off()

gg.zoom <- gg + coord_cartesian(xlim = c(xmin, xmax)) + 
  ggtitle("Zoom to right peak") + 
  scale_x_continuous(breaks = NULL) + 
  theme(
    legend.position = "bottom", 
    text = element_text(size = 8))

gg.zoom

# png("jss-figure-more-likely-models-three-peaks-zoom.png", 
#    units = "in", res = 300, width = 2.5, height = 3)
# print(gg.zoom)
# dev.off()


##################################################################
## Figure 8 (up-down constrained changepoint model is robust to
## spatial correlation).
## Load data set with one row for every genomic region with a
## unique aligned read, and compute mean read size in bases.
#################################################################
data("ChIPreads", envir = environment())
experiments <- ChIPreads[, .(
  mean.bases = mean(chromEnd - chromStart), 
  median.bases = median(chromEnd - chromStart), 
  chromStart = min(chromStart)
), by = list(experiment)]
## Compute data set with two representations of these aligned
## reads: count each read at each aligned base in the read, or
## just the end/last base of the read.
end.counts <- ChIPreads[, list(
  count = .N # ignores dup reads, sum(count) would not.
), by = list(experiment, chrom, chromEnd)]
aligned.dt <- rbind(
  ChIPreads[, .(
    bases.counted = "each", experiment, chrom, 
    chromStart, chromEnd, 
    count = 1)], # ignore duplicate reads.
  end.counts[, .(
    bases.counted = "end", experiment, chrom, 
    chromStart = chromEnd - 1L, chromEnd, 
    count)])
## Compute count profile for each base in these genomic regions.
seq.dt <- aligned.dt[, {
  event.dt <- rbind(
    data.table(count, pos = chromStart + 1L), 
    data.table(count = -count, pos = chromEnd + 1L))
  edge.vec <- event.dt[, {
    as.integer(seq(min(pos), max(pos), l = 100))
  }]
  event.bins <- rbind(
    event.dt, 
    data.table(count = 0L, pos = edge.vec))
  total.dt <- event.bins[, .(
    count = sum(count)
  ), by = list(pos)][order(pos)]
  total.dt[, cum := cumsum(count)]
  total.dt[, bin.i := cumsum(pos %in% edge.vec)]
  ## it is somewhat confusing because total.dt pos is the first base
  ## with cum, and cum goes all the way up to but not including the
  ## pos of the next row.
  total.dt[, data.table(
    chromStart = pos[-.N] - 1L, 
    chromEnd = pos[-1] - 1L, 
    count = cum[-.N], 
    bin.i = bin.i[-.N])]
}, by = list(bases.counted, experiment, chrom)]
gg.data <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_grid(
    bases.counted ~ experiment, 
    scales = "free", 
    labeller = label_both) + 
  geom_step(aes(
    chromStart/1e3, count, color = data.type), 
    data = data.table(seq.dt, data.type = "exact")) + 
  scale_color_manual(values = c(
    exact = "black", 
    bins = "red", 
    model = "deepskyblue"
  )) + 
  scale_x_continuous("Position on hg19 chrom (kb = kilo bases)")
## Compute mean profile in bins.
bin.dt <- seq.dt[, {
  bases <- chromEnd - chromStart
  data.table(
    binStart = min(chromStart), 
    binEnd = max(chromEnd), 
    mean.count = sum(count*bases)/sum(bases), 
    bases = sum(bases)
  )}, by = list(bases.counted, experiment, bin.i)]
gg.bins <- gg.data + 
  geom_step(aes(
    binStart/1e3, mean.count, color = data.type), 
    alpha = 0.75, 
    size = 0.7, 
    data = data.table(bin.dt, data.type = "bins")) + 
  scale_y_log10("Aligned DNA sequence reads (log scale)")
## Compute optimal segmentation model with 2 peaks.
segs.dt <- seq.dt[, {
  data.dir <- file.path("figure-spatial-correlation", bases.counted, experiment)
  dir.create(data.dir, showWarnings = FALSE, recursive = TRUE)
  coverage.bedGraph <- file.path(data.dir, "coverage.bedGraph")
  fwrite(
    .SD[, .(chrom, chromStart, chromEnd, count)], 
    coverage.bedGraph, 
    sep = "\t", 
    quote = FALSE, 
    col.names = FALSE)
  fit <- PeakSegDisk::sequentialSearch_dir(data.dir, 2L, verbose = 1)
  data.table(fit$segments, data.type = "model")
}, by = list(bases.counted, experiment)]
changes.dt <- segs.dt[, {
  .SD[-1]
}, by = list(bases.counted, experiment, data.type)]
gg.model <- gg.bins + 
  geom_segment(aes(
    chromStart/1e3, mean, 
    xend = chromEnd/1e3, yend = mean, 
    color = data.type), 
    data = segs.dt) + 
  geom_vline(aes(
    xintercept = chromEnd/1e3, 
    color = data.type), 
    data = changes.dt)
## Compute difference between peak positions of two models.
peaks.dt <- segs.dt[status == "peak"]
peaks.dt[, peak.i := rep(1:2, l = .N)]
peak.pos.tall <- melt(
  peaks.dt, 
  measure.vars = c("chromStart", "chromEnd"))
peak.pos.wide <- dcast(
  peak.pos.tall, 
  experiment + variable + peak.i ~ bases.counted)
peak.pos.wide[, diff.bases := abs(each - end)]
read.size.panel <- "each"
bases.max.dt <- seq.dt[, .(max.count = max(count)), by = list(bases.counted)]
read.size.y <- bases.max.dt[
  read.size.panel, max.count, on = list(bases.counted)]
read.size.y <- Inf
diff.panel <- "end"
diff.y <- bases.max.dt[
  diff.panel, max.count, on = list(bases.counted)]
diff.y <- Inf
diff.vjust <- 1.1
text.size <- 3
gg.text <- gg.model + 
  geom_text(aes(
    chromStart/1e3, read.size.y, label = sprintf(
      "Median read size:\n%.0f bases", 
      median.bases)), 
    hjust = 0, 
    size = text.size, 
    vjust = 1.1, 
    data = data.table(experiments, bases.counted = read.size.panel)) + 
  geom_text(aes(
    end/1e3, diff.y, 
    label = diff.bases, 
    color = data.type), 
    size = text.size, 
    data = data.table(
      bases.counted = diff.panel, 
      data.type = "model", 
      peak.pos.wide), 
    vjust = diff.vjust, 
    hjust = 0) + 
  geom_text(aes(
    chromStart/1e3, diff.y, 
    label = "Difference = ", 
    color = data.type, 
  ), 
  size = text.size, 
  hjust = 0, 
  vjust = diff.vjust, 
  data = data.table(
    data.type = "model", 
    bases.counted = diff.panel, 
    experiments["H3K36me3", on = list(experiment)]))

gg.text

# png("jss-figure-spatial-correlation.png", 8, 3.2, units = "in", res = 300)
# print(gg.text)
# dev.off()


######################################################################
## Figure 9 (number of intervals is log N and time/space complexity is
## N log N).
######################################################################
bench.models <- fread("jss.bench.models.csv")
bench.models[, gigabytes := megabytes/1024]
prob.dir.vec <- c(
  ## "Most bases" = bench.models[which.max(bases), prob.dir], 
  "Most weighted data" = bench.models[which.max(bedGraph.lines), prob.dir], 
  "Largest mean intervals" = bench.models[which.max(mean.intervals), prob.dir], 
  "Largest max intervals" = bench.models[which.max(max.intervals), prob.dir], 
  ## "Most computation time" = bench.models[which.max(seconds), prob.dir], 
  "Most megabytes stored" = bench.models[which.max(megabytes), prob.dir])
prob.label.dt <- data.table(
  prob.label = gsub(" ", "\n", names(prob.dir.vec)), 
  prob.dir = prob.dir.vec)
one.prob <- bench.models[prob.dir == prob.dir.vec[["Largest max intervals"]] ]
## Plot time versus penalty for several data set sizes.
near.tens <- data.table(N = 10^(4:7))[, {
  diff.vec <- abs(N - bench.models$bedGraph.lines)
  bench.models[diff.vec == min(diff.vec)]
}, by = list(N)]
uniq.lines <- unique(bench.models[, list(prob.dir, bedGraph.lines)])
gg <- ggplot() + 
  theme_bw() + 
  geom_histogram(aes(
    bedGraph.lines), 
    data = uniq.lines) + 
  scale_x_log10("N = data to segment (log scale)") + 
  scale_y_continuous(
    "Number of data sets of size N
in chipseq benchmark", 
    breaks = seq(0, 600, by = 200), limits = c(0, 600))
bench.models[, round.lines :=  10^round(log10(bedGraph.lines))]
bench.models[, round.penalty :=  10^round(log10(penalty))]
bench.tiles <- bench.models[, list(
  mean.seconds = mean(seconds)
), by = list(round.lines, round.penalty)]
fit <- lm(log10(seconds)~log10(bedGraph.lines) + log10(penalty), bench.models)
one.prob.intervals <- melt(
  one.prob[0 < megabytes], measure.vars = c("mean.intervals", "max.intervals"))
one.prob.intervals[, stat := sub(".intervals", "", variable)]
leg <- ggplot() + 
  geom_point(aes(
    log(penalty), log10(value), color = stat), 
    shape = 1, 
    data = one.prob.intervals) + 
  ylab("log10(Intervals stored in the functional cost)") + 
  scale_x_continuous("log(penalty)", limits = c(NA, 10.5))
some.probs <- bench.models[prob.label.dt, on = list(prob.dir)]
some.probs.intervals <- melt(
  some.probs[0 < megabytes], 
  measure.vars = c("mean.intervals", "max.intervals"))
some.probs.intervals[, stat := sub(".intervals", "", variable)]
some.probs[, minutes := seconds/60]
some.probs[, hours := minutes/60]
some.probs[, gigabytes := megabytes/1024]
some.probs.other <- melt(
  some.probs, 
  measure.vars = c(
    ##"minutes", 
    "gigabytes"))
some.probs.both <- rbind(
  some.probs.intervals[, list(
    prob.label, penalty, variable = "intervals", value, stat)], 
  some.probs.other[, list(
    prob.label, penalty, variable, value, stat = "total")])
gg <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  geom_point(aes(
    log(penalty), value, color = stat), 
    shape = 1, 
    data = some.probs.both[penalty<Inf]) + 
  scale_y_log10("") + 
  scale_x_continuous("log(penalty)") + 
  facet_grid(variable ~ prob.label, scales = "free_y")
gigabyte.ranges <- bench.models[0 < gigabytes, list(
  min.gigabytes = min(gigabytes), 
  max.gigabytes = max(gigabytes), 
  models = .N
), by = list(bedGraph.lines, prob.dir)]
bench.models[, minutes :=  seconds/60]
bench.models.tall <- melt(
  bench.models[0 < gigabytes], 
  measure.vars = c(
    "gigabytes", 
    "mean.intervals", 
    "minutes"
  ))
bench.models.tall[, var := ifelse(
  variable == "mean.intervals", "intervals", paste(variable))]
bench.models.tall[, stat := ifelse(
  variable == "mean.intervals", "mean", "total")]
bench.models.tall.ranges <- bench.models.tall[, list(
  min.value = min(value), 
  max.value = max(value), 
  mean.value = mean(value)
), by = list(bedGraph.lines, prob.dir, variable, var, stat)]
bench.models.tall.ranges[, mid.value := (min.value + max.value)/2]
bench.models.max.intervals <- bench.models[, list(
  max.intervals = max(max.intervals), 
  stat = "max", 
  var = "intervals"
), by = list(bedGraph.lines, prob.dir)]
## this data point min value stands out -- probably an optimization
## error.
bench.models.tall.ranges[6 < log10(bedGraph.lines) & min.value < 0.1]
show.segments <- bench.models.tall.ranges[!(6 < log10(bedGraph.lines) & min.value < 0.1)]
show.max <- rbind(show.segments[variable  !=  "mean.intervals", {
  .SD[which.max(max.value)]
}, by = list(var, stat)][, list(
  var, stat, bedGraph.lines, prob.dir, max.value
)], bench.models.max.intervals[which.max(max.intervals), list(
  var, stat, bedGraph.lines, prob.dir, max.value = max.intervals
)])
blank.dt <- data.table(
  var = c("gigabytes", "minutes", "intervals"), 
  y = c(10^3.2, 10^3.4, 10^3.2))
max.color <- "black"
gg <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_grid(var ~ ., scales = "free") + 
  geom_blank(aes(
    3, y), 
    data = blank.dt) + 
  geom_hline(aes(
    yintercept = yint), 
    color = "grey50", 
    data = data.table(yint = 1, var = "gigabytes")) + 
  geom_text(aes(
    x, y, label = label), 
    color = "grey50", 
    data = data.table(x = 3, y = 1, label = "1 gigabyte", var = "gigabytes"), 
    vjust = -0.5) + 
  geom_line(aes(
    log10(bedGraph.lines), max.intervals, color = stat), 
    data = bench.models.max.intervals) + 
  geom_segment(aes(
    log10(bedGraph.lines), min.value, 
    color = stat, 
    xend = log10(bedGraph.lines), yend = max.value), 
    data = show.segments) + 
  geom_point(aes(
    log10(bedGraph.lines), max.value), 
    shape = 1, 
    color = max.color, 
    data = show.max) + 
  geom_text(aes(
    log10(bedGraph.lines) - 0.05, max.value, 
    label = paste(
      format(bedGraph.lines, big.mark = ", "), 
      "data, ", 
      round(max.value), var)), 
    color = max.color, 
    hjust = 1, 
    vjust = 0, 
    data = show.max) + 
  scale_y_log10("") + 
  scale_x_continuous("log10(number of data after compression = lines in bedGraph file)")
log10.range <- log10(range(show.segments$bedGraph.lines))
box.dt <- data.table(
  box.mid = 10^seq(log10.range[1], log10.range[2], l = 7))
(diff.vec <- diff(log10(box.dt$box.mid)))
box.w <- diff.vec[1]/2
box.dt[, box.min :=  10^(log10(box.mid) - box.w)]
box.dt[, box.max := 10^(log10(box.mid) + box.w)]
box.dt[, box.i := 1:.N]
box.models <- box.dt[bench.models, on = list(
  box.min < bedGraph.lines, 
  box.max > bedGraph.lines)]
stopifnot(nrow(box.models) == nrow(bench.models))
box.segments <- box.dt[show.segments, on = list(
  box.min < bedGraph.lines, 
  box.max > bedGraph.lines)]
stopifnot(nrow(box.segments) == nrow(show.segments))
box.segments.stats <- box.segments[, list(
  median = median(mid.value), 
  q95 = quantile(mid.value, 0.95), 
  q05 = quantile(mid.value, 0.05), 
  min = min(mid.value), 
  max = max(mid.value), 
  models = .N
), by = list(var, stat, box.mid)]
box.max <- box.dt[bench.models.max.intervals, on = list(
  box.min < bedGraph.lines, 
  box.max > bedGraph.lines)]
stopifnot(nrow(box.max) == nrow(bench.models.max.intervals))
box.max.stats <- box.max[, list(
  max.intervals = max(max.intervals)
), by = list(var, stat, box.mid)]
box.segments.stats[var == "intervals" & stat == "mean" & box.mid == max(box.mid)]
stat.colors <- c(
  mean = "#E41A1C", 
  max = "#377EB8", 
  total = "#4DAF4A", 
  "#984EA3", "#FF7F00", "#FFFF33", 
  "#A65628", "#F781BF", "#999999")
line.size <- 1
line.dt <- rbind(box.max.stats[, data.table(
  var, stat, box.mid, line.value = max.intervals
)], box.segments.stats[, data.table(
  var, stat, box.mid, line.value = median
)])
(show.point <- show.segments[bedGraph.lines == max(bedGraph.lines) & variable == "mean.intervals"])
hline.dt <- rbind(
  data.table(x = 10^3.5, y = 1, vjust = -0.5, label = "1 gigabyte", var = "gigabytes"), 
  data.table(x = 10^3.5, y = 60, vjust = 1.25, label = "1 hour", var = "minutes"))
ref.dt <- data.table(N.data = 10^seq(log10.range[1], log10.range[2], l = 100))
fun.list <- list(
  "log(N)" = log, 
  "N log(N)" = function(x)x*log(x), 
  "N" = identity)
one.var <- "gigabytes"
one.line <- line.dt[var == one.var]
first.row <- one.line[order(box.mid)][1]
for (fun.name in names(fun.list)) {
  fun <- fun.list[[fun.name]]
  first.y <- fun(first.row$box.mid)
  ref.dt[[fun.name]] <- fun(ref.dt$N.data)/first.y*first.row$line.value
}
ref.tall <- melt(ref.dt, id.vars = "N.data")
leg <- ggplot() + 
  geom_line(aes(
    log10(N.data), log10(value), color = variable), 
    size = 2, 
    data = ref.tall) + 
  geom_ribbon(aes(
    log10(box.mid), ymin = log10(q05), ymax = log10(q95)), 
    alpha = 0.5, 
    data = box.segments.stats[var == one.var]) + 
  geom_line(aes(
    log10(box.mid), log10(line.value)), 
    data = one.line) + 
  scale_x_continuous(
    "log10(N = number of data to segment)", 
    limits = c(NA, 7.5)
  )
dl <- direct.label(leg, "last.polygons")
ref.dt <- data.table(N.data = 10^seq(log10.range[1], log10.range[2], l = 100))
fun.list <- list(
  "log(N)" = log, 
  "loglog(N)" = function(x)log(log(x)), 
  "sqrt(log(N))" = function(x)sqrt(log(x)), 
  "sqrt(N)" = sqrt)
one.var <- "intervals"
one.line <- line.dt[var == one.var & stat == "mean"]
first.row <- one.line[order(box.mid)][1]
for (fun.name in names(fun.list)) {
  fun <- fun.list[[fun.name]]
  first.y <- fun(first.row$box.mid)
  ref.dt[[fun.name]] <- fun(ref.dt$N.data)/first.y*first.row$line.value
}
ref.tall <- melt(ref.dt, id.vars = "N.data")
leg <- ggplot() + 
  geom_line(aes(
    log10(N.data), log10(value), color = variable), 
    size = 2, 
    data = ref.tall) + 
  geom_ribbon(aes(
    log10(box.mid), ymin = log10(q05), ymax = log10(q95)), 
    alpha = 0.5, 
    data = box.segments.stats[var == one.var]) + 
  geom_line(aes(
    log10(box.mid), log10(line.value)), 
    data = one.line) + 
  ylab("log10(mean number of intervals)") + 
  scale_x_continuous(
    "log10(N = number of data to segment)", 
    limits = c(NA, 7.75)
  )
dl <- direct.label(leg, "last.polygons")
leg <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_grid(var ~ ., scales = "free") + 
  geom_blank(aes(
    10^3.5, y), 
    data = blank.dt) + 
  geom_hline(aes(
    yintercept = y), 
    color = "grey50", 
    data = hline.dt) + 
  geom_text(aes(
    x, y, label = label), 
    vjust = -0.5, 
    color = "grey50", 
    data = hline.dt) + 
  geom_ribbon(aes(
    box.mid, ymin = q05, ymax = q95, fill = stat), 
    alpha = 0.5, 
    data = box.segments.stats) + 
  geom_line(aes(
    box.mid, line.value, color = stat), 
    size = line.size, 
    data = line.dt) + 
  geom_point(aes(
    bedGraph.lines, max.value), 
    shape = 1, 
    color = max.color, 
    data = show.max) + 
  geom_text(aes(
    bedGraph.lines, max.value, 
    label = paste(
      format(bedGraph.lines, big.mark = ", "), 
      "data, ", 
      round(max.value), var)), 
    color = max.color, 
    hjust = 1, 
    vjust = -0.5, 
    data = show.max) + 
  ## show median.
  geom_point(aes(
    bedGraph.lines, mean.value), 
    shape = 1, 
    color = max.color, 
    data = show.point) + 
  geom_text(aes(
    bedGraph.lines, mean.value, 
    label = paste(
      format(bedGraph.lines, big.mark = ", "), 
      "data, ", 
      round(mean.value), var)), 
    color = max.color, 
    hjust = 1, 
    vjust = -1, 
    data = show.point) + 
  scale_y_log10("(log scales)", labels = function(chr) {
    paste(chr)
  }) + 
  scale_color_manual(values = stat.colors) + 
  scale_fill_manual(values = stat.colors, guide = "none") + 
  scale_x_log10(
    "N = number of data to segment (log scale)", 
    limits = c(NA, 10^7.25), 
    labels = paste
  )
gg <- direct.label(leg, "last.polygons")
gg <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_grid(var ~ ., scales = "free") + 
  geom_blank(aes(
    10^4.5, y), 
    data = blank.dt[var !=  "intervals"]) + 
  geom_hline(aes(
    yintercept = y), 
    color = "grey50", 
    data = hline.dt) + 
  geom_text(aes(
    10^4, y, label = label, vjust = vjust), 
    color = "grey50", 
    data = hline.dt) + 
  geom_ribbon(aes(
    box.mid, ymin = q05, ymax = q95), 
    alpha = 0.5, 
    data = box.segments.stats[var != "intervals"]) + 
  geom_line(aes(
    box.mid, line.value), 
    size = line.size, 
    data = line.dt[var != "intervals"]) + 
  geom_point(aes(
    bedGraph.lines, max.value), 
    shape = 1, 
    color = max.color, 
    data = show.max[var != "intervals"]) + 
  geom_text(aes(
    bedGraph.lines, max.value, 
    label = paste(
      format(bedGraph.lines, big.mark = ", "), 
      "data, ", 
      round(max.value), var)), 
    color = max.color, 
    hjust = 1, 
    vjust = -0.5, 
    data = show.max[var != "intervals"]) + 
  scale_y_log10("Computational requirements\n(log scales)", labels = function(chr) {
    paste(chr)
  }) + 
  scale_x_log10(
    "N = data to segment (log scale)", 
    labels = paste
  )

gg

# pdf("jss-figure-target-intervals-models-computation.pdf", 3.5, 3)
# print(gg)
# dev.off()

## Left plot intervals.
leg <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  geom_ribbon(aes(
    box.mid, ymin = q05, ymax = q95, fill = stat), 
    alpha = 0.5, 
    data = box.segments.stats[var == "intervals"]) + 
  geom_line(aes(
    box.mid, line.value, color = stat), 
    size = line.size, 
    data = line.dt[var == "intervals"]) + 
  geom_point(aes(
    bedGraph.lines, max.value), 
    shape = 1, 
    color = max.color, 
    data = show.max[var == "intervals"]) + 
  geom_text(aes(
    1e7, max.value, 
    label = paste(
      format(bedGraph.lines, big.mark = ", "), 
      "data, ", 
      round(max.value), var)), 
    color = max.color, 
    hjust = 1, 
    vjust = -0.5, 
    data = show.max[var == "intervals"]) + 
  ## show median.
  geom_point(aes(
    bedGraph.lines, mean.value), 
    shape = 1, 
    color = max.color, 
    data = show.point[var == "intervals"]) + 
  geom_text(aes(
    3e7, mean.value, 
    label = paste(
      format(bedGraph.lines, big.mark = ", "), 
      "data, ", 
      round(mean.value), var)), 
    color = max.color, 
    hjust = 1, 
    vjust = -1.75, 
    data = show.point) + 
  scale_y_log10(
    "Intervals/candidate changepoints\n(log scale)", 
    limits = c(NA, 1000), 
    labels = function(chr) {
      paste(chr)
    }) + 
  scale_color_manual(values = stat.colors) + 
  scale_fill_manual(values = stat.colors, guide = "none") + 
  scale_x_log10(
    "N = data to segment (log scale)", 
    limits = c(NA, 10^7.75), 
    labels = paste
  )
dl <- direct.label(leg, list("last.qp", dl.trans(x = x + 0.1)))
gg <- leg + 
  geom_line(aes(
    N.data, value, group = variable), 
    color = "grey", 
    data = ref.tall) + 
  geom_text(aes(
    N.data, value, label = variable), 
    hjust = 0, 
    color = "grey", 
    data = ref.tall[N.data == max(N.data)])

dl

# pdf("jss-figure-target-intervals-models.pdf", 3.5, 3)
# print(dl)
# dev.off()


##################################################################
## Figure 10 (disk-based storage is a constant factor slower than
## memory - based storage, about 2x).
##################################################################
fit.dt <- readRDS("jss.disk.memory.rds")
fit.dt[, bench.seconds := time/1e9 ]
bench.stats <- fit.dt[, list(
  median = median(bench.seconds), 
  q25 = quantile(bench.seconds, 0.25), 
  q75 = quantile(bench.seconds, 0.75)
), by = list(bedGraph.lines, storage = expr)]
wide <- dcast(bench.stats, bedGraph.lines ~ storage, value.var = "median")
storage.colors <- c(
  "#E41A1C", # red
  "#377EB8", # blue
  disk = "#4DAF4A", # green
  memory = "#984EA3", # purple
  "#FF7F00", # orange
  "#FFFF33", # yellow
  "#A65628", # brown
  "#F781BF", # pink
  "#999999") # grey
details.dt <- fit.dt[bedGraph.lines == 106569]
leg <- ggplot() + 
  theme_bw() + 
  geom_point(aes(
    penalty, bench.seconds, color = expr), 
    shape = 1, 
    data = details.dt) + 
  scale_color_manual(values = storage.colors) + 
  scale_x_log10(
    "lambda = penalty (log scale)", 
    limits = c(NA, 10^5.75)
  ) + 
  scale_y_log10("seconds (log scale)")

dl <- direct.label(leg, list("last.qp", dl.trans(x = x + 0.1)))

dl

# pdf("jss-figure-disk-memory-compare-speed-penalty.pdf", 3.3, 3)
# print(dl)
# dev.off()

range.dt <- details.dt[, list(
  min.seconds = min(bench.seconds), 
  max.seconds = max(bench.seconds), 
  bedGraph.lines = bedGraph.lines[1]
)]

leg <- ggplot() + 
  theme_bw() + 
  scale_color_manual(values = storage.colors) + 
  scale_fill_manual(values = storage.colors) + 
  geom_ribbon(aes(
    bedGraph.lines, ymin = q25, ymax = q75, fill = storage), 
    alpha = 0.5, 
    data = bench.stats) + 
  geom_line(aes(
    bedGraph.lines, median, color = storage), 
    data = bench.stats) + 
  scale_y_log10("seconds (log scale)") + 
  scale_x_log10(
    "N = number of data to segment (log scale)", 
    limits = c(NA, 10^6.2))
dl <- direct.label(leg, list("last.qp", dl.trans(x = x + 0.1)))

dl 

# pdf("jss-figure-disk-memory-compare-speed.pdf", 3.3, 3)
# print(dl)
# dev.off()


###################################################
## Figure 11 (number of DP iterations using OP is much lower than SN to
## compute a large number of peaks).
###################################################
bench.models <- fread("jss.bench.models.csv")
bench.models[, gigabytes := megabytes/1024]
jss.variable.peaks <- readRDS("jss.variable.peaks.rds")[others.penalty != Inf]
jss.variable.peaks[, others.minutes := others.seconds/60]
jss.variable.peaks[, others.gigabytes := others.megabytes/1024]
others.tall <- melt(
  jss.variable.peaks, 
  measure.vars = c("others.minutes", "others.gigabytes"))
others.tall[, var := sub("others.", "", variable)]
prob.stats <- others.tall[, list(
  OP = .N, 
  sum = sum(value), 
  median = median(value), 
  max = max(value), 
  q95 = quantile(value, 0.95), 
  q05 = quantile(value, 0.05)
), by = list(
  var, target.N = as.integer(target.N), 
  bedGraph.lines, segments = loss.peaks*2 + 1, peaks = loss.peaks)]
algo.key <- c(
  peaks = "O(sqrt N) peaks\nin zero-error model", 
  SN = "Segment\nNeighborhood", 
  OP = "Optimal\nPartitioning")
abbrev.colors <- c(
  "#E41A1C", # red
  "#377EB8", # blue
  OP = "#4DAF4A", # green
  "#984EA3", # purple
  "#FF7F00", # orange
  "#FFFF33", # yellow
  "#A65628", # brown
  "#F781BF", # pink
  SN = "#999999") # grey
op.color <- abbrev.colors[["OP"]]
sn.color <- abbrev.colors[["SN"]]
algo.colors <- structure(abbrev.colors, names = algo.key[names(abbrev.colors)])
evals.dt <- prob.stats[var == "minutes"]
evals.dt[, SN := segments - 1]
evals.tall <- melt(
  evals.dt, 
  measure.vars = c(
    "SN", 
    ## "peaks", 
    "OP"
  ), 
  variable.name = "algo", 
  value.name = "evaluations")
evals.tall[, algorithm := algo.key[paste(algo)] ]
evals.tall[, N.fac := paste("N  = ", format(bedGraph.lines, big.mark = ", "))]
op.evals <- evals.tall[algo == "OP"]
first.peaks <- 3
N.peaks <- 10^seq(log10(first.peaks), 5, l = 30)
fun.list <- list(
  ##N = identity, 
  "$\\log(N)$" = log
  ## "loglog(N)" = function(x)log(log(x)), 
  ## "sqrt(N)" = sqrt
)
first.dt <- op.evals[peaks == first.peaks, list(
  evaluations = mean(evaluations)
), by = list(target.N)]
ref.tall.list <- list()
for (first.i in 1:nrow(first.dt)) {
  first <- first.dt[first.i]
  for (fun.name in names(fun.list)) {
    fun <- fun.list[[fun.name]]
    first.y <- fun(first.peaks)
    ref.tall.list[[paste(fun.name, first.i)]] <- data.table(
      N.peaks, 
      first, 
      fun.name, 
      value = fun(N.peaks)/first.y*first$evaluations)
  }
}
(ref.tall <- do.call(rbind, ref.tall.list))
gg <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_grid(. ~ target.N, labeller = function(df) {
    df$target.N <- paste("N $\\approx$", df$target.N)
    df
  }) + 
  scale_x_log10("Desired peaks $P^*$ (log scale)") + 
  scale_y_log10("Number of DP iterations (log scale)") + 
  geom_point(aes(
    peaks, evaluations), 
    color = op.color, 
    data = op.evals) + 
  geom_abline(
    slope = 2, intercept = 0, 
    size = 1, 
    color = sn.color)
gg.zoom <- ggplot() + 
  ggtitle("Zoom to $P^* \\leq 10$\n(linear scales)") + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  scale_x_continuous(
    "Desired peaks $P^*$", 
    breaks = seq(0, 10, by = 2)) + 
  scale_y_continuous("Number of DP iterations", 
                     breaks = seq(0, 20, by = 2)) + 
  geom_point(aes(
    peaks, evaluations), 
    color = op.color, 
    data = op.evals[target.N == max(target.N) & peaks <=  10]) + 
  geom_abline(
    slope = 2, intercept = 0, 
    size = 1, 
    color = sn.color) + 
  coord_equal(xlim = c(1, 11)) + 
  scale_color_manual(values = abbrev.colors, guide = "none") + 
  geom_text(aes(
    x, y, label = label, color = algo, hjust = hjust), 
    size = 3, 
    data = rbind(
      data.table(x = 4, y = 14, algo = "SN", label = "SN\nfaster\nfor\n$P^*<5$", hjust = 1), 
      data.table(x = 8, y = 14, algo = "OP", label = "OP\nfaster\nfor\n$P^*>5$", hjust = 0)), 
    vjust = 0.5)

gg.zoom

# tikz("jss-figure-variable-peaks-zoom.tex", width = 3, height = 3)
# print(gg.zoom)
# dev.off()

## only show figure for largest data set size.
gg <- ggplot() + 
  ggtitle("All timings (log scales)") + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  scale_x_log10("Desired peaks $P^*$") + 
  coord_cartesian(ylim = c(6, 25)) + 
  scale_y_log10(
    "Number of DP iterations", 
    breaks = seq(0, 20, by = 2)) + 
  geom_point(aes(
    peaks, evaluations), 
    color = op.color, 
    data = op.evals[target.N == max(target.N)]) + 
  geom_abline(
    slope = 2, intercept = 0, 
    size = 1, 
    color = sn.color) + 
  scale_color_manual(values = abbrev.colors, guide = "none") + 
  geom_text(aes(
    x, y, label = label, color = algo), 
    size = 3, 
    data = rbind(
      data.table(x = 10, y = 25, algo = "SN", label = "Segment Neighborhood\nGPDPA $O(P^*)$ iterations\n\n"), 
      data.table(x = 100, y = 11, algo = "OP", label = "Optimal\nPartitioning\nGFPOP\n$O(\\log P^*)$ iterations")), 
    vjust = 1, 
    hjust = 0)

gg

# tikz("jss-figure-variable-peaks.tex", width = 3, height = 3)
# print(gg)
# dev.off()


#######################################################################
## Figure 12 (number of GFPOP calls depends on maximum number of peaks).
#######################################################################
jss.more.evals <- readRDS("jss.more.evals.rds")
evals.dt <- jss.more.evals[, .(
  penalties = .N, 
  max.peaks = max(others.peaks)
), by = list(bedGraph.lines, peaks.arg, loss.peaks)]
pen0 <- unique(
  jss.more.evals[others.penalty == 0, .(bedGraph.lines, others.peaks)])
evals.dt[, desired.peaks := peaks.arg]
problem.counts <- evals.dt[, .(
  problems = .N
), by = list(desired.peaks)]
max.counts <- problem.counts[problems == max(problems)][desired.peaks < 2000]
max.evals <- evals.dt[max.counts, on = list(desired.peaks)]
evals.tall <- melt(
  max.evals, 
  measure.vars = c("max.peaks", "desired.peaks"))
stats.dt <- evals.tall[, .(
  mean.penalties = mean(penalties), 
  sd.penalties = sd(penalties), 
  penalties = .N
), by = list(variable, value)]
gg <- ggplot() + 
  geom_point(aes(
    max.peaks, penalties), 
    data = max.evals[desired.peaks %in% c(10, 100, 1000)]) + 
  scale_y_log10(
    "GFPOP calls required
to compute model with
desired peaks (log scale)", 
    breaks = c(5, 10, 20)) + 
  coord_cartesian(ylim = c(5, 20)) + 
  scale_x_log10(
    "Maximum number of peaks for data set") + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_wrap("desired.peaks", labeller = label_both)

gg 

# png("jss-figure-more-evals.png", 5, 2, units = "in", res = 300)
# print(gg)
# dev.off()


###################################################################
## Figure 13 (label error).
###################################################################
ann.colors <- c(
  noPeaks = "#f6f4bf", 
  peakStart = "#ffafaf", 
  peakEnd = "#ff4c4c", 
  peaks = "#a445ee")
bench.models <- fread("jss.bench.models.csv")
bench.models[prob.dir == "H3K4me3_TDH_immune/samples/tcell/McGill0008/problems/chr11:96437584-134946516"]
prob.dir <- "jss-figure-label-error"
some.regions <- fread(file.path(prob.dir, "labels.bed"))
setnames(some.regions, c("chrom", "chromStart", "chromEnd", "annotation"))
(some.models <- bench.models[prob.dir == "H3K36me3_TDH_immune/samples/monocyte/McGill0104/problems/chr12:7239876-34856694"])
model.i.vec <- c(
  "too many peaks" = 5, 
  "no errors, max peaks" = 12, 
  ## "min incorrect labels" = 15, 
  "no errors, min peaks" = 18, 
  "too few peaks" = 19)
error.dt.list <- list()
peak.dt.list <- list()
for (i in seq_along(model.i.vec)) {
  model.name <- names(model.i.vec)[[i]]
  model.i <- model.i.vec[[i]]
  cat(sprintf("%4d %s\n", model.i, model.name))
  model <- some.models[model.i]
  pen.str <- paste(model$penalty)
  fit <- PeakSegDisk::PeakSegFPOP_dir(prob.dir, pen.str)
  fit.peaks <- fit$segments[status == "peak"]
  fit.errors <- PeakError::PeakErrorChrom(fit.peaks, some.regions)
  meta <- data.table(
    pen.str = paste(round(model$penalty)), 
    model.name, model.y = -i*3, penalty = model$penalty, peaks = model$peaks)
  peak.dt.list[[model.name]] <- data.table(meta, fit.peaks)
  error.dt.list[[model.name]] <- data.table(meta, fit.errors)
}
peak.dt <- do.call(rbind, peak.dt.list)
error.dt <- do.call(rbind, error.dt.list)
coverage.dt <- fread(file.path(prob.dir, "coverage.bedGraph"), drop = c(1, 3))
setnames(coverage.dt, c("chromStart", "coverage"))
xlim.vec <- c(1.52e7, 1.73e7)
small.counts <- coverage.dt[, approx(chromStart, coverage, seq(xlim.vec[1], xlim.vec[2], l = 1000))]
coverage.dt[xlim.vec[1] < chromStart & chromStart < xlim.vec[2]]
show.peaks <- peak.dt[min(error.dt$chromStart) < chromStart & chromStart < max(error.dt$chromEnd)]
gg.data <- ggplot() + 
  theme_bw() + 
  scale_fill_manual("label", values = ann.colors) + 
  geom_line(aes(
    x, y), 
    color = "grey50", 
    data = small.counts) + 
  coord_cartesian(
    xlim = xlim.vec, expand = FALSE, 
    ylim = c(-14, 31)) + 
  xlab("position on chromosome") + 
  scale_y_continuous(
    "aligned read counts", 
    breaks = seq(0, 30, by = 10))
gg.labels <- ggplot() + 
  theme_bw() + 
  scale_fill_manual("label", values = ann.colors) + 
  geom_tallrect(aes(
    xmin = chromStart, xmax = chromEnd, fill = annotation), 
    alpha = 0.5, 
    size = 0.4, 
    color = "grey", 
    data = some.regions[annotation  !=  "noPeaks"]) + 
  geom_line(aes(
    x, y), 
    color = "grey50", 
    data = small.counts) + 
  coord_cartesian(
    xlim = xlim.vec, expand = FALSE, 
    ylim = c(-14, 31)) + 
  xlab("position on chromosome") + 
  scale_y_continuous(
    "aligned read counts", 
    breaks = seq(0, 30, by = 10))
some <- function(dt) {
  dt[pen.str == "278653"]
}
gg.few <- ggplot() + 
  theme_bw() + 
  scale_fill_manual("label", values = ann.colors) + 
  geom_tallrect(aes(
    xmin = chromStart, xmax = chromEnd, fill = annotation), 
    alpha = 0.5, 
    size = 0.4, 
    color = "grey", 
    data = some.regions[annotation != "noPeaks"]) + 
  geom_line(aes(
    x, y), 
    color = "grey50", 
    data = small.counts) + 
  coord_cartesian(
    xlim = xlim.vec, expand = FALSE, 
    ylim = c(-14, 31)) + 
  geom_rect(aes(
    xmin = chromStart, xmax = chromEnd, 
    linetype = status, 
    ymin = model.y - 1, ymax = model.y + 1), 
    fill = NA, 
    color = "black", 
    size = 0.7, 
    data = some(error.dt)) + 
  scale_linetype_manual("error type", 
                        values = c(correct = 0, 
                                 "false negative" = 3, 
                                 "false positive" = 1)) + 
  geom_segment(aes(
    chromStart, model.y, 
    xend = chromEnd, yend = model.y), 
    color = "deepskyblue", 
    size = 2, 
    data = some(show.peaks)) + 
  geom_point(aes(
    chromStart, model.y), 
    color = "deepskyblue", 
    shape = 1, 
    data = some(show.peaks)) + 
  geom_text(aes(
    chromStart, model.y, 
    label = sprintf("penalty = %s ", pen.str)), 
    hjust = 1, 
    data = some(error.dt[chromStart == min(chromStart)])) + 
  geom_text(aes(
    chromEnd, model.y, 
    label = sprintf(" %s (%d)", model.name, peaks)), 
    hjust = 0, 
    data = some(error.dt[chromStart == max(chromStart)])) + 
  xlab("position on chromosome") + 
  scale_y_continuous(
    "aligned read counts", 
    breaks = seq(0, 30, by = 10))
some <- function(dt) {
  dt[pen.str %in% c("278653", "6682")]
}
gg.many <- ggplot() + 
  theme_bw() + 
  scale_fill_manual("label", values = ann.colors) + 
  geom_tallrect(aes(
    xmin = chromStart, xmax = chromEnd, fill = annotation), 
    alpha = 0.5, 
    size = 0.4, 
    color = "grey", 
    data = some.regions[annotation != "noPeaks"]) + 
  geom_line(aes(
    x, y), 
    color = "grey50", 
    data = small.counts) + 
  coord_cartesian(
    xlim = xlim.vec, expand = FALSE, 
    ylim = c(-14, 31)) + 
  geom_rect(aes(
    xmin = chromStart, xmax = chromEnd, 
    linetype = status, 
    ymin = model.y - 1, ymax = model.y + 1), 
    fill = NA, 
    color = "black", 
    size = 0.7, 
    data = some(error.dt)) + 
  scale_linetype_manual("error type", 
                        values = c(correct = 0, 
                                 "false negative" = 3, 
                                 "false positive" = 1)) + 
  geom_segment(aes(
    chromStart, model.y, 
    xend = chromEnd, yend = model.y), 
    color = "deepskyblue", 
    size = 2, 
    data = some(show.peaks)) + 
  geom_point(aes(
    chromStart, model.y), 
    color = "deepskyblue", 
    shape = 1, 
    data = some(show.peaks)) + 
  geom_text(aes(
    chromStart, model.y, 
    label = sprintf("penalty = %s ", pen.str)), 
    hjust = 1, 
    data = some(error.dt[chromStart == min(chromStart)])) + 
  geom_text(aes(
    chromEnd, model.y, 
    label = sprintf(" %s (%d)", model.name, peaks)), 
    hjust = 0, 
    data = some(error.dt[chromStart == max(chromStart)])) + 
  xlab("position on chromosome") + 
  scale_y_continuous(
    "aligned read counts", 
    breaks = seq(0, 30, by = 10))
gg.data <- ggplot() + 
  theme_bw() + 
  scale_fill_manual("label", values = ann.colors) + 
  geom_tallrect(aes(
    xmin = chromStart, xmax = chromEnd, fill = annotation), 
    alpha = 0.5, 
    size = 0.4, 
    color = "grey", 
    data = some.regions[annotation != "noPeaks"]) + 
  geom_line(aes(
    x, y), 
    color = "grey50", 
    data = small.counts) + 
  coord_cartesian(
    xlim = xlim.vec, expand = FALSE, 
    ylim = c(-14, 31)) + 
  geom_rect(aes(
    xmin = chromStart, xmax = chromEnd, 
    linetype = status, 
    ymin = model.y - 1, ymax = model.y + 1), 
    fill = NA, 
    color = "black", 
    size = 0.7, 
    data = error.dt) + 
  scale_linetype_manual("error type", 
                        values = c(correct = 0, 
                                 "false negative" = 3, 
                                 "false positive" = 1)) + 
  geom_segment(aes(
    chromStart, model.y, 
    xend = chromEnd, yend = model.y), 
    color = "deepskyblue", 
    size = 2, 
    data = show.peaks) + 
  geom_point(aes(
    chromStart, model.y), 
    color = "deepskyblue", 
    shape = 1, 
    data = show.peaks) + 
  geom_text(aes(
    chromStart, model.y, 
    label = sprintf("penalty = %s ", pen.str)), 
    hjust = 1, 
    data = error.dt[chromStart  ==  min(chromStart)]) + 
  geom_text(aes(
    chromEnd, model.y, 
    label = sprintf(" %s (%d)", model.name, peaks)), 
    hjust = 0, 
    data = error.dt[chromStart == max(chromStart)]) + 
  xlab("position on chromosome") + 
  scale_y_continuous(
    "aligned read counts", 
    breaks = seq(0, 30, by = 10))

gg.data

# pdf("jss-figure-label-error.pdf", 8, 3)
# print(gg.data)
# dev.off()


#################################################################
## Figure 14 (N genomic data have sqrt N peaks).
#################################################################
bench.models <- fread("jss.bench.models.csv")
bench.models[, minutes := seconds/60]
bench.models[, hours := minutes/60]
bench.models[, gigabytes := megabytes/1024]
max.err <- bench.models[, list(
  max.errors = max(errors)
), by = list(prob.dir, bedGraph.lines)]
gg <- ggplot() + 
  geom_point(aes(
    bedGraph.lines, max.errors), 
    data = max.err) + 
  scale_x_log10() + 
  scale_y_log10()
min.err.ranges <- bench.models[, .SD[errors == min(errors), list(
  min.penalty = min(penalty), 
  max.penalty = max(penalty), 
  min.peaks = min(peaks), 
  max.peaks = max(peaks)
)], by = list(bedGraph.lines, prob.dir)][order(bedGraph.lines)]
no.zero <- min.err.ranges[0 < min.peaks]
seg.dt <- no.zero[min.peaks < max.peaks]
point.dt <- no.zero[min.peaks == max.peaks]
penalty.ranges <- bench.models[0 < penalty & penalty < Inf, list(
  min.penalty = min(penalty), 
  max.penalty = max(penalty)
), by = list(bedGraph.lines, prob.dir)]
ref.dt <- data.table(N = 10^seq(2, 7, l = 100))
fun.list <- list(
  log = log, 
  linear = identity, 
  sqrt = sqrt)
for (fun.name in names(fun.list)) {
  fun <- fun.list[[fun.name]]
  ref.dt[[fun.name]] <- fun(ref.dt$N)
}
ref.tall <- melt(ref.dt, id.vars = "N")
leg <- ggplot() + 
  geom_segment(aes(
    bedGraph.lines, min.penalty, 
    xend = bedGraph.lines, yend = max.penalty), 
    data = penalty.ranges) + 
  geom_segment(aes(
    bedGraph.lines, min.penalty, 
    xend = bedGraph.lines, yend = max.penalty), 
    color = "red", 
    data = min.err.ranges) + 
  geom_line(aes(
    N, value, color = variable), 
    data = ref.tall) + 
  scale_x_log10() + 
  scale_y_log10()
## Plot only models with penalty between log N and N.
log.linear.models <- bench.models[log(bedGraph.lines) < penalty & penalty < bedGraph.lines]
log10.range <- log.linear.models[, range(bedGraph.lines)]
ref.dt <- data.table(N.data = 10^seq(log10.range[1], log10.range[2], l = 100))
fun.list <- list(
  "log(N)" = log, 
  "N" = identity)
for (fun.name in names(fun.list)) {
  fun <- fun.list[[fun.name]]
  ref.dt[[fun.name]] <- fun(ref.dt$N.data)
}
ref.tall <- melt(ref.dt, id.vars = "N.data")
leg <- ggplot() + 
  geom_point(aes(
    bedGraph.lines, penalty), 
    shape = 1, 
    data = log.linear.models) + 
  geom_line(aes(
    N.data, value, color = variable), 
    data = ref.tall) + 
  scale_x_log10("N = number of data to segment (log scale)") + 
  scale_y_log10()
dl <- direct.label(leg, "last.polygons")
## Plot a dot at the middle number of peaks.
min.err.ranges[, mid.peaks := (min.peaks + max.peaks)/2]
## do it by experiment type.
min.err.ranges[, experiment := sub("_.*", "", prob.dir)]
## Make boxes with the median and quartiles of the number of peaks, by
## experiment
log10.range <- log10(range(min.err.ranges$bedGraph.lines))
box.dt <- data.table(
  box.mid = 10^seq(log10.range[1], log10.range[2], l = 8))
(diff.vec <- diff(log10(box.dt$box.mid)))
box.w <- diff.vec[1]/2
box.dt[, box.min := 10^(log10(box.mid) - box.w)]
box.dt[, box.max := 10^(log10(box.mid) + box.w)]
box.models <- box.dt[min.err.ranges, on = list(
  box.min < bedGraph.lines, 
  box.max > bedGraph.lines)]
stopifnot(nrow(box.models) == nrow(min.err.ranges))
box.models.stats <- box.models[, list(
  median = median(mid.peaks), 
  q75 = quantile(mid.peaks, 0.75), 
  q25 = quantile(mid.peaks, 0.25), 
  min = min(mid.peaks), 
  max = max(mid.peaks), 
  models = .N
), by = list(box.mid, experiment)][order(box.mid)]
ref.dt <- data.table(N.data = 10^seq(log10.range[1], log10.range[2], l = 100))
log10.range <- log10(range(box.models.stats$box.mid))
N.data <- 10^seq(log10.range[1], log10.range[2], l = 100)
fun.list <- list(
  "O(N)" = identity, 
  "O(log N)" = log, 
  "loglog(N)" = function(x)log(log(x)), 
  "O(sqrt N)" = sqrt)
ref.line.list <- list(
  ## OP = list(y = 9, lines = c("log(N)", "sqrt(N)", "loglog(N)")), 
  SN = list(y = 13.5, lines = c("O(N)", "O(log N)", "O(sqrt N)")))
ref.tall.list <- list()
for (ref.name in names(ref.line.list)) {
  ref.info <- ref.line.list[[ref.name]]
  for (fun.name in ref.info$lines) {
    fun <- fun.list[[fun.name]]
    first.y <- fun(min(N.data))
    ref.tall.list[[paste(fun.name, ref.name)]] <- data.table(
      N.data, 
      ref.name, 
      fun.name, 
      value = fun(N.data)/first.y*ref.info$y)
  }
}
ref.tall <- do.call(rbind, ref.tall.list)
ref.color <- "red"
gg.bands <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_wrap("experiment") + 
  geom_line(aes(
    N.data, value, group = paste(ref.name, fun.name)), 
    color = ref.color, 
    data = ref.tall) + 
  geom_text(aes(
    N.data, value, label = fun.name), 
    color = ref.color, 
    hjust = 0, 
    data = ref.tall[N.data == max(N.data)]) + 
  geom_ribbon(aes(
    box.mid, ymin = q25, ymax = q75), 
    data = box.models.stats, 
    alpha = 0.5) + 
  geom_line(aes(
    box.mid, median), 
    data = box.models.stats) + 
  scale_x_log10(
    "N = number of data to segment (log scale)", 
    limits = c(NA, 10^7.5), 
    labels = paste) + 
  scale_y_log10("Peaks in models with min label error\n(log scale)")
gg.points <- ggplot() + 
  theme_bw() + 
  theme(panel.spacing = grid::unit(0, "lines")) + 
  facet_wrap("experiment") + 
  geom_point(aes(
    bedGraph.lines, mid.peaks), 
    data = min.err.ranges) + 
  scale_x_log10() + 
  scale_y_log10() + 
  geom_line(aes(
    N.data, value, group = paste(ref.name, fun.name)), 
    color = ref.color, 
    data = ref.tall) + 
  geom_text(aes(
    N.data, value, label = fun.name), 
    color = ref.color, 
    hjust = 0, 
    data = ref.tall[N.data == max(N.data)])
## Make boxes with the median and quartiles of the number of peaks.
log10.range <- log10(range(min.err.ranges$bedGraph.lines))
box.dt <- data.table(
  box.mid = 10^seq(log10.range[1], log10.range[2], l = 8))
(diff.vec <- diff(log10(box.dt$box.mid)))
box.w <- diff.vec[1]/2
box.dt[, box.min :=  10^(log10(box.mid) - box.w)]
box.dt[, box.max := 10^(log10(box.mid) + box.w)]
box.models <- box.dt[min.err.ranges, on = list(
  box.min < bedGraph.lines, 
  box.max > bedGraph.lines)]
stopifnot(nrow(box.models) == nrow(min.err.ranges))
box.models.stats <- box.models[, list(
  median = median(mid.peaks), 
  q75 = quantile(mid.peaks, 0.75), 
  q25 = quantile(mid.peaks, 0.25), 
  min = min(mid.peaks), 
  max = max(mid.peaks), 
  models = .N
), by = list(box.mid)][order(box.mid)]
ref.dt <- data.table(N.data = 10^seq(log10.range[1], log10.range[2], l = 100))
log10.range <- log10(range(box.models.stats$box.mid))
N.data <- 10^seq(log10.range[1], log10.range[2], l = 100)
fun.list <- list(
  "O(N)" = identity, 
  "O(log N)" = log, 
  "loglog(N)" = function(x)log(log(x)), 
  "O(sqrt N)" = sqrt)
ref.line.list <- list(
  ##OP = list(y = 9, lines = c("log(N)", "sqrt(N)", "loglog(N)")), 
  SN = list(y = 13.5, lines = c("O(N)", "O(log N)", "O(sqrt N)")))
ref.tall.list <- list()
for (ref.name in names(ref.line.list)) {
  ref.info <- ref.line.list[[ref.name]]
  for (fun.name in ref.info$lines) {
    fun <- fun.list[[fun.name]]
    first.y <- fun(min(N.data))
    ref.tall.list[[paste(fun.name, ref.name)]] <- data.table(
      N.data, 
      ref.name, 
      fun.name, 
      value = fun(N.data)/first.y*ref.info$y)
  }
}
ref.tall <- do.call(rbind, ref.tall.list)
ref.color <- "red"
gg <- ggplot() + 
  theme_bw() + 
  geom_line(aes(
    N.data, value, group = paste(ref.name, fun.name)), 
    color = ref.color, 
    data = ref.tall) + 
  geom_text(aes(
    N.data, value, label = fun.name), 
    color = ref.color, 
    hjust = 0, 
    data = ref.tall[N.data == max(N.data)]) + 
  geom_ribbon(aes(
    box.mid, ymin = q25, ymax = q75), 
    data = box.models.stats, 
    alpha = 0.5) + 
  geom_line(aes(
    box.mid, median), 
    data = box.models.stats) + 
  scale_x_log10(
    "N = number of data to segment (log scale)", 
    limits = c(NA, 10^7.5), 
    labels = paste) + 
  scale_y_log10("Peaks in models with min label error\n(log scale)")
## Make boxes with the median and quartiles of the number of peaks. (TIKZ)
log10.range <- log10(range(min.err.ranges$bedGraph.lines))
box.dt <- data.table(
  box.mid = 10^seq(log10.range[1], log10.range[2], l = 8))
(diff.vec <- diff(log10(box.dt$box.mid)))
box.w <- diff.vec[1]/2
box.dt[, box.min := 10^(log10(box.mid) - box.w)]
box.dt[, box.max := 10^(log10(box.mid) + box.w)]
box.models <- box.dt[min.err.ranges, on = list(
  box.min < bedGraph.lines, 
  box.max > bedGraph.lines)]
stopifnot(nrow(box.models) == nrow(min.err.ranges))
box.models.stats <- box.models[, list(
  median = median(mid.peaks), 
  q75 = quantile(mid.peaks, 0.75), 
  q25 = quantile(mid.peaks, 0.25), 
  min = min(mid.peaks), 
  max = max(mid.peaks), 
  models = .N
), by = list(box.mid)][order(box.mid)]
ref.dt <- data.table(N.data = 10^seq(log10.range[1], log10.range[2], l = 100))
log10.range <- log10(range(box.models.stats$box.mid))
N.data <- 10^seq(log10.range[1], log10.range[2], l = 100)
fun.list <- list(
  "$O(N)$" = identity, 
  "$O(\\log N)$" = log, 
  "$O(\\sqrt N)$" = sqrt)
ref.line.list <- list(
  SN = list(y = 13.5, lines = names(fun.list)))
ref.tall.list <- list()
for (ref.name in names(ref.line.list)) {
  ref.info <- ref.line.list[[ref.name]]
  for (fun.name in ref.info$lines) {
    fun <- fun.list[[fun.name]]
    first.y <- fun(min(N.data))
    ref.tall.list[[paste(fun.name, ref.name)]] <- data.table(
      N.data, 
      ref.name, 
      fun.name, 
      value = fun(N.data)/first.y*ref.info$y)
  }
}
ref.tall <- do.call(rbind, ref.tall.list)
ref.color <- "red"
gg <- ggplot() + 
  theme_bw() + 
  geom_line(aes(
    N.data, value, group = paste(ref.name, fun.name)), 
    color = ref.color, 
    data = ref.tall) + 
  geom_text(aes(
    N.data, value, label = fun.name), 
    color = ref.color, 
    hjust = 0, 
    data = ref.tall[N.data == max(N.data)]) + 
  geom_ribbon(aes(
    box.mid, ymin = q25, ymax = q75), 
    data = box.models.stats, 
    alpha = 0.5) + 
  geom_line(aes(
    box.mid, median), 
    data = box.models.stats) + 
  scale_x_log10(
    "N = number of data to segment (log scale)", 
    limits = c(NA, 10^7.5), 
    labels = paste) + 
  scale_y_log10("Peaks in models with\nmin label error\n(log scale)")

gg  +  scale_x_log10(
  "N = number of data to segment (log scale)", 
  limits = c(NA, 10^8), 
  labels = paste)

# tikz("jss-figure-data-peaks-slide.tex", width = 4, height = 2)
# print(gg  +  scale_x_log10(
#   "N  =  number of data to segment (log scale)", 
#   limits = c(NA, 10^8), 
#   labels = paste))
# dev.off()

# tikz("jss-figure-data-peaks.tex", width = 6, height = 2)
# print(gg)
# dev.off()

