#' ---
#' title: 'R Markdown replication script for "spsurvey: spatial sampling design and analysis
#'   in R" froom The Journal of Statistical Software'
#' author: "Michael Dumelle"
#' output:
#'   html_document:
#'     theme: flatly
#'     number_sections: yes
#'     highlighted: default
#'     toc: yes
#'     toc_float:
#'       collapsed: no
#'       smooth_scroll: no
#'     toc_depth: 2
#'     keep_md: yes
#'   pdf_document:
#'     toc: yes
#'     toc_depth: '2'
#' ---
#'
#'
## ----setup, include=FALSE------------------------------------------------------------------------
knitr::opts_chunk$set(
  echo = TRUE,
  eval = TRUE,
  warning = TRUE,
  error = FALSE,
  message = TRUE,
  comment = "#>"
)

#'
#'
#' # Preliminaries {.unnumbered}
#'
#' This document contains the replication script for "spsurvey: spatial sampling design and analysis in R" for consideration of publication in the Journal of Statistical Software (JSS). As per the JSS author guidelines, the replication script contains exact code used to reproduce the manuscript. You must install the development version of spsurvey considered in this submission by installing the tarball provided alongside the submission or by running
#'
## ----eval = FALSE--------------------------------------------------------------------------------
## ## install.packages("remotes") # if needed
## ## remotes::install_github("USEPA/spsurvey", ref = "develop")

#'
#'
#' Data and scripts used to create the manuscript itself are available in a supplementary R package that can be downloaded by installing the tarball provided alongside the submission or by running
#'
## ----eval = FALSE--------------------------------------------------------------------------------
## ## remotes::install_github("USEPA/spsurvey.manuscript", ref = "main", dependencies = TRUE)

#'
#'
#' The data can alternatively be downloaded from the online JSS submission.

#'
#' The supplementary R package must be installed before proceeding with the Section 4 (Application) of the replication script. Instructions for using and finding files in the supplementary R package are available in the `README` of the package's GitHub repository: [https://github.com/USEPA/spsurvey.manuscript](https://github.com/USEPA/spsurvey.manuscript). It is recommended read the `README` before proceeding with this file, as the `README` contains some useful contextual information. Note that some of the images used in the manuscript underwent minor adjustments to code presented here in order to use high-quality, publication-ready figures in the manuscript. The code used to generate these higher-quality images is available after downloading the supplementary package. The code used to generate lower-quality images is provided in this document.
#'
#' This document can be viewed without requiring you save and knit it by visiting a pre-knitted Markdown file located in the supplementary package's GitHub repository at [https://github.com/USEPA/spsurvey.manuscript/blob/main/inst/scripts/replication/code.md](https://github.com/USEPA/spsurvey.manuscript/blob/main/inst/scripts/replication/code.md). This is helpful in the event that some version discrepancy prevents you from compiling the document.
#'
#' Before proceeding, load spsurvey and the packages installed alongside the spsurvey.manuscript package.
#'
## ----warning = FALSE, message = FALSE------------------------------------------------------------
library("spsurvey")
library("cowplot")
library("dplyr")
# library("spsurvey.manuscript") # not required if data downloaded from online JSS submission
library("ggplot2")
library("knitr")
library("maps")
library("rmarkdown")
library("xtable")
library("parallel")
#'
#'
#' We will also load a reproducible seed (chosen as 5 because the original submission coincided version 5.0 of spsurvey) by running
#'
## ------------------------------------------------------------------------------------------------
set.seed(5)

#'
#'
#' A raw replication script (with a `.R` extension) was also uploaded in the submission alongside the R Markdown replication script (with a `.Rmd` extension) to provide an easier way to run code interactively. This R script was generated using `knitr::purl(, documentation = 2)`. Next we provide code used to build the manuscript section-by-section.
#'
#' # Introduction
#'
#' There was no code in this section (outside of installing and loading spsurvey).
#'
#' # Spatially balanced sampling
#'
#' ## Summarizing and visualizing sampling frames
#'
#' To load the `NE_Lakes` data from spsurvey into the global environment, run
#'
## ------------------------------------------------------------------------------------------------
data("NE_Lakes", package = "spsurvey")
NE_Lakes
#'
#'
#' To make `NE_Lakes` an `sp_frame` object, run
#'
## ------------------------------------------------------------------------------------------------
NE_Lakes <- sp_frame(NE_Lakes)

#'
#'
#' To summarize lakes by elevation category, run
#'
## ------------------------------------------------------------------------------------------------
summary(NE_Lakes, formula = ~ ELEV_CAT)

#'
#'
#' To visualize lakes by elevation category, run
#'
## ------------------------------------------------------------------------------------------------
plot(NE_Lakes, formula = ~ ELEV_CAT, key.width = lcm(3), key.pos = 4)
#'
#'
#' The `key.width` argument was not included in the code in the manuscript but may be needed to properly display categorical figure legends.
#'
#' To summarize lakes by elevation category and the interaction between elevation category and area category, run
#'
## ------------------------------------------------------------------------------------------------
summary(NE_Lakes, formula = ~ ELEV_CAT + ELEV_CAT:AREA_CAT)

#'
#'
#' To visualize lakes by elevation category and the interaction between elevation category and area category (Figure 1 in the manuscript), run
#'
## ------------------------------------------------------------------------------------------------
# Figure 1

plot(NE_Lakes, formula = ~ ELEV_CAT + ELEV_CAT:AREA_CAT, 
     key.width = lcm(3), key.pos = 4)

#'
#'
#' To summarize lake elevation by area category, run
#'
## ------------------------------------------------------------------------------------------------
summary(NE_Lakes, formula = ELEV ~ AREA_CAT)

#'
#'
#' To visualize lake elevation by area category, run
#'
## ------------------------------------------------------------------------------------------------
# output not shown in manuscript
plot(NE_Lakes, formula = ELEV ~ AREA_CAT)

#'
#'
#' ## The Generalized Random Tessellation Stratified (GRTS) algorithm
#'
#' For a visual representation of the GRTS algorithm (Figure 2 in the manuscript), run
#'
## ----fig.show = "hold", out.width = "50%"--------------------------------------------------------
# Figure 2


## set some graphical parameters
ht <-  4
wid <- 4 / 3 * ht

## grid preliminaries
us_states <- map_data("state")
oregon_data <- subset(us_states, region == "oregon")
n_pop <- 5
n_samp <- 3
pop_long <- c(-122.75, -120.80, -117.42, -119.54, -122.08)
pop_lat <- c(42.37, 45.50, 44.25, 43.29, 42.25)
population_data <- data.frame(lat = pop_lat, long = pop_long)
min_long <- min(oregon_data$long)
max_long <- max(oregon_data$long)
min_lat <- min(oregon_data$lat)
max_lat <- max(oregon_data$lat)
scaled_oregon_data <- data.frame(lat = (oregon_data$lat - min_lat) / (max_lat - min_lat), long = (oregon_data$long - min_long) / (max_long - min_long))
scaled_population_data <- data.frame(lat = (population_data$lat - min_lat) / (max_lat - min_lat), long = (population_data$long - min_long) / (max_long - min_long))
low <- 0
q1 <- 0.25
mid <- 0.5
q3 <- 0.75
high <- 1
q18 <- 1 / 8
q38 <- 3 / 8
q58 <- 5 / 8
q78 <- 7 / 8

# Figure 2a     ----------------------------------------------------------------
annotate_size <- 10
grts_level1 <- ggplot() +
  geom_path(scaled_oregon_data, mapping = aes(x = long, y = lat), col = "black") +
  geom_point(data = scaled_population_data, mapping = aes(x = long, y = lat), size = 5) +
  geom_segment(aes(x = low, xend = high, y = low, yend = low), linetype = "dashed", col = "black") +
  geom_segment(aes(x = low, xend = high, y = mid, yend = mid), linetype = "dashed", col = "black") +
  geom_segment(aes(x = low, xend = high, y = high, yend = high), linetype = "dashed", col = "black") +
  geom_segment(aes(x = low, xend = low, y = low, yend = high), linetype = "dashed", col = "black") +
  geom_segment(aes(x = mid, xend = mid, y = low, yend = high), linetype = "dashed", col = "black") +
  geom_segment(aes(x = high, xend = high, y = low, yend = high), linetype = "dashed", col = "black") +
  annotate(geom = "text", x = q1, y = q1, label = "2", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q1, y = q3, label = "1", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q3, y = q3, label = "3", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q3, y = q1, label = "0", size = annotate_size, col = "black") +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        panel.background = element_blank())

grts_level1

## Figure 2b     ----------------------------------------------------------------
annotate_size <- 10
grts_level2 <- ggplot() +
  geom_path(scaled_oregon_data, mapping = aes(x = long, y = lat), col = "black") +
  geom_point(data = scaled_population_data, mapping = aes(x = long, y = lat), size = 5) +
  geom_segment(aes(x = low, xend = high, y = low, yend = low), linetype = "dashed", col = "black") +
  geom_segment(aes(x = low, xend = high, y = mid, yend = mid), linetype = "dashed", col = "black") +
  geom_segment(aes(x = low, xend = high, y = high, yend = high), linetype = "dashed", col = "black") +
  geom_segment(aes(x = low, xend = low, y = low, yend = high), linetype = "dashed", col = "black") +
  geom_segment(aes(x = mid, xend = mid, y = low, yend = high), linetype = "dashed", col = "black") +
  geom_segment(aes(x = high, xend = high, y = low, yend = high), linetype = "dashed", col = "black") +
  geom_segment(aes(x = q1, xend = q1, y = low, yend = high), linetype = "dashed", col = "black") +
  geom_segment(aes(x = q3, xend = q3, y = low, yend = high), linetype = "dashed", col = "black") +
  geom_segment(aes(x = low, xend = high, y = q1, yend = q1), linetype = "dashed", col = "black") +
  geom_segment(aes(x = low, xend = high, y = q3, yend = q3), linetype = "dashed", col = "black") +
  # low left
  annotate(geom = "text", x = q18, y = q18, label = "21", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q18, y = q38, label = "22", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q38, y = q18, label = "20", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q38, y = q38, label = "23", size = annotate_size, col = "black") +
  # top left
  annotate(geom = "text", x = q18, y = q58, label = "13", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q18, y = q78, label = "11", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q38, y = q58, label = "12", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q38, y = q78, label = "10", size = annotate_size, col = "black") +
  # top right
  annotate(geom = "text", x = q58, y = q18, label = "01", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q58, y = q38, label = "02", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q78, y = q18, label = "03", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q78, y = q38, label = "00", size = annotate_size, col = "black") +
  # low right
  annotate(geom = "text", x = q58, y = q58, label = "32", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q58, y = q78, label = "31", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q78, y = q58, label = "33", size = annotate_size, col = "black") +
  annotate(geom = "text", x = q78, y = q78, label = "30", size = annotate_size, col = "black") +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        panel.background = element_blank())


grts_level2
# Figure 2c --------------------------------------------------------------------
x <- seq(0, n_samp, length.out = 100)
yht <- 0.4
ysep <- 0.1
yeps <- 0.05
line_data <- data.frame(x = x, y = yht)
xprobs <- 3 / 5 * 1:5
point_data <- data.frame(
  x = c(xprobs,  seq(0.9, 3, by = 1)),
  y = yht,
  Site = c(
    "Site Not Selected",
    "Site Selected",
    "Site Not Selected",
    "Site Selected",
    "Site Selected",
    "Systematic Random Sample",
    "Systematic Random Sample",
    "Systematic Random Sample"
  )
)
ls_size <- 1.2
an_size <- 10

point_data <- subset(point_data, Site != "Systematic Random Sample")

grts_line <- ggplot() +
  geom_line(line_data, mapping = aes(x = x, y = y), size = 1.2, linetype = "dotted") +
  geom_point(point_data, mapping = aes(x = x, y = y, shape = Site), size = 5) +
  scale_shape_manual(values = c(19, 17, 3)) +
  expand_limits(y = c(0, 1)) +
  labs(x = "") +
  # dashed lines from bottom to one-d line
  # geom_segment(aes(x = 0 * high, xend = 0 * high, y = low, yend = yht), linetype = "dashed", col = "black") +
  # geom_segment(aes(x = 1 * high, xend = 1 * high, y = low, yend = yht), linetype = "dashed", col = "black") +
  # geom_segment(aes(x = 2 * high, xend = 2 * high, y = low, yend = yht), linetype = "dashed", col = "black") +
  # geom_segment(aes(x = 3 * high, xend = 3 * high, y = low, yend = yht), linetype = "dashed", col = "black") +
  # first inclusion prob
  geom_segment(aes(x = 0, xend = 0, y = yht + ysep, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = xprobs[1], xend = xprobs[1], y = yht + ysep, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = 0, xend = xprobs[1], y = yht + ysep + yeps, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = mean(c(0, xprobs[1])), xend = mean(c(0, xprobs[1])), y = yht + ysep + yeps, yend = yht + 1.5 * ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  annotate(geom = "text", x = mean(c(0, xprobs[1])), y = yht + 2 * ysep + yeps, label = "02", col = "black", size = an_size) +
  # second inclusion prob
  geom_segment(aes(x = xprobs[1], xend = xprobs[1], y = yht + ysep, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = xprobs[2], xend = xprobs[2], y = yht + ysep, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = xprobs[1], xend = xprobs[2], y = yht + ysep + yeps, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = mean(c(xprobs[1], xprobs[2])), xend = mean(c(xprobs[1], xprobs[2])), y = yht + ysep + yeps, yend = yht + 1.5 * ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  annotate(geom = "text", x = mean(c(xprobs[1], xprobs[2])), y = yht + 2 * ysep + yeps, label = "10", col = "black", size = an_size) +
  # third inclusion prob
  geom_segment(aes(x = xprobs[2], xend = xprobs[2], y = yht + ysep, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = xprobs[3], xend = xprobs[3], y = yht + ysep, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = xprobs[2], xend = xprobs[3], y = yht + ysep + yeps, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = mean(c(xprobs[2], xprobs[3])), xend = mean(c(xprobs[2], xprobs[3])), y = yht + ysep + yeps, yend = yht + 1.5 * ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  annotate(geom = "text", x = mean(c(xprobs[2], xprobs[3])), y = yht + 2 * ysep + yeps, label = "20", col = "black", size = an_size) +
  # fourth inclusion prob
  geom_segment(aes(x = xprobs[3], xend = xprobs[3], y = yht + ysep, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = xprobs[4], xend = xprobs[4], y = yht + ysep, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = xprobs[3], xend = xprobs[4], y = yht + ysep + yeps, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = mean(c(xprobs[3], xprobs[4])), xend = mean(c(xprobs[3], xprobs[4])), y = yht + ysep + yeps, yend = yht + 1.5 * ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  annotate(geom = "text", x = mean(c(xprobs[3], xprobs[4])), y = yht + 2 * ysep + yeps, label = "21", col = "black", size = an_size) +
  # fifth inclusion prob
  geom_segment(aes(x = xprobs[4], xend = xprobs[4], y = yht + ysep, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = xprobs[5], xend = xprobs[5], y = yht + ysep, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = xprobs[4], xend = xprobs[5], y = yht + ysep + yeps, yend = yht + ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  geom_segment(aes(x = mean(c(xprobs[4], xprobs[5])), xend = mean(c(xprobs[4], xprobs[5])), y = yht + ysep + yeps, yend = yht + 1.5 * ysep + yeps), linetype = "solid", col = "black", size = ls_size) +
  annotate(geom = "text", x = mean(c(xprobs[4], xprobs[5])), y = yht + 2 * ysep + yeps, label = "33", col = "black", size = an_size) +
  # uniform points
  annotate(geom = "text", x = 0.9, y = yht - 3 * ysep, label = expression(u[1]), size = an_size) +
  geom_segment(aes(x = 0.9, xend = 0.9, y = yht - 2.25 * ysep, yend = yht), linetype = "solid", col = "black", size = ls_size) +
  annotate(geom = "text", x = 1 + 0.9, y = yht - 3 * ysep, label = expression(u[2]), size = an_size) +
  geom_segment(aes(x = 1 + 0.9, xend = 1 + 0.9, y = yht - 2.25 * ysep, yend = yht), linetype = "solid", col = "black", size = ls_size) +
  annotate(geom = "text", x = 2 + 0.9, y = yht - 3 * ysep, label = expression(u[3]), size = an_size) +
  geom_segment(aes(x = 2 + 0.9, xend = 2 + 0.9, y = yht - 2.25 * ysep, yend = yht), linetype = "solid", col = "black", size = ls_size) +
  # lines to above points
  # geom_segment(aes(x = 0.9, xend = 0.9, y = 0.42, yend = yht + ysep + yeps), linetype = "dotted", col = "black", size = 1.1) +
  # geom_segment(aes(x = 1 + 0.9, xend = 1 + 0.9, y = 0.42, yend = yht + ysep + yeps), linetype = "dotted", col = "black", size = 1.1) +
  # geom_segment(aes(x = 2 + 0.9, xend = 2 + 0.9, y = 0.42, yend = yht + ysep + yeps), linetype = "dotted", col = "black", size = 1.1) +
  # theme
  theme(axis.text.x = element_text(size = 25, face = "bold", color = "black"),
        axis.title.x = element_text(size = 25, face = "bold", color = "black"),
        axis.text.y = element_blank(),
        axis.title.y = element_blank(),
        axis.ticks.y = element_blank(),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        legend.key = element_rect(fill = NA),
        legend.text = element_text(face = "bold", size = 20),
        legend.title = element_blank(),
        legend.position = c(0.5, 0.85))

grts_line

# Figure 2d            ---------------------------------------------------------
population_data$`Site` <-   c(
  "Selected",
  "Selected",
  "Selected",
  "Not Selected",
  "Not Selected"
)
grts_sample <- ggplot() +
  geom_path(oregon_data, mapping = aes(x = long, y = lat), col = "black") +
  geom_point(data = population_data, mapping = aes(x = long, y = lat, shape = `Site`), size = 5) +
  theme(axis.text = element_blank(),
        axis.title = element_blank(),
        axis.ticks = element_blank(),
        panel.grid = element_blank(),
        panel.background = element_blank(),
        legend.key = element_rect(fill = NA),
        legend.text = element_text(face = "bold", size = 20),
        legend.title = element_text(face = "bold", size = 20),
        legend.position = c(0.35, 0.5))

grts_sample

#'
#'
#'
#' To select an equal probability GRTS sample of size 50, run
#'
## ------------------------------------------------------------------------------------------------
eqprob <- grts(NE_Lakes, n_base = 50)

#'
#'
#' To select a stratified GRTS sample with 35 lakes in the low elevation category and 15 lakes in the high elevation category, run
#'
## ------------------------------------------------------------------------------------------------
n_strata <- c(low = 35, high = 15)
eqprob_strat <- grts(
  NE_Lakes,
  n_base = n_strata,
  stratum_var = "ELEV_CAT"
)

#'
#'
#' To select an unequal probability GRTS sample of size 50 with 10 expected lakes in the small area category and 40 expected lakes in the large area category, run
#'
## ------------------------------------------------------------------------------------------------
caty_n <- c(small = 10, large = 40)
uneqprob <- grts(
  NE_Lakes,
  n_base = 50,
  caty_n = caty_n,
  caty_var = "AREA_CAT"
)

#'
#'
#' To select a proportional probability (to lake area) GRTS sample of size 50
#'
## ------------------------------------------------------------------------------------------------
propprob <- grts(
  NE_Lakes,
  n_base = 50,
  aux_var = "AREA"
)

#'
#'
#' ### Legacy sites
#'
#' To select an equal probability GRTS sample of size 50 that incorporates the five legacy sites in `NE_Lakes_Legacy`, run
#'
## ------------------------------------------------------------------------------------------------
eqprob_legacy <- grts(
  NE_Lakes,
  n_base = 50,
  legacy_sites = NE_Lakes_Legacy
)

#'
#'
#' ### A minimum distance between sites
#'
#' To select an equal probability GRTS sample of size 50 with a minimum distance requirement of 1600 meters, run
#'
## ------------------------------------------------------------------------------------------------
min_d <- grts(NE_Lakes, n_base = 50, mindis = 1600)

#'
#'
#' ### Replacement sites
#'
#' To select an equal probability GRTS sample of size 50 with 10 reverse hierarchically ordered replacement sites, run
#'
## ------------------------------------------------------------------------------------------------
eqprob_rho <- grts(NE_Lakes, n_base = 50, n_over = 10)

#'
#'
#' To select an equal probability GRTS sample of size 50 with 2 nearest neighbor replacement sites (for each base site), run
#'
## ------------------------------------------------------------------------------------------------
eqprob_nn <- grts(NE_Lakes, n_base = 50, n_near = 2)

#'
#'
#' ## Summarizing, visualizing, and binding design sites
#'
#' To visualize `eqprob_rho` (Figure 3a in the manuscript), run
#'
## ------------------------------------------------------------------------------------------------
# Figure 3a
plot(eqprob_rho, key.width = lcm(3), key.pos = 4)

#'
#'
#' To visualize `eqprob_rho` for only the base sites (Figure 3b in the manuscript), run
#'
## ------------------------------------------------------------------------------------------------
# Figure 3b
plot(eqprob_rho, siteuse = "Base", key.width = lcm(3), key.pos = 4)
#'
#'
#' To summarize the design sites by elevation category, run
#'
## ------------------------------------------------------------------------------------------------
summary(eqprob_rho, formula = siteuse ~ ELEV_CAT)

#'
#'
#' To visualize the design sites by elevation category, run
#'
## ------------------------------------------------------------------------------------------------
# output not shown in manuscript
plot(eqprob_rho, formula = siteuse ~ ELEV_CAT, key.width = lcm(3))

#'
#'
#' To summarize lake area by the design site categories, run
#'
## ------------------------------------------------------------------------------------------------
summary(eqprob_rho, formula = AREA ~ siteuse)

#'
#'
#' To visualize lake area by the design site categories, run
#'
## ------------------------------------------------------------------------------------------------
# output not shown in manuscript
plot(eqprob_rho, formula = AREA ~ siteuse)

#'
#'
#' To bind the design sites together, run
#'
## ------------------------------------------------------------------------------------------------
sites_bind <- sp_rbind(eqprob_rho)

#'
#'
#' ## Printing design sites
#'
#' To print a design stratified by lake elevation category with legacy sites, reverse hierarchically ordered replacement sites, and nearest neighbor replacement sites, run
#'
## ------------------------------------------------------------------------------------------------
n_strata <- c(low = 10, high = 10)
n_over_strata <- c(low = 2, high = 5)
print(grts(
  NE_Lakes,
  n_base = n_strata,
  stratum_var = "ELEV_CAT",
  legacy_sites = NE_Lakes_Legacy,
  n_over = n_over_strata,
  n_near = 1
))

#'
#'
#' ## Measuing spatial balance
#'
#' To compute the spatial balance of the design sites in `eqprob`, run
#'
## ------------------------------------------------------------------------------------------------
sp_balance(eqprob$sites_base, NE_Lakes) # grts

#'
#'
#' To select an equal probability IRS sample of size 50, run
#'
## ------------------------------------------------------------------------------------------------
set.seed(5) # resetting seed to match results in the manuscript
eqprob_irs <- irs(NE_Lakes, n_base = 50)

#'
#'
#' To compute the spatial balance of the design sites in `eqprob_irs`, run
#'
## ------------------------------------------------------------------------------------------------
sp_balance(eqprob_irs$sites_base, NE_Lakes) # irs

#'
#' ## Linear and areal sampling frames
#'
#' To select and visualize an equal probability GRTS sample of size 25 from `Illinois_River`, run
#'
## ------------------------------------------------------------------------------------------------
set.seed(5)
eqprob_linear <- grts(Illinois_River, n_base = 25)
plot(eqprob_linear, sframe = Illinois_River, pch = 19, key.pos = 4, key.width = lcm(3.2))

#'
#' To select and visualize an equal probability GRTS sample of size 25 from `Lake_Ontario`, run
## ------------------------------------------------------------------------------------------------
set.seed(5)
eqprob_areal <- grts(Lake_Ontario, n_base = 40)
plot(eqprob_areal, sframe = Lake_Ontario, pch = 19, key.pos = 4, key.width = lcm(3.2))

#'
#' # Analysis
#'
#' ## Categorical variable analysis
#'
#' To load `NLA_PNW` into your global environment (this data set is in the supplementary R package, not spsurvey), run
#'
## ------------------------------------------------------------------------------------------------
data("NLA_PNW", package = "spsurvey")

#'
#'
#' To analyze nitrogen condition, run
#'
## ------------------------------------------------------------------------------------------------
nitr <- cat_analysis(
  NLA_PNW,
  vars = "NITR_COND",
  weight = "WEIGHT"
)

#'
#'
#' To print a subset of the proportion output, run
#'
## ------------------------------------------------------------------------------------------------
subset(nitr, select = c(Category, nResp, Estimate.P, LCB95Pct.P, UCB95Pct.P))

#'
#'
#' To print a subset of the total (`.U` stands for "total units") output, run
#'
## ------------------------------------------------------------------------------------------------
subset(nitr, select = c(Category, nResp, Estimate.U, LCB95Pct.U, UCB95Pct.U))

#'
#'
#' To analyze nitrogen condition separately for each state, run
#'
## ------------------------------------------------------------------------------------------------
nitr_subpop <- cat_analysis(
  NLA_PNW,
  vars = "NITR_COND",
  subpops = "STATE",
  weight = "WEIGHT"
)

#'
#'
#' To print a subset of the total output for Oregon lakes, run
#'
## ------------------------------------------------------------------------------------------------
subset(
  nitr_subpop,
  subset = Subpopulation == "Oregon",
  select = c(Subpopulation, Category, nResp, Estimate.U, LCB95Pct.U, UCB95Pct.U)
)

#'
#'
#' To analyze nitrogen category stratified by urban category, run
#'
## ------------------------------------------------------------------------------------------------
nitr_strat <- cat_analysis(
  NLA_PNW,
  vars = "NITR_COND",
  stratumID = "URBAN",
  weight = "WEIGHT"
)

#'
#'
#' To analyze nitrogen category separately for each state and stratified by urban category, run
#'
## ------------------------------------------------------------------------------------------------
nitr_strat_subpop <- cat_analysis(
  NLA_PNW,
  vars = "NITR_COND",
  subpops = "STATE",
  stratumID = "URBAN",
  weight = "WEIGHT"
)

#'
#'
#' ## Continuous variable analysis
#'
#' To analyze BMMI, run
#'
## ------------------------------------------------------------------------------------------------
bmmi <- cont_analysis(
  NLA_PNW,
  vars = "BMMI",
  weight = "WEIGHT"
)

#'
#'
#' To print a subset of the mean output, run
#'
## ------------------------------------------------------------------------------------------------
subset(bmmi$Mean, select = c(Indicator, nResp, Estimate, LCB95Pct, UCB95Pct))

#'
#'
#' To visualize the cumulative distribution function (Figure 4 in the manuscript), run
#'
## ------------------------------------------------------------------------------------------------
# Figure 4
plot(bmmi$CDF)
#'
#'
#' To analyze BMMI separately for each state, run
#'
## ------------------------------------------------------------------------------------------------
bmmi_state <- cont_analysis(
  NLA_PNW,
  vars = "BMMI",
  subpops = "STATE",
  weight = "WEIGHT"
)

#'
#'
#' To print mean output for each state, run
#'
## ------------------------------------------------------------------------------------------------
subset(
  bmmi_state$Mean,
  select = c(Subpopulation, Indicator, nResp, Estimate, LCB95Pct, UCB95Pct)
  )

#'
#'
#' To analyze BMMI stratified by urban category, run
#'
## ------------------------------------------------------------------------------------------------
bmmi_strat <-  cont_analysis(
  NLA_PNW,
  vars = "BMMI",
  stratumID = "URBAN",
  weight = "WEIGHT"
)

#'
#'
#' To analyze BMMI separately for each state and stratified by urban category, run
#'
## ------------------------------------------------------------------------------------------------
bmmi_strat_state <-  cont_analysis(
  NLA_PNW,
  vars = "BMMI",
  subpops = "STATE",
  stratumID = "URBAN",
  weight = "WEIGHT"
)

#'
#'
#' ## Additional analysis approaches
#'
#' There was no code in this section.
#'
#' # Application
#'
#' To load `NLA12` from the spsurvey.manuscript supplementary R package into your global environment, run
#'
## ------------------------------------------------------------------------------------------------
#  data("NLA12") # can be loaded directly if downloading data from the online JSS submission
load("NLA12.rda")
#'
#'
#' To turn `NLA12` into an `sp_frame` object, run
#'
## ------------------------------------------------------------------------------------------------
NLA12 <- sp_frame(NLA12)

#'
#'
#' To summarize Atrazine presence and BMMI, run
#'
## ------------------------------------------------------------------------------------------------
summary(NLA12, formula = ~ AP + BMMI)

#'
#'
#' To visualize Atrazine presence and BMMI (Figure 5), run
#'
## ------------------------------------------------------------------------------------------------
plot(NLA12, formula = ~ AP + BMMI, key.width = lcm(3), key.pos = 4)

#'
#'
#' To store the function that performs a simulation trial, run
#'
## ------------------------------------------------------------------------------------------------
# write function
perform_sim <- function(seed, data, variable, sample_size, cat, ...) {

  # set reproducible seed
  set.seed(seed)

  # set new data
  data <- data[, variable, drop = FALSE] %>%
    na.omit()

  # grts process
  grts_design <- grts(data, n_base = sample_size, ...)
  grts_spbalance <- sp_balance(grts_design$sites_base, data, ...)

  # irs process
  irs_design <- irs(data, n_base = sample_size, ...)
  irs_spbalance <- sp_balance(irs_design$sites_base, data, ...)

  if (cat) { # short for categorical variable

    # take true mean
    true_mean <- sum(data[[variable]] == "Y", na.rm = TRUE) / nrow(data)

    # use spsurvey to analyze data
    grts_analysis <- cat_analysis(
      grts_design$sites_base,
      siteID = "siteID",
      vars = variable,
      weight = "wgt",
      ...
    )

    # filter the analysis output and compute summary statistics
    grts_output <- grts_analysis %>%
      filter(Category == "Y") %>%
      select(Estimate.P, StdError.P, LCB95Pct.P, UCB95Pct.P) %>%
      mutate(
        Estimate.P = Estimate.P / 100,
        StdError.P = StdError.P / 100,
        LCB95Pct.P = LCB95Pct.P / 100,
        UCB95Pct.P = UCB95Pct.P / 100,
        spb = grts_spbalance[["value"]],
        algorithm = "GRTS",
        true_mean = true_mean,
        cover = 1 * (LCB95Pct.P <= true_mean & true_mean <= UCB95Pct.P),
        seed = seed
      )

    # use spsurvey to analyze data
    irs_analysis <- cat_analysis(
      irs_design$sites_base,
      siteID = "siteID",
      vars = variable,
      weight = "wgt",
      vartype = "SRS",
      fpc = nrow(data),
      ...
    )

    # filter the analysis output and compute summary statistics
    irs_output <- irs_analysis %>%
      filter(Category == "Y") %>%
      select(Estimate.P, StdError.P, LCB95Pct.P, UCB95Pct.P) %>%
      mutate(
        Estimate.P = Estimate.P / 100,
        StdError.P = StdError.P / 100,
        LCB95Pct.P = LCB95Pct.P / 100,
        UCB95Pct.P = UCB95Pct.P / 100,
        spb = irs_spbalance[["value"]],
        algorithm = "SRS",
        true_mean = true_mean,
        cover = 1 * (LCB95Pct.P <= true_mean & true_mean <= UCB95Pct.P),
        seed = seed
      )

  } else {

    # take true mean
    true_mean <- mean(data[[variable]], na.rm = TRUE)


    # use spsurvey to analyze data
    grts_analysis <- cont_analysis(
      grts_design$sites_base,
      siteID = "siteID",
      vars = variable,
      weight = "wgt",
      statistics = "Mean",
      ...
    )

    # filter the analysis output and compute summary statistics
    grts_output <- grts_analysis$Mean %>%
      select(Estimate, StdError, LCB95Pct, UCB95Pct) %>%
      mutate(
        spb = grts_spbalance[["value"]],
        algorithm = "GRTS",
        true_mean = true_mean,
        cover = 1 * (LCB95Pct <= true_mean & true_mean <= UCB95Pct),
        seed = seed
      )


    # use spsurvey to analyze data
    irs_analysis <- cont_analysis(
      irs_design$sites_base,
      siteID = "siteID",
      vars = variable,
      weight = "wgt",
      vartype = "SRS",
      fpc = nrow(data),
      statistics = "Mean",
      ...
    )

    # filter the analysis output and compute summary statistics
    irs_output <- irs_analysis$Mean %>%
      select(Estimate, StdError, LCB95Pct, UCB95Pct) %>%
      mutate(
        spb = irs_spbalance[["value"]],
        algorithm = "SRS",
        true_mean = true_mean,
        cover = 1 * (LCB95Pct <= true_mean & true_mean <= UCB95Pct),
        seed = seed
      )

  }

  # combine the results
  output <- rbind(grts_output, irs_output)

  # null row names (personal preference)
  row.names(output) <- NULL

  # return output
  output
}

#'
#'
#' WARNING Running all simulation trials in parallel may take 10 - 60 minutes. Results can instead obtained by loading the `.rda` files included in the online JSS submission or by running (after removing comments)
#'
## ----eval = FALSE--------------------------------------------------------------------------------
## ## load("cat_results.rda") # categorical variable results (Atrazine)
## ## load("cat_summaries.rda") # categorical variable summaries (Atrazine)
## ## load("cont_results.rda") # continuous variable results (BMMI)
## ## load("cont_summaries.rda") # continuous variable summaries (BMMI)
## ##
## ## # printing the summaries
## ## cat_summaries
## ## cont_summaries

#'
#'
#' To perform the 2000 simulation trials in parallel, run
#'
## ----results = "hide"----------------------------------------------------------------------------
# find cores and set cluster
n_clusters <- detectCores()
cluster <- makeCluster(n_clusters)

# run the simulation
clusterEvalQ(cluster, library("dplyr"))
clusterEvalQ(cluster, library("spsurvey"))

# set seeds and sample sizes
seeds <- 1:2000 # the seeds also represent the trial number
sample_size <- 250

# cat results
cat_results <- parLapply(cluster, seeds, perform_sim, NLA12, "AP", sample_size, cat = TRUE) %>%
  bind_rows()

## cont results
cont_results <- parLapply(cluster, seeds, perform_sim, NLA12, "BMMI", sample_size, cat = FALSE) %>%
  bind_rows()

## stop the cluster
stopCluster(cluster)

#'
#'
#' To compute the Atrazine presence summary, run
#'
## ------------------------------------------------------------------------------------------------
cat_summaries <- cat_results %>%
    group_by(algorithm) %>%
    summarize(
      mbias = mean(Estimate.P - true_mean),
      mstderr = mean(StdError.P),
      rmse = sqrt(mean((Estimate.P - true_mean)^2)),
      coverage = mean(cover),
      msbp = mean(spb)
    )

#'
#'
#' To view the Atrazine presence summary, run
#'
## ------------------------------------------------------------------------------------------------
cat_summaries %>%
  as.data.frame() %>%
  print()

#'
#'
#' To generate a LaTeX table from the Atrazine presence summary (Table 1 in the manuscript), run
#'
## ------------------------------------------------------------------------------------------------
names(cat_summaries) <- c("Algorithm", "Bias", "Std. Err.", "RMSE", "Coverage", "SPB")
alpha <- 0.05
cat_summaries$MOE <- qnorm(1 - alpha / 2) * cat_summaries$`Std. Err.`
cat_summaries <- cat_summaries[, c("Algorithm", "SPB", "Bias", "RMSE", "Coverage", "MOE"), drop = FALSE]
cat_summaries_table <- xtable(cat_summaries, digits = 4, type = "latex", latex.environments = "center")
print(cat_summaries_table, include.rownames = FALSE, comment = FALSE)

#'
#'
#' To view the Atrazine presence spatial balance values (Figure 6a in the manuscript), run
#'
## ------------------------------------------------------------------------------------------------
ggplot(cat_results, mapping = aes(x = algorithm, y = spb)) +
  geom_boxplot() +
  labs(x = "Algorithm", y = "Spatial Balance") +
  theme_minimal_grid()

#'
#'
#' To view the Atrazine presence margin of error values (Figure 6b in the manuscript), run
#'
## ------------------------------------------------------------------------------------------------
ggplot(cat_results, mapping = aes(x = algorithm, y = 1.96 * StdError.P)) +
  geom_boxplot() +
  labs(x = "Algorithm (Variance Estimator)", y = "Margin of Error") +
  scale_x_discrete(labels = c("GRTS (Local)", "SRS (SRS)")) +
  theme_minimal_grid()

#'
#'
#' To compute the BMMI summary, run
#'
## ------------------------------------------------------------------------------------------------
cont_summaries <- cont_results %>%
  group_by(algorithm) %>%
  summarize(
    mbias = mean(Estimate - true_mean),
    mstderr = mean(StdError),
    rmse = sqrt(mean((Estimate - true_mean)^2)),
    coverage = mean(cover),
    msbp = mean(spb)
  )

#'
#'
#' To view the BMMI summary, run
#'
## ------------------------------------------------------------------------------------------------
cont_summaries %>%
  as.data.frame() %>%
  print()

#'
#'
#' To generate a LaTeX table from the BMMI summary (Table 2 in the manuscript), run
#'
## ------------------------------------------------------------------------------------------------
names(cont_summaries) <- c("Algorithm", "Bias", "Std. Err.", "RMSE", "Coverage", "SPB")
alpha <- 0.05
cont_summaries$MOE <- qnorm(1 - alpha / 2) * cont_summaries$`Std. Err.`
cont_summaries <- cont_summaries[, c("Algorithm", "SPB", "Bias", "RMSE", "Coverage", "MOE"), drop = FALSE]
cont_summaries_table <- xtable(cont_summaries, digits = 4, type = "latex", latex.environments = "center")
print(cont_summaries_table, include.rownames = FALSE, comment = FALSE)

#'
#'
#' To view the BMMI spatial balance values (Figure 7a in the manuscript), run
#'
## ------------------------------------------------------------------------------------------------
ggplot(cont_results, mapping = aes(x = algorithm, y = spb)) +
  geom_boxplot() +
  labs(x = "Algorithm", y = "Spatial Balance") +
  theme_minimal_grid()

#'
#'
#' To view the BMMI margin of error values, run
#'
## ------------------------------------------------------------------------------------------------
ggplot(cont_results, mapping = aes(x = algorithm, y = 1.96 * StdError)) +
  geom_boxplot() +
  labs(x = "Algorithm (Variance Estimator)", y = "Margin of Error") +
  scale_x_discrete(labels = c("GRTS (Local)", "SRS (SRS)")) +
  theme_minimal_grid()

#'
#'
#' # Discussion
#'
#' There was no code in this section.
#'
#' # Session information {.unnumbered}
#'
#' To view the session information used to compile this document, run
#'
## ------------------------------------------------------------------------------------------------
sessionInfo()

#'