# Replication code for qqconf
# Dependencies for running the replication script can be installed via 
# install.packages(c("qqconf", "robustbase", "markdown", "knitr", "ggplot2"))

####### Section 2.4 #######

# Code to verify the following statement from lines 9-11 of p.6 of the paper regarding the case alpha=.01:
# "For example, for n in the range of 15K to 500K, the absolute relative error
# in the approximation based on c_alpha = 1.6 is always more than .0067, while
# that based on c_alpha = 1.591 is always less than .001."

# To replicate this result, we consider local levels for alpha = .01
# for a grid of sample sizes n from 15K to 500K, and compare
# (1) exact value, (2) old asymptotic approximation (with c_alpha = 1.6), and
# (3) new asymptotic approximation (with c_alpha = 1.591), and then we show that
# the absolute relative error of (2) as an approximation of (1) is greater than .0067 on this grid,
# while the absolute relative error of (3) as an approximation of (1) is less than .001 on this grid.

# We define our grid of sample sizes: (15K, 50K, 100K, 150K, ..., 500K)

grid <- c(15, c(1:10) * 50) * 10 ^ 3

# For the grid of sample sizes, the corresponding vector of local levels for global level alpha=.01 is

loc_lev <- c(9.822814e-05, 8.200296e-05, 7.476107e-05, 7.105596e-05,
            6.862968e-05, 6.685227e-05, 6.546293e-05, 6.432979e-05,
            6.337766e-05, 6.255951e-05, 6.184428e-05)

# To demonstrate the correctness of these values, we calculate the global level
# for each case and verify that it is equal to .01 (up to tolerance 1e-06).

glob_lev <- c(rep(0, 11))
for(i in c(1:11)) {

  n <- grid[i]
  eta <- loc_lev[i]
  lbs <- qbeta(eta / 2, c(1:n), c(n:1))
  ubs <- qbeta(1 - eta / 2, c(1:n), c(n:1))
  glob_lev[i] <- qqconf::get_level_from_bounds_two_sided(lbs, ubs)

}

# We verify that the resulting relative errors are all less than 1e-6 in absolute value:

rel_err <- (glob_lev - .01) / .01
range(signif(abs(rel_err), digits = 3))

# All the global levels are equal to .01 (up to relative tolerance 1e-06),  so the values
# in loc_lev are verified.

# The old and new asymptotic approximations are both based on equation (5) of the paper with
# different choices of c_alpha.
# We use the following function to calculate the value from equation (5):

asym_approx <- function(alpha, n, c_alpha){
  -log(1 - alpha) / 2 / log(n) / log(log(n)) * (1 - c_alpha * log(log(log(n))) / log(log(n)))
}

# We obtain the local levels by the old and new asymptotic approximations for each of the
# sample sizes in the grid

local_levels_old_asym = c(rep(0, 11))
local_levels_new_asym = c(rep(0, 11))
for(i in c(1:11)) {
  local_levels_old_asym[i] <- asym_approx(.01, grid[i], 1.6)
  local_levels_new_asym[i] <- asym_approx(.01, grid[i], 1.591)
}

# Range of absolute relative errors for each of these approximations

range(abs((local_levels_old_asym - loc_lev) / loc_lev))
range(abs((local_levels_new_asym - loc_lev) / loc_lev))

# From the above, we can see that the absolute relative error in the old
# approximation (based on c_alpha = 1.6) for these values is more than .0067,
# while that in the new approximation (based on c_alpha = 1.591) is less than .001.

####### Section 2.6 #######
# The simulation results in Table 1 were generated using the following code.
# This code produces two tables: One of simulated means and the other of simulated standard errors
# In our paper these tables are combined for convenience
assess_type1_error <- function(n, alpha, nrep){

  tmp <- qqconf::get_bounds_two_sided(alpha, n)
  lb <- tmp$lower_bound
  ub <- tmp$upper_bound

  results <- c(rep(0, 5))
  z <- matrix(c(rep(0, 5 * n)), n, 5)
  tst <- c(rep(0, 5))

  for(i in 1:nrep){
    x <- rnorm(n)
    x <- sort(x)
    mn <- mean(x)
    md <- median(x)
    s1 <- sd(x)
    s2 <- mad(x)
    s3 <- robustbase::Qn(x)
    s4 <- robustbase::Sn(x)

    z[, 1] <- pnorm((x - mn) / s1)
    z[, 2] <- pnorm((x - md) / s2)
    z[, 3] <- pnorm((x - md) / s3)
    z[, 4] <- pnorm((x - md) / s4)
    z[, 5] <- pnorm(x)

    for(j in 1:5) tst[j] <- sum(as.numeric(z[, j] < lb)) + sum(as.numeric(z[, j] > ub))

    for(j in c(1:5)) results[j] <- results[j] + as.numeric(tst[j] > .5)
  }

  return(results / nrep)
}

set.seed(3141592)
n <- 100
alpha <- .05
nrep <- 10 ^ 4
sim1 <- assess_type1_error(n, alpha, nrep)
sim1sds <- sqrt(sim1 * (1 - sim1) / nrep)
n <- 500
alpha <- .05
nrep <- 10 ^ 4
sim2 <- assess_type1_error(n, alpha, nrep)
sim2sds <- sqrt(sim2 * (1 - sim2) / nrep)
n <- 10 ^ 4
alpha <- .05
nrep <- 10 ^ 4
sim3 <- assess_type1_error(n, alpha, nrep)
sim3sds <- sqrt(sim3 * (1 - sim3) / nrep)

sim_mat <- matrix(c(sim1, sim2, sim3), nrow = 3, byrow = TRUE)
sim_sd_mat <- matrix(c(sim1sds, sim2sds, sim3sds), nrow = 3, byrow = TRUE)

sim_df <- as.data.frame(
  sim_mat,
  row.names = c("100", "500", "10,000")
)

colnames(sim_df) <- c("sample sd", "MAD", "Qn", "Sn", "true")

knitr::kable(sim_df, "html", caption = "Empirical Type 1 error when using", digits = 4)

sim_sd_df <- as.data.frame(
  sim_sd_mat,
  row.names = c("100", "500", "10,000")
)

colnames(sim_sd_df) <- c("sample sd se", "MAD se", "Qn se", "Sn se", "true se")

knitr::kable(sim_sd_df, "html", caption = "Empirical Type 1 error se when using", digits = 4)


####### Section 3.1 #######

# Generate data
set.seed(20)
n <- 100
x <- runif(n)
eta <- rt(n, df = 3)
y <- x + eta

# Fit regression
reg <- lm(y ~ x)

# Figure 1
qnorm_plot <- qqnorm(reg$residuals)
qqline(reg$residuals)

# Figure 2
qqconf::qq_conf_plot(
  obs = reg$residuals,
  points_params = list(col = "blue", pch = 20, cex = .5)
)

# Figure 3
band <- qqconf::get_qq_band(obs = reg$residuals)
plot(
  qnorm_plot,
  col = "blue",
  pch = 20,
  cex = .5,
  xlab = "Expected quantiles",
  ylab = "Observed quantiles"
)

lines(sort(qnorm_plot$x), band$lower_bound, col = "red")
lines(sort(qnorm_plot$x), band$upper_bound, col = "red")

qqline(
  qnorm_plot$x,
  datax = TRUE,
  distribution = function(p) qnorm(
    p, mean = band$dparams$mean, sd = band$dparams$sd
  )
)

# Figure 4
band_df <- data.frame(
  lower = band$lower_bound,
  upper = band$upper_bound,
  obs = reg$residuals
)

build_plot <- ggplot2::ggplot_build(
  ggplot2::ggplot(data = band_df, mapping = ggplot2::aes(sample = obs)) +
    qqplotr::stat_qq_point(dparams = band$dparams)
)

band_df$expected <- build_plot$data[[1]]$x

ggplot2::ggplot(data = band_df, mapping = ggplot2::aes(sample = obs)) +
  ggplot2::geom_ribbon(
    ggplot2::aes(ymin = lower, ymax = upper, x = expected),
    fill = "grey80"
  ) +
  qqplotr::stat_qq_line(dparams = band$dparams, identity = TRUE) +
  qqplotr::stat_qq_point(
    dparams = band$dparams, 
    color = "blue", 
    size = .5
  ) +
  ggplot2::xlab("Expected quantiles") +
  ggplot2::ylab("Observed quantiles")

# Figure 5
qqconf::qq_conf_plot(
  obs = reg$residuals,
  method = "ks",
  points_params = list(col = "blue", pch = 20, cex = .5)
)

####### Section 3.2 #######
set.seed(8675309)
# Generate contingency tables for variables as defined in section 3.1
generate_contingency_tables <- function(n, s, a, b) {

  cell_probs <- c(a * b, a * (1 - b), b * (1 - a), (1 - b) * (1 - a))
  num_sims <- 0
  sim_mat <- matrix(NA, nrow = n, ncol = 4)

  # Generate n simulations and append the contingency tables to a matrix
  while (num_sims < n) {

    proposed_sim <- t(rmultinom(n = 1, size = s, prob = cell_probs))

    # Check to make sure that the chi-square statistic will be defined
    if (proposed_sim[, 1] + proposed_sim[, 2] != 0 &&
        proposed_sim[, 1] + proposed_sim[, 3] != 0 &&
        proposed_sim[, 3] + proposed_sim[, 4] != 0 &&
        proposed_sim[, 4] + proposed_sim[, 2]) {

      num_sims <- num_sims + 1
      sim_mat[num_sims, ] <- proposed_sim

    }

  }

  return(sim_mat)

}

scenario_1 <- generate_contingency_tables(n = 1000, s = 200, a = .15, b = .4)
scenario_2 <- generate_contingency_tables(n = 1000, s = 20, a = .15, b = .4)

# Compute Pearson Chi-Square Test p-values for Set of Contingency Tables
get_p_values_from_c_tables <- function(c_tables, s) {

  chisq_numerator <- (
    (c_tables[, 1] * c_tables[, 4] - c_tables[, 2] * c_tables[, 3]) / s
  ) ^ 2
  chisq_denominator <- (c_tables[, 1] + c_tables[, 2]) *
    (c_tables[, 1] + c_tables[, 3]) *
    (c_tables[, 3] + c_tables[, 4]) *
    (c_tables[, 2] + c_tables[, 4]) /
    (s ^ 3)
  chisq_stats_vec <- chisq_numerator / chisq_denominator
  p_vals <- pchisq(q = chisq_stats_vec, df = 1, lower.tail = FALSE)
  return(p_vals)

}

pvals_scenario_1 <- get_p_values_from_c_tables(scenario_1, s = 200)
pvals_scenario_2 <- get_p_values_from_c_tables(scenario_2, s = 20)
# For the manuscript, these above two vectors of p-values are loaded in as data

par(pty = "s")

# Figure 6
qqconf::qq_conf_plot(
  obs = pvals_scenario_2,
  distribution = qunif,
  points_params = list(
    col = palette.colors(palette = "Okabe-Ito")["vermillion"],
    type="l"
  )
)

qqconf::qq_conf_plot(
  obs = pvals_scenario_1,
  distribution = qunif,
  points_params = list(
    col = palette.colors(palette = "Okabe-Ito")["blue"],
    type="l"
  ),
  add = TRUE,
  asp = 1
)

legend(
  "topleft",
  legend = c("s=200", "s=20"),
  col = c(
    palette.colors(palette = "Okabe-Ito")["blue"], 
    palette.colors(palette = "Okabe-Ito")["vermillion"]
  ),
  lty = 1
)

# Figure 7
qqconf::qq_conf_plot(
  obs = pvals_scenario_2,
  distribution = qunif,
  points_params = list(
    col = palette.colors(palette = "Okabe-Ito")["vermillion"],
    type = "l"
  ),
  log10 = TRUE,
  asp = 1
)

qqconf::qq_conf_plot(
  obs = pvals_scenario_1,
  distribution = qunif,
  points_params = list(
    col = palette.colors(palette = "Okabe-Ito")["blue"], 
    type = "l"
  ),
  log10 = TRUE,
  add = TRUE
)

legend(
  "topleft",
  legend = c("s=200", "s=20"),
  col = c(
    palette.colors(palette = "Okabe-Ito")["blue"], 
    palette.colors(palette = "Okabe-Ito")["vermillion"]
  ),
  lty = 1
)

par(pty = "m")

####### Section 3.3 #######

# Read in data
cjd_df <- read.table("Data/cjd_sample.txt", header = TRUE)

# Figure 8
qqconf::qq_conf_plot(
  obs = cjd_df[,3],
  distribution = qunif,
  points_params = list(pch = 21, cex = 0.2),
  difference = TRUE
)

# Figure 9
qqconf::qq_conf_plot(
  obs = cjd_df[,3],
  distribution = qunif,
  points_params = list(pch = 21, cex = 0.2),
  log10 = TRUE
)

# Figure 10
qqconf::qq_conf_plot(
  obs = cjd_df[,3],
  distribution = qunif,
  points_params = list(pch = 21, cex = 0.2),
  difference = TRUE,
  log10 = TRUE,
  ylim = c(-0.2, 1.1)
)

sessionInfo()
