#### Header ####

## File: R input code associated to the JSS paper "vsgoftest: An R package..."
## Date: 18.08.30
## Author: philippe.regnault(at)univ-reims.fr

#### Installation instructions ####

## Installing missing packages.
## Uncomment and execute to proceed
# reqrd.pkg <- c("vsgoftest", "dbEmpLikeGOF", "devtools", "DescTools",
#                "ggplot2", "dplyr", "microbenchmark", "goftest")
# for (pkg in reqrd.pkg) {
#   if (!requireNamespace(pkg, quietly = TRUE)) {
#     install.packages(pkg)
#   }
# }

## Uncomment the following line to install the vsgoftest package from github
## devtools::install_github(repo = "pregnault/vsgoftest")

#### Estimating entropy ####
library("vsgoftest") 
set.seed(2) 
samp <- rnorm(n = 100, mean = 0, sd = 1) # Sample drawn from normal distribution
entropy.estimate(x = samp, window = 8)  # Estimated entropy
log(2 * pi * exp(1))/2 # The exact value of the entropy of the normal
# distribution with mean 0 and standard deviation 1

## ------------------------------------------------------------------------
## Compute estimates for several window sizes
sapply(1:10, function(w) entropy.estimate(x = samp, window = w)) 

## ------------------------------------------------------------------------
## Select window size maximizing the entropy estimate (maximum entropy principle)
n <- 100
V <- sapply(1:(n/2 - 1), function(w) entropy.estimate(x = samp, window = w))
which.max(V)

## ------------------------------------------------------------------------
set.seed(5)
n <- 100
samp <- rpareto(n, c = 1, mu = 2) # Sample drawn from Pareto distribution
entropy.estimate(x = samp, window = 3)
-log(2) + 3/2 # Exact value of the Pareto Distribution with parameters c=1, mu=2

#### Testing GOF of specified (family of) distributions ####
## ------------------------------------------------------------------------
set.seed(5)
samp <- rnorm(50,2,3)
vs.test(x = samp, densfun = "dlaplace")

## ------------------------------------------------------------------------
set.seed(4)
vs.test(x = samp, densfun = "dnorm")

## ------------------------------------------------------------------------
set.seed(26)
vs.test(x = samp, densfun = "dnorm", param = c(2,3))

## 
set.seed(2)
samp <- rnorm(50, -2, 1)
try(vs.test(samp, densfun = "dnorm", param = -2))
# Error message: -2 is not a suitable parameter for the normal distribution

## ------------------------------------------------------------------------
set.seed(1)
samp <- rweibull(200, shape = 1.05, scale = 1)
set.seed(2)
# MC simulations for computing the p-value
vs.test(samp, densfun = "dexp", simulate.p.value = TRUE, B = 10000)

## ------------------------------------------------------------------------
set.seed(63)
vs.test(samp, densfun = "dexp", delta = 5/30) # Change upper bound for window size

## ------------------------------------------------------------------------
set.seed(8)
samp <- rexp(30, rate = 3)
vs.test(x = samp, densfun = "dlnorm") # Log_normality not rejected
vs.test(x = samp, densfun = "dlnorm", extend = TRUE) # upper bound for window
# size extended to half of the sample; log_normality rejected

## ------------------------------------------------------------------------
samp <- c(samp, rep(4,3)) # Add ties to the sample
try(vs.test(x = samp, densfun = "dexp"))
# Error: too many ties to perform VS test

## ------------------------------------------------------------------------
vs.test(x = samp, densfun = "dexp", extend = TRUE) 

## ------------------------------------------------------------------------
## An example of sample for which VS test can not be applied to
set.seed(84)
ech <- rpareto(20, mu = 1/2, c = 1)
try(vs.test(x = ech, densfun = "dpareto", param = c(1/2, 1)))

## ------------------------------------------------------------------------
data("contaminants", package = "vsgoftest")
set.seed(1)
vs.test(x = aluminium2, densfun = "dpareto")


#### Power comparisons ####
## The execution of these code chunks takes a pretty big amount of time.
## The following chunks yield table 3 of the paper
library("goftest")
lstn <- c(20, 30, 50, 100) # List of samples sizes
N <- 10000 # Number of replicates for MC simulations

## Pareto against log-normal: auxiliary function for simulating a sample,
## applying GOF tests and gathering their pvalues
pop <- function(n, m = 0, s) {
  ech <- 1 + rlnorm(n, meanlog = m, sdlog = s)
  pvs <- vs.test(x = ech, densfun = "dpareto", param = c(1/s, 1), B = 1000)$p.value
  pks <- ks.test(x = ech, y = "ppareto", mu = 1/s, c = 1)$p.value
  pad <- ad.test(x = ech, null = "ppareto", mu = 1/s, c = 1)$p.value
  pcvm <- cvm.test(x = ech, null = "ppareto", mu = 1/s, c = 1)$p.value
  return(c(pvs, pks, pad, pcvm))
}

## Pareto with c = 1, mu = 1 vs. shifted log-normal with meanlog = 0, sdlog = 1
## WARNING: the execution of the following "for" loop may take a couple of
## hours on a standard laptop

mu <- 1
powers1 <- matrix(0, nrow = 4, ncol = length(lstn))
set.seed(54)
t <- Sys.time()
for (i in seq_along(lstn)) {
  res.pow <- replicate(n = N, expr = pop(lstn[i], 0, 1/mu))
  powers1[i,] <- apply(res.pow < 0.05, 1, mean)
}
powers1
Sys.time() - t

## Pareto with c = 1, mu = .8 vs. shifted log-normal with meanlog = 0, sdlog = 1.25
## WARNING: the execution of the following "for" loop may take a couple of
## hours on a standard laptop
mu <- 4/5
powers2 <- matrix(0, nrow = 4, ncol = length(lstn))
set.seed(32)
for (i in seq_along(lstn)) {
  res.pow <- replicate(n = N, expr = pop(lstn[i], 0, 1/mu))
  powers2[i,] <- apply(res.pow < 0.05, 1, mean)
}
powers2
## Exponential vs Weibull
## Auxiliary function
pop <- function(n, sh) {
  ech <- rweibull(n, shape = sh, scale = 1)
  pvs <- vs.test(x = ech, densfun = "dexp", param = 1, B = 1000)$p.value
  pks <- ks.test(x = ech, y = "pexp", rate = 1)$p.value
  pad <- ad.test(x = ech, null = "pexp", rate = 1)$p.value
  pcvm <- cvm.test(x = ech, null = "pexp", rate = 1)$p.value
  return(c(pvs, pks, pad, pcvm))
}

## Exp 1/2 vs Weibull(1.2, 2)
## WARNING: the execution of the following "for" loop may take a couple of
## hours on a standard laptop
powers3 <- matrix(0, nrow = 4, ncol = length(lstn))
set.seed(12)
for (i in seq_along(lstn)) {
  res.pow <- replicate(n = N, expr = pop(lstn[i], 1.2))
  powers3[i,] <- apply(res.pow < 0.05, 1, mean)
}
powers3
## Exp 1/2 vs Weibull(1.3, 2)
## WARNING: the execution of the following "for" loop may take a couple of
## hours on a standard laptop
powers4 <- matrix(0, nrow = 4, ncol = length(lstn))
set.seed(23)
for (i in seq_along(lstn)) {
  res.pow <- replicate(n = N, expr = pop(lstn[i], 1.3))
  powers4[i,] <- apply(res.pow < 0.05, 1, mean)
}
powers4

#### vsgoftest versus dbEmpLikeGOF ####
## ------------------------------------------------------------------------
## Relation between VS and ELR test statistics.
set.seed(1)
samp <- rnorm(50)
res.vs <- vs.test(x = samp, densfun = "dnorm", delta = -1/6, relax = TRUE)
res.vs$statistic * 50 + 1/2
library("dbEmpLikeGOF")
res.el <- dbEmpLikeGOF(x = samp, testcall = "normal", vrb = FALSE)
res.el$teststat

## Comparison of computation times
library("ggplot2")
library("dplyr")
library("microbenchmark")
## For normal distribution
set.seed(1)
sample <- rnorm(n = 50)
bm <- microbenchmark(vs.test = vs.test(x = sample,
                                       densfun = "dnorm", simulate.p.value = TRUE,
                                       B = 1000, delta = -1/6),
                     dbEmpLikeGOF = dbEmpLikeGOF(x = sample, testcall = "normal",
                                                 pvl.Table = FALSE, num.mc = 1000,
                                                 vrb = FALSE),
                     times = 100L)
## Violin plots for comparing computation times of vs.test and dbEmpLikeGOF
bm %>% ggplot(mapping = aes(x = expr, y = time/10^6)) + geom_violin() +
  coord_flip() +
  stat_summary(fun = "median", geom = "errorbar",
               mapping = aes(ymax = ..y.., ymin = ..y..),
               linetype = "dashed", col = "red", show.legend = TRUE) +
  labs(x = "", y = "Computation time (ms)") +
  theme(text = element_text(size = 18))

## For uniform distribution
set.seed(1)
sample <- runif(n = 50, min = 1, max = 3)
bm <- microbenchmark(vs.test = vs.test(x = sample,
                                       densfun = "dunif", simulate.p.value = TRUE,
                                       B = 1000, delta = -1/6),
                     dbEmpLikeGOF = dbEmpLikeGOF(x = sample, testcall = "uniform",  
                                                 pvl.Table = FALSE, num.mc = 1000,
                                                 vrb = FALSE),
                     times = 100L)

## Violin plots for comparing computation times of vs.test and dbEmpLikeGOF
bm %>%  ggplot(mapping = aes(x = expr, y = time/10^6)) + geom_violin() + coord_flip() +
  stat_summary(fun = "median", geom = "errorbar", 
               mapping = aes(ymax = ..y.., ymin = ..y..),
               linetype = "dashed", col = "red", show.legend = TRUE) +
  labs(x = "", y = "Computation time (ms)") +
  theme(text = element_text(size = 18))


## Power comparison when applied to heavy tailed samples
## The following code commented chunks yield table 4
## For laplace samples
tmp <- function(n = 50) {
  samp <- rlaplace(n, mu = 0, b = 1)
  pvs <- vs.test(x = samp, densfun = "dnorm")$p.value
  pelr <- dbEmpLikeGOF(x = samp, testcall = "normal", vrb = FALSE)$pvalue
  return(c(VS = pvs, ELR = pelr))
}
set.seed(seed = 3)
res <- replicate(n = 1000, expr = tmp(50))
apply(res < 0.05, 1, mean)
## For student samples
tmp2 <- function(n = 50) {
  samp <- rt(n, df = 4, ncp = 0)
  pvs <- vs.test(x = samp, densfun = "dnorm")$p.value
  pelr <- dbEmpLikeGOF(x = samp, testcall = "normal", vrb = FALSE)$pvalue
  return(c(VS = pvs, ELR = pelr))
}
set.seed(seed = 4)
res2 <- replicate(n = 1000, expr = tmp2(50))
apply(res2 < 0.05, 1, mean)
## With large samples (n = 200)
## Laplace
set.seed(seed = 5)
res3 <- replicate(n = 1000, expr = tmp(200))
apply(res3 < 0.05, 1, mean)
## Student
set.seed(seed = 6)
res4 <- replicate(n = 1000, expr = tmp2(200))
apply(res4 < 0.05, 1, mean)

#### Real data application ####
## ------------------------------------------------------------------------
data("contaminants", package = "vsgoftest")
unlist(lapply(X = list(aluminium1, manganese, aluminium2, toluene),
              FUN = DescTools::Skew))

## ------------------------------------------------------------------------
knitr::opts_chunk$set(warning = FALSE)

## ------------------------------------------------------------------------
set.seed(1)
vs.test(x = aluminium1, densfun = "dlnorm")

## ------------------------------------------------------------------------
set.seed(1)
vs.test(x = aluminium2, densfun = "dlnorm")

## ------------------------------------------------------------------------
set.seed(1)
vs.test(x = toluene, densfun = "dlnorm", extend = TRUE, relax = TRUE)

## ------------------------------------------------------------------------
set.seed(1)
vs.test(x = log(toluene), densfun ="dnorm", extend = TRUE)

## ------------------------------------------------------------------------
set.seed(1)
vs.test(x = aluminium2, densfun = "dpareto")

## ------------------------------------------------------------------------
res.test <- vs.test(x = toluene, densfun = "dpareto", extend = TRUE, relax = TRUE)
set.seed(5)
vs.test(x = ppareto(toluene, mu = res.test$estimate[1], c = res.test$estimate[2]),
        densfun ="dunif", param = c(0,1), extend = TRUE)