#### Header #### ## File: R input code associated to the JSS paper "vsgoftest: An R package..." ## Date: 18.08.30 ## Author: philippe.regnault(at)univ-reims.fr #### Installation instructions #### ## Installing missing packages. ## Uncomment and execute to proceed # reqrd.pkg <- c("vsgoftest", "dbEmpLikeGOF", "devtools", "DescTools", # "ggplot2", "dplyr", "microbenchmark", "goftest") # for (pkg in reqrd.pkg) { # if (!requireNamespace(pkg, quietly = TRUE)) { # install.packages(pkg) # } # } ## Uncomment the following line to install the vsgoftest package from github ## devtools::install_github(repo = "pregnault/vsgoftest") #### Estimating entropy #### library("vsgoftest") set.seed(2) samp <- rnorm(n = 100, mean = 0, sd = 1) # Sample drawn from normal distribution entropy.estimate(x = samp, window = 8) # Estimated entropy log(2 * pi * exp(1))/2 # The exact value of the entropy of the normal # distribution with mean 0 and standard deviation 1 ## ------------------------------------------------------------------------ ## Compute estimates for several window sizes sapply(1:10, function(w) entropy.estimate(x = samp, window = w)) ## ------------------------------------------------------------------------ ## Select window size maximizing the entropy estimate (maximum entropy principle) n <- 100 V <- sapply(1:(n/2 - 1), function(w) entropy.estimate(x = samp, window = w)) which.max(V) ## ------------------------------------------------------------------------ set.seed(5) n <- 100 samp <- rpareto(n, c = 1, mu = 2) # Sample drawn from Pareto distribution entropy.estimate(x = samp, window = 3) -log(2) + 3/2 # Exact value of the Pareto Distribution with parameters c=1, mu=2 #### Testing GOF of specified (family of) distributions #### ## ------------------------------------------------------------------------ set.seed(5) samp <- rnorm(50,2,3) vs.test(x = samp, densfun = "dlaplace") ## ------------------------------------------------------------------------ set.seed(4) vs.test(x = samp, densfun = "dnorm") ## ------------------------------------------------------------------------ set.seed(26) vs.test(x = samp, densfun = "dnorm", param = c(2,3)) ## set.seed(2) samp <- rnorm(50, -2, 1) try(vs.test(samp, densfun = "dnorm", param = -2)) # Error message: -2 is not a suitable parameter for the normal distribution ## ------------------------------------------------------------------------ set.seed(1) samp <- rweibull(200, shape = 1.05, scale = 1) set.seed(2) # MC simulations for computing the p-value vs.test(samp, densfun = "dexp", simulate.p.value = TRUE, B = 10000) ## ------------------------------------------------------------------------ set.seed(63) vs.test(samp, densfun = "dexp", delta = 5/30) # Change upper bound for window size ## ------------------------------------------------------------------------ set.seed(8) samp <- rexp(30, rate = 3) vs.test(x = samp, densfun = "dlnorm") # Log_normality not rejected vs.test(x = samp, densfun = "dlnorm", extend = TRUE) # upper bound for window # size extended to half of the sample; log_normality rejected ## ------------------------------------------------------------------------ samp <- c(samp, rep(4,3)) # Add ties to the sample try(vs.test(x = samp, densfun = "dexp")) # Error: too many ties to perform VS test ## ------------------------------------------------------------------------ vs.test(x = samp, densfun = "dexp", extend = TRUE) ## ------------------------------------------------------------------------ ## An example of sample for which VS test can not be applied to set.seed(84) ech <- rpareto(20, mu = 1/2, c = 1) try(vs.test(x = ech, densfun = "dpareto", param = c(1/2, 1))) ## ------------------------------------------------------------------------ data("contaminants", package = "vsgoftest") set.seed(1) vs.test(x = aluminium2, densfun = "dpareto") #### Power comparisons #### ## The execution of these code chunks takes a pretty big amount of time. ## The following chunks yield table 3 of the paper library("goftest") lstn <- c(20, 30, 50, 100) # List of samples sizes N <- 10000 # Number of replicates for MC simulations ## Pareto against log-normal: auxiliary function for simulating a sample, ## applying GOF tests and gathering their pvalues pop <- function(n, m = 0, s) { ech <- 1 + rlnorm(n, meanlog = m, sdlog = s) pvs <- vs.test(x = ech, densfun = "dpareto", param = c(1/s, 1), B = 1000)$p.value pks <- ks.test(x = ech, y = "ppareto", mu = 1/s, c = 1)$p.value pad <- ad.test(x = ech, null = "ppareto", mu = 1/s, c = 1)$p.value pcvm <- cvm.test(x = ech, null = "ppareto", mu = 1/s, c = 1)$p.value return(c(pvs, pks, pad, pcvm)) } ## Pareto with c = 1, mu = 1 vs. shifted log-normal with meanlog = 0, sdlog = 1 ## WARNING: the execution of the following "for" loop may take a couple of ## hours on a standard laptop mu <- 1 powers1 <- matrix(0, nrow = 4, ncol = length(lstn)) set.seed(54) t <- Sys.time() for (i in seq_along(lstn)) { res.pow <- replicate(n = N, expr = pop(lstn[i], 0, 1/mu)) powers1[i,] <- apply(res.pow < 0.05, 1, mean) } powers1 Sys.time() - t ## Pareto with c = 1, mu = .8 vs. shifted log-normal with meanlog = 0, sdlog = 1.25 ## WARNING: the execution of the following "for" loop may take a couple of ## hours on a standard laptop mu <- 4/5 powers2 <- matrix(0, nrow = 4, ncol = length(lstn)) set.seed(32) for (i in seq_along(lstn)) { res.pow <- replicate(n = N, expr = pop(lstn[i], 0, 1/mu)) powers2[i,] <- apply(res.pow < 0.05, 1, mean) } powers2 ## Exponential vs Weibull ## Auxiliary function pop <- function(n, sh) { ech <- rweibull(n, shape = sh, scale = 1) pvs <- vs.test(x = ech, densfun = "dexp", param = 1, B = 1000)$p.value pks <- ks.test(x = ech, y = "pexp", rate = 1)$p.value pad <- ad.test(x = ech, null = "pexp", rate = 1)$p.value pcvm <- cvm.test(x = ech, null = "pexp", rate = 1)$p.value return(c(pvs, pks, pad, pcvm)) } ## Exp 1/2 vs Weibull(1.2, 2) ## WARNING: the execution of the following "for" loop may take a couple of ## hours on a standard laptop powers3 <- matrix(0, nrow = 4, ncol = length(lstn)) set.seed(12) for (i in seq_along(lstn)) { res.pow <- replicate(n = N, expr = pop(lstn[i], 1.2)) powers3[i,] <- apply(res.pow < 0.05, 1, mean) } powers3 ## Exp 1/2 vs Weibull(1.3, 2) ## WARNING: the execution of the following "for" loop may take a couple of ## hours on a standard laptop powers4 <- matrix(0, nrow = 4, ncol = length(lstn)) set.seed(23) for (i in seq_along(lstn)) { res.pow <- replicate(n = N, expr = pop(lstn[i], 1.3)) powers4[i,] <- apply(res.pow < 0.05, 1, mean) } powers4 #### vsgoftest versus dbEmpLikeGOF #### ## ------------------------------------------------------------------------ ## Relation between VS and ELR test statistics. set.seed(1) samp <- rnorm(50) res.vs <- vs.test(x = samp, densfun = "dnorm", delta = -1/6, relax = TRUE) res.vs$statistic * 50 + 1/2 library("dbEmpLikeGOF") res.el <- dbEmpLikeGOF(x = samp, testcall = "normal", vrb = FALSE) res.el$teststat ## Comparison of computation times library("ggplot2") library("dplyr") library("microbenchmark") ## For normal distribution set.seed(1) sample <- rnorm(n = 50) bm <- microbenchmark(vs.test = vs.test(x = sample, densfun = "dnorm", simulate.p.value = TRUE, B = 1000, delta = -1/6), dbEmpLikeGOF = dbEmpLikeGOF(x = sample, testcall = "normal", pvl.Table = FALSE, num.mc = 1000, vrb = FALSE), times = 100L) ## Violin plots for comparing computation times of vs.test and dbEmpLikeGOF bm %>% ggplot(mapping = aes(x = expr, y = time/10^6)) + geom_violin() + coord_flip() + stat_summary(fun = "median", geom = "errorbar", mapping = aes(ymax = ..y.., ymin = ..y..), linetype = "dashed", col = "red", show.legend = TRUE) + labs(x = "", y = "Computation time (ms)") + theme(text = element_text(size = 18)) ## For uniform distribution set.seed(1) sample <- runif(n = 50, min = 1, max = 3) bm <- microbenchmark(vs.test = vs.test(x = sample, densfun = "dunif", simulate.p.value = TRUE, B = 1000, delta = -1/6), dbEmpLikeGOF = dbEmpLikeGOF(x = sample, testcall = "uniform", pvl.Table = FALSE, num.mc = 1000, vrb = FALSE), times = 100L) ## Violin plots for comparing computation times of vs.test and dbEmpLikeGOF bm %>% ggplot(mapping = aes(x = expr, y = time/10^6)) + geom_violin() + coord_flip() + stat_summary(fun = "median", geom = "errorbar", mapping = aes(ymax = ..y.., ymin = ..y..), linetype = "dashed", col = "red", show.legend = TRUE) + labs(x = "", y = "Computation time (ms)") + theme(text = element_text(size = 18)) ## Power comparison when applied to heavy tailed samples ## The following code commented chunks yield table 4 ## For laplace samples tmp <- function(n = 50) { samp <- rlaplace(n, mu = 0, b = 1) pvs <- vs.test(x = samp, densfun = "dnorm")$p.value pelr <- dbEmpLikeGOF(x = samp, testcall = "normal", vrb = FALSE)$pvalue return(c(VS = pvs, ELR = pelr)) } set.seed(seed = 3) res <- replicate(n = 1000, expr = tmp(50)) apply(res < 0.05, 1, mean) ## For student samples tmp2 <- function(n = 50) { samp <- rt(n, df = 4, ncp = 0) pvs <- vs.test(x = samp, densfun = "dnorm")$p.value pelr <- dbEmpLikeGOF(x = samp, testcall = "normal", vrb = FALSE)$pvalue return(c(VS = pvs, ELR = pelr)) } set.seed(seed = 4) res2 <- replicate(n = 1000, expr = tmp2(50)) apply(res2 < 0.05, 1, mean) ## With large samples (n = 200) ## Laplace set.seed(seed = 5) res3 <- replicate(n = 1000, expr = tmp(200)) apply(res3 < 0.05, 1, mean) ## Student set.seed(seed = 6) res4 <- replicate(n = 1000, expr = tmp2(200)) apply(res4 < 0.05, 1, mean) #### Real data application #### ## ------------------------------------------------------------------------ data("contaminants", package = "vsgoftest") unlist(lapply(X = list(aluminium1, manganese, aluminium2, toluene), FUN = DescTools::Skew)) ## ------------------------------------------------------------------------ knitr::opts_chunk$set(warning = FALSE) ## ------------------------------------------------------------------------ set.seed(1) vs.test(x = aluminium1, densfun = "dlnorm") ## ------------------------------------------------------------------------ set.seed(1) vs.test(x = aluminium2, densfun = "dlnorm") ## ------------------------------------------------------------------------ set.seed(1) vs.test(x = toluene, densfun = "dlnorm", extend = TRUE, relax = TRUE) ## ------------------------------------------------------------------------ set.seed(1) vs.test(x = log(toluene), densfun ="dnorm", extend = TRUE) ## ------------------------------------------------------------------------ set.seed(1) vs.test(x = aluminium2, densfun = "dpareto") ## ------------------------------------------------------------------------ res.test <- vs.test(x = toluene, densfun = "dpareto", extend = TRUE, relax = TRUE) set.seed(5) vs.test(x = ppareto(toluene, mu = res.test$estimate[1], c = res.test$estimate[2]), densfun ="dunif", param = c(0,1), extend = TRUE)