###############################################################
### Script to replicate the results presented in the manuscript
### "An R Package for the Analysis of Geostatistical Count Data
###  Using Gaussian Copulas"
### Authors: Zifei Han (hanzifei1@gmail.com)
###          Victor De Oliveira (victor.deoliveira@utsa.edu)
###############################################################

##################################################
### Section 3.1. Data simulation and visualization
##################################################

library("gcKrig")

# Simulate ten random fields with negative binomial marginal and exponential correlation function
grid <- seq(0.05, 0.95, by = 0.1)
xloc <- expand.grid(x = grid, y = grid)[, 1]
yloc <- expand.grid(x = grid, y = grid)[, 2]
set.seed(12345)
sim1 <- simgc(locs = cbind(xloc,yloc), sim.n = 10,
              marginal = negbin.gc(mu = 5, od = 1),
              corr = matern.gc(range = 0.3, kappa = 0.5, nugget = 0.1))

# Visualize one of the simulated data
plot(sim1, index = 1, plottype = "Text", col = 2)
plot(sim1, index = 1, plottype = "Dot", col = 2)
plot(sim1, index = 1, plottype = "3D", col = 2)

###############################################################
### Section 3.6. Computation of Frechet–Hoeffding upper bounds
###############################################################

# The Frechet Hoeffding upper bounds for the correlations between negative binomial variables
NBmu <- seq(0.01, 15, by = 0.02)
fhub1 <- vector("numeric", length(NBmu))
for (i in 1:length(NBmu)) {
     fhub1[i] <- FHUBdiscrete(marg1 = "nb", marg2 = "nb", mu1 = 10, od1 = 0.2,
                              mu2 = NBmu[i], od2 = 0.2)
}

# The Frechet Hoeffding upper bounds  for the correlations between gamma variables
gammaShape <- seq(0.01, 15, by = 0.02)
fhub2 <- vector("numeric", length(gammaShape))
for (i in 1:length(gammaShape)) {
     fhub2[i] <- corrTG(marg1 = gm.gc(shape = gammaShape[i], rate = 1),
                        marg2 = gm.gc(shape = 0.5, rate = 1),
                        corrGauss = 1, method = "integral")
}

# Generate the plots
plot(NBmu, fhub1, type = "l", xlab = expression(E(X[2])), ylab = "FHUB")
plot(gammaShape, fhub2, type = "l", xlab = "Shape of a gamma distribution", ylab = "FHUB")

#######################################
### Section 4.1. The Atlantic fish data
#######################################

data("AtlanticFish", package = "gcKrig")

# Calculate the maximum simulated likelihood estimates with three covariates
Fitfish <- mlegc(y = AtlanticFish[, 3], x = AtlanticFish[, 4:6],
                 locs = AtlanticFish[, 1:2], longlat = TRUE,
                 marginal = negbin.gc(link = "log"),
                 corr = matern.gc(kappa = 0.5, nugget = TRUE), method = "GHK")
summary(Fitfish)

# Calculate the 95% Wald-type confidence intervals
round(Fitfish$MLE - qnorm(0.975)*sqrt(diag(solve(Fitfish$hessian))), 3)
round(Fitfish$MLE + qnorm(0.975)*sqrt(diag(solve(Fitfish$hessian))), 3)

# Calculate the 95% profile-likelihood confidence intervals
profile(Fitfish, par.index = 1, method = "GHK")
profile(Fitfish, par.index = 2, method = "GHK")
profile(Fitfish, par.index = 3, method = "GHK")
profile(Fitfish, par.index = 4, method = "GHK")
profile(Fitfish, par.index = 5, method = "GHK")

# Calculate the maximum simulated likelihood estimates with four covariates
Fitfish2 <- mlegc(y = AtlanticFish[, 3], x = AtlanticFish[, 4:7],
                  locs = AtlanticFish[, 1:2], longlat = TRUE,
                  marginal = negbin.gc(link = "log"),
                  corr = matern.gc(kappa = 0.5, nugget = TRUE), method = "GHK")
summary(Fitfish2)

####################################
### Section 4.2. The weed count data
####################################

library("parallel")

# Detect number of cores available for parallel prediction
detectCores(all.tests = FALSE, logical = TRUE)

# Load snowfall for parallel computing
library("snowfall")
data("Weed95", package = "gcKrig")

# Set the observed and prediction locations
weedobs <- subset(Weed95, dummy == 1)
weedpred <- subset(Weed95, dummy == 0)

# Parallel prediction with 4 cores using GHK method
predweed <- predgc(obs.y = weedobs$weedcount, obs.x = weedobs[, 4:5],
                   obs.locs = weedobs[, 1:2], pred.x = weedpred[, 4:5],
                   pred.locs = weedpred[, 1:2], marginal = negbin.gc(link = "log"),
                   corr = matern.gc(kappa = 0.5, nugget = TRUE), method = "GHK",
                   pred.interval = 0.95, parallel = TRUE,
                   paralleloptions = list(n.cores = 4, cluster.type = "SOCK"))
summary(predweed)
plot(predweed, plottype = "2D", xlab = "Coordinate x (m)",
     ylab = "Coordinate y (m)", col = c(3, 2), textcex = 0.8)
plot(predweed, plottype = "Predicted Variance", xlab = "Coordinate x (m)",
     ylab = "Coordinate y (m)", col = c(3, 2), textcex = 0.8)

########################################
### Section 4.3. The oil prevalence data
########################################

data("OilWell", package = "gcKrig")
gridstep <- seq(0.5, 30.5, length = 40)
locOilpred <- data.frame(Easting = expand.grid(gridstep, gridstep)[, 1],
                         Northing = expand.grid(gridstep, gridstep)[, 2])

# Predict the oil abundance with the GHK method using 4 cores in parallel
PredOil <- predgc(obs.y = OilWell[, 3], obs.locs = OilWell[, 1:2],
                  pred.locs = locOilpred, marginal = binomial.gc(link = "logit"),
                  corr = matern.gc(nugget = FALSE), obs.effort = 1,
                  pred.effort = 1, method = "GHK",
                  parallel = TRUE, paralleloptions = list(n.cores = 4))
PredMat <- summary(PredOil)

library("colorspace")

# Generate the contour plots
filled.contour(seq(0.5, 30.5, length = 40), seq(0.5, 30.5, length = 40),
               matrix(PredMat$predMean, 40), zlim = c(0, 1), col = rev(heat_hcl(12)),
               nlevels = 12, xlab = "Eastings (km)", ylab = "Northings (km)",
               plot.axes = {axis(1); axis(2); points(OilWell[, 1:2], col = 1,
                                                     cex = 0.2 + 0.4*OilWell[, 3])})

filled.contour(seq(0.5, 30.5, length = 40), seq(0.5, 30.5, length = 40),
               matrix(PredMat$predVar, 40), zlim = c(0, 0.3), col = rev(heat_hcl(12)),
               nlevels = 10, xlab = "Eastings (km)", ylab = "Northings (km)",
               plot.axes = {axis(1); axis(2); points(OilWell[, 1:2], col = 1,
                                                     cex = 0.2 + 0.4*OilWell[, 3])})

# Predict the oil abundance with the GQT method using 4 cores in parallel
# Simple change the `method' argument from GHK to GQT
PredOil2 <- predgc(obs.y = OilWell[, 3], obs.locs = OilWell[, 1:2],
                  pred.locs = locOilpred, marginal = binomial.gc(link = "logit"),
                  corr = matern.gc(nugget = FALSE), obs.effort = 1,
                  pred.effort = 1, method = "GQT",
                  parallel = TRUE, paralleloptions = list(n.cores = 4))
PredMat2 <- summary(PredOil2)

# Generate the contour plots
filled.contour(seq(0.5, 30.5, length = 40), seq(0.5, 30.5, length = 40),
               matrix(PredMat2$predMean, 40, ), zlim = c(0, 1), col = rev(heat_hcl(12)),
               nlevels = 12, xlab = "Eastings (km)", ylab = "Northings (km)",
               plot.axes = {axis(1); axis(2); points(OilWell[, 1:2], col = 1,
                                                     cex = 0.2 + 0.4*OilWell[, 3])})

filled.contour(seq(0.5, 30.5, length = 40), seq(0.5, 30.5, length = 40),
               matrix(PredMat2$predVar, 40, ), zlim = c(0, 0.3), col = rev(heat_hcl(12)),
               nlevels = 10, xlab = "Eastings (km)", ylab = "Northings (km)",
               plot.axes = {axis(1); axis(2); points(OilWell[, 1:2], col = 1,
                                                     cex = 0.2 + 0.4*OilWell[, 3])})

#############################################
### Section 5. Comparison with other packages
#############################################

# Set the sampling locations
xloc <- rep(seq(0, 1, by = 0.1), 11)
yloc <- rep(seq(0, 1, by = 0.1), each = 11)
simD <- as.matrix(dist(cbind(xloc, yloc)))

# Simulate the dataset
set.seed(321)
simData1 <- simgc(locs = cbind(xloc, yloc), sim.n = 1,
                  marginal = negbin.gc(mu = exp(1 + 0.5*xloc + yloc), od = 1),
                  corr = matern.gc(range = 0.3, nugget = 0))
simDf1 <- data.frame(xloc = xloc, yloc = yloc, data = simData1$data)

# Create a function to calculate likelihood with package 'mvtnorm' by Genz and Bretz
QMCLik <- function(v){
  mu <- exp(v[1] + v[2]*xloc + v[3]*yloc)
  lower <- qnorm(pnbinom(q = simDf1$data - 1, size = 1/v[4], mu = mu))
  upper <- qnorm(pnbinom(q = simDf1$data, size = 1/v[4], mu = mu))
  R <- exp(-simD/v[5])
  set.seed(1234)
  lik <- -log(mvtnorm::pmvnorm(lower = lower, upper = upper,
                               mean = rep(0, length(lower)), sigma = R)[1])
  return(lik)
}

# Find appropriate starting point of marginals and range
est1 <- MASS::glm.nb(data ~ xloc + yloc, data = simDf1)
start0 <- c(coef(est1), 1/est1$theta, median(simD)/2)
EPS <- .Machine$double.eps

# The GB method using package 'mvtnorm'
library("mvtnorm")
library("numDeriv")
t0 <- proc.time()
GBFit <- optim(par = start0, fn = QMCLik, gr = NULL,
               method = c("L-BFGS-B"), lower = c(-Inf, -Inf, -Inf, EPS, EPS),
               upper = c(Inf, Inf, Inf, Inf, Inf))
tGB <- proc.time() - t0
HessGB <- hessian(QMCLik, x = GBFit$par)
sdGB <- sqrt(diag(solve(HessGB)))

# The GHK method using package 'gcmr'
library("gcmr")
t0 <- proc.time()
gcmrGHK <- gcmr(data ~ xloc + yloc, data = simDf1,
                marginal = negbin.marg(link = "log"),
                cormat = matern.cormat(D = simD),
                options = gcmr.options(seed = 123, nrep = c(100, 1000)))
tgcmr <- proc.time() - t0

# The GHK method using package 'gcKrig'
library("gcKrig")
t0 <- proc.time()
gcKrigGHK <- mlegc(y = simDf1[, 3], x = simDf1[, 1:2],
                   locs = cbind(xloc,yloc), method = "GHK",
                   marginal = negbin.gc(link = "log"),
                   corr = matern.gc(nugget = FALSE),
                   ghkoptions = list(nrep = c(100, 1000), seed = 123))
tgcKrigGHK <- proc.time() - t0

# The GQT method using package 'gcKrig'
t0 <- proc.time()
gcKrigGQT <- mlegc(y = simDf1[, 3], x = simDf1[, 1:2],
                   locs = cbind(xloc,yloc), method = "GQT",
                   marginal = negbin.gc(link = "log"),
                   corr = matern.gc(nugget = FALSE))
tgcKrigGQT <- proc.time() - t0

# Table 3
est <- cbind(coef(gcmrGHK), gcKrigGHK$MLE, gcKrigGQT$MLE, GBFit$par)
sd <- cbind(sqrt(diag(vcov(gcmrGHK))), sqrt(diag(solve(gcKrigGHK$hessian))),
            sqrt(diag(solve(gcKrigGQT$hessian))), sdGB)
logLiks <- c(logLik(gcmrGHK), gcKrigGHK$log.lik, gcKrigGQT$log.lik, -GBFit$value)
times <- c(tgcmr["elapsed"], tgcKrigGHK["elapsed"], tgcKrigGQT["elapsed"], tGB["elapsed"])
Est <- matrix(paste0(format(round(est, 3), nsmall = 3),
                     " (", format(round(sd, 3), nsmall = 3), ")"), ncol = 4)
colnames(Est) <- c("gcmr", "gcKrig-GHK", "gcKrig-GQT", "mvtnorm")
rownames(Est) <- names(coef(gcmrGHK))
Table.3 <- rbind(Est,
                 logLik = format(round(logLiks, 2), nsmall = 2),
                 Time = format(round(times, 1), nsmall = 1))
print(Table.3, quote = FALSE)

##################################################
### Appendix
##################################################

## Example of specifying binomial marginal with t distribution as link
binomial_t.gc <- function(df.t = 6, size = NULL, prob = NULL)
{
  ans <- list()
  ans$discrete <- TRUE
  if(is.null(size) & is.null(prob)){
    ans$start <- function(y, x, effort) {
      mfit <- suppressWarnings(glm.fit(x, y/effort,
                                       family = binomial(link = "logit")))
      reg0 <- coef(mfit)
      glb <- qnorm(pbinom(y - 1, size = effort, prob = fitted(mfit)))
      gub <- qnorm(pbinom(y, size = effort, prob = fitted(mfit)))
      names(reg0)[1] <- "Intercept"
      return(reg0)
    }
    ans$nod <- 0
    ans$bounds <- function(y, x, pars, effort) {
      p <- pt(pars[1:ncol(x)]%*%t(x), df = df.t)
      a <- qnorm(pbinom(y - 1, size = effort, prob = p))
      b <- qnorm(pbinom(y, size = effort, prob = p))
      return(list(lower = a, upper = b))
    }
    ans$pdf <- function(y, x, pars, effort){
      p <- pt(pars[1:ncol(x)]%*%t(x), df = df.t)
      pdf <- dbinom(y, size = effort, prob = p, log = FALSE)
      return(pdf)
    }
    ans$cdf <- function(y, x, pars, effort){
      p <- pt(pars[1:ncol(x)]%*%t(x), df = df.t)
      cdf <- pbinom(y, size = effort, prob = p)
      return(cdf)
    }
  }
  if(is.numeric(size) & is.numeric(prob))
  {
    q <- function(p) qbinom(p = p, size = size, prob = prob)
    ans$margvar <- size*prob*(1-prob)
    ans$int.marg <- function (order){
      if(requireNamespace("EQL", quietly = TRUE)){
        integrate(function(x, order)
          ifelse((q(pnorm(x))==Inf | dnorm(x) < .Machine$double.eps), 0,
                 q(pnorm(x))*dnorm(x)*EQL::hermite(x, order, prob = TRUE)),
          order = order, -8, 8, subdivisions = 1500,
          rel.tol = 0.01, stop.on.error = FALSE)
      }else{
        stop("Please install {EQL} first!")
      }
    }
    ans$rsample <- function(nrep){
      rbinom(n = nrep, size = size, prob = prob)
    }
    ans$q <- q
  }
  class(ans) <- c("marginal.gc")
  return(ans)
}


## Example of specifying power exponential correlation function
powerexp.gc <- function (range = NULL, kappa = 1, nugget = TRUE)
{
  ans <- list()
  if (kappa > 2)
    stop("'Kappa' must be between 0 and 2")
  if (is.null(range)) {
    if (nugget == TRUE) {
      ans$nug <- 1
      ans$npar.cor <- 2
      ans$start <- function(D) {
        corstart <- c(median(D)/2, 0.2)
        names(corstart) <- c("range", "nugget")
        return(corstart)
      }
      ans$corr <- function(corrpar, D) {
        S <- (1 - corrpar[2]) * exp(-abs((D/corrpar[1])^(kappa))) +
          corrpar[2] * diag(NROW(D))
        return(S)
      }
    }
    else {
      ans$nug <- 0
      ans$npar.cor <- 1
      ans$start <- function(D) {
        corstart <- median(D)/2
        names(corstart) <- "range"
        return(corstart)
       }
      ans$corr <- function(corrpar, D) {
        S <- exp(-abs((D/corrpar[1])^(kappa)))
        return(S)
      }
    }
  }
  if (is.numeric(range) & is.numeric(nugget)) {
    ans$S <- function(D) (1 - nugget) * exp(-abs((D/range)^(kappa))) +
      nugget * diag(NROW(D))
  }
  class(ans) <- c("corr.gc")
  return(ans)
}