# ---- Packages ------------------------------------------------------------
library("bamlss")
library("gamlss.dist")
library("gamlss.tr")
library("ggplot2")
library("sf")
library("rnaturalearth")
library("AER")
## install.packages("FlashAustria", repos = "http://R-Forge.R-project.org")
library("FlashAustria")


# ---- Section 2.1: Basic Bayesian regression: Logit model -----------------
data("SwissLabor", package = "AER")

## Model formula.
f <- participation ~ income + age + education + youngkids + oldkids + foreign + I(age^2)

## Estimate model.
set.seed(123)
b <- bamlss(f, family = "binomial", data = SwissLabor)

## Model summary.
summary(b)

## Compute DIC.
DIC(b)

## Figure 1: MCMC diagnostics.
plot(b, which = c("samples", "max-acf"))

## Predictions on probability scale.
nd <- data.frame(income = 11, age = seq(2, 6.2, length = 100),
  education = 12, youngkids = 1, oldkids = 1, foreign = "no")
nd$p_swiss <- predict(b, newdata = nd, type = "parameter", FUN = c95)
nd$foreign <- "yes"
nd$p_foreign <- predict(b, newdata = nd, type = "parameter", FUN = c95)

## Plot effect of age on probability.
blues <- function(n, ...) hcl.colors(n, "Blues", rev = TRUE)
plot2d(p_swiss ~ age, data = nd, ylab = "participation",
  ylim = range(c(nd$p_swiss, nd$p_foreign)), fill.select = c(0, 1, 0, 1))
plot2d(p_foreign ~ age, data = nd, add = TRUE,
  fill.select = c(0, 1, 0, 1), axes = FALSE,
  s2.col = blues, col.lines = blues(1))
legend("topright", c("Foreign", "Swiss"), lwd = 1,
  col = c(blues(1), "black"), bty = "n")


# ---- Section 2.2: Flexible model terms and estimators --------------------
## Model formula including a smooth model term for age.
f <- participation ~ income + education +
  youngkids + oldkids + foreign + s(age, k = 10)

## Estimate model.
set.seed(123)
b <- bamlss(f, family = "binomial", data = SwissLabor)

## Plot estimated smooth effect.
plot(b, term = "s(age)",
  ylab = expression(paste("Effect on Logi", t^-1, (pi))))

## Categorize age variable using quantiles.
SwissLabor$cage <- cut(SwissLabor$age,
  breaks = quantile(SwissLabor$age, prob = seq(0, 1, length = 10)),
  include.lowest = TRUE, ordered_result = TRUE)

## Model formula including the fused lasso model term for cage.
f <- participation ~ income + education + youngkids + oldkids + foreign + la(cage, fuse = 2)

## Estimate model using the lasso optimizer function.
b <- bamlss(f, family = "binomial", data = SwissLabor,
  optimizer = opt_lasso, sampler = FALSE, upper = exp(5), lower = 1,
  criterion = "AIC")

## Figure 3: BIC path, paths of coefficients and estimated effect.
par(mfrow = c(1, 3), mar = c(4.1, 4.1, 5, 7.3))
pathplot(b, 1, spar = FALSE)
pathplot(b, 2, spar = FALSE, name = "pi.s.la(cage).cage",
  main = "Coefficient paths age", lwd = 2)
page <- predict(b, term = "cage", intercept = FALSE, mstop = lasso_stop(b))
plot2d(page ~ age, data = SwissLabor, rug = TRUE, lwd = 2,
  ylab = expression(paste("Effect on Logi", t^-1, (pi))))
mtext("Estimated\nnonlinear effect", side = 3, line = 1.5, cex = 1.2, font = 2)


# ---- Section 2.3: Distributional model -----------------------------------
data("fatalities", package = "bamlss")

## Create subsets.
d19 <- subset(fatalities, year <= 2019)
d20 <- subset(fatalities, year >= 2020)

## Model formula.
f <- list(
  log(num) ~ s(week, bs = "cc", k = 20),
  sigma    ~ s(week, bs = "cc", k = 20)
)

## Estimate location-scale model.
set.seed(456)
b <- bamlss(f, data = d19, family = NO)

## Plot estimated effects.
plot(b, model = c("mu", "sigma"))

## Show predicted quantiles with 2020 data.
nd <- data.frame(week = 1:53)
par <- predict(b, newdata = nd, type = "parameter")
nd$fit <- sapply(c(0.05, 0.5, 0.95),
  FUN = function(p) { exp(family(b)$q(p, par)) })
d19w <- reshape(d19, idvar = "week",
  timevar = "year", direction = "wide")

par(mar = c(4.1, 4.1, 0.1, 0.1))
matplot(d19w$week, d19w[, -1],
  type = "o", lty = 1, pch = 16, col = gray(0.1, alpha = 0.1),
  xlab = "Week", ylab = "Number of fatalities")
grid()
matplot(nd$week, nd$fit, type = "l", lty = c(2, 1, 2),
  col = 1, lwd = 2, add = TRUE)
lines(num ~ week, data = d20, col = 2, lwd = 2, type = "o", pch = 16)


## Plot histogram of quantile residuals.
plot(b, which = "hist-resid", spar = FALSE, col = "lightgray")

## Show quantile residuals QQ-plot.
plot(b, which = "qq-resid", spar = FALSE, c95 = TRUE)


# ---- Section 4: The bamlss package ---------------------------------------
## Show optimizer/sampler information for families.
print(engines(NO, gaussian_bamlss, cox_bamlss))

## Model formula from Section 2.1.
f <- participation ~ income + age + education + youngkids + oldkids + foreign + I(age^2)

## Setup the bamlss.frame.
bf <- bamlss.frame(f, data = SwissLabor, family = "binomial")

## Run backfitting model fitting engine.
pm <- with(bf, bfit(x, y, family))

## Run MCMC engine.
set.seed(123)
samps <- with(bf, GMCMC(x, y, family, start = pm$parameters))

## Compute sampling statistics.
stats <- with(bf, samplestats(samps, x, y, family))
print(unlist(stats))


# ---- Section 4.1: The bamlss model frame ---------------------------------
## Generate simulated data set.
set.seed(111)
d <- GAMart()

## A model formula.
f <- list(
  num ~ x1 + s(x2) + s(x3) + te(lon,lat),
  sigma ~ x1 + s(x2) + s(x3) + te(lon,lat)
)

## Create the bamlss frame.
bf <- bamlss.frame(f, data = d, family = "gaussian")

## Show the structure of the bamlss frame object.
print(bf)

## Show names of smooth term list.
## Each element holds design and penalty matrices
## that can be used for estimation.
print(names(bf$x$mu$smooth.construct))


# ---- Section 4.3: Estimation engines ------------------------------------
## Show parameter names of bamlss family.
gaussian_bamlss()$names

## Example of parameter names.
paste0("mu.s.s(x3)", ".b", 1:10)


# ---- Section 5: Flexible count regression for lightning reanalysis ------
data("FlashAustria", package = "FlashAustria")

## Load estimated models.
data("FlashAustriaModel", package = "FlashAustria")

## Show first rows of the data.
head(FlashAustriaTrain)[, c("counts", "q_prof_PC1", "sqrt_cape",
  "sqrt_lsp", "d2m", "cswc_prof_PC4")]

## Number of observations.
nrow(FlashAustriaTrain)

## Generate truncate Sichel distribution.
ztSICHEL <- trun(0, family = "SICHEL", local = FALSE)

## The model formula.
f <- list(
  counts ~ s(d2m, bs = "ps") + s(q_prof_PC1, bs = "ps") +
           s(cswc_prof_PC4, bs = "ps") + s(t_prof_PC1, bs = "ps") +
           s(v_prof_PC2, bs = "ps") + s(sqrt_cape, bs = "ps"), 
         ~ s(sqrt_lsp, bs = "ps"),
         ~ 1
)

## Models are estimated with:
if(FALSE) {
  set.seed(123)
  flash_model_ztnbinom <- bamlss(f, data = FlashAustriaTrain, ## Standard interface.
    family = "ztnbinom", binning = TRUE,                      ## General arguments.
    optimizer = opt_boost, maxit = 1000,                      ## Boosting arguments.
    thin = 3, burnin = 1000, n.iter = 2000,                   ## Sampler arguments.
    light = TRUE, cores = 3)

  set.seed(123)
  flash_model_ztSICHEL <- bamlss(f, data = FlashAustriaTrain,
    family = ztSICHEL, binning = TRUE,
    optimizer = opt_boost, maxit = 1000,
    thin = 3, burnin = 1000, n.iter = 2000,
    light = TRUE, cores = 3)

  save(
    flash_model_ztnbinom,
    flash_model_ztSICHEL,
    file = "FlashAustriaModel.rda"
  )
}

## Compute quantile residuals.
set.seed(321)

resids <- c(
	"ztnbinom" = residuals(flash_model_ztnbinom, newdata = FlashAustriaTrain),
	"ztSICHEL" = residuals(flash_model_ztSICHEL, newdata = FlashAustriaTrain)
)

## Compare using worm plots.
plot(resids, which = "wp", main = "Worm plot",
  col = hcl.colors(2, "Dark 3"), cex = 0.5, ylim2 = c(-0.5, 0.5)) 

## Show coefficient paths of the boosting iterations of the best model.
pathplot(flash_model_ztSICHEL, which = "loglik.contrib", intercept = TRUE)

## Show some samples.
b2 <- flash_model_ztSICHEL
b2$samples[[1]] <- b2$samples[[1]][, c("mu.s.s(q_prof_PC1).b1", "mu.s.s(q_prof_PC1).b2")]
plot(b2, which = "samples", ask = FALSE)

## Plot selection of estimated effects.
par(mfrow = c(1, 3))
plot(flash_model_ztSICHEL,
  term = c("s(q_prof_PC1)", "s(sqrt_cape)", "s(d2m)"),
  ask = FALSE, rug = TRUE, col.rug = "#39393919", spar = FALSE)

## Projection issues may happen, in this case try
sf::st_crs(FlashAustriaCase) <- 4326

## Compute estimated probabilities.
fit <- predict(flash_model_ztSICHEL, newdata = FlashAustriaCase, type = "parameter")
fam <- family(flash_model_ztSICHEL)
FlashAustriaCase$P10 <- 1 - fam$p(9, fit)

world <- rnaturalearth::ne_countries(scale = "medium", returnclass = "sf")

ggplot() + geom_sf(aes(fill = P10), data = FlashAustriaCase) +
  scale_fill_continuous_sequential("Oslo", rev = TRUE, p1 = .85) +
  geom_sf(data = world, col = "white", fill = NA) +
  coord_sf(xlim = c(7.95, 17), ylim = c(45.45, 50), expand = FALSE) +
  facet_wrap(~time, nrow = 2) + theme_minimal() +
  theme(plot.margin = margin(t = 0, r = 0, b = 0, l = 0))


# ---- Appendix A: Gaussian family object --------------------------------
Gauss_bamlss <- function(...) {
  f <- list(
    "family" = "mygauss",
    "names"  = c("mu", "sigma"),
    "links"  = c(mu = "identity", sigma = "log"),
    "d" = function(y, par, log = FALSE) {
      dnorm(y, mean = par$mu, sd = par$sigma, log = log)
    },
    "p" = function(y, par, ...) {
      pnorm(y, mean = par$mu, sd = par$sigma, ...)
    },
    "r" = function(n, par) {
      rnorm(n, mean = par$mu, sd = par$sigma)
    },
    "q" = function(p, par) {
      qnorm(p, mean = par$mu, sd = par$sigma)
    },
    "score" = list(
      mu = function(y, par, ...) {
        drop((y - par$mu) / (par$sigma^2))
      },
      sigma = function(y, par, ...) {
        drop(-1 + (y - par$mu)^2 / (par$sigma^2))
      }
    ),
    "hess" = list(
      mu = function(y, par, ...) {
        drop(1 / (par$sigma^2))
      },
      sigma = function(y, par, ...) { 
        rep(2, length(y))
      }
    )
  )
  class(f) <- "family.bamlss"
  return(f)
}


# ---- Appendix B: Special model terms -----------------------------------
## Smooth construct method.
smooth.construct.gc.smooth.spec <- function(object, data, knots) 
{
  object$X <- matrix(as.numeric(data[[object$term]]), ncol = 1)
  center <- if(!is.null(object$xt$center)) {
    object$xt$center
  } else TRUE
  object$by.done <- TRUE
  if(object$by != "NA")
    stop("by variables not supported!")

  ## Begin special elements to be used with bfit() and GMCMC().
  object$fit.fun <- function(X, b, ...) {
    f <- b[1] * exp(-b[2] * exp(-b[3] * drop(X)))
    if(center)
      f <- f - mean(f)
    f
  }
  object$update <- bfit_optim
  object$propose <- GMCMC_slice
  object$prior <- function(b) { sum(dnorm(b, sd = 1000, log = TRUE)) }
  object$fixed <- TRUE
  object$state$parameters <- c("b1" = 0, "b2" = 0.5, "b3" = 0.1)
  object$state$fitted.values <- rep(0, length(object$X))
  object$state$edf <- 3
  object$special.npar <- 3 ## Important!
  ## End special elements.

  ## Important, This is a special smooth constructor!
  class(object) <- c("gc.smooth", "no.mgcv", "special")

  object
}

## Predict matrix method.
Predict.matrix.gc.smooth <- function(object, data, knots) 
{
  X <- matrix(as.numeric(data[[object$term]]), ncol = 1)
  X
}

## Example using the growth curve smooth constructor
## using simulated data.
set.seed(111)

d <- data.frame("time" = 1:30)
d$y <- 2 + 1 / (1 + exp(0.5 * (15 - d$time))) +
  rnorm(30, sd = exp(-3 + 2 * cos(d$time/30 * 6 - 3)))

f <- list(
  y ~ s2(time, bs = "gc"),
  sigma ~ s(time)
)

b <- bamlss(f, data = d, optimizer = bfit, sampler = GMCMC)

## Plot estimated effects.
par(mfrow = c(1, 2))
p <- predict(b, model = "mu", FUN = c95)
plot(y ~ time, data = d, main = expression(mu))
plot2d(p ~ time, data = d, add = TRUE, axes = FALSE,
  fill.select = c(0, 1, 0, 1))
points(d$time, d$y)
plot(b, model = "sigma", spar = FALSE, main = expression(log(sigma)))


# ---- Appendix C: Model fitting engines for linear regression -----------
## Linear model family object.
lm_bamlss <- function(...) {
  f <- list(
    "family" = "LM",
    "names" = "mu",
    "links" = "identity",
    "d" = function(y, par, log = FALSE) {
      sigma <- sqrt(sum((y - par$mu)^2) / (length(y) - .lm_bamlss.p))
      dnorm(y, mean = par$mu, sd = sigma, log = log)
    },
    "p" = function(y, par, ...) {
      sigma <- sqrt(sum((y - par$mu)^2) / (length(y) - .lm_bamlss.p))
      pnorm(y, mean = par$mu, sd = sigma, ...)
    }
  )
  class(f) <- "family.bamlss"
  return(f)
}

## Simulate some data.
d <- GAMart()

## Test family by creating a bamlss frame.
bf <- bamlss.frame(num ~ x1 + x2, data = d, family = "lm")
print(bf)

## The linear model design matrix.
head(bf$x$mu$model.matrix)

## The response.
head(bf$y)

## Set up an optimizer function that can be used with bamlss().
lm.opt <- function(x, y, ...)
{
  ## Only univariate response.
  y <- y[[1L]]

  ## For illustration this is easier to read.
  X <- x$mu$model.matrix

  ## Estimate model parameters.
  par <- drop(chol2inv(chol(crossprod(X))) %*% crossprod(X, y))

  ## Set parameter names.
  names(par) <- paste0("mu.p.", colnames(X))

  ## Return estimated parameters and fitted values.
  rval <- list(
    "parameters" = par,
    "fitted.values" = drop(X %*% par),
    "edf" = length(par),
    "sigma" = drop(sqrt(crossprod(y - X %*% par) / (length(y) - ncol(X))))
  )

  ## Set edf within .GlobalEnv for the
  ## loglik() function in the lm_bamlss() family.
  .lm_bamlss.p <<- length(par)

  return(rval)
}

## Test the optimizer on the simulated data.
## Note, no MCMC sampling.
f <- num ~ x1 + poly(x2, 5) + poly(x3, 5)
b <- bamlss(f, data = d, family = "lm", optimizer = lm.opt, sampler = FALSE)
summary(b)
nd <- data.frame("x2" = seq(0, 1, length = 100))
nd$p <- predict(b, newdata = nd, term = "x2")

## Plot estimetd effect of x2.
plot2d(p ~ x2, data = nd)

## Set up a sampling engine for linear models.
lm.mcmc <- function(x, y, start = NULL,
  n.iter = 12000, burnin = 2000, thin = 10,
  m = 0, M = 1e+05,
  a = 1, b = 1e-05,
  verbose = TRUE, ...)
{
  ## How many samples are saved?
  itrthin <- seq.int(burnin, n.iter, by = thin)
  nsaves <- length(itrthin)

  ## Only univariate response.
  y <- y[[1L]]

  ## For illustration this is easier to read.
  X <- x$mu$model.matrix

  ## Again, set edf within .GlobalEnv for the
  ## loglik() function in the lm_bamlss() family.
  .lm_bamlss.p <<- ncol(X)

  ## Number of observations and parameters.
  n <- length(y)
  p <- ncol(X)

  ## Matrix saving the samples.
  samples <- matrix(0, nsaves, p + 1L)

  ## Stick to the naming convention.
  pn <- paste0("mu.p.", colnames(X))
  colnames(samples) <- c(
    pn,      ## Regression coefficients and
    "sigma"  ## variance samples.
  )

  ## Setup coefficient vector,
  ## again, use correct names.
  beta <- rep(0, p)
  names(beta) <- pn
  sigma <- sd(y)

  ## Check for starting values obtained,
  ## e.g., from lm.opt() from above.
  if(!is.null(start)) {
    sn <- names(start)
    for(j in names(beta)) {
      if(j %in% sn)
        beta[j] <- start[j]
    }
  }

  ## Process prior information.
  m <- rep(m, length.out = p)
  if(length(M) < 2)
    M <- rep(M, length.out = p)
  if(!is.matrix(M))
    M <- diag(M)
  Mi <- solve(M)

  ## Precompute cross products.
  XX <- crossprod(X)
  Xy <- crossprod(X, y)

  ## Inverse gamma parameter.
  a <- a + n / 2 + p / 2

  ## Start sampling.
  ii <- 1
  for(i in 1:n.iter) {
    ## Sampling sigma
    b2 <- b + 1 / 2 * t(y - X %*% beta) %*% (y - X %*% beta) +
      1 / 2 * t(beta - m) %*% Mi %*% (beta - m)
    sigma2 <- sqrt(1 / rgamma(1, a, b2))

    ## Sampling beta.
    sigma2i <- 1 / sigma2
    Sigma <- chol2inv(chol(sigma2i * XX + sigma2i * Mi))
    mu <- Sigma %*% (sigma2i * Xy + sigma2i * Mi %*% m)
    beta <- MASS::mvrnorm(1, mu, Sigma)
      
    if(i %in% itrthin) {
      samples[ii, pn] <- beta
      samples[ii, "sigma"] <- sqrt(sigma2)
      ii <- ii + 1
    }
    if(verbose) {
      if(i %% 1000 == 0)
        cat("iteration:", i, "\n")
    }
  }

  ## Convert to "mcmc" object.
  samples <- as.mcmc(samples)

  return(samples)
}

## Test both engines on the simulated data.
b <- bamlss(f, data = d, family = "lm", optimizer = lm.opt, sampler = lm.mcmc)

## Show the summary.
summary(b)

## Predict for all terms including 95% credible intervals
nd$x1 <- nd$x3 <- seq(0, 1, length = 100)
for(j in c("x1", "x2", "x3")) {
  nd[[paste0("p.", j)]] <- predict(b, newdata = nd, term = j,
    FUN = c95, intercept = FALSE)
}

## Plot estimated effects.
par(mfrow = c(1, 3))
plot2d(p.x1 ~ x1, data = nd, fill.select = c(0, 1, 0, 1), lty = c(2, 1, 2))
plot2d(p.x2 ~ x2, data = nd, fill.select = c(0, 1, 0, 1), lty = c(2, 1, 2))
plot2d(p.x3 ~ x3, data = nd, fill.select = c(0, 1, 0, 1), lty = c(2, 1, 2))