### R code from vignette source 'hglm.Rnw' Encoding: UTF-8

set.seed(7892)

## Load packages
library("holiglm")
library("detectseparation")
library("ztable")
library("ggplot2")
library("ggbeeswarm")

## 4. The holiglm package

## Fit log binomial model without constraints (similar to stats::glm)
data("Caesarian", package = "lbreg")
fit <- hglm(cbind(n1, n0) ~ RISK + NPLAN + ANTIB, binomial(link = "log"), solver = "ecos",
  data = Caesarian, constraints = NULL)
fit

## Display all generics for class glm in package stats
grep("^[[:alnum:]]*\\.glm$", ls(getNamespace("stats")), value = TRUE)

## Inspect summary of fit (output similar to summary of stats::glm)
summary(fit)

## Fit on model.matrix
x <- model.matrix(cbind(n1, n0) ~ RISK + NPLAN + ANTIB, data = Caesarian)
model <- hglm_model(x = x, y = with(Caesarian, cbind(n1, n0)), binomial(link = "log"))

## Construct optimization problem with dry_run = TRUE
model <- hglm(cbind(n1, n0) ~ RISK + NPLAN + ANTIB, data = Caesarian, binomial(link = "log"),
  constraints = NULL, dry_run = TRUE)

## View underlying optimization problem and solve it explicitly
op <- as.OP(model)
print(op)
ROI::ROI_solve(op)

## Check fit for same solution
fit <- hglm_fit(model)
fit

## Combine multiple constraints with c
c(k_max(5), rho_max(0.8))


## Fit with global sparsity constraint, k_max = 2
fit <- hglm(cbind(n1, n0) ~ RISK + NPLAN + ANTIB, binomial(link = "log"), constraints = k_max(2),
  data = Caesarian)
coef(fit)

## Display active coefficients
active_coefficients(fit)
coef(fit, type = "selected")

## Fit with group sparsity constraint
data("apprentice", package = "GLMsData")
fo <- Apps ~ Dist + log(Dist) + Pop + log(Pop) + Urban + Locn
constraints <- c(group_sparsity(c("Dist", "log(Dist)")), group_sparsity(c("Pop",
  "log(Pop)")))
fit <- hglm(fo, constraints = constraints, family = poisson(), data = apprentice)
coef(fit)


## Fit with global sparsity + inout constraint
fo <- Apps ~ log(Dist) + log(Pop) + Urban + Locn
constraints <- c(k_max(3), group_inout(c("LocnSouth", "LocnWest")))
fit <- hglm(fo, constraints = constraints, family = poisson(), data = apprentice)

## Fit with limited pairwise multicollinearity
fit <- hglm(mpg ~ cyl + disp + hp + drat + wt, data = mtcars, constraints = rho_max(0.9))
coef(fit)

## Fit with inclusion constraint
fit <- hglm(cbind(n1, n0) ~ RISK + NPLAN + ANTIB, binomial(link = "log"), constraints = c(k_max(2),
  include("NPLAN")), data = Caesarian)
coef(fit)

## Fit with bounded domains (lower + upper) constraints
constraints <- c(lower(c(RISK = 3, ANTIB = 0.001)), upper(c(NPLAN = -1)))
fit <- hglm(cbind(n1, n0) ~ RISK + NPLAN + ANTIB, binomial(link = "log"), constraints = constraints,
  data = Caesarian)
coef(fit)

## Fit with linear constraints RISK <= ANTIB
risk_leq_antib <- c(RISK = 1, ANTIB = -1)
fit <- hglm(cbind(n1, n0) ~ RISK + NPLAN + ANTIB, binomial(link = "log"), constraints = linear(risk_leq_antib,
  "<=", 0), data = Caesarian)
coef(fit)

## Fit with linear constraint RISK + NPLAN == 1
risk_nplan <- c(RISK = 1, NPLAN = 1)
fit <- hglm(cbind(n1, n0) ~ RISK + NPLAN + ANTIB, binomial(link = "log"), constraints = linear(risk_nplan,
  "==", 1), data = Caesarian)
coef(fit)
coef(fit)[["RISK"]] + coef(fit)[["NPLAN"]]


## Fit with multiple constraints at the same time
L <- rbind(c(RISK = 1, NPLAN = 0, ANTIB = -1), c(1, 1, 0))
fit <- hglm(cbind(n1, n0) ~ RISK + NPLAN + ANTIB, binomial(link = "log"), constraints = linear(L,
  c("<=", "=="), c(0, 1)), data = Caesarian)
coef(fit)

## Ensure coefficients of a group are equal
fit <- hglm(cbind(n1, n0) ~ RISK + NPLAN + ANTIB, binomial(link = "log"), constraints = group_equal(c("RISK",
  "NPLAN", "ANTIB")), data = Caesarian)
coef(fit)

## Include lower order polynomials if higher order is included
data("heatcap", package = "GLMsData")
fo <- Cp ~ poly(Temp, 5)
L <- rbind(c(-1, 1, 0, 0, 0), c(0, -1, 1, 0, 0), c(0, 0, -1, 1, 0), c(0, 0, 0, -1,
  1))
colnames(L) <- colnames(model.matrix(fo, data = heatcap))[-1]
lin_z_c <- linear(L, rep("<=", nrow(L)), rep(0, nrow(L)), on_big_m = TRUE)

## Combine lower order polynomials inclusion with global sparsity constraint
constraints <- c(k_max(3), lin_z_c)
fit <- hglm(fo, data = heatcap, constraints = constraints)
coef(fit)

## Enforce same sign on a group of covariates
fo <- Apps ~ 0 + Locn + Locn:Dist + Locn:Pop + Locn:Urban
constraints <- c(sign_coherence(c("LocnNorth:Dist", "LocnSouth:Dist", "LocnWest:Dist")),
  sign_coherence(c("LocnNorth:Pop", "LocnSouth:Pop", "LocnWest:Pop")), sign_coherence(c("LocnNorth:Urban",
    "LocnSouth:Urban", "LocnWest:Urban")))
fit <- hglm(fo, constraints = constraints, family = poisson("identity"), data = apprentice)
coef(fit)

## Enforce same sign on coefficients of pairwise multicollinearited features
fit <- hglm(formula = mpg ~ cyl + disp + hp + drat + wt, data = mtcars, constraints = pairwise_sign_coherence(0.9))
coef(fit)

## Estimate a sequence of models
fit_seq <- hglm_seq(formula = Cp ~ poly(Temp, 5), data = heatcap, constraints = lin_z_c)
fit_seq

## Aggregate data by unique rows
data("Heart", package = "lbreg")
heart <- agg_binomial(Heart ~ ., data = Heart, as_list = FALSE)
c(nrow(Heart), nrow(heart))

## Fit model on aggregated data
hglm(cbind(success, failure) ~ ., binomial(link = "log"), data = heart, constraints = NULL)

## 5. Comparison with existing \proglang{R} packages
lbl_eq <- function(x) label_both(x, sep = " = ")

## Load cached data for plots
supp_dir <- normalizePath("simulations/supp")

sizes <- readRDS(file.path(supp_dir, "sizes.rds"))
binomial_size <- sizes[[2]]

## Plot occupied memory of hglm objects with size 'normal'/'big'
lbls <- c("glm", "hglm", "hglm(\"big\")")
ggplot(binomial_size, aes(n, y = size, group = method)) + geom_point(aes(color = method,
  shape = method)) + geom_line(aes(linetype = method, color = method)) + labs(y = "size of fitted object (in MB)",
  color = "Method", linetype = "Method", shape = "Method") + facet_wrap(~p, labeller = lbl_eq) +
  scale_color_brewer(labels = lbls, palette = "Dark2") + scale_linetype_discrete(labels = lbls) +
  scale_shape_discrete(labels = lbls) + theme(axis.text.x = element_text(angle = 45,
  hjust = 1))

## 6. Uses cases

## Remove rows containing NAs
data("AdultUCI", package = "arules")
AdultUCI <- na.omit(AdultUCI)
dim(AdultUCI)

## Show distribution of repsonse
prop.table(table(AdultUCI$income))

## Show counts for different group combinations
xtabs(~sex + race, data = AdultUCI)


## Combine levels with low count into group Other
levels(AdultUCI$race)[c(1, 2)] <- "Other"

## Observe difference of high income earning in sex and race groups
aggregate(income ~ sex + race, data = AdultUCI, function(x) mean(as.numeric(x) -
  1))

## Classify race and sex as sensitive variables
W <- model.matrix(~0 + sex:race, data = AdultUCI)

## Create formula for predictive model analysis
form <- "income ~ age + relationship + `marital-status` + workclass +
  `education-num` + occupation + I(`capital-gain` - `capital-loss`) +
  `hours-per-week`"

## Estimate unconstrained model with no fairness constraints and upper bound on
## disparate impact covariance measure
m0 <- glm(as.formula(form), family = binomial(), data = AdultUCI)
s <- apply(W, 2, function(w) abs(cov(w, m0$linear.predictors)))

## Create model matrix and constraint matrix for fairness constraint model
xm <- model.matrix(m0)
L <- t(apply(W, 2, function(w) colMeans((w - mean(w)) * xm)))

## Estimate fairness constraint model for various levels of alpha cache results
alpha <- seq(0, 1, by = 0.05)
FILE_OUT <- "results-cache/pred_prob_race_sex.rds"
if (file.exists(FILE_OUT)) {
  pred_prob_race_sex <- readRDS(FILE_OUT)
} else {
  K <- nrow(L)
  pred_prob_race_sex <- sapply(alpha, function(ak) {
    ck <- ak * s
    model_constrained <- hglm(as.formula(form), family = binomial(), data = AdultUCI,
      scaler = "center_standardization", big_m = 5, constraints = c(linear(L,
        rep(">=", K), -ck), linear(L, rep("<=", K), ck)))
    phat <- predict(model_constrained, type = "response")
    phat
  })
  dir.create("results-cache", showWarnings = FALSE, recursive = FALSE)
  saveRDS(pred_prob_race_sex, file = FILE_OUT)
}

## Calculate disparate impact, accuracy and AUROC for different values of alpha
obs <- AdultUCI$income
res <- apply(pred_prob_race_sex, 2, function(phat) {
  tab <- table(phat > 0.5, obs)
  tab_w <- table(phat > 0.5, W[, "sexMale:raceWhite"])
  ptab_w <- prop.table(tab_w)
  DI <- ptab_w[2, 1]/ptab_w[2, 2]
  acc <- sum(diag(tab))/sum(tab)
  c(DI = DI, acc = acc, auc = pROC::roc(obs, phat, quiet = TRUE)$auc)
})

## Create scatterplots of disparate impact, accuracy and AUROC
par(mfrow = c(1, 3))
plot(alpha, res[1, ], type = "b", xlab = expression(alpha), ylab = "Disparate impact")
plot(alpha, res[2, ], type = "b", xlab = expression(alpha), ylab = "Accuracy")
plot(alpha, res[3, ], type = "b", xlab = expression(alpha), ylab = "AUROC")

## Load icu data and drop id column
data("icu", package = "aplore3")
dim(icu)
icu <- icu[, setdiff(colnames(icu), "id")]

## Check if MLE exists for log binomial model
glm(sta ~ ., family = binomial("log"), data = icu, method = "detect_separation")

## Check if MLE exists for logit binomial model
glm(sta ~ ., family = binomial("logit"), data = icu, method = "detect_separation")


## Fit sequence of models and cache results
cache_icu_results_file <- "results-cache/log-binomial_example.rds"
if (!file.exists(cache_icu_results_file)) {
  fits <- hglm_seq(formula = sta ~ ., family = binomial("log"), data = icu, solver = "ecos")
  saveRDS(fits, cache_icu_results_file)
} else {
  fits <- readRDS(cache_icu_results_file)
}


## This is a slightly modified version of make makeHeatmap from the ztable
## package.  Therefore this function is again under GPL-2 license.
make_heatmap <- function(z, palette = "Reds", mycolor = NULL, rows = NULL, cols = NULL,
  changeColor = TRUE, reverse = FALSE, margin = 0, na_color = "#FFFFFF") {
  if (is.null(mycolor))
    mycolor <- palette2colors(palette)
  ncolor <- length(mycolor)
  mycolor <- c(mycolor, na_color)
  df <- z$x
  if (is.null(rows))
    rows <- 1:nrow(df)
  if (is.null(cols))
    cols <- 1:ncol(df)
  df1 <- df[rows, cols]
  select <- sapply(df1, is.numeric)
  selected <- cols[which(select)]
  if (margin == 0) {
    res <- as.matrix(df1[select])
    max <- res[which.max(res)]
    min <- res[which.min(res)]
    res <- round((res - min) * (ncolor - 1)/(max - min)) + 1
    res[is.na(res)] <- length(mycolor)
  } else if (margin == 1) {
    res <- df1[select]
    result <- apply(res, 1, normalize2, ncolor)
    res[] <- t(result)
  } else {
    res <- df1[select]
    res[] <- apply(res, 2, normalize2, ncolor)
  }
  for (i in 1:nrow(res)) {
    for (j in 1:ncol(res)) {
      color <- NULL
      if (changeColor) {
        if (reverse) {
          color <- ifelse(res[i, j] > ncolor/2, "black", "white")
        } else {
          color <- ifelse(res[i, j] > ncolor/2, "white", "black")
        }
      }
      z <- addCellColor(z, rows = rows[i] + 1, cols = selected[j] + 1, bg = mycolor[res[i,
        j]], color = color)
    }
  }
  z
}

## Calculate k, loglike, AIC and BIC from fitted models
stats <- data.frame(k = sapply(fits, "[[", "k_max"), loglik = sapply(fits, logLik),
  aic = sapply(fits, AIC), bic = sapply(fits, BIC))
stats <- cbind(stats, do.call(rbind, lapply(fits, coef)))
stats <- stats[order(stats$k, decreasing = TRUE), ]
stats <- stats[, -2L]

## Build heatmap
d <- stats
rownames(d) <- NULL
d[, 5:ncol(d)][abs(d[, 5:ncol(d)]) < 1e-04] <- NA_real_
d[, 4:ncol(d)] <- exp(d[, 4:ncol(d)])
colnames(d) <- gsub("(Intercept)", "$RR_0$", colnames(d), fixed = TRUE)
colnames(d) <- gsub("<=", "$\\\\leq$", colnames(d))
colnames(d) <- gsub("<", "$<$", colnames(d))
colnames(d) <- gsub(">", "$>$", colnames(d))
z <- ztable::ztable(d, type = "latex", include.rownames = FALSE)
bgcolors <- ztable::gradientColor(low = "yellow", mid = "orange", high = "red", n = 100)
z <- make_heatmap(z, mycolor = bgcolors, cols = 5:ncol(d), changeColor = FALSE)
pale <- rev(ztable::gradientColor(low = "white", mid = "lightgreen", high = "lightblue",
  n = 1000))
z <- make_heatmap(z, mycolor = pale, cols = 2, changeColor = FALSE)
z <- make_heatmap(z, mycolor = pale, cols = 3, changeColor = FALSE)
z

## Appendix

## Load cached data for plots in Appendix
supp_dir <- normalizePath("simulations/supp")

linear_times <- readRDS(file.path(supp_dir, "linear.rds"))
gauss_linear_time <- linear_times[[1]]
binomial_linear_time <- linear_times[[2]]
poisson_linear_time <- linear_times[[3]]

bss_times <- readRDS(file.path(supp_dir, "bss.rds"))
gauss_bss_time <- bss_times[[1]]
binomial_bss_time <- bss_times[[2]]
poisson_bss_time <- bss_times[[3]]

## Plot runtime beeswarm for bss case and gaussian data
lbls <- c("abess", "bestglm", "hglm(ECOS)", "hglm(GUROBI)")
ggplot(gauss_bss_time, aes(x = factor(method), y = time, group = method)) + geom_beeswarm(aes(shape = method,
  color = method)) + labs(y = "time (in s) - log-scale", color = "Package", x = "",
  shape = "Package") + scale_y_log10() + facet_wrap(~p, labeller = lbl_eq) + scale_color_brewer(labels = lbls,
  palette = "Dark2") + scale_x_discrete(labels = lbls) + scale_shape_discrete(labels = lbls) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Plot runtime beeswarm for bss case and binomial data
lbls <- c("abess", "bestglm", "hglm(ECOS)", "hglm(MOSEK)")
ggplot(binomial_bss_time, aes(x = factor(method), y = time, group = method)) + geom_beeswarm(aes(shape = method,
  color = method)) + labs(y = "time (in s) - log-scale", color = "Package", x = "",
  shape = "Package") + scale_y_log10() + facet_wrap(~p, labeller = lbl_eq) + scale_color_brewer(labels = lbls,
  palette = "Dark2") + scale_x_discrete(labels = lbls) + scale_shape_discrete(labels = lbls) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Plot runtime beeswarm for bss case and poisson data
lbls <- c("abess", "bestglm", "hglm(ECOS)", "hglm(MOSEK)")
ggplot(poisson_bss_time, aes(x = factor(method), y = time, group = method)) + geom_beeswarm(aes(shape = method,
  color = method)) + labs(y = "time (in s) - log-scale", color = "Package", x = "",
  shape = "Package") + scale_y_log10() + facet_wrap(~p, labeller = lbl_eq) + scale_color_brewer(labels = lbls,
  palette = "Dark2") + scale_x_discrete(labels = lbls) + scale_shape_discrete(labels = lbls) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Plot runtime beeswarm for linear constraints case and gaussian data
lbls <- c("ConsReg", "hglm(ECOS)", "hglm(GUROBI)", "restriktor")
ggplot(gauss_linear_time, aes(x = factor(method), y = time, group = method)) + geom_beeswarm(aes(shape = method,
  color = method)) + labs(y = "time (in s) - log-scale", color = "Package", x = "",
  shape = "Package") + scale_y_log10() + facet_wrap(~p, labeller = lbl_eq) + scale_color_brewer(labels = lbls,
  palette = "Dark2") + scale_x_discrete(labels = lbls) + scale_shape_discrete(labels = lbls) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

## Plot runtime beeswarm for linear constraints case and binomial data
lbls <- c("ConsReg", "hglm(ECOS)", "hglm(MOSEK)", "restriktor")
ggplot(binomial_linear_time, aes(x = factor(method), y = time, group = method)) +
  geom_beeswarm(aes(shape = method, color = method)) + labs(y = "time (in s) - log-scale",
  color = "Package", x = "", shape = "Package") + scale_y_log10() + facet_wrap(~p,
  labeller = lbl_eq) + scale_color_brewer(labels = lbls, palette = "Dark2") + scale_x_discrete(labels = lbls) +
  scale_shape_discrete(labels = lbls) + theme(axis.text.x = element_text(angle = 45,
  hjust = 1))

## Plot runtime beeswarm for linear constraints case and poisson data
lbls <- c("ConsReg", "hglm(ECOS)", "hglm(MOSEK)", "restriktor")
ggplot(poisson_linear_time, aes(x = factor(method), y = time, group = method)) +
  geom_beeswarm(aes(shape = method, color = method)) + labs(y = "time (in s) - log-scale",
  color = "Package", x = "", shape = "Package") + scale_y_log10() + facet_wrap(~p,
  labeller = lbl_eq) + scale_color_brewer(labels = lbls, palette = "Dark2") + scale_x_discrete(labels = lbls) +
  scale_shape_discrete(labels = lbls) + theme(axis.text.x = element_text(angle = 45,
  hjust = 1))

FALSE
