# This file contains all code used in Section 6 of the paper by Speidel, Drechsler, Jolani.

# install hmi
if (!require("hmi")) {
    install.packages("hmi")
}

# load the required packages
library("hmi")
# We should avoid this:
library("broom.mixed")

# load the required data sets
data("Gcsemv", package = "hmi")
data("selfreport", package = "mice")
data("nhanes_mod", package = "hmi")
data("nhanes_sub", package = "hmi")

## Section 6.1
# summary of the data
summary(Gcsemv)

# check type of data
list_of_types_maker(Gcsemv)

# example on how a variable type can be specified by the user
modified_list <- list_of_types_maker(Gcsemv)
modified_list$coursework <- "ordered_categorical"

# set up a model_formula
model_formula <- written ~ 1 + gender + coursework + (1 + gender | school)

# imputation of the data
set.seed(123)
dat_imputed <- hmi(data = Gcsemv, model_formula = model_formula)
# Note: a warning message that one of the internal models did not converge to
# the allowed tolerance of 0.002, but 0.0045, is given. Since this appears
# in only one of multiple iterations and with only a rather small deviation
# from the allowed tolerance, the author evaluates the issue as neglectable.

# generation of Figure 1
plot(dat_imputed, layout = c(2, 2))

# convergence checks and plots. Will generate 12 plots. From those 12 plots,
# number 5 and 8 are included in the article (Figure 3 and 2).
chaincheck(dat_imputed, thin = NULL)

# pooling
summary(dat_imputed$pooling)

# set up a analysis_function
analysis_function <- function(complete_data) {

  # An empty list which will contain the (temporarily) results is set up.

  parameters_of_interest <- list()

  # The analysis model of interest is specified.

  my_model <- lme4::lmer(written ~ 1 + gender + coursework + (1 + gender | school),
                         data = complete_data)

  # Specify, which parameters from the model should be returned.
  # In the example the analyst is interested in all fixed effects estimates
  # and the random effects covariance matrix.

  parameters_of_interest[[1]] <- lme4::fixef(my_model)
  parameters_of_interest[[2]] <- lme4::VarCorr(my_model)[[1]][, ]
  # The following line is essential if the elements of interest should be labeled.
  ret <- unlist(parameters_of_interest)

  # Labeling the output
  names(ret) <- c("intercept", "gender", "coursework",
                  "sigma0", "sigma01", "sigma10", "sigma1")

  # Return the results.

  return(ret)
}

# pooling with hmi_pool
hmi_pool(mids = dat_imputed, analysis_function = analysis_function)

## Section 6.2
# initialization of arrays
low <- array(dim = 9971)
up <- array(dim = 9971)

# filling the lower bounds array
low[nhanes_sub$ind310 == 1] <- 0
low[nhanes_sub$ind310 == 2] <- 3001
low[nhanes_sub$ind310 == 3] <- 5001
low[nhanes_sub$ind310 == 4] <- 10001
low[nhanes_sub$ind310 == 5] <- 15001
low[nhanes_sub$ind310 == 6] <- 0
low[nhanes_sub$ind310 == 7] <- 20001
low[nhanes_sub$ind310 == 8] <- 0

# filling the upper bounds array
up[nhanes_sub$ind310 == 1] <- 3000
up[nhanes_sub$ind310 == 2] <- 5000
up[nhanes_sub$ind310 == 3] <- 10000
up[nhanes_sub$ind310 == 4] <- 15000
up[nhanes_sub$ind310 == 5] <- 20000
up[nhanes_sub$ind310 == 6] <- 20000
up[nhanes_sub$ind310 == 7] <- Inf
up[nhanes_sub$ind310 == 8] <- Inf

# generation of an interval object
ind310interval <- generate_interval(low, up)

# inspecting the interval object
head(ind310interval)

# imputation of the data
set.seed(123)
nhanes_imp <- hmi(nhanes_mod, maxit = 20, k = 3)

# table of an interval object
table(nhanes_mod$ind310)

# generation of Figure 4
plot(nhanes_mod$ind310, sort = "mostprecise_increasing")

# generating midpoints of an interval object
midpoints <- center_interval(nhanes_mod$ind310)

# table of the midpoints
table(midpoints)

# switching from interval to idf format
idf <- interval2idf(nhanes_mod$ind310)

# switching from idf to interval format
intervaldf <- idf2interval(idf)

# splitting up an interval object
bounds <- split_interval(nhanes_mod$ind310)
head(bounds)

# calculations for interval objects
log_savings_in_euro <- log(nhanes_mod$ind310 * 0.8)

## Section 6.3

# calculation of the shares of observations rounded to the nearest multiple of 5 and 10
sum(selfreport$wr %% 10 == 0)/nrow(selfreport) # 0.1941748
sum(selfreport$wr %%  5 == 0)/nrow(selfreport) # 0.3800971

# imputation of the data
set.seed(123)
selfreport_imputed <- hmi(selfreport[, c("hr", "wr")],
                          rounding_degrees = list(wr = c(1, 5, 10)))
# generation of Figure 5
par(mfrow = c(1, 2))
hist(selfreport$wr, breaks = 100,
     xlab = "selfreported weight", main = "Original, heaped data",
     ylim = c(0, 100))
hist(mice::complete(selfreport_imputed, 1)$wr, breaks = 100,
     xlab = "selfreported weight", main = "Data after first imputation",
     ylim = c(0, 100))