## This file contains the code required to reproduce the results presented
## in the paper. Before running the code, make sure you have the necessary packages
## installed. Open the file in your R environment and execute it. This should
## reproduce the results from the paper, including any tables, figures, or
## statistical analyses. The code may generate output files, plots, or tables.
## Check the code comments or documentation for details on where to find these
## outputs.


## * packages

library("hdpGLM")
library("PReMiuM")
## 
library("magrittr")
library("glue")
library("tictoc")
library("SimDesign")
library("ggplot2")

## * setup

theme_set(theme_bw() +
          theme(legend.position = "top") + 
          theme(strip.background = element_rect(colour="white", fill="white"),
                strip.text.x = element_text(size=11, face='bold', hjust=0),
                strip.text.y = element_text(size=11, face="bold", vjust=0),
                legend.background= element_rect(fill='transparent'),
                legend.key.height = grid::unit(.1, "cm"),
                legend.key.width = grid::unit(.8, "cm")
                )
          )

## * argument constant


## Example
## Parameters of prior distribution for a DPP with gaussian outcome
## that regress an outcome on two covariates
const_list = list(
    ## For beta (vector of size tree for intercept and two linear coefficients)
    mu_beta = c(0, 0, 0),
    Sigma_beta = diag(3),
    ## For the sigma_k
    df_sigma=10,
    s2_sigma=10,
    ## for SBC
    alpha=1
)


## * str of hdpGLM object, output of hdpGLM()

options(digits = 5)
set.seed(777)
sim <- hdpGLM_simulateData(n = 200, nCov = 2, K = 2, nCovj = 1, J = 5,
                           family = "gaussian")
df <- sim$data
mod <- hdpGLM(y ~ X1 + X2, y ~ W1, data = df, K = 50,
              mcmc = list(burn.in = 0, n.iter = 100), n.display = 30)
mod$max_active
class(mod)
str(mod)

## * Summary of hdpGLM

summary(mod)
summary_tidy(mod)

## * Visualization (non-hierarchical DPP)

set.seed(777)

## Simulate data to show examples of visualization using DPP model
sim <- hdpGLM_simulateData(n = 500, nCov = 2, K = 3, nCovj = 1, J = 1,
                           family = "gaussian")

## Estimate the model
mod <- hdpGLM(y ~ X1 + X2, data = sim$data,
              mcmc = list(burn.in = 0, n.iter = 100))

## Plots
## -----
g <- plot(mod); g

## with option to plot clusters separately
g <- plot(mod, separate = T); g

## with option to plot clusters separately and selecting covars to display
plot(mod, separate=T, term=c('X1', 'X2'))

## * Visualization (hierarchical DPP)

set.seed(777)
sim <- hdpGLM_simulateData(n = 500, nCov = 2, K = 3, nCovj = 1, J = 5,
                           family = "gaussian")
## creating context labels
sim$data <- sim$data %>% dplyr::mutate(school = paste0("School ",
                                                       toupper(letters[C])))
## estimation
mod <- hdpGLM(y ~ X1 + X2, y ~ W1, data = sim$data, context.id = "school",
             mcmc = list(burn.in = 0, n.iter = 100))

## Plots
## -----
g <- plot(mod, context.id = "school", term=c("X1", "X2")); g

g <- plot_tau(mod); g

g <- plot_pexp_beta(mod, smooth.line = T); g

## * Classification

set.seed(777)
## Simulate data
sim <- hdpGLM_simulateData(n = 500, nCov = 2, K = 3, nCovj = 1, J = 1,
                           family = "gaussian")
## estimate the model
mod <- hdpGLM(y ~ X1 + X2, data = sim$data,
              mcmc = list(burn.in = 0, n.iter = 100))

## classify the data points
hdpGLM_classify(data=sim$data, samples=mod) %>% head()

## * Examples
## ** Applied (DPP)

data("welfare", package = "hdpGLM")
head(welfare)

welfare %>% tibble::as_tibble()

mod <- lm(support ~ inequality + income + ideology, data = welfare)
summary(mod)

set.seed(777)
mcmc <- list(burn.in = 1000, n.iter = 5000)
mod <- hdpGLM(support ~ inequality + income + ideology, data = welfare, 
              mcmc = mcmc, K = 30)
summary(mod)

nclusters(mod)
classify(welfare, mod) %>% head()

## ** Applied (hDPP)

help(welfare2)
data("welfare2", package = "hdpGLM")
head(welfare2)

## adding country labels
welfare2 <- welfare2 %>%
    dplyr::mutate(country = paste0("Ctr ", country) ) 

## estimation
set.seed(777)
mcmc <- list(burn.in = 1, n.iter = 50)
mod <- hdpGLM(support ~ inequality + income + ideology, context.id = "country",
             support ~ gap, data = welfare2, mcmc = mcmc)

summary_tidy(mod)
plot(mod, ncol = 4, context.id = "country")
plot_pexp_beta(mod, smooth.line = T)
plot_tau(mod)


## saving the plots
g <- plot(mod, ncol = 4, context.id = "country"); g
g <- plot_pexp_beta(mod, smooth.line = T); g

## ** Simulated (DPP)

## Complete example
## ----------------
library(hdpGLM)


set.seed(777)
## simlating model parameters of the DPP regression
parameters <- hdpGLM_simParameters(nCov = 3, K = 2) 
## setting all linear covariates of second cluster to be equal the first
parameters$beta[[2]] <- parameters$beta[[1]]                          
## setting beta2 of the first cluster to be 2
parameters$beta[[1]]['beta2'] <- 2                                        
## setting beta2 of the second cluster to be -1.5
parameters$beta[[2]]['beta2'] <- -1.5                                     
parameters


## simulating the data
sim <- hdpGLM_simulateData(n = 2000, nCov = 3, K = 2, parameters = parameters,
                           family = "gaussian") 
str(sim)
head(sim$data)
df <- sim$data
mod <- hdpGLM(y ~ X1 + X2 + X3, data = df, 
              mcmc = list(burn.in = 500, n.iter = 10000))

summary(mod)
## summary_tidy(mod)

g <- plot(mod, separate = TRUE, ncol = 4); g

dfc <- hdpGLM_classify(data = df, samples = mod)
head(dfc)

## * Session info
sessionInfo()