###############################################################################
#                                                                             #
# Purpose:       GLMcat: An R package for GLMs for Categorical Responses      #
# Author:        Lorena Leon, Jean Peyhardi and Catherine Trottier            #
# Contact:       ylorenaleonv@gmail.com                                       #
# Comment:       Script associated to the submitted paper for the JSS         #
#                                                                             #
###############################################################################


# install.packages('GLMcat')
library("GLMcat")
library("ggplot2")
library("tidyr")
library("scatterplot3d")
library("dplyr")

##---------------------------------------------------------------
##                  First example (Section 2)                  --
##---------------------------------------------------------------

glmcat(formula = Level ~ Age, data = DisturbedDreams,
  categories_order = c("Not.severe", "Severe.1", "Severe.2",
    "Very.severe"), ratio = "adjacent", cdf = "gompertz")

##----------------------------------------------------------------
##                  Second example (Section 2)                  --
##----------------------------------------------------------------

glmcat(formula = Level ~ Age, data = DisturbedDreams,
   ref_category = "Very.severe", ratio = "reference", cdf = "gompertz")

##---------------------------------------------------------------
##                  First model (Section 3.1)                  --
##---------------------------------------------------------------

adj_gumbel_p <- glmcat(formula = Level ~ Age, data = DisturbedDreams,
   ratio = "adjacent", cdf = "gumbel", categories_order = c("Not.severe",
      "Severe.1", "Severe.2", "Very.severe"))
logLik(adj_gumbel_p)
summary(adj_gumbel_p)

adj_gompertz_rev <- glmcat(formula = Level ~ Age, data = DisturbedDreams,
   ratio = "adjacent", cdf = "gompertz", categories_order = c("Very.severe",
      "Severe.2", "Severe.1", "Not.severe"))
logLik(adj_gompertz_rev)
summary(adj_gompertz_rev)

##---------------------------------------------------------------
##                         Section 3.2                         --
##---------------------------------------------------------------

mod_ref_log_c <- glmcat(formula = Level ~ Age, ratio = "reference",
   parallel = FALSE, data = DisturbedDreams, cdf = "logistic")

mod_adj_log_c <- glmcat(formula = Level ~ Age, ratio = "adjacent",
   parallel = FALSE, data = DisturbedDreams, cdf = "logistic")

logLik(mod_ref_log_c); logLik(mod_adj_log_c)
coef(mod_ref_log_c)
coef(mod_adj_log_c)

A <- matrix(c(1, 0, 0, -1, 1, 0, 0, -1, 1), nrow = 3)
A %*% coef(mod_ref_log_c)[1:3]
A %*% coef(mod_ref_log_c)[4:6]

##---------------------------------------------------------------
##                         Section 3.4                        --
##---------------------------------------------------------------

data("accidents")

glmcat(accident_severity ~ road + urban_or_rural_area +
   day_of_week + number_of_casualties + weather +
   light_conditions + speed_limit, data = accidents,
   parallel = FALSE, ratio = "cumulative", cdf = "cauchy")

cum_cau <- glmcat(accident_severity ~ road + urban_or_rural_area +
   day_of_week + number_of_casualties + weather +
   light_conditions + speed_limit, data = accidents,
   parallel = TRUE, ratio = "cumulative", cdf = "cauchy")

summary(cum_cau)

adj_cau <- glmcat(accident_severity ~ road + urban_or_rural_area +
   day_of_week + number_of_casualties + weather +
   light_conditions + speed_limit, data = accidents,
   parallel = FALSE, ratio = "adjacent", cdf = "cauchy")

summary(adj_cau)

AIC(cum_cau)

AIC(adj_cau)

##---------------------------------------------------------------
##                          Section 4                          --
##---------------------------------------------------------------

head(TravelChoice, 8)

logistic_car <- discrete_cm(formula = choice ~ hinc +
   psize + gc + ttme, case_id = "indv", alternatives = "mode",
   reference = "car", data = TravelChoice, alternative_specific = c("gc",
      "ttme"), cdf = "logistic")
summary(logistic_car)
logLik(logistic_car)

logistic_car_alt <- discrete_cm(formula = choice ~
   hinc[air] + psize[air] + gc + ttme, case_id = "indv",
   alternatives = "mode", reference = "car", data = TravelChoice,
   alternative_specific = c("gc", "ttme"), cdf = "logistic")
summary(logistic_car_alt)
logLik(logistic_car_alt)

##---------------------------------------------------------------
# Finding base reference

mod_train <- discrete_cm(formula = choice ~ hinc +
              psize + gc + ttme, 
            case_id = "indv", alternatives = "mode",
            reference = "train", 
            alternative_specific = c("gc", "ttme"), normalization = 0.95,
            data = TravelChoice, cdf = "student", find_nu = TRUE)
mod_train$cdf
mod_car <- discrete_cm(formula = choice ~ hinc +
                     psize + gc + ttme, 
                   case_id = "indv", alternatives = "mode",
                   reference = "car", 
                   alternative_specific = c("gc", "ttme"), normalization = 0.95,
                   data = TravelChoice, cdf = "student", find_nu = TRUE)
mod_car$cdf
mod_air <- discrete_cm(formula = choice ~ hinc +
                     psize + gc + ttme, 
                   case_id = "indv", alternatives = "mode",
                   reference = "air", 
                   alternative_specific = c("gc", "ttme"), normalization = 0.95,
                   data = TravelChoice, cdf = "student", find_nu = TRUE)
mod_air$cdf
mod_bus <- discrete_cm(formula = choice ~ hinc +
                     psize + gc + ttme, 
                   case_id = "indv", alternatives = "mode",
                   reference = "bus", 
                   alternative_specific = c("gc", "ttme"), normalization = 0.95,
                   data = TravelChoice, cdf = "student", find_nu = TRUE)
mod_bus$cdf
which.max(c(logLik(mod_train), logLik(mod_car), logLik(mod_air), logLik(mod_bus))) 
logLik(mod_car)
summary(mod_car, normalized = TRUE)


# Figure 6
# ----------------------------------------------------------------

d_f <- c(seq(0.2, 2, by = 0.05), 3:20)
LogLik_car <- NA
i = 1
for (variable in d_f) {
   car_0 <- discrete_cm(formula = choice ~ hinc +
      psize + gc + ttme, case_id = "indv", alternatives = "mode",
      reference = "car", alternative_specific = c("gc",
         "ttme"), data = TravelChoice, cdf = list("student",
         df = variable))
   LogLik_car[i] <- car_0$LogLikelihood
   i = i + 1

}
datcar <- data.frame(d_f, LogLik = LogLik_car, Alternative = "Car")

LogLik_bus <- NA
i = 1
for (variable in d_f) {
   car_0 <- discrete_cm(formula = choice ~ hinc +
      psize + gc + ttme, case_id = "indv", alternatives = "mode",
      reference = "bus", alternative_specific = c("gc",
         "ttme"), data = TravelChoice, cdf = list("student",
         df = variable))
   LogLik_bus[i] <- car_0$LogLikelihood
   i = i + 1

}

datbus <- data.frame(d_f, LogLik = LogLik_bus, Alternative = "Bus")

LogLik_air <- NA
i = 1
for (variable in d_f) {
   car_0 <- discrete_cm(formula = choice ~ hinc +
      psize + gc + ttme, case_id = "indv", alternatives = "mode",
      reference = "air", alternative_specific = c("gc",
         "ttme"), data = TravelChoice, cdf = list("student",
         df = variable))
   LogLik_air[i] <- car_0$LogLikelihood
   i = i + 1

}

datair <- data.frame(d_f, LogLik = LogLik_air, Alternative = "Air")

LogLik_train <- NA
i = 1
for (variable in d_f) {
   car_0 <- discrete_cm(formula = choice ~ hinc +
      psize + gc + ttme, case_id = "indv", alternatives = "mode",
      reference = "train", alternative_specific = c("gc",
         "ttme"), data = TravelChoice, cdf = list("student",
         df = variable))
   LogLik_train[i] <- car_0$LogLikelihood
   i = i + 1

}

dattrain <- data.frame(d_f, LogLik = LogLik_train,
   Alternative = "Train")

dat2 <- rbind(datcar, datbus, datair, dattrain)

Figure6 <- ggplot(dat2, aes(x = d_f, y = LogLik, color = Alternative)) +
   scale_color_manual(values = c("red", "orange",
      "darkgreen", "skyblue")) + geom_point() + geom_line() +
   labs(x = latex2exp::TeX("$Student(\\nu)$"), y = "Log-Likelihood") +
   theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(),
      panel.background = element_blank(), axis.line = element_line(colour = "black"),
      legend.position = "none") + geom_hline(yintercept = -177.4541,
   linetype = "dotted", color = "black", size = 1) +
   geom_vline(xintercept = 8, linetype = "dotted",
      color = "black", size = 1) + scale_y_continuous(name = "Log-Likelihood",
   breaks = c(-200, -177, -147)) + scale_x_continuous(breaks = c(0,
   5, 8, 10, 15, 20))

Figure6  # Saved as pdf 6 * 3.1
# Save the plot as a PDF file
ggsave("Figures/Figure6.pdf", plot = Figure6, width = 6, height = 3.1)

# Model with only ttme
dis_stu_2 <- discrete_cm(formula = choice ~ ttme, case_id = "indv",
   alternatives = "mode", reference = "car", data = TravelChoice,
   alternative_specific = "ttme", cdf = list("student",
      df = 0.490052))

logLik(dis_stu_2)



# Transform data to wide format with 'ttme' spread across 'mode'
wide_data <- TravelChoice %>%
  select(individual = indv, mode, travel_time = ttme) %>%
  spread(key = mode, value = travel_time)

# Select choices where choice is TRUE and join with wide data
chosen_modes <- filter(TravelChoice, choice) %>%
  select(individual = indv, mode)
joined_data <- left_join(wide_data, chosen_modes, by = "individual")

# Sort joined data by descending order of modes
sorted_data <- joined_data %>%
  arrange(desc(air), desc(train), desc(bus), desc(car))

head(sorted_data[,-1],20)

# Count distinct 'mode' values for each transport combination
mode_summary <- sorted_data %>%
  group_by(air, train, bus, car) %>%
  summarise(modes_count = n_distinct(mode)) %>%
  ungroup
# Display summary of mode counts
summary(mode_summary$modes_count)


# Figure 7
# ----------------------------------------------------------------

head(TravelChoice)
data_wide <- spread(TravelChoice[, c("indv", "mode",
   "ttme")], mode, ttme)
choice <- TravelChoice[TravelChoice$choice == T, c("indv",
   "mode")]
dat_plot1 <- left_join(data_wide, choice)

dat_summarized <- dat_plot1 %>%
   group_by(air, bus, car, train, mode) %>%
   summarise(freq = n())
cols <- c("skyblue", "orange", "darkgreen", "red")

# Open a PDF device
pdf("Figures/Travel_link1.pdf")

s3d <- scatterplot3d(dat_summarized$air, dat_summarized$bus,
   dat_summarized$train, xlab = "ttme air", ylab = "",
   zlab = "ttme train", pch = 16, cex.symbols = as.numeric(as.factor(dat_summarized$freq))/10 +
      0.5, color = cols[as.numeric(dat_summarized$mode)])

legend(s3d$xyz.convert(-4, 9, 90), bty = "n", inset = -0.25,
   xpd = TRUE, horiz = FALSE, cex = 1, legend = levels(as.factor(c("air",
      "bus", "train", "car"))), col = cols, pch = 16)


dims <- par("usr")
x <- dims[1] + 0.9 * diff(dims[1:2]) - 1
y <- dims[3] + 0.08 * diff(dims[3:4])
text(x, y, "ttme bus", srt = 35)

# Close the PDF device
dev.off()


# Call sessionInfo
# --------------------------------------------------------
# knitr::spin('reproduction_script.R')
sessionInfo()