############################################################################### # # # Purpose: GLMcat: An R package for GLMs for Categorical Responses # # Author: Lorena Leon, Jean Peyhardi and Catherine Trottier # # Contact: ylorenaleonv@gmail.com # # Comment: Script associated to the submitted paper for the JSS # # # ############################################################################### # install.packages('GLMcat') library("GLMcat") library("ggplot2") library("tidyr") library("scatterplot3d") library("dplyr") ##--------------------------------------------------------------- ## First example (Section 2) -- ##--------------------------------------------------------------- glmcat(formula = Level ~ Age, data = DisturbedDreams, categories_order = c("Not.severe", "Severe.1", "Severe.2", "Very.severe"), ratio = "adjacent", cdf = "gompertz") ##---------------------------------------------------------------- ## Second example (Section 2) -- ##---------------------------------------------------------------- glmcat(formula = Level ~ Age, data = DisturbedDreams, ref_category = "Very.severe", ratio = "reference", cdf = "gompertz") ##--------------------------------------------------------------- ## First model (Section 3.1) -- ##--------------------------------------------------------------- adj_gumbel_p <- glmcat(formula = Level ~ Age, data = DisturbedDreams, ratio = "adjacent", cdf = "gumbel", categories_order = c("Not.severe", "Severe.1", "Severe.2", "Very.severe")) logLik(adj_gumbel_p) summary(adj_gumbel_p) adj_gompertz_rev <- glmcat(formula = Level ~ Age, data = DisturbedDreams, ratio = "adjacent", cdf = "gompertz", categories_order = c("Very.severe", "Severe.2", "Severe.1", "Not.severe")) logLik(adj_gompertz_rev) summary(adj_gompertz_rev) ##--------------------------------------------------------------- ## Section 3.2 -- ##--------------------------------------------------------------- mod_ref_log_c <- glmcat(formula = Level ~ Age, ratio = "reference", parallel = FALSE, data = DisturbedDreams, cdf = "logistic") mod_adj_log_c <- glmcat(formula = Level ~ Age, ratio = "adjacent", parallel = FALSE, data = DisturbedDreams, cdf = "logistic") logLik(mod_ref_log_c); logLik(mod_adj_log_c) coef(mod_ref_log_c) coef(mod_adj_log_c) A <- matrix(c(1, 0, 0, -1, 1, 0, 0, -1, 1), nrow = 3) A %*% coef(mod_ref_log_c)[1:3] A %*% coef(mod_ref_log_c)[4:6] ##--------------------------------------------------------------- ## Section 3.4 -- ##--------------------------------------------------------------- data("accidents") glmcat(accident_severity ~ road + urban_or_rural_area + day_of_week + number_of_casualties + weather + light_conditions + speed_limit, data = accidents, parallel = FALSE, ratio = "cumulative", cdf = "cauchy") cum_cau <- glmcat(accident_severity ~ road + urban_or_rural_area + day_of_week + number_of_casualties + weather + light_conditions + speed_limit, data = accidents, parallel = TRUE, ratio = "cumulative", cdf = "cauchy") summary(cum_cau) adj_cau <- glmcat(accident_severity ~ road + urban_or_rural_area + day_of_week + number_of_casualties + weather + light_conditions + speed_limit, data = accidents, parallel = FALSE, ratio = "adjacent", cdf = "cauchy") summary(adj_cau) AIC(cum_cau) AIC(adj_cau) ##--------------------------------------------------------------- ## Section 4 -- ##--------------------------------------------------------------- head(TravelChoice, 8) logistic_car <- discrete_cm(formula = choice ~ hinc + psize + gc + ttme, case_id = "indv", alternatives = "mode", reference = "car", data = TravelChoice, alternative_specific = c("gc", "ttme"), cdf = "logistic") summary(logistic_car) logLik(logistic_car) logistic_car_alt <- discrete_cm(formula = choice ~ hinc[air] + psize[air] + gc + ttme, case_id = "indv", alternatives = "mode", reference = "car", data = TravelChoice, alternative_specific = c("gc", "ttme"), cdf = "logistic") summary(logistic_car_alt) logLik(logistic_car_alt) ##--------------------------------------------------------------- # Finding base reference mod_train <- discrete_cm(formula = choice ~ hinc + psize + gc + ttme, case_id = "indv", alternatives = "mode", reference = "train", alternative_specific = c("gc", "ttme"), normalization = 0.95, data = TravelChoice, cdf = "student", find_nu = TRUE) mod_train$cdf mod_car <- discrete_cm(formula = choice ~ hinc + psize + gc + ttme, case_id = "indv", alternatives = "mode", reference = "car", alternative_specific = c("gc", "ttme"), normalization = 0.95, data = TravelChoice, cdf = "student", find_nu = TRUE) mod_car$cdf mod_air <- discrete_cm(formula = choice ~ hinc + psize + gc + ttme, case_id = "indv", alternatives = "mode", reference = "air", alternative_specific = c("gc", "ttme"), normalization = 0.95, data = TravelChoice, cdf = "student", find_nu = TRUE) mod_air$cdf mod_bus <- discrete_cm(formula = choice ~ hinc + psize + gc + ttme, case_id = "indv", alternatives = "mode", reference = "bus", alternative_specific = c("gc", "ttme"), normalization = 0.95, data = TravelChoice, cdf = "student", find_nu = TRUE) mod_bus$cdf which.max(c(logLik(mod_train), logLik(mod_car), logLik(mod_air), logLik(mod_bus))) logLik(mod_car) summary(mod_car, normalized = TRUE) # Figure 6 # ---------------------------------------------------------------- d_f <- c(seq(0.2, 2, by = 0.05), 3:20) LogLik_car <- NA i = 1 for (variable in d_f) { car_0 <- discrete_cm(formula = choice ~ hinc + psize + gc + ttme, case_id = "indv", alternatives = "mode", reference = "car", alternative_specific = c("gc", "ttme"), data = TravelChoice, cdf = list("student", df = variable)) LogLik_car[i] <- car_0$LogLikelihood i = i + 1 } datcar <- data.frame(d_f, LogLik = LogLik_car, Alternative = "Car") LogLik_bus <- NA i = 1 for (variable in d_f) { car_0 <- discrete_cm(formula = choice ~ hinc + psize + gc + ttme, case_id = "indv", alternatives = "mode", reference = "bus", alternative_specific = c("gc", "ttme"), data = TravelChoice, cdf = list("student", df = variable)) LogLik_bus[i] <- car_0$LogLikelihood i = i + 1 } datbus <- data.frame(d_f, LogLik = LogLik_bus, Alternative = "Bus") LogLik_air <- NA i = 1 for (variable in d_f) { car_0 <- discrete_cm(formula = choice ~ hinc + psize + gc + ttme, case_id = "indv", alternatives = "mode", reference = "air", alternative_specific = c("gc", "ttme"), data = TravelChoice, cdf = list("student", df = variable)) LogLik_air[i] <- car_0$LogLikelihood i = i + 1 } datair <- data.frame(d_f, LogLik = LogLik_air, Alternative = "Air") LogLik_train <- NA i = 1 for (variable in d_f) { car_0 <- discrete_cm(formula = choice ~ hinc + psize + gc + ttme, case_id = "indv", alternatives = "mode", reference = "train", alternative_specific = c("gc", "ttme"), data = TravelChoice, cdf = list("student", df = variable)) LogLik_train[i] <- car_0$LogLikelihood i = i + 1 } dattrain <- data.frame(d_f, LogLik = LogLik_train, Alternative = "Train") dat2 <- rbind(datcar, datbus, datair, dattrain) Figure6 <- ggplot(dat2, aes(x = d_f, y = LogLik, color = Alternative)) + scale_color_manual(values = c("red", "orange", "darkgreen", "skyblue")) + geom_point() + geom_line() + labs(x = latex2exp::TeX("$Student(\\nu)$"), y = "Log-Likelihood") + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank(), panel.background = element_blank(), axis.line = element_line(colour = "black"), legend.position = "none") + geom_hline(yintercept = -177.4541, linetype = "dotted", color = "black", size = 1) + geom_vline(xintercept = 8, linetype = "dotted", color = "black", size = 1) + scale_y_continuous(name = "Log-Likelihood", breaks = c(-200, -177, -147)) + scale_x_continuous(breaks = c(0, 5, 8, 10, 15, 20)) Figure6 # Saved as pdf 6 * 3.1 # Save the plot as a PDF file ggsave("Figures/Figure6.pdf", plot = Figure6, width = 6, height = 3.1) # Model with only ttme dis_stu_2 <- discrete_cm(formula = choice ~ ttme, case_id = "indv", alternatives = "mode", reference = "car", data = TravelChoice, alternative_specific = "ttme", cdf = list("student", df = 0.490052)) logLik(dis_stu_2) # Transform data to wide format with 'ttme' spread across 'mode' wide_data <- TravelChoice %>% select(individual = indv, mode, travel_time = ttme) %>% spread(key = mode, value = travel_time) # Select choices where choice is TRUE and join with wide data chosen_modes <- filter(TravelChoice, choice) %>% select(individual = indv, mode) joined_data <- left_join(wide_data, chosen_modes, by = "individual") # Sort joined data by descending order of modes sorted_data <- joined_data %>% arrange(desc(air), desc(train), desc(bus), desc(car)) head(sorted_data[,-1],20) # Count distinct 'mode' values for each transport combination mode_summary <- sorted_data %>% group_by(air, train, bus, car) %>% summarise(modes_count = n_distinct(mode)) %>% ungroup # Display summary of mode counts summary(mode_summary$modes_count) # Figure 7 # ---------------------------------------------------------------- head(TravelChoice) data_wide <- spread(TravelChoice[, c("indv", "mode", "ttme")], mode, ttme) choice <- TravelChoice[TravelChoice$choice == T, c("indv", "mode")] dat_plot1 <- left_join(data_wide, choice) dat_summarized <- dat_plot1 %>% group_by(air, bus, car, train, mode) %>% summarise(freq = n()) cols <- c("skyblue", "orange", "darkgreen", "red") # Open a PDF device pdf("Figures/Travel_link1.pdf") s3d <- scatterplot3d(dat_summarized$air, dat_summarized$bus, dat_summarized$train, xlab = "ttme air", ylab = "", zlab = "ttme train", pch = 16, cex.symbols = as.numeric(as.factor(dat_summarized$freq))/10 + 0.5, color = cols[as.numeric(dat_summarized$mode)]) legend(s3d$xyz.convert(-4, 9, 90), bty = "n", inset = -0.25, xpd = TRUE, horiz = FALSE, cex = 1, legend = levels(as.factor(c("air", "bus", "train", "car"))), col = cols, pch = 16) dims <- par("usr") x <- dims[1] + 0.9 * diff(dims[1:2]) - 1 y <- dims[3] + 0.08 * diff(dims[3:4]) text(x, y, "ttme bus", srt = 35) # Close the PDF device dev.off() # Call sessionInfo # -------------------------------------------------------- # knitr::spin('reproduction_script.R') sessionInfo()