# This file contains all of the code necessary to replicate the code examples as # well as the figures and tables used in the JSS submission titled "logitr: Fast # Estimation of Multinomial and Mixed Logit Models with Preference Space and # Willingness to Pay Space Utility Parameterizations". # ---- Install and Load Packages ---- # Packages must be installed once. To do so, uncomment the lines below. Once # installed, these lines should be re-commented out. # Install packages for creating figures # install.packages(c( # "tidyverse", "cowplot", "ggrepel", "randtoolbox", "RColorBrewer" # )) # Install packages for benchmarking # install.packages(c("logitr", "mlogit", "gmnl")) # Load libraries for making figures library("tidyverse") library("cowplot") library("ggrepel") library("randtoolbox") library("RColorBrewer") # Load libraries for estimating models library("logitr") library("mlogit") library("gmnl") # This replication file is structured as follows: # # 1) Code to replicate the example models used in manuscript (Section 5). Since # Section 5 is the first section manuscript that contains code, it is included # first in this replication file even though Figure 1, Figure 2, and Table 1 # appear earlier in the manuscript. All other code after this section is not # included in the manuscript but rather is used to replicate Figure 1, Figure 2, # and Table 1. # # 2) Code to replicate Figure 1 (Section 3.1), which involves first estimating # several models and then using the estimates from those models to create the # figure itself. # # 3) Code to replicate Figure 2 (Section 4.2). This requires first estimating # several models using different packages and then creating the figure based on # the estimation times of each model. The exact estimation times will *always* # vary for every run of each model, even when run on the same machine due to # variations in background processes. As a result, the only way to perfectly # reproduce Figure 2 in the manuscript is to use a saved data set of the run # times exported from a specific run on a specific machine. To facilitate this, # the code to estimate the models used to create Figure 2 was first run on the # following publicly accessible Google colab notebook: # # https://colab.research.google.com/drive/1vYlBdJd4xCV43UwJ33XXpO3Ys8xWkuxx?usp=sharing # # The run times were then exported as a csv and are included in the logitr # package as a dataframe called "runtimes" that is loaded with the package. # This data set is used in the code in this section to replicate Figure 2 in # the manuscript. # # 4) Code to replicate Table 1 (Section 4.2). This is computed from the # "runtimes" data frame used to create Figure 2. # # # Note that logitr uses parallel processing to simultaneously estimate models # that use a multi-start via the "numMultiStarts" argument. To ensure perfect # reproducibility with the results in the manuscript, numCores is set to 1 for # all models that use a multi-start so that all multi-start iterations are run # in series rather than in parallel. This will result in slower estimation, but # is necessary for precise reproduction of the manuscript results. In practice, # it is recommended to use more cores for faster performance. # 1) Replication of example models used in manuscript (Section 5) ---- # Section 5.2 - Data format ---- head(yogurt) # Section 5.3 - Model specification interface ---- # Estimate a multinomial logit model in the preference space with fixed # parameters mnl_pref <- logitr( data = yogurt, outcome = "choice", obsID = "obsID", pars = c("price", "feat", "brand") ) # Estimate a multinomial logit model in the WTP space with fixed parameters mnl_wtp <- logitr( data = yogurt, outcome = "choice", obsID = "obsID", pars = c("feat", "brand"), scalePar = "price" ) # Comparing WTP space specification with the {gmnl} package data_gmnl <- mlogit.data( data = yogurt, shape = "long", choice = "choice", id.var = "id", alt.var = "alt", chid.var = "obsID", opposite = "price" ) mnl_wtp <- gmnl( data = data_gmnl, formula = choice ~ price + feat + brand | 0 | 0 | 0 | 1, fixed = c(TRUE, FALSE, FALSE, FALSE, FALSE, TRUE, FALSE), model = "smnl", method = "bhhh", start = c(1, 0, 0, 0, 0, 0, 0) ) # Section 5.4 - Continuous and discrete variable coding ---- # Manually setting factor levels to set the default level for "brand" yogurt2 <- logitr::yogurt yogurt2$brand <- factor( x = yogurt2$brand, levels = c("weight", "hiland", "yoplait", "dannon") ) levels(yogurt2$brand) # Section 5.5 - Estimating multinomial logit models ---- # Preference space model mnl_pref <- logitr( data = yogurt, outcome = "choice", obsID = "obsID", pars = c("price", "feat", "brand") ) # View summary of results summary(mnl_pref) # Extract coefficients using coef() coef(mnl_pref) # Extract standard errors using se() se(mnl_pref) # Extract the log-likelihood using coef() logLik(mnl_pref) # Extract variance-covariance matrix using vcov() vcov(mnl_pref) # Section 5.6 - Estimating willingness to pay ---- # Compute willingness to pay (WTP) wtp(mnl_pref, scalePar = "price") # Estimate a multinomial logit model in the WTP space with fixed parameters set.seed(123) mnl_wtp <- logitr( data = yogurt, outcome = "choice", obsID = "obsID", pars = c("feat", "brand"), scalePar = "price", numMultiStarts = 10, numCores = 1 ) # View summary of results summary(mnl_wtp) # Compare computed WTP versus directly estimated WTP wtpCompare( model_pref = mnl_pref, model_wtp = mnl_wtp, scalePar = "price" ) # Section 5.7 - Estimating mixed logit models ---- # Estimate a preference space mixed logit model with a normally-distributed # brand parameter and a fixed price parameter set.seed(456) mxl_pref <- logitr( data = yogurt, outcome = "choice", obsID = "obsID", panelID = "id", pars = c("price", "feat", "brand"), randPars = c(feat = "n", brand = "n"), numMultiStarts = 10, numCores = 1 ) # View summary of results summary(mxl_pref) # Section 5.8 - Estimating WTP space mixed logit models ---- # Estimate a WTP space mixed logit model with a normally-distributed brand WTP # parameter and feat parameter, and a fixed price / scale parameter set.seed(6789) mxl_wtp <- logitr( data = yogurt, outcome = "choice", obsID = "obsID", panelID = "id", pars = c("feat", "brand"), scalePar = "price", randPars = c(feat = "n", brand = "n"), numMultiStarts = 10, numCores = 1 ) # View summary of results summary(mxl_wtp) # Compare computed WTP versus directly estimated WTP wtpCompare( model_pref = mxl_pref, model_wtp = mxl_wtp, scalePar = "price" ) # Section 5.9 - Weighted models ---- # Estimate an *unweighted* WTP space multinomial logit model with fixed # parameters using a multi-start loop set.seed(5678) mnl_wtp_unweighted <- logitr( data = cars_us, outcome = "choice", obsID = "obsnum", pars = c( "hev", "phev10", "phev20", "phev40", "bev75", "bev100", "bev150", "american", "japanese", "chinese", "skorean", "phevFastcharge", "bevFastcharge","opCost", "accelTime"), scalePar = "price", robust = TRUE, numMultiStarts = 10, numCores = 1 ) # View summary of results summary(mnl_wtp_unweighted) # Estimate a *weighted* WTP space multinomial logit model with fixed # parameters using a multi-start loop set.seed(5678) mnl_wtp_weighted <- logitr( data = cars_us, outcome = "choice", obsID = "obsnum", pars = c( "hev", "phev10", "phev20", "phev40", "bev75", "bev100", "bev150", "american", "japanese", "chinese", "skorean", "phevFastcharge", "bevFastcharge","opCost", "accelTime"), scalePar = "price", weights = "weights", robust = TRUE, numMultiStarts = 10, numCores = 1 ) # View summary of results summary(mnl_wtp_weighted) # Compare weighted versus unweighted coefficients data.frame( Unweighted = coef(mnl_wtp_unweighted), Weighted = coef(mnl_wtp_weighted) ) # Section 5.10 - Predicting probabilities ---- # Predict probabilities and outcomes using estimated models. # Predicted probabilities using the preferences space MNL model probs <- predict(mnl_pref) head(probs) # Same predictions, but also returning the data probs <- predict(mnl_pref, returnData = TRUE) head(probs) # Predict probabilities for a subset of alternatives data <- subset( yogurt, obsID %in% c(42, 13), select = c("obsID", "alt", "price", "feat", "brand")) probs_mnl_pref <- predict( mnl_pref, newdata = data, obsID = "obsID", ) probs_mnl_pref # Include a 95% confidence interval set.seed(5678) probs_mnl_pref <- predict( mnl_pref, newdata = data, obsID = "obsID", ci = 0.95 ) probs_mnl_pref # Predict probabilities using WTP space model set.seed(5678) probs_mnl_wtp <- predict( mnl_wtp, newdata = data, obsID = "obsID", ci = 0.95 ) probs_mnl_wtp # Section 5.11 - Predicting outcomes ---- # Predict outcomes using the estimated preference space model set.seed(5678) outcomes_pref <- predict( mnl_pref, type = "outcome", returnData = TRUE ) head(outcomes_pref) # Predict outcomes using the estimated WTP space model set.seed(5678) outcomes_wtp <- predict( mnl_wtp, type = "outcome", returnData = TRUE ) head(outcomes_wtp) # Compute accuracy of each model chosen_pref <- subset(outcomes_pref, choice == 1) chosen_pref$correct <- chosen_pref$choice == chosen_pref$predicted_outcome accuracy_pref <- sum(chosen_pref$correct) / nrow(chosen_pref) accuracy_pref chosen_wtp <- subset(outcomes_wtp, choice == 1) chosen_wtp$correct <- chosen_wtp$choice == chosen_wtp$predicted_outcome accuracy_wtp <- sum(chosen_wtp$correct) / nrow(chosen_wtp) accuracy_wtp # Section 5.11 - Additional options ---- # Use the WTP values computed from preference space model as starting points in # a WTP space model. # First compute the WTP values from the mnl_pref model: wtp_est <- wtp(mnl_pref, scalePar = "price")$Estimate # Now use wtp_est as the starting values for the first iteration of the # multi-start loop: set.seed(5678) mnl_wtp2 <- logitr( data = yogurt, outcome = "choice", obsID = "obsID", pars = c("feat", "brand"), scalePar = "price", startVals = wtp_est, numMultiStarts = 10, numCores = 1 ) # 2) Replication of Figure 1 ---- # Figure 1 plots the coefficients from the next 6 models. Each model can # take several minutes to estimate as each one involves a 30-iteration # multi-start, which is used because some of the models (e.g., where # price is log-normally distributed) can have trouble converging depending on # the random starting points. # Set up the data used to create the figure (copy of the yogurt data exported) # from {logitr} fig1_data <- logitr::yogurt # Set the reference brand to "weight" for the fig1_data data frame # This is done simply for stability purposes as some of the models # converge better when "weight" is the reference level brands <- c("weight", "hiland", "yoplait", "dannon") fig1_data$brand <- factor(fig1_data$brand, levels = brands) # Preference space mixed logit model with a normally-distributed brand # parameter and a fixed price / scale parameter: set.seed(1234) mxl_pref_f <- logitr( data = fig1_data, outcome = "choice", obsID = "obsID", pars = c("price", "brand"), randPars = c(brand = "n"), numMultiStarts = 30 ) # Preference space mixed logit model with a normally-distributed brand # parameter and a normally-distributed price / scale parameter: set.seed(1234) mxl_pref_n <- logitr( data = fig1_data, outcome = "choice", obsID = "obsID", pars = c("price", "brand"), randPars = c(price = "n", brand = "n"), numMultiStarts = 30 ) # Preference space mixed logit model with a normally-distributed brand # parameter and a log-normally-distributed price / scale parameter. # For this model, use negative of price since log-normal forces positivity: fig1_data_neg_price <- fig1_data fig1_data_neg_price$price <- -1*fig1_data$price set.seed(1234) mxl_pref_ln <- logitr( data = fig1_data_neg_price, outcome = "choice", obsID = "obsID", pars = c("price", "brand"), randPars = c(price = "ln", brand = "n"), numMultiStarts = 30 ) # WTP space mixed logit model with a normally-distributed brand WTP # parameter and a fixed price / scale parameter: set.seed(1234) mxl_wtp_f <- logitr( data = fig1_data, outcome = "choice", obsID = "obsID", pars = "brand", scalePar = "price", randPars = c(brand = "n"), numMultiStarts = 30 ) # WTP space mixed logit model with a normally-distributed brand WTP # parameter and normally-distributed price / scale parameter: set.seed(1234) mxl_wtp_n <- logitr( data = fig1_data, outcome = "choice", obsID = "obsID", pars = "brand", scalePar = "price", randPars = c(brand = "n"), randScale = "n", numMultiStarts = 30 ) # WTP space mixed logit model with a normally-distributed brand WTP # parameter and log-normally-distributed price / scale parameter: set.seed(1234) mxl_wtp_ln <- logitr( data = fig1_data, outcome = "choice", obsID = "obsID", pars = "brand", scalePar = "price", randPars = c(brand = "n"), randScale = "ln", numMultiStarts = 30 ) # Compare log-likelihood of each model to check that they all reached # the same solution: data.frame( pref_f = logLik(mxl_pref_f), pref_n = logLik(mxl_pref_n), pref_ln = logLik(mxl_pref_ln), wtp_f = logLik(mxl_wtp_f), wtp_n = logLik(mxl_wtp_n), wtp_ln = logLik(mxl_wtp_ln) ) # Generate simulated data from the estimated parameters using halton draws pars_pref_f <- coef(mxl_pref_f) pars_pref_n <- coef(mxl_pref_n) pars_pref_ln <- coef(mxl_pref_ln) pars_wtp_f <- coef(mxl_wtp_f) pars_wtp_n <- coef(mxl_wtp_n) pars_wtp_ln <- coef(mxl_wtp_ln) N <- 10^4 draws <- randtoolbox::halton(N, normal = TRUE) mu <- "brandyoplait" sd <- "sd_brandyoplait" pars <- tibble( alpha_f = rep(pars_pref_f["price"], N), alpha_n = pars_pref_n["price"] + draws*pars_pref_n["sd_price"], alpha_ln = exp(pars_pref_ln["price"] + draws*pars_pref_ln["sd_price"]), beta_f = pars_pref_f[mu] + draws*pars_pref_f[sd], beta_n = pars_pref_n[mu] + draws*pars_pref_n[sd], beta_ln = pars_pref_ln[mu] + draws*pars_pref_ln[sd], omega_f = pars_wtp_f[mu] + draws*pars_wtp_f[sd], omega_n = pars_wtp_n[mu] + draws*pars_wtp_n[sd], omega_ln = pars_wtp_ln[mu] + draws*pars_wtp_ln[sd]) %>% # Reshape simulated draws for plotting mutate( pref_f = beta_f / (-1*alpha_f), pref_n = beta_n / (-1*alpha_n), pref_ln = beta_ln / alpha_ln ) %>% select(starts_with(c("pref", "omega"))) %>% gather(key = "model", value = "wtp", everything()) %>% mutate(model = fct_recode(model, "pref_f" = "pref_f", "pref_n" = "pref_n", "pref_ln" = "pref_ln", "wtp_f" = "omega_f", "wtp_n" = "omega_n", "wtp_ln" = "omega_ln" )) %>% separate(model, into = c("space", "model"), sep = "_") %>% mutate(space = fct_recode(space, "Preference" = "pref", "WTP" = "wtp" )) %>% # Remove extreme values by dropping 0.1% and 99.9% quantiles group_by(space, model) %>% mutate( lower = quantile(wtp, 0.01), upper = quantile(wtp, 0.99)) %>% filter(wtp > lower, wtp < upper) %>% select(-lower, -upper) # Create data frame for Figure 1 from estimated models fig1data <- pars %>% # Add labels for facets left_join(data.frame( model = c("f", "n", "ln"), title = c( "A) Price Fixed", "B) Price Normally Distributed", "C) Price Log-normally Distributed" ) )) %>% # Compute means and standard deviations group_by(space, model) %>% mutate( mean = mean(wtp), sd = sd(wtp), label = paste0( "Mean: ", round(mean, 2), "\nSD: ", round(sd, 2))) %>% ungroup() %>% mutate(space = fct_relevel(space, c("Preference", "WTP"))) # Make plot font <- "Fira Sans Condensed" plotColors <- c("red", "grey42") fig1 <- ggplot(fig1data) + geom_density( aes(x = wtp, y = ..density.., fill = space), color = "black", size = 0.1, alpha = 0.42) + geom_vline( mapping = aes(xintercept = mean, color = space), linetype = "dashed") + facet_wrap(vars(title), scales = "free") + scale_y_continuous(expand = expansion(mult = c(0, 0.05))) + coord_cartesian(xlim = c(2, 8), ylim = c(0, 8)) + scale_fill_manual( values = plotColors, guide = guide_legend(reverse = TRUE) ) + scale_color_manual(values = plotColors, guide = "none") + theme_minimal_hgrid(font_family = font, font_size = 16) + panel_border() + labs( x = "WTP for Yoplait brand", y = "Density", fill = "Model space:") + theme( legend.position = "bottom", plot.title = element_text(hjust = 0.5, size = 14), axis.title.x = element_text(size = 13), axis.title.y = element_text(size = 13)) + geom_text( data = fig1data %>% group_by(space, model) %>% slice(1), mapping = aes( label = label, x = c(4.6, 5, 5.1, 2.8, 2.7, 2.7), y = 4, color = space), size = 5, family = font, hjust = 0.5) if (!dir.exists("Figures")) dir.create("Figures") # Save chart ggsave("Figures/fig1.png", fig1, width = 13, height = 4.8) # 2) Replication of Figure 2 (benchmarking) ---- # The code below compares the relative computation time to estimate # a preference space mixed logit models using the # {logitr}, {mixl}, {mlogit}, {gmnl}, and {apollo} packages # # The exact estimation times obtained will *always* vary for every run # of each model, even when run on the same machine due to variations in # background processes. As a result, the only way to perfectly reproduce # Figure 2 in the manuscript is to use a data set of the run times from a # specific run on a specific machine. To facilitate this, the code to estimate # the models used to create Figure 2 was first run on the following publicly # accessible Google colab notebook: # # https://colab.research.google.com/drive/1vYlBdJd4xCV43UwJ33XXpO3Ys8xWkuxx?usp=sharing # # The run times were then exported as a csv and are included in the # logitr package as a dataframe called "runtimes" that is loaded with the # package. This data set is used in the code in this section to # replicate Figure 2 in the manuscript. # Visualize estimation times using "runtimes" data frame from Google # colab notebook (Figure 2 in manuscript) plotColors <- c("black", RColorBrewer::brewer.pal(n = 5, name = "Set1"), "gold") fig2 <- logitr::runtimes %>% ggplot(aes(x = numDraws, y = time_sec, color = package)) + geom_line() + geom_point() + geom_text_repel( data = . %>% filter(numDraws == max(numDraws)), aes(label = package), hjust = 0, nudge_x = 40, direction = "y", size = 4.5, segment.size = 0 ) + scale_x_continuous( limits = c(0, 1200), breaks = unique(logitr::runtimes$numDraws), labels = scales::comma) + scale_y_continuous(limits = c(0, 300), breaks = seq(0, 300, 100)) + scale_color_manual(values = plotColors) + guides( point = guide_legend(override.aes = list(label = "")), color = guide_legend(override.aes = list(label = ""))) + theme_bw(base_size = 18) + theme( panel.grid.minor = element_blank(), panel.grid.major.x = element_blank(), legend.position = "none", axis.line.x = element_blank(), axis.ticks.x = element_blank() ) + labs( x = "Number of random draws", y = "Computation time (seconds)" ) # Save chart ggsave("Figures/fig2.png", fig2, width = 8, height = 5.5) # 3) Replication of Table 1 ---- # Compute speed comparison tables using the "runtimes" data frame from # the Google colab benchmark logitr_time <- logitr::runtimes %>% filter(package == "logitr") %>% rename(time_logitr = time_sec) temp <- logitr::runtimes %>% left_join(select(logitr_time, -package), by = "numDraws") %>% mutate(mult = round(time_sec/ time_logitr, 1)) %>% select(-time_logitr) table_time <- temp %>% select(-mult) %>% pivot_wider(names_from = numDraws, values_from = time_sec) table_mult <- temp %>% select(-time_sec) %>% pivot_wider(names_from = numDraws, values_from = mult) # Print the tables table_time table_mult # Fine sessionInfo()