############################################################################ ### Section 2 On using mnlogit ############################################################################ library("mnlogit") # Display expected data format data("Fish", package = "mnlogit") head(Fish, 8) # Formula examples fm <- formula(mode ~ price | income | catch) # A set of equivalent formulas fm <- formula(mode ~ price | income - 1 | catch) fm <- formula(mode ~ price | income | catch - 1) fm <- formula(mode ~ 0 + price | income | catch) # A set of non-equivalent formulas fm <- formula(mode ~ 1 | income | catch) fm <- formula(mode ~ price | 1 | catch) fm <- formula(mode ~ price | income | 1) fm <- formula(mode ~ price | 1 | 1) fm <- formula(mode ~ 1 | 1 | price + catch) # Another set of equivalent formulas - showing omission of certain slots fm <- formula(mode ~ price + catch | 1 | 1) fm <- formula(mode ~ price + catch | 1) fm <- formula(mode ~ price + catch) # Fitting a basic model fm <- formula(mode ~ price | income | catch) fit <- mnlogit(fm, Fish, ncores=2) class(fit) # Fitting same model using multiple processor cores fit <- mnlogit(fm, Fish, choiceVar = "alt", ncores = 2) class(fit) # Printing estimation information print(fit, what = "eststat") print(fit, what = "modsize") ############################################################################ ### 4 Benchmarking performance ############################################################################ ############################################################################ ### Table 1 ############################################################################ library("mnlogit") library("mlogit") source("simChoiceModel.R") # Has makeModel() to generate simulated data ## Parameter Setting # Num choices numChoices <- 100 # setting used to generate results in Table 1 of paper ## NOTE ## May take quite long to run, for quick results uncomment line below #numChoices <- 10 cat(paste0("\nProfiling mnlogit with ", numChoices, " alternatives.\n")) ############################################################################ ## Generate simulated data and model formulas ############################################################################ # Type "X" problems # Make formula vars <- paste("X", 1:50, sep = "", collapse = " + ") fmX <- formula(paste("response ~ 1|", vars, " - 1 | 1")) # Make formula for type "Y" problems vars <- paste("X", 1:50, sep = "", collapse = " + ") fmY <- formula(paste("response ~ 1| - 1 | ", vars)) # Formula for type "Z" problems vars <- paste("X", 1:50, sep = "", collapse = " + ") fmZ <- formula(paste("response ~ ", vars, "| - 1 | 1")) # Formula for type "YZ" problems # 5 variables of type "Z" and 45 variables of type "Y" vars <- paste("X", 1:45, sep = "", collapse = " + ") fmYZ <- formula(paste("response ~ X46 + X47 + X48 + X49 + X50| - 1 | ", vars)) ############################################################################ ## Run code and generate results ############################################################################ # Table for recording results. See Table 1 of the paper time.table <- matrix(rep(0, 5 * 4), nrow = 4) colnames(time.table) <- c("Pre-processing time(s)", "NR time(s)", "Hessian time(s)", "Total time(s)", "n_p") rownames(time.table) <- c("Problem X", "Problem Y", "Problem Z", "Problem YZ") ## Problem X dataX <- makeModel("X", K = numChoices) # generate data # Default args set: p = 50 variables, N = K * p * 20 observations # Run mnlogit on 1 proc mnlogit.time <- system.time(fit <- mnlogit(fmX, dataX, "choices"))[3] # Store results time.table[1, 1] <- fit$est.stat$prepTimeSecs time.table[1, 2] <- mnlogit.time - fit$est.stat$prepTimeSecs time.table[1, 3] <- fit$est.stat$hessMins * 60 time.table[1, 4] <- mnlogit.time time.table[1, 5] <- 50 * (numChoices - 1) # num vars = 50 , as set above cat(paste0("\n\tDone problem X.\n")) # Data for Type "Y, "Z" & "YZ" problems data <- makeModel("Y", K = numChoices) # generate data # Default args set: p = 50 variables, N = K * p * 20 observations ## Problem Y # Run mnlogit on 1 proc mnlogit.time <- system.time(fit <- mnlogit(fmY, data, "choices"))[3] # Store results time.table[2, 1] <- fit$est.stat$prepTimeSecs time.table[2, 2] <- mnlogit.time - fit$est.stat$prepTimeSecs time.table[2, 3] <- fit$est.stat$hessMins * 60 time.table[2, 4] <- mnlogit.time time.table[2, 5] <- 50 * numChoices # num vars = 50 , as set above cat(paste0("\n\tDone problem Y.\n")) ## Problem Z # Run mnlogit on 1 proc mnlogit.time <- system.time(fit <- mnlogit(fmZ, data, "choices"))[3] # Store results time.table[3, 1] <- fit$est.stat$prepTimeSecs time.table[3, 2] <- mnlogit.time - fit$est.stat$prepTimeSecs time.table[3, 3] <- fit$est.stat$hessMins * 60 time.table[3, 4] <- mnlogit.time time.table[3, 5] <- 50 # num vars = 50 , as set above cat(paste0("\n\tDone problem Z.\n")) ## Problem YZ # Run mnlogit on 1 proc mnlogit.time <- system.time(fit <- mnlogit(fmYZ, data, "choices"))[3] # Store results time.table[4, 1] <- fit$est.stat$prepTimeSecs time.table[4, 2] <- mnlogit.time - fit$est.stat$prepTimeSecs time.table[4, 3] <- fit$est.stat$hessMins * 60 time.table[4, 4] <- mnlogit.time time.table[4, 5] <- 45 * numChoices + 5 # Following the formula "fmYZ" cat(paste0("\n\tDone problem YZ.\n")) cat("\nFinished running tests, printing results (See Table 1 of paper).\n") print(time.table) ############################################################################ ### Table 2 ############################################################################ ## Parameter Setting # List of number of alternatives to keep in test multinomial logit models numChoicesVec <- c(10, 20, 30) ## NOTE ## Above setting used is to generate Table 2 in paper ## May take about 4 hours to run - depending on the machine ## Quicker results may be obtained by uncommenting the following line # numChoicesVec <- c(3, 5, 7) if (length(numChoicesVec) < 1) stop("Can't be empty!") if (any(numChoicesVec <= 2)) stop("Can't specify any model with fewer than 2 choices!") ############################################################################ ## Generate simulated data and model formulas ############################################################################ # Type "X" problems # Make formula vars <- paste("X", 1:50, sep = "", collapse = " + ") fmX <- formula(paste("response ~ 1|", vars, " - 1 | 1")) # Make formula for type "Y" problems vars <- paste("X", 1:50, sep = "", collapse = " + ") fmY <- formula(paste("response ~ 1| - 1 | ", vars)) # Formula for type "Z" problems vars <- paste("X", 1:50, sep = "", collapse = " + ") fmZ <- formula(paste("response ~ ", vars, "| - 1 | 1")) # Formula for type "YZ" problems # 5 variables of type "Z" and 45 variables of type "Y" vars <- paste("X", 1:45, sep = "", collapse = " + ") fmYZ <- formula(paste("response ~ X46 + X47 + X48 + X49 + X50| - 1 | ", vars)) ############################################################################ ## Run code and generate results ############################################################################ # Tables for recording runnign times ratio wrt to mnlogit # See Table 2 of the paper # Newton-Raphson ratios nr.ratio.table <- matrix(rep(0, 4 * length(numChoicesVec)), nrow = 4) colnames(nr.ratio.table) <- numChoicesVec rownames(nr.ratio.table) <- c("Problem X", "Problem Y", "Problem YZ", "Problem Z") # BFGS ratios bfgs.ratio.table <- matrix(rep(0, 4 * length(numChoicesVec)), nrow = 4) colnames(bfgs.ratio.table) <- numChoicesVec rownames(bfgs.ratio.table) <- c("Problem X", "Problem Y", "Problem YZ", "Problem Z") for (i in 1:length(numChoicesVec)) { numChoices <- numChoicesVec[i] cat(paste0("\nRunning speed test with ", numChoices, " alternatives.\n")) ## Problem X dataX <- makeModel("X", K = numChoices) # generate data # Default args set: p = 50 variables, N = K * p * 20 observations # Data for mlogit mdatX <- mlogit.data(dataX[order(dataX$indivID), ], "response", shape = "long", alt.var = "choices") # Run mnlogit on 1 proc mnlogit.time <- system.time(fit.mnlogit <- mnlogit(fmX, dataX, "choices"))[3] # Run mlogit nr.time <- system.time(fit.mlogit <- mlogit(fmX, mdatX))[3] # Newton-Raphson bfgs.time <- system.time(fit.mlogit <- mlogit(fmX, mdatX, method = "bfgs"))[3] # Compute & Store ratios nr.ratio.table[1, i] <- nr.time / mnlogit.time bfgs.ratio.table[1, i] <- bfgs.time / mnlogit.time cat(paste0("\n\tDone problem X.\n")) # Data for Type "Y, "Z" & "YZ" problems data <- makeModel("Y", K = numChoices) # generate data # Default args set: p = 50 variables, N = K * p * 20 observations mdat <- mlogit.data(data[order(data$indivID), ], "response", shape = "long", alt.var = "choices") ## Problem Y # Run mnlogit on 1 proc mnlogit.time <- system.time(fit.mnlogit <- mnlogit(fmY, data, "choices"))[3] # Run mlogit nr.time <- system.time(fit.mlogit <- mlogit(fmY, mdat))[3] # Newton-Raphson bfgs.time <- system.time(fit.mlogit <- mlogit(fmY, mdat, method = "bfgs"))[3] # Compute & Store ratios nr.ratio.table[2, i] <- nr.time / mnlogit.time bfgs.ratio.table[2, i] <- bfgs.time / mnlogit.time cat(paste0("\n\tDone problem Y.\n")) ## Problem YZ # Run mnlogit on 1 proc mnlogit.time <- system.time(fit.mnlogit <- mnlogit(fmYZ, data, "choices"))[3] # Run mlogit nr.time <- system.time(fit.mlogit <- mlogit(fmYZ, mdat))[3] # Newton-Raphson bfgs.time <- system.time(fit.mlogit <- mlogit(fmYZ, mdat, method = "bfgs"))[3] # Compute & Store ratios nr.ratio.table[3, i] <- nr.time / mnlogit.time bfgs.ratio.table[3, i] <- bfgs.time / mnlogit.time cat(paste0("\n\tDone problem YZ.\n")) ## Problem Z # Run mnlogit on 1 proc mnlogit.time <- system.time(fit.mnlogit <- mnlogit(fmZ, data, "choices"))[3] # Run mlogit nr.time <- system.time(fit.mlogit <- mlogit(fmZ, mdat))[3] # Newton-Raphson bfgs.time <- system.time(fit.mlogit <- mlogit(fmZ, mdat, method = "bfgs"))[3] # Compute & Store ratios nr.ratio.table[4, i] <- nr.time / mnlogit.time bfgs.ratio.table[4, i] <- bfgs.time / mnlogit.time cat(paste0("\n\tDone problem Z.\n")) } cat("\nFinished running tests, printing runtime ratios (See Table 2 of paper).") cat("\nNewton-Raphson (mlogit / mnlogit)\n") print(nr.ratio.table) cat("\nBFGS(mlogit / mnlogit)\n") print(bfgs.ratio.table) ############################################################################ ### Table 3 ############################################################################ ## Parameter Setting (used for Table 3 of paper) numChoices <- 100 ## NOTE ## Above setting can take very long, uncomment the line below for quick results #numChoices <- 20 problem <- c("X", "Y", "Z", "YZ") data.problem <- c("X", "Y", "Y", "Y") for (i in 1:4) { ## Type of problem data <- makeModel(data.problem[i], K = numChoices) # generate data ## Make formula fm <- get(paste0("fm", problem[i])) ## Matrix to store results results <- matrix(rep(0, 8), nrow = 2) colnames(results) <- c("Serial fraction", "S_2", "S_4", "S_8") rownames(results) <- c("Actual speedup", "Amdahl's law") time.1 <- system.time(fit.1 <- mnlogit(fm, data, "choices", ncores = 1))[3] ## Estimated fraction of code running in serial f_s <- 1.0 - fit.1$est.stat$hessMins * 60 / time.1 results[1, 1] <- f_s results[2, 1] <- f_s cat("\nFinished running on 1 core.\n") time.2 <- system.time(fit.2 <- mnlogit(fm, data, "choices", ncores = 2))[3] results[1, 2] <- time.1 / time.2 results[2, 2] <- 1.0 / (f_s + (1 - f_s) / 2) cat("\nFinished running on 2 cores.\n") time.3 <- system.time(fit.4 <- mnlogit(fm, data, "choices", ncores = 4))[3] results[1, 3] <- time.1 / time.3 results[2, 3] <- 1.0 / (f_s + (1 - f_s) / 4) cat("\nFinished running on 4 cores.\n") time.4 <- system.time(fit.8 <- mnlogit(fm, data, "choices", ncores = 8))[3] results[1, 4] <- time.1 / time.4 results[2, 4] <- 1.0 / (f_s + (1 - f_s) / 8) cat("\nFinished running on 8 cores.\n") cat("Problem ", problem[i], "\n") cat("\nActual mnlogit and Amdahl's law parallel speedup (see Table 3 of paper):\n") print(results) cat("\nNote: Serial fraction estimated from single core runtime.\n") } ############################################################################## ## Code to Fig. 1 (Parallel Hessian Calculation Speedup) ## ## ## ## Speed tests single & multi threaded mnlogit using simulated data. ## ## ## ## Running this script will produce single and multicore runtimes on your ## ## LOCAL machine. Runtimes WILL fluctuate if other programs are running. ## ############################################################################## ############################################################################## ## User parameters ## ## These settings are used for results of Fig 1 in the paper ## ############################################################################## # Vector with number of cores to run tests on cores.run <- c(1, 2, 4, 8, 16) # Separate timing runs for each # Number of choices num.choices <- 100 # Controls problem size # 50 covariates is default choice set below in "nvars" parameter & formulas. ############################################################################## ## Note to user ## ############################################################################## # With above settings it takes about 4 hours to run this script on dev machine. # For quick results run with `smaller" values, e.g. # cores.run <- c(1, 2, 4) # num.choices <- 20 ############################################################################## ## File to plot Hessian calculation speedup data (NULL disables plotting) ## ############################################################################## hessSpeedupFile <- paste0("speedupHess_K", num.choices, ".pdf") # NULL #hessSpeedupFile <- "HessianSpeedups.pdf" # NULL ############################################################################## # Runs mnlogit and returns runtimes (all in seconds) time.mnlogit <- function(formula, data, num.cores = 1) { cat("\n==============\nTiming Run Starts\n==============\n") cat(paste0("Using ", num.cores, " processors.\n")) elapsed <- system.time( # Increase print.level to 1 to enable in-iteration printing fit <- mnlogit(formula, data, choiceVar = "choices", ncores = num.cores, maxiter = 50, ftol = 1e-12, gtol = 1e-3, print.level = 0) ) return(list(tot.time = elapsed[3], hess.time = fit$est.stat$hessMins * 60)) cat("\n==============\nTiming Run Ends\n==============\n") } # Creates a table of runtimes (all in sec), after running over multiple cores run.tests <- function(formula, data, num.cores.vec) { if (!is.vector(num.cores.vec)) stop("Invalid num.cores.vec argument.") timings <- matrix(rep(0, 3 * length(num.cores.vec)), nrow = length(num.cores.vec), ncol = 3) colnames(timings) <- c("ncores", "hess.time", "tot.time") for (i in c(1:length(num.cores.vec))) { ncores <- num.cores.vec[i] stats <- time.mnlogit(formula, data, num.cores = ncores) timings[i, 1] <- ncores timings[i, 2] <- stats$hess.time timings[i, 3] <- stats$tot.time } return(timings) } ############################################################################# ## Formulas of the 4 problem classes. ############################################################################# # 50 variables of type "X" fmX <- formula(response ~ 1| X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X21 + X22 + X23 + X24 + X25 + X26 + X27 + X28 + X29 + X30 + X31 + X32 + X33 + X34 + X35 + X36 + X37 + X38 + X39 + X40 + X41 + X42 + X43 + X44 + X45 + X46 + X47 + X48 + X49 + X50 - 1| 1) # 50 variables of type "Y" fmY <- formula(response ~ 1| -1 | X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X21 + X22 + X23 + X24 + X25 + X26 + X27 + X28 + X29 + X30 + X31 + X32 + X33 + X34 + X35 + X36 + X37 + X38 + X39 + X40 + X41 + X42 + X43 + X44 + X45 + X46 + X47 + X48 + X49 + X50) # 50 variables of type "Z" fmZ <- formula(response ~ X1 + X2 + X3 + X4 + X5 + X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X21 + X22 + X23 + X24 + X25 + X26 + X27 + X28 + X29 + X30 + X31 + X32 + X33 + X34 + X35 + X36 + X37 + X38 + X39 + X40 + X41 + X42 + X43 + X44 + X45 + X46 + X47 + X48 + X49 + X50| -1 | 1) # 45 variables of type "Y", 5 variables of type "Z" fmYZ <- formula(response ~ X1 + X2 + X3 + X4 + X5 | -1 | X6 + X7 + X8 + X9 + X10 + X11 + X12 + X13 + X14 + X15 + X16 + X17 + X18 + X19 + X20 + X21 + X22 + X23 + X24 + X25 + X26 + X27 + X28 + X29 + X30 + X31 + X32 + X33 + X34 + X35 + X36 + X37 + X38 + X39 + X40 + X41 + X42 + X43 + X44 + X45 + X46 + X47 + X48 + X49 + X50) nvars <- 50 # number of variables in each of the 4 problem classes ############################################################################# # Run problem type "X" timing tests ############################################################################# cat("\n--------------\nCASE X\n--------------\n") df <- makeModel("X", K = num.choices, numCovars = nvars) timeX <- run.tests(fmX, df, cores.run) cat("\n----------------------------\n") ############################################################################# # Run problem type "Y" timing tests ############################################################################# cat("\n--------------\nCASE Y\n--------------\n") df <- makeModel("Y", K = num.choices, numCovars = nvars) timeY <- run.tests(fmY, df, cores.run) cat("\n----------------------------\n") ############################################################################# # Run problem type "Z" timing tests ############################################################################# cat("\n--------------\nCASE Z\n--------------\n") df <- makeModel("Y", K = num.choices, numCovars = nvars) timeZ <- run.tests(fmZ, df, cores.run) cat("\n----------------------------\n") ############################################################################# # Run problem type "YZ" timing tests ############################################################################# cat("\n--------------\nCASE YZ\n--------------\n") df <- makeModel("Y", K = num.choices, numCovars = nvars) timeYZ <- run.tests(fmYZ, df, cores.run) cat("\n----------------------------\n") ############################################################################# cat("\n***** FINAL STATS *****\n") ############################################################################# cat("\n--------------\nCASE X\n--------------\n") print(timeX) cat("\n--------------\nCASE Y\n--------------\n") print(timeY) cat("\n--------------\nCASE Z\n--------------\n") print(timeZ) cat("\n--------------\nCASE YZ\n--------------\n") print(timeYZ) ############################################################################# # Total Execution time speedups (Table 3 of the paper) ############################################################################# totTimes <- cbind(timeX[ ,3], timeY[ ,3], timeZ[ ,3], timeYZ[ ,3]) totSpeedup <- apply(totTimes, 2, function(vec) 1.0 / (vec / vec[1])) totSpeedup <- cbind(timeX[ ,1], totSpeedup) colnames(totSpeedup) <- c("ncores", "caseX", "caseY", "caseZ", "caseYZ") cat("\n--------------\nTotal parallel speedup\n--------------\n") print(totSpeedup) ############################################################################# # Hessian calculation speedups ############################################################################# hessTimes <- cbind(timeX[ ,2], timeY[ ,2], timeZ[ ,2], timeYZ[ ,2]) hessSpeedup <- apply(hessTimes, 2, function(vec) 1.0/(vec/vec[1])) hessSpeedup <- cbind(timeX[ ,1], hessSpeedup) colnames(hessSpeedup) <- c("ncores", "caseX", "caseY", "caseZ", "caseYZ") cat("\n--------------\nHessian parallel speedup\n--------------\n") print(hessSpeedup) ############################################################################# ## Plotting hessian speedup. See Fig 1 of the vignette. ############################################################################# if (!is.null(hessSpeedupFile)) { pdf(hessSpeedupFile) matplot(hessSpeedup[ , 1], hessSpeedup[ , 2:5], type = "b", pch = 1:4, col = 1:4, xlab = "procs", ylab = "Speedup") legend("topleft", c("case X", "case Y", "case YZ", "case Z"), pch = 1:4, col = 1:4) dev.off() cat(paste("\nPlotted speedups to file:", hessSpeedupFile, "\n")) }