## NOTE: In order for this script to run in reasonable time (< 1 min), ## the number of parameters for the timing plots (nlist, plist, nsim) ## can be reduced by setting fast <- TRUE fast <- FALSE ##### ## CODE FOR SECTION 3.1 ##### class(binomial(link = "probit")) str(binomial(link = "probit")) library("glmnet") data("BinomialExample", package = "glmnet") fit <- glm(y ~ x, data = BinomialExample, family = binomial(link = "probit")) ##### ## CODE FOR SECTION 3.3 ##### data("PoissonExample", package = "glmnet") x <- PoissonExample$x y <- PoissonExample$y fit1 <- glmnet(x, y, family = "poisson") fit2 <- glmnet(x, y, family = poisson()) cbind(coef(fit1, s = 0.1), coef(fit2, s = 0.1)) fit <- glmnet(x, y, family = quasipoisson()) ##### ## CODE FOR TIMING PLOTS (FIGURE 1) ##### library("glmnet") library("survival") library("tidyverse") ## parameters that are common across the gaussian and binomial simulations if (fast) { nlist <- c(100, 300, 1000) plist <- c(100, 300) nsim <- 2 } else { nlist <- c(100, 300, 1000, 3000) plist <- c(100, 300, 1000) nsim <- 5 } s <- 0.1 # i assume that p * s is always an integer thresh <- 1e-7 # default custom_theme <- theme_bw() + theme(legend.position = "right", plot.title = element_text(size = rel(1.5), face = "bold", hjust = 0.5), axis.title = element_text(size = rel(1.2)), strip.text = element_text(size = rel(1.2)), axis.text = element_text(size = rel(1)), legend.text = element_text(size = rel(1.2)), axis.text.x = element_text(angle = 45, hjust = 1) ) ## run gaussian() vs. "gaussian" comparison set.seed(1) res <- matrix(NA, nrow = length(nlist) * length(plist) * nsim, ncol = 6) rownum <- 0 for (n in nlist) { for (p in plist) { for (i in 1:nsim) { rownum <- rownum + 1 cat("row", rownum, "n", n, "p", p, "simulation no.", i, fill = T) # create fake data x <- matrix(rnorm(n * p), nrow = n) beta <- matrix(c(2 + rnorm(p * s), rep(0, times = p * (1 - s))), ncol = 1) y <- scale(x %*% beta + rnorm(n)) row <- c(n, p, i) # run models old_time <- system.time(glmnet(x, y, family = "gaussian", thresh = thresh))[1] new_time <- system.time(glmnet(x, y, family = gaussian(), thresh = thresh))[1] cat("old time:", old_time, fill = T) cat("new time:", new_time, fill = T) row <- c(row, old_time, new_time, new_time / old_time) res[rownum, ] <- row } } } df <- data.frame(res) names(df) <- c("n", "p", "run", "glmnet", "glmnetFlex", "ratio") saveRDS(df, "timing_gaussian.rds") input_file <- "timing_gaussian.rds" df <- readRDS(input_file) ## make plot prefix <- "gaussian" df_tidy <- df %>% select(-ratio) %>% gather(glmnet:glmnetFlex, key = "method", value = "time") %>% group_by(n, p, method) %>% summarize(mean_time = mean(time)) %>% mutate(family = ifelse(method == "glmnet", paste0("\"", prefix, "\""), paste0(prefix, "()"))) %>% mutate(family = fct_relevel(family, "gaussian()", "\"gaussian\"")) df_tidy %>% rename(`# features` = p) %>% ggplot(aes(x = n, y = mean_time, col = family)) + geom_point() + geom_line() + labs(title = paste0("Fitting time: \"", prefix, "\" vs. ", prefix, "()"), x = "# observations", y = "Mean fitting time (s)") + facet_wrap(~ `# features`, nrow = 1, labeller = label_both) + scale_y_log10() + scale_x_log10() + custom_theme # ggsave(paste0(prefix, ".png"), width = 10, height = 3) ## as above, but for binomial() vs. "binomial" set.seed(1) res <- matrix(NA, nrow = length(nlist) * length(plist) * nsim, ncol = 6) rownum <- 0 for (n in nlist) { for (p in plist) { for (i in 1:nsim) { rownum <- rownum + 1 cat("row", rownum, "n", n, "p", p, "simulation no.", i, fill = T) ## create fake data x <- matrix(rnorm(n * p), nrow = n) beta <- matrix(c(2 + rnorm(p * s), rep(0, times = p * (1-s))), ncol = 1) y <- scale(x %*% beta + rnorm(n)) biny <- ifelse(y > 0, 1, 0) row <- c(n, p, i) ## run models old_time <- system.time(glmnet(x, biny, family = "binomial", thresh = thresh))[1] new_time <- system.time(glmnet(x, biny, family = binomial(), thresh = thresh))[1] cat("old time:", old_time, fill = T) cat("new time:", new_time, fill = T) row <- c(row, old_time, new_time, new_time / old_time) res[rownum, ] <- row } } } df <- data.frame(res) names(df) <- c("n", "p", "run", "glmnet", "glmnetFlex", "ratio") saveRDS(df, "timing_binomial.rds") input_file <- "timing_binomial.rds" df <- readRDS(input_file) # make plot prefix <- "binomial" df_tidy <- df %>% select(-ratio) %>% gather(glmnet:glmnetFlex, key = "method", value = "time") %>% group_by(n, p, method) %>% summarize(mean_time = mean(time)) %>% mutate(family = ifelse(method == "glmnet", paste0("\"", prefix, "\""), paste0(prefix, "()"))) %>% mutate(family = fct_relevel(family, "binomial()", "\"binomial\"")) df_tidy %>% rename(`# features` = p) %>% ggplot(aes(x = n, y = mean_time, col = family)) + geom_point() + geom_line() + labs(title = paste0("Fitting time: \"", prefix, "\" vs. ", prefix, "()"), x = "# observations", y = "Mean fitting time (s)") + facet_wrap(~ `# features`, nrow = 1, labeller = label_both) + scale_y_log10() + scale_x_log10() + custom_theme # ggsave(paste0(prefix, ".png"), width = 10, height = 3) ##### ## CODE FOR SECTION 3.5 ##### ## get data data("QuickStartExample", package = "glmnet") x <- QuickStartExample$x y <- QuickStartExample$y alphas <- c(1, 0.8, 0.5, 0.2, 0) fits <- list() set.seed(1) fits[[1]] <- cv.glmnet(x, y, keep = TRUE) foldid <- fits[[1]]$foldid for (i in 2:length(alphas)) { fits[[i]] <- cv.glmnet(x, y, alpha = alphas[i], foldid = foldid) } best_cvm <- unlist(lapply(fits, function(fit) min(fit$cvm))) alphas[which.min(best_cvm)] set.seed(1) cfit <- cv.glmnet(x, y) predict(cfit, x) predict(cfit, x, s = "lambda.min") set.seed(1) x[sample(seq(length(x)), 1600)] <- 0 filter <- function(x, ...) which(colMeans(x == 0) > 0.8) exclude <- filter(x) fit1 <- glmnet(x, y, exclude = exclude) fit2 <- glmnet(x, y, exclude = filter) all.equal(fit1, fit2) data("BinomialExample", package = "glmnet") x <- BinomialExample$x y <- BinomialExample$y fit <- bigGlm(x, y, family = "binomial") which(coef(fit) < -1) fit <- bigGlm(x, y, family = "binomial", lower.limits = -1) which(coef(fit) < -1) ##### ## CODE FOR CV CURVE (FIGURE 2) ##### ## print CV curve # pdf("cv_curve.pdf", width = 8, height = 5) plot(cfit) # dev.off() ##### ## CODE FOR SECTION 4 ##### data("CoxExample", package = "glmnet") glmnet(CoxExample$x, CoxExample$y, family = "cox") ##### ## CODE FOR SECTION 4.1 ##### library("survival") head(bladder2) y <- with(bladder2, Surv(start, stop, event)) head(y) x <- as.matrix(bladder2[, 2:4]) glmnet(x, y, family = "cox") ##### ## CODE FOR SECTION 4.2 ##### data("CoxExample", package = "glmnet") x <- CoxExample$x y <- CoxExample$y strata <- c(rep(1, 500), rep(2, 500)) y2 <- stratifySurv(y, strata) glmnet(x, y2, family = "cox") ##### ## CODE FOR SECTION 4.3 & CODE FOR SURVIVAL CURVES (FIGURE 3) ##### ## create data x and response y set.seed(1) nobs <- 100; nvars <- 15 x <- matrix(rnorm(nobs * nvars), nrow = nobs) ty <- rep(rexp(nobs / 5), each = 5) tcens <- rbinom(n = nobs, prob = 0.3, size = 1) y <- Surv(ty, tcens) ## fit model and plot survival curves fit <- glmnet(x, y, family = "cox") sf_obj <- survfit(fit, s = 0.05, x = x, y = y, newx = x[1:2, ]) # pdf("survival_curve.pdf", width = 7, height = 5.5) par(mar = c(2.5, 2.5, 1, 1)) plot(sf_obj, col = 1:2, mark.time = TRUE, pch = "12") # dev.off() set.seed(1) cfit <- cv.glmnet(x, y, family = "cox", nfolds = 5) survfit(cfit, s = "lambda.min", x = x, y = y, newx = x[1:2, ]) ##### ## CODE FOR SECTION 5 ##### data("QuickStartExample", package = "glmnet") x <- QuickStartExample$x y <- QuickStartExample$y fitr <- glmnet(x, y, relax = TRUE) fitr predict(fitr, x, s = 1, gamma = 0.5) set.seed(1) cfitr <- cv.glmnet(x, y, gamma = 0, relax = TRUE) cfitr ##### ## CODE FOR CV CURVE FOR RELAXED LASSO (FIGURE 4) ##### set.seed(1) cfitr <- cv.glmnet(x, y, relax = TRUE) # pdf("cv_curve_relaxed.pdf", width = 7, height = 5.5) plot(cfitr) # dev.off() ##### ## CODE FOR SECTION 5.1 ##### set.seed(1) cfitr <- cv.glmnet(x, y, gamma = 0, relax = TRUE) cfitr ##### ## CODE FOR SECTION 6 ##### data_url <- paste0("https://archive.ics.uci.edu/ml/", "machine-learning-databases/spambase/spambase.data") df <- read.csv(data_url, header = FALSE) x <- as.matrix(df[, 1:57]) y <- df[, 58] set.seed(1) cv.glmnet(x, y, family = "binomial", type.measure = "auc") set.seed(1) itrain <- sample(nrow(x), size = nrow(x) / 2) fit <- glmnet(x[itrain, ], y[itrain], family = "binomial") err <- assess.glmnet(fit, newx = x[-itrain, ], newy = y[-itrain]) names(err) err$deviance assess.glmnet(fit, newx = x[-itrain, ], newy = y[-itrain], s = 0.1)$auc set.seed(1) cfit <- cv.glmnet(x[itrain, ], y[itrain], family = "binomial") cerr <- assess.glmnet(cfit, newx = x[-itrain, ], newy = y[-itrain]) cerr$auc pred <- predict(fit, newx = x[-itrain, ]) err <- assess.glmnet(pred, newy = y[-itrain], family = "binomial") err$auc set.seed(1) cfit <- cv.glmnet(x, y, family = "binomial", keep = TRUE) err <- assess.glmnet(cfit$fit.preval, newy = y, family = "binomial") err$mae set.seed(1) cfit <- cv.glmnet(x[itrain, ], y[itrain], family = "binomial") roc_val <- roc.glmnet(cfit, newx = x[-itrain, ], newy = y[-itrain], s = "lambda.min") ##### ## CODE FOR ROC CURVE (FIGURE 5) ##### # pdf("roc_curve.pdf", width = 5, height = 4) plot(roc_val, main = "ROC Curve on Test Data") # dev.off() data("MultinomialExample", package = "glmnet") x <- MultinomialExample$x y <- MultinomialExample$y set.seed(101) itrain <- sample(1:500, 400, replace = FALSE) cfit <- cv.glmnet(x[itrain, ], y[itrain], family = "multinomial") cnf <- confusion.glmnet(cfit, newx = x[-itrain, ], newy = y[-itrain]) print(cnf) sessionInfo()