###################################################################################### # This script provides R code that produces the numerical results shown in the paper. # # NOTE - Because this script reproduces the numerical simulations exactly, it is slow # to run. If the goal is code-checking or an approximation of the results, # please run Replication_code_Fast.R # # Approximate run time of this script: 8-10 hours ###################################################################################### # Last update: Nov 30 2024 # Require R > 4.0.0 # Packages needed to run this code: # ---------------------------------------------------------------------------------------- ## Package glmnet is needed to construct Figure 2 and Table 4. ## Package SIS is needed to construct Figures 7-9 and Table 4. ## Package qqman is needed to construct Figures 6-8. ## Package SMLE is the main package to implement joint feature screening in most examples. ## Package abess is needed to construct Table 4 ## Package ggplot2 and tidyr are needed to construct Figures 9-10 ## Remark: using lower-version packages may lead to results different from the paper. library("glmnet") # version 4.1.8 library("SIS") # version 0.8.8 library("qqman") # version 0.1.9 library("SMLE") # version 2.2.0 library("abess") # version 0.4.9 library("VariableScreening") # version 0.2.1 library("ggplot2") # version 3.5.1 library("tidyr") # version 1.3.0 #################################################################### # Additional functions required to run this replication script # #################################################################### # This is the source code for "The Sparse MLE for Ultrahigh-Dimensional Feature Screening" # Journal of the American Statistical Association, 109(507), 1257–1269. # Authors: Chen Xu and Jiahua Chen ##################################################################################### # Hard thresholding function-------------------------------- Hard <- function(t = c(0, 0), k = -1, lam = 1) { y <- t t <- abs(t) if (k > 0) { lam <- sort(t, decreasing = TRUE)[k] } y[t < lam] <- 0 return(y) } # A: p*n, a: n*1------------------------------- Fmt <- function(A, a, rr = 5) { pp <- dim(A)[1] r <- pp / rr b <- matrix(0, nrow = pp, ncol = 1) for (i in 1:rr) { b[((i - 1) * r + 1):(i * r)] <- A[((i - 1) * r + 1):(i * r), ] %*% a } return(b) } # log-likelihood function------------------------- lh <- function(Y, X, beta, sig = 1, tag = "N") { X <- as.matrix(X) dx <- dim(X) n <- dx[1] p <- dx[2] ind_0 <- (1:p)[beta != 0] bc_0 <- beta[ind_0] bc_0 <- as.matrix(bc_0, ncol = 1) Xs_0 <- X[, ind_0] R_0 <- Xs_0 %*% bc_0 if (tag == "N") { R_1 <- R_0^2 / 2 } if (tag == "B") { R_1 <- log(1 + exp(R_0)) } if (tag == "P") { R_1 <- exp(R_0) } ll <- sum((Y * R_0 - R_1) / sig^2) if (tag == "P") { ll <- sum(dpois(Y, R_1, log = TRUE)) } return(ll) } # function for u check------------------------------ Uh <- function(A, uh, b0, b1, tag = "N") { if (tag != "P") { if (tag == "N") { rho <- 1 } if (tag == "B") { rho <- 0.25 } bb <- b1 - b0 in_b <- (1:length(bb))[bb != 0] bt <- 0 if (length(in_b) != 0) { bt <- bb[in_b] bt <- as.matrix(bt, ncol = 1) } tm1 <- sum(bt^2) / uh tm2 <- sum((A[, in_b] %*% bt)^2) * rho Bt <- exp(tm1 - tm2) } if (tag == "P") { theta_0 <- 0 theta_1 <- 0 in_b0 <- (1:length(b0))[b0 != 0] in_b1 <- (1:length(b1))[b1 != 0] if (length(in_b0) != 0) { bt0 <- b0[in_b0] bt0 <- as.matrix(bt0, ncol = 1) theta_0 <- A[, in_b0] %*% bt0 } if (length(in_b1) != 0) { bt1 <- b1[in_b1] bt1 <- as.matrix(bt1, ncol = 1) theta_1 <- A[, in_b1] %*% bt1 } theta_t <- pmax(theta_0, theta_1) bb <- b1 - b0 in_b <- (1:length(bb))[bb != 0] bt <- 0 if (length(in_b) != 0) { bt <- bb[in_b] bt <- as.matrix(bt, ncol = 1) } tm1 <- sum(bt^2) / uh tm2 <- sum(exp(theta_t) * (theta_0 - theta_1)^2) Bt <- exp(tm1 - tm2) } return(Bt) } ################################### ## X: n*p; Y: n*1--------------- ################################### AHT <- function(Y, X, beta0, k = 20, rr = 5, T = 500, U = 1, er = 10^(-3), tag = "N") { # initializing------------------------------ dX <- dim(X) n <- dX[1] p <- dX[2] X_t <- t(X) ind_0 <- (1:p)[beta0 != 0] # Case if beta0==0-------------------------- if (length(ind_0) == 0) { R_0 <- matrix(0, ncol = 1, nrow = n) if (tag == "B") { R_0 <- exp(R_0) / (1 + exp(R_0)) } if (tag == "P") { R_0 <- exp(R_0) } V_0 <- Fmt(A = X_t, a = Y - R_0, rr = rr) beta1 <- Hard(t = V_0, k = k) ind_1 <- (1:p)[beta1 != 0] Xs_1 <- X[, ind_1] tau_1 <- svd(Xs_1)$d[1] ############################ uu <- U / tau_1^2 ############################# } # Case if beta0!=0----------------------- if (length(ind_0) != 0) { bc_0 <- beta0[ind_0] bc_0 <- as.matrix(bc_0, ncol = 1) Xs_0 <- X[, ind_0] R_0 <- Xs_0 %*% bc_0 if (tag == "B") { R_0 <- exp(R_0) / (1 + exp(R_0)) } if (tag == "P") { R_0 <- exp(R_0) } V_0 <- Fmt(A = X_t, a = Y - R_0, rr = rr) tau_0 <- svd(Xs_0)$d[1] ############################ uu <- U / tau_0^2 ############################# } # iteration start--------------------------- u_c <- uu i <- 1 repeat{ repeat { gamma0 <- beta0 + uu * V_0 beta1 <- Hard(t = gamma0, k = k) ########## u-check ################# ucheck <- Uh(A = X, uh = uu, b0 = beta0, b1 = beta1, tag = tag) if (ucheck >= 1) { break } else { uu <- 0.5 * uu } #################################### } ######## convergence check ############## MSE <- sqrt(sum((beta1 - beta0)^2)) if ((i > T) || (MSE < er)) { break } ######################################### beta0 <- beta1 ind_0 <- (1:p)[beta0 != 0] bc_0 <- beta0[ind_0] bc_0 <- as.matrix(bc_0, ncol = 1) Xs_0 <- X[, ind_0] R_0 <- Xs_0 %*% bc_0 if (tag == "B") { R_0 <- exp(R_0) / (1 + exp(R_0)) } if (tag == "P") { R_0 <- exp(R_0) } V_0 <- Fmt(A = X_t, a = Y - R_0, rr = rr) uu <- u_c i <- i + 1 } index <- (1:p)[beta1 != 0] list(index = index, B = bc_0, step = i) } ########################### # Code for Section 3 # ########################### # Section 3.2---Demo code---Simulating ultrahigh-dimensional GLM data--------------------- set.seed(1) Data_ctg <- Gen_Data( n = 200, p = 1000, family = "gaussian", pos_ctgidx = c(1, 2, 3), level_ctgidx = c(3, 4, 5) ) head(Data_ctg$X)[, 1:5] # Section 3.2---Demo code---Joint feature screening-------------------------------- fit <- SMLE( Y = Data_ctg$Y, X = Data_ctg$X, k = 15, family = "gaussian", keyset = c(1, 4, 5), categorical = TRUE, group = TRUE ) fit # Section 3.2---Demo code---Post-screening Selection------------------------------- fit_s <- smle_select(fit, criterion = "ebic", gamma_seq = seq(0, 1, 0.2), vote = TRUE) fit_s # Section 3.2---Figure 2----------------- # Figure 2 is constructed based on NumIter=100 repetitions and it # will have a long running time. Set NumIter=5 for checking purposes. # Note that running time is system dependent and so there may be # slight deviations in run time when the code is run on different # computers. NumIter <- 100 # "OSMLE" is the original IHT implementation used in Xu and Chen (2014). OSMLE <- function(X, Y, family, k) { fit_pre <- glmnet(x = X, y = Y, family = family) Beta0 <- c(fit_pre$beta[, dim(fit_pre$beta)[2]]) tag <- switch(family, "gaussian" = "N", "binomial" = "B", "poisson" = "P" ) AHT(Y = Y, X = X, beta0 = Beta0, k = k, U = 1, rr = 1, T = 100, tag = tag, er = 10^(-3)) } # _g and _b stands for the running time in linear model and logistic model. SMLE_time_g <- matrix(0, nrow = 9, ncol = NumIter) IHT_time_g <- matrix(0, nrow = 9, ncol = NumIter) SMLE_time_b <- matrix(0, nrow = 9, ncol = NumIter) IHT_time_b <- matrix(0, nrow = 9, ncol = NumIter) p <- seq(from = 4000, to = 20000, length.out = 9) n <- 1000 k <- 10 for (j in 1:NumIter) { for (i in 1:9) { family <- "gaussian" DD <- Gen_Data(n, p[i], family = family) Y <- DD$Y X <- DD$X iT <- proc.time() OSMLE(X = X, Y = Y, k = k, family = family) IHT_time_g[i, j] <- (proc.time() - iT)[3] sT <- proc.time() SMLE( X = X, Y = Y, k = k, family = family, intercept = FALSE, standardize = FALSE, categorical = FALSE, max_iter = 100, tol = 10^(-3) ) SMLE_time_g[i, j] <- (proc.time() - sT)[3] } } # linear model comparison IHT_average_time <- apply(IHT_time_g, 1, mean) plot( x = p, y = IHT_average_time, type = "b", pch = 18, col = "blue", cex.main = 1.5, cex.lab = 1.5, xlab = "The number of features", ylab = "Elapsed Time(s)", main = "Linear model", ylim = c(0, 1.2 * max(IHT_average_time)) ) lines(x = p, y = apply(SMLE_time_g, 1, mean), pch = 19, col = "red", type = "b", lty = 2) legend("topleft", legend = c("Original Code", "SMLE()"), pch = 18:19, lty = 1:2, col = c("blue", "red") ) # logistic model comparison for (j in 1:NumIter) { for (i in 1:9) { DD <- Gen_Data( n = n, p = p[i], num_truecoef = 10, family = "binomial", effect_truecoef = rep(1.5, 10), rho = 0.3 ) Y <- DD$Y X <- DD$X iT <- proc.time() OSMLE(X = X, Y = Y, k = k, family = "binomial") IHT_time_b[i, j] <- (proc.time() - iT)[3] sT <- proc.time() SMLE( X = X, Y = Y, k = k, family = "binomial", intercept = F, standardize = FALSE, categorical = FALSE, max_iter = 100, tol = 10^(-3), U = 4 ) SMLE_time_b[i, j] <- (proc.time() - sT)[3] } } IHT_average_time <- apply(IHT_time_b, 1, mean) plot( x = p, y = IHT_average_time, type = "b", pch = 18, col = "blue", xlab = "The number of features", ylab = "Elapsed Time(s)", main = "Logistic model", ylim = c(0, 1.2 * max(IHT_average_time)), cex.main = 1.5, cex.lab = 1.5 ) lines(x = p, y = apply(SMLE_time_b, 1, mean), pch = 19, col = "red", type = "b", lty = 2) legend("topleft", legend = c("Original Code", "SMLE()"), pch = 18:19, lty = 1:2, col = c("blue", "red") ) # Section 3.2---Figures 3 & 4 & 5 -------- set.seed(1) Data_sim <- Gen_Data( n = 200, p = 1000, correlation = "AR", rho = 0.9, sigma = 1, family = "gaussian", pos_truecoef = c(1, 3, 5, 7, 9), effect_truecoef = (0.8) * c(2, -3, -3, 2, -4) ) fit_path <- SMLE(Y = Data_sim$Y, X = Data_sim$X, k = 10, coef_initial = rep(0, 1000)) plot(fit_path, cex = 0.8, cex.axis = 1.5, cex.lab = 1.5) # The following comment is a reminder that the object fit_s is defined previously. # fit_s <- smle_select(fit, criterion = "ebic", gamma_seq = seq(0,1,0.2), vote = TRUE) plot(fit_s, cex.axis = 1.5, cex.lab = 1.5) ########################### # Code for Section 4 # ########################### # Section 4.1---Demo Code for SMLE-screening--- set.seed(1) Data_eg <- Gen_Data( n = 400, p = 1000, family = "binomial", correlation = "AR", rho = 0.9, pos_truecoef = c(1, 3, 5, 7, 9), effect_truecoef = c(2, 3, -3, 3, -4) ) print(Data_eg) fit1 <- SMLE(Y = Data_eg$Y, X = Data_eg$X, family = "binomial", k = 10, coef_initial = rep(0, 1000)) summary(fit1) with(Data_eg, coef(summary(glm(Y ~ X[, 2], family = "binomial")))) fit1_s <- smle_select(fit1, criterion = "ebic") fit1_s summary(fit1_s) # Section 4.2 --- Table 4 --------------------------- # SIS/ISIS is implemented by Package "SIS". # Lasso is implemented by package "glmnet". # screenIID is implemented by package "VariableScreening" # We have suppressed warnings from GLMNET so that the output file is not too large. # The warning message from GLMNET package is due to the fact that Lasso may # not always be able to produce an estimate with sparsity exactly matching the # pre-specified screening size k. When it happens, a Lasso estimate with the # largest sparsity not exceeding k is returned. This is expected and the warning # message is safe to ignore. # We have suppressed routine iteration output and warnings from ISIS being # printed to screen since the resulting output file would be much too large. # The warnings relate to ISIS fitting a saturated model periodically with Poisson # data. However, SIS, glmnet, SMLE and abess do not give warnings with this data. # The abess function modifies the seed of the random number generator in a # way that affects the ability to reproduce results exactly. For this reason, we # randomly create a vector of seeds with length equal to the number of iterations # and we re-seed the random number generator each iteration. # Table 4 is summarized based on NumIter=500 repetitions, which takes a long time # to run. Set NumIter=5 for quick checking purposes. NumIter <- 500 num_methods <- 6 ########### Linear Regression #################################### set.seed(1) seed.vec <- round(sample(c(-1, 1), size = NumIter, replace = TRUE) * runif(NumIter, min = 0, max = 1e7)) Lasso_avg_length <- rep(1:NumIter) PRR <- matrix(0, nrow = num_methods, ncol = NumIter) SSR <- matrix(0, nrow = num_methods, ncol = NumIter) TIME <- matrix(0, nrow = num_methods, ncol = NumIter) time <- rep(0, num_methods) k <- 20 for (j in 1:NumIter) { set.seed(seed.vec[j]) Data <- Gen_Data( n = 100, p = 2000, pos_truecoef = c(1, 2, 3, 4), family = "gaussian", effect_truecoef = rep(2.5, 4), correlation = "CS", rho = 0.3 ) ## models fitting # SIS sis_a <- proc.time() SIS <- SIS(x = Data$X, y = Data$Y, family = "gaussian", iter = F, nsis = k) sis_b <- proc.time() time[1] <- (sis_b - sis_a)[3] # ISIS isis_a <- proc.time() suppressWarnings(capture.output(ISIS <- SIS(x = Data$X, y = Data$Y, family = "gaussian", iter = T, nsis = k))) isis_b <- proc.time() time[2] <- (isis_b - isis_a)[3] # Lasso Lasso_a <- proc.time() suppressWarnings(Lasso_fit <- glmnet(x = Data$X, y = Data$Y, family = "gaussian", pmax = k)) Lasso_b <- proc.time() time[3] <- (Lasso_b - Lasso_a)[3] # abess abess_a <- proc.time() abess_fit <- abess(x = Data$X, y = Data$Y, family = "gaussian", support.size = k) abess_b <- proc.time() time[4] <- (abess_b - abess_a)[3] # VariableScreening Vscreening_a <- proc.time() Vscreening_fit <- screenIID(X = Data$X, Y = Data$Y) Vscreening_b <- proc.time() time[5] <- (Vscreening_b - Vscreening_a)[3] # SMLE SMLE_a <- proc.time() SMLE_fit <- SMLE(X = Data$X, Y = Data$Y, family = "gaussian", standardize = F, categorical = FALSE, k = k, fast = F) SMLE_b <- proc.time() time[6] <- (SMLE_b - SMLE_a)[3] Lasso_index <- (1:dim(Data$X)[2])[Lasso_fit$beta[, dim(Lasso_fit$beta)[2]] != 0] Lasso_avg_length[j] <- length(Lasso_index) ind <- abess::extract(abess_fit)$support.vars ind <- as.numeric(regmatches(ind, gregexpr("[[:digit:]]+", ind))) rank_id <- order(Vscreening_fit$rank)[1:k] index_set <- list(SIS$ix0, ISIS$ix0, Lasso_index, ind, rank_id, SMLE_fit$ID_retained) PRR[, j] <- unlist(lapply(1:num_methods, function(i) { sum(Data$subset_true %in% index_set[[i]]) })) SSR[, j] <- unlist(lapply(1:num_methods, function(i) { sum(Data$subset_true %in% index_set[[i]]) == 4 })) TIME[, j] <- time } # Order: SIS / ISIS / glmnet / abess / screenIID / SMLE # Screening Accuracy PRR_linear <- apply(PRR, 1, mean) / 4 SSR_linear <- apply(SSR, 1, mean) # Computational Time Time_linear <- apply(TIME, 1, mean) ########### Logistic Regression #################################### set.seed(1) seed.vec <- round(sample(c(-1, 1), size = NumIter, replace = TRUE) * runif(NumIter, min = 0, max = 1e7)) Lasso_avg_length <- rep(1:NumIter) PRR <- matrix(0, nrow = num_methods, ncol = NumIter) SSR <- matrix(0, nrow = num_methods, ncol = NumIter) TIME <- matrix(0, nrow = num_methods, ncol = NumIter) time <- rep(0, num_methods) k <- 30 for (j in 1:NumIter) { set.seed(seed.vec[j]) Data <- Gen_Data( n = 600, p = 2000, pos_truecoef = c(1, 2, 3, 4), family = "binomial", effect_truecoef = rep(1.1, 4), correlation = "CS", rho = 0.3 ) ## models fitting # SIS sis_a <- proc.time() SIS <- SIS(x = Data$X, y = Data$Y, family = "binomial", iter = F, nsis = k) sis_b <- proc.time() time[1] <- (sis_b - sis_a)[3] # ISIS isis_a <- proc.time() suppressWarnings(capture.output(ISIS <- SIS(x = Data$X, y = Data$Y, family = "binomial", iter = T, nsis = k))) isis_b <- proc.time() time[2] <- (isis_b - isis_a)[3] # Lasso Lasso_a <- proc.time() suppressWarnings(Lasso_fit <- glmnet(x = Data$X, y = Data$Y, family = "binomial", pmax = k)) Lasso_b <- proc.time() time[3] <- (Lasso_b - Lasso_a)[3] # abess abess_a <- proc.time() abess_fit <- abess(x = Data$X, y = Data$Y, family = "binomial", support.size = k) abess_b <- proc.time() time[4] <- (abess_b - abess_a)[3] # VariableScreening Vscreening_a <- proc.time() Vscreening_fit <- screenIID(X = Data$X, Y = Data$Y) Vscreening_b <- proc.time() time[5] <- (Vscreening_b - Vscreening_a)[3] # SMLE SMLE_a <- proc.time() SMLE_fit <- SMLE(X = Data$X, Y = Data$Y, family = "binomial", standardize = F, categorical = FALSE, k = k, fast = F) SMLE_b <- proc.time() time[6] <- (SMLE_b - SMLE_a)[3] Lasso_index <- (1:dim(Data$X)[2])[Lasso_fit$beta[, dim(Lasso_fit$beta)[2]] != 0] Lasso_avg_length[j] <- length(Lasso_index) ind <- abess::extract(abess_fit)$support.vars ind <- as.numeric(regmatches(ind, gregexpr("[[:digit:]]+", ind))) index_set <- list(SIS$ix0, ISIS$ix0, Lasso_index, ind, rank_id, SMLE_fit$ID_retained) PRR[, j] <- unlist(lapply(1:num_methods, function(i) { sum(Data$subset_true %in% index_set[[i]]) })) SSR[, j] <- unlist(lapply(1:num_methods, function(i) { sum(Data$subset_true %in% index_set[[i]]) == 4 })) TIME[, j] <- time } # Order: SIS / ISIS / glmnet / abess / screenIID / SMLE # Screening Accuracy PRR_binomial <- apply(PRR, 1, mean) / 4 SSR_binomial <- apply(SSR, 1, mean) # Computational Time Time_binomial <- apply(TIME, 1, mean) ########### Poisson Regression #################################### set.seed(1) seed.vec <- round(sample(c(-1, 1), size = NumIter, replace = TRUE) * runif(NumIter, min = 0, max = 1e7)) Lasso_avg_length <- rep(1:NumIter) PRR <- matrix(0, nrow = num_methods, ncol = NumIter) SSR <- matrix(0, nrow = num_methods, ncol = NumIter) TIME <- matrix(0, nrow = num_methods, ncol = NumIter) time <- rep(0, num_methods) k <- 10 for (j in 1:NumIter) { set.seed(seed.vec[j]) Data <- Gen_Data( n = 250, p = 2000, pos_truecoef = c(1, 2, 3, 4), family = "poisson", effect_truecoef = rep(0.7, 4), correlation = "CS", rho = 0.3 ) ## models fitting # SIS sis_a <- proc.time() SIS <- SIS(x = Data$X, y = Data$Y, family = "poisson", iter = F, nsis = k) sis_b <- proc.time() time[1] <- (sis_b - sis_a)[3] # ISIS isis_a <- proc.time() suppressWarnings(capture.output(ISIS <- SIS(x = Data$X, y = Data$Y, family = "poisson", iter = T, nsis = k))) isis_b <- proc.time() time[2] <- (isis_b - isis_a)[3] # Lasso Lasso_a <- proc.time() suppressWarnings(Lasso_fit <- glmnet(x = Data$X, y = Data$Y, family = "poisson", pmax = k)) Lasso_b <- proc.time() time[3] <- (Lasso_b - Lasso_a)[3] # abess abess_a <- proc.time() abess_fit <- abess(x = Data$X, y = Data$Y, family = "poisson", support.size = k) abess_b <- proc.time() time[4] <- (abess_b - abess_a)[3] # VariableScreening Vscreening_a <- proc.time() Vscreening_fit <- screenIID(X = Data$X, Y = Data$Y) Vscreening_b <- proc.time() time[5] <- (Vscreening_b - Vscreening_a)[3] # SMLE SMLE_a <- proc.time() SMLE_fit <- SMLE(X = Data$X, Y = Data$Y, family = "poisson", standardize = F, categorical = FALSE, k = k, fast = F) SMLE_b <- proc.time() time[6] <- (SMLE_b - SMLE_a)[3] Lasso_index <- (1:dim(Data$X)[2])[Lasso_fit$beta[, dim(Lasso_fit$beta)[2]] != 0] Lasso_avg_length[j] <- length(Lasso_index) ind <- abess::extract(abess_fit)$support.vars ind <- as.numeric(regmatches(ind, gregexpr("[[:digit:]]+", ind))) index_set <- list(SIS$ix0, ISIS$ix0, Lasso_index, ind, rank_id, SMLE_fit$ID_retained) PRR[, j] <- unlist(lapply(1:num_methods, function(i) { sum(Data$subset_true %in% index_set[[i]]) })) SSR[, j] <- unlist(lapply(1:num_methods, function(i) { sum(Data$subset_true %in% index_set[[i]]) == 4 })) TIME[, j] <- time } # Order: SIS / ISIS / glmnet / abess / screenIID / SMLE # Screening Accuracy PRR_poisson <- apply(PRR, 1, mean) / 4 SSR_poisson <- apply(SSR, 1, mean) # Computational Time Time_poisson <- apply(TIME, 1, mean) # Summary # Order: SIS / ISIS / glmnet / abess / screenIID / SMLE PRR_linear PRR_poisson PRR_binomial SSR_linear SSR_poisson SSR_binomial # Note that computational time depends on the system so the time # values will vary. Time_linear Time_poisson Time_binomial # Section 4.3--- Impact of initialization --- set.seed(1) # Functions to run SMLE with different initial values run_ID_smle_simulation <- function(initialization_method) { # Generate example data Data_S1 <- Gen_Data( n = 300, p = 1000, family = "binomial", correlation = "ID", rho = 0.9, pos_truecoef = c(1, 3, 5, 7, 9), effect_truecoef = c(2, 3, -3, 3, -4) ) # Set initial coefficients based on the method if (initialization_method == "default") { coef_initial <- NULL # Assuming the default method will handle this } else if (initialization_method == "zero") { coef_initial <- rep(0, 1000) } # Fit the SMLE model fit <- SMLE(Y = Data_S1$Y, X = Data_S1$X, family = "binomial", k = 10, coef_initial = coef_initial) return(fit) } run_AR_smle_simulation <- function(initialization_method) { # Generate example data Data_S2 <- Gen_Data( n = 400, p = 1000, family = "binomial", correlation = "AR", rho = 0.9, pos_truecoef = c(1, 3, 5, 7, 9), effect_truecoef = c(2, 3, -3, 3, -4) ) # Set initial coefficients based on the method if (initialization_method == "default") { coef_initial <- NULL # Assuming the default method will handle this } else if (initialization_method == "zero") { coef_initial <- rep(0, 1000) } # Fit the SMLE model fit <- SMLE(Y = Data_S2$Y, X = Data_S2$X, family = "binomial", k = 10, coef_initial = coef_initial) return(fit) } # Function to summarize the results of the simulation summarize_results <- function(results_list) { # Example: summarizing the coefficients SSR <- sapply(results_list, function(fit) floor(sum(c(1, 3, 5, 7, 9) %in% fit$ID_retained) / 5)) PSR <- sapply(results_list, function(fit) sum(c(1, 3, 5, 7, 9) %in% fit$ID_retained) / 5) return(list(SSR = SSR, PSR = PSR)) } # Number of simulations n_simulations <- 100 # Run simulations for different initialization methods results_default <- replicate(n_simulations, run_AR_smle_simulation("default"), simplify = FALSE) results_zero <- replicate(n_simulations, run_AR_smle_simulation("zero"), simplify = FALSE) # Summarize results summary_default <- summarize_results(results_default) summary_zero <- summarize_results(results_zero) # Combine results into a data frame for plotting results <- data.frame( Initialization = rep(c("Default", "Zero"), each = 100), SSR = c(summary_default$SSR, summary_zero$SSR), PSR = c(summary_default$PSR, summary_zero$PSR) ) # Calculate mean SSR and PSR for each Initialization method mean_results_AR <- aggregate(cbind(SSR, PSR) ~ Initialization, data = results, FUN = mean) # Create a bar plot barplot_heights <- as.matrix(mean_results_AR[, 2:3]) barplot_names_AR <- mean_results_AR$Initialization # Plot the barplot barplot( t(barplot_heights), beside = TRUE, names.arg = barplot_names_AR, ylab = "Rate", cex.main = 1.5, # Increase main title size cex.lab = 1.5, # Increase label size ylim = c(0, 1), # Set y-axis range from 0 to 1 legend.text = c("SSR", "PSR"), args.legend = list(x = "topright", cex = 1.5) # Increase legend text size ) results_default <- replicate(n_simulations, run_ID_smle_simulation("default"), simplify = FALSE) results_zero <- replicate(n_simulations, run_ID_smle_simulation("zero"), simplify = FALSE) # Summarize results summary_default <- summarize_results(results_default) summary_zero <- summarize_results(results_zero) # Combine results into a data frame for plotting results <- data.frame( Initialization = rep(c("Default", "Zero"), each = 100), SSR = c(summary_default$SSR, summary_zero$SSR), PSR = c(summary_default$PSR, summary_zero$PSR) ) # Calculate mean SSR and PSR for each Initialization method mean_results_ID <- aggregate(cbind(SSR, PSR) ~ Initialization, data = results, FUN = mean) # Create a bar plot barplot_heights_ID <- as.matrix(mean_results_ID[, 2:3]) barplot_names_ID <- mean_results_ID$Initialization # Plot the barplot # Section 4.2---Figures 6 -------- barplot( t(barplot_heights_ID), beside = TRUE, names.arg = barplot_names_ID, ylab = "Rate", cex.main = 1.5, # Increase main title size cex.lab = 1.5, # Increase label size ylim = c(0, 1), # Set y-axis range from 0 to 1 legend.text = c("SSR", "PSR"), args.legend = list(x = "topright", cex = 1.5) # Increase legend text size ) # Section 4.4 --- Application to high-dimensional genetic data----------- data("synSNP") Y_SNP <- synSNP[, 1] X_SNP <- as.matrix(synSNP[, -1]) # Casual SNPs' positions and effects generating the data was saved in "answers.txt'. # We use this to point casual SNPs and estimate the screening results. answers <- readLines(paste0(path.package("SMLE"), "/answers.txt")) level1 <- strsplit(answers[1], split = " ")[[1]] effects1 <- c(answers[2], answers[2]) level2 <- strsplit(answers[3], split = " ")[[1]] effects2 <- strsplit(answers[4], split = " ")[[1]] level3 <- strsplit(answers[5], split = " ")[[1]] effects3 <- strsplit(answers[6], split = " ")[[1]] level4 <- strsplit(answers[7], split = " ")[[1]] effects4 <- strsplit(answers[8], split = " ")[[1]] snps_chr <- list(level1, level2, level3, level4) # Section 4.3 --- figure 7 --------- # The marginal p-values of each SNPs is pre-calculated and saved in "pvals". data("pvals") # Code corresponding to the calculation of p-values pvals$P <- unlist(lapply(2:10032, function(i) { summary(lm(Y_SNP ~ synSNP[, i], data = synSNP))$coef[2, 4] })) # The first three columns of "pvals" are position information on chromosome, which are # provided in the original data set. qqman::manhattan(pvals, ylim = c(0, 8), genomewideline = FALSE) pvals$logp <- -log10(pvals$P) pvals$index <- rep.int(seq_along(unique(pvals$CHR)), times = tapply(pvals$SNP, pvals$CHR, length)) pvals$pos <- pvals$BP lastbase <- 0 for (i in unique(pvals$index)) { if (i == 1) { pvals[pvals$index == i, ]$pos <- pvals[pvals$index == i, ]$BP } else { lastbase <- lastbase + max(pvals[pvals$index == (i - 1), "BP"]) pvals[pvals$index == i, "BP"] <- pvals[pvals$index == i, "BP"] - min(pvals[pvals$index == i, "BP"]) + 1 pvals[pvals$index == i, "pos"] <- pvals[pvals$index == i, "BP"] + lastbase } } colors <- c("red", "orange", "blue", "green3") for (i in 1:4) { pvals.highlight <- pvals[which(pvals$SNP %in% snps_chr[[i]]), ] with(pvals.highlight, points(pos, logp, col = colors[[i]], pch = 20)) } # Section 4.3 --- figure 8 --------- SMLE_fit <- SMLE(Y = Y_SNP, X = X_SNP, family = "gaussian", k = 80, fast = F) SIS_fit <- SIS(y = Y_SNP, x = X_SNP, family = "gaussian", nsis = 80, iter = F) suppressWarnings(Lasso_fit <- glmnet(x = X_SNP, y = Y_SNP, family = "gaussian", pmax = 80)) Lasso_index <- (1:dim(X_SNP)[2])[Lasso_fit$beta[, dim(Lasso_fit$beta)[2]] != 0] par(mar = c(5, 6, 4, 2) + 0.1) # SMLE screening results are showed in red line. qqman::manhattan(pvals, ylim = c(0, 8), genomewideline = FALSE, cex.lab = 2, cex.main = 2, main = "SMLE()") pvals.screened <- pvals[SMLE_fit$ID_retained, ] with(pvals.screened, abline(v = pos, col = "red", pch = 20)) for (i in 1:4) { pvals.highlight <- pvals[which(pvals$SNP %in% snps_chr[[i]]), ] with(pvals.highlight, points(pos, logp, col = colors[[i]], pch = 20)) } # SIS screening results are showed in red line. qqman::manhattan(pvals, ylim = c(0, 8), genomewideline = FALSE, cex.lab = 2, cex.main = 2, main = "SIS()") pvals.screened <- pvals[SIS_fit$ix0, ] with(pvals.screened, abline(v = pos, col = "red", pch = 20)) for (i in 1:4) { pvals.highlight <- pvals[which(pvals$SNP %in% snps_chr[[i]]), ] with(pvals.highlight, points(pos, logp, col = colors[[i]], pch = 20)) } # Lasso screening results are showed in red line. qqman::manhattan(pvals, ylim = c(0, 8), genomewideline = FALSE, cex.lab = 2, cex.main = 2, main = "glmnet()") pvals.screened <- pvals[Lasso_index, ] with(pvals.screened, abline(v = pos, col = "red", pch = 20)) for (i in 1:4) { pvals.highlight <- pvals[which(pvals$SNP %in% snps_chr[[i]]), ] with(pvals.highlight, points(pos, logp, col = colors[[i]], pch = 20)) } # Section 4.3 --- Screening with varying k---- k <- c(40, 60, 80, 100) MD <- list() for (i in 1:length(k)) { SMLE_fit <- SMLE( Y = Y_SNP, X = X_SNP, k = k[i], family = "gaussian", standardize = F, fast = F ) SIS_fit <- SIS( x = X_SNP, y = Y_SNP, family = "gaussian", nsis = k[i], iter = F, standardize = F )$ix0 suppressWarnings(Lasso_fit <- glmnet(x = X_SNP, y = Y_SNP, family = "gaussian", pmax = k[i])) Lasso_index <- (1:dim(X_SNP)[2])[Lasso_fit$beta[, dim(Lasso_fit$beta)[2]] != 0] screened <- c( list(colnames(X_SNP)[SIS_fit]), list(colnames(X_SNP)[SMLE_fit$ID_retained]), list(colnames(X_SNP)[Lasso_index]) ) # Causal Snps' positions snps_postion <- match(unlist(snps_chr), pvals$SNP) # Causal Snps' effects effects <- as.numeric(c(effects1, effects2, effects3, effects4)) results <- matrix(nrow = length(levels), ncol = 3) distances <- data.frame(snps = snps_postion, effects = effects, results) colnames(distances)[3:5] <- c("SIS", "SMLE", "glmnet") for (l in 1:nrow(distances)) { for (j in 1:3) { subset <- match(screened[[j]], pvals$SNP) distances[l, j + 2] <- min(abs(distances[l, 1] - subset)) } } # distances$w_min_SIS = abs(distances$effects) * distances$SIS # distances$w_min_SMLE = abs(distances$effects) * distances$SMLE md <- colMeans(distances[, 3:5]) MD <- rbind(MD, md) # Section 4.3 ---- figure ------ if (i == 3) { # Set y-axis limits for this specific case ylimits <- c(-100, 2000) distances_long <- data.frame( EffectSize = abs(distances[, 2]), Distance = c(distances[, 3], distances[, 4], distances[, 5]), Method = factor(rep(c("SIS", "SMLE", "glmnet"), each = nrow(distances))) ) md <- c(md[1], md[2], md[3]) labels <- paste("Averaged MRD =", round(md, 2)) Figure9 <- lapply(unique(distances_long$Method), function(method) { data_plot <- subset(distances_long, Method == method) label_text <- labels[which(unique(distances_long$Method) == method)] ggplot(data_plot, aes(x = EffectSize, y = Distance)) + geom_point() + labs(x = "Effect size", y = "Distance", title = paste0(method, "()")) + ylim(ylimits) + theme_bw() + theme( plot.title = element_text(hjust = 0.5, size = 24, face = "bold"), axis.title = element_text(size = 20), axis.text = element_text(size = 16), plot.margin = margin(20, 20, 20, 20) ) + annotate( "text", x = max(data_plot$EffectSize) * 0.75, y = ylimits[2] * 0.75, label = label_text, size = 6 ) }) } } print(Figure9) row.names(MD) <- paste0("k=", k) colnames(MD) <- c("SIS_MRD", "SMLE_MRD", "GLMNET_MRD") MD_df <- as.data.frame(MD) # Add k values as a separate column by extracting from row names MD_df$Screening_Size <- as.numeric(sub("k=", "", rownames(MD_df))) # Reorder columns to put Screening_Size first MD_df <- MD_df[, c("Screening_Size", "SIS_MRD", "SMLE_MRD", "GLMNET_MRD")] MD_df$Screening_Size <- as.numeric(MD_df$Screening_Size) MD_df$SIS_MRD <- as.numeric(MD_df$SIS_MRD) MD_df$SMLE_MRD <- as.numeric(MD_df$SMLE_MRD) MD_df$GLMNET_MRD <- as.numeric(MD_df$GLMNET_MRD) MD_long <- MD_df %>% pivot_longer( cols = c("SIS_MRD", "SMLE_MRD", "GLMNET_MRD"), names_to = "Method", values_to = "MRD" ) MD_long$Method <- factor(MD_long$Method, levels = c("SIS_MRD", "SMLE_MRD", "GLMNET_MRD"), labels = c("SIS()", "SMLE()", "glmnet()") ) # Plot using ggplot2 ggplot(MD_long, aes(x = Screening_Size, y = MRD, color = Method, group = Method)) + geom_line(size = 1) + geom_point(size = 3) + scale_color_manual(values = c("SIS()" = "blue", "SMLE()" = "red", "glmnet()" = "orange")) + labs( x = "Screening size (k)", y = "Averaged MRD", color = "Method" ) + theme_minimal(base_size = 20) + # Increase base size to make text larger theme( legend.position = c(0.95, 0.95), # Top right position legend.justification = c("right", "top"), # Align the legend to the top right legend.box.just = "right", # Box alignment to the right plot.title = element_text(size = 24, face = "bold"), # Larger title axis.title = element_text(size = 20), # Larger axis titles axis.text = element_text(size = 18), # Larger axis labels legend.text = element_text(size = 18), # Larger legend text legend.title = element_text(size = 20) # Larger legend title )