################################################## ########## Comparison in Section 6 ############### ################################################## ## note that the comparison may spend several days due to the low efficiency of package regsem ## library("devtools") install_version("lsl", version = "0.5.6", repos = "https://cloud.R-project.org/") install_version("regsem", version = "0.9.2", repos = "https://cloud.R-project.org/") library("lslx") library("lavaan") library("lsl") library("regsem") library("ggplot2") library("dplyr") library("tidyr") set.seed(9487) ## function for specifying covariance structure ## specify_sigma <- function() { n_y <- 9 n_x <- 6 n_f <- 3 beta_yf <- diag(1, n_f) %x% matrix(0.7, n_y / n_f, 1) beta_yf[2, 3] <- beta_yf[5, 1] <- beta_yf[8, 2] <- 0.2 beta_fx <- rbind(matrix(c(.2, .2, .2, .2, 0, 0), 1, n_x), matrix(c(0, .2, .2, .2, .2, 0), 1, n_x), matrix(c(0, 0, .2, .2, .2, .2), 1, n_x)) psi_x <- 0.3 * matrix(1, n_x, 1) %*% matrix(1, 1, n_x) + 0.7 * diag(1, n_x) psi_f_diag <- 1 - diag(beta_fx %*% psi_x %*% t(beta_fx)) psi_f <- .2 * matrix(1, n_f, 1) %*% matrix(1, 1, n_f) + .8 * diag(1, n_f) diag(psi_f) <- psi_f_diag sigma_ff <- (beta_fx %*% psi_x %*% t(beta_fx) + psi_f) psi_y <- diag(1 - diag(beta_yf %*% sigma_ff %*% t(beta_yf))) sigma <- rbind( cbind(psi_x, t(beta_yf %*% beta_fx %*% psi_x)), cbind( beta_yf %*% beta_fx %*% psi_x, beta_yf %*% sigma_ff %*% t(beta_yf) + psi_y ) ) name_var <- c(paste0("x", 1:(n_x)), paste0("y", 1:(n_y))) colnames(sigma) <- name_var rownames(sigma) <- name_var return(sigma) } ## function for generating data ## generate_data <- function(n_obs, sigma) { sigma_chol <- chol(sigma) data <- as.data.frame(matrix(rnorm(n_obs * nrow(sigma)), n_obs, nrow(sigma)) %*% sigma_chol) return(data) } ## function for computing lslx-fisher result ## compute_lslx_fisher <- function(data, type) { if (type == "simple") { model_lslx <- " f1 :=> fix(.7) * y1 + y2 + y3 f2 :=> fix(.7) * y4 + y5 + y6 f3 :=> fix(.7) * y7 + y8 + y9 f1 + f2 + f3 <=> f1 + f2 + f3 x1 + x2 + x3 + x4 + x5 + x6 ~> f1 + f2 + f3 " } else { model_lslx <- " f1 :=> fix(.7) * y1 + y2 + y3 f2 :=> fix(.7) * y4 + y5 + y6 f3 :=> fix(.7) * y7 + y8 + y9 f1 :~> y4 + y5 + y6 + y7 + y8 + y9 f2 :~> y1 + y2 + y3 + y7 + y8 + y9 f3 :~> y1 + y2 + y3 + y4 + y5 + y6 f1 + f2 + f3 <=> f1 + f2 + f3 x1 + x2 + x3 + x4 + x5 + x6 ~> f1 + f2 + f3 " } r6_lslx <- lslx$new(model = model_lslx, data = data, verbose = FALSE) start_time <- Sys.time() r6_lslx$fit_lasso( lambda_grid = .1, algorithm = "fisher", iter_out_max = 1000, iter_in_max = 50, tol_out = .00001, verbose = FALSE ) end_time <- Sys.time() time_taken <- difftime(end_time, start_time, units = "secs") result <- data.frame( type = type, n_obs = nrow(data), algorithm = "lslx-fisher", objective_value = r6_lslx$extract_numerical_condition(include_faulty = TRUE)[["objective_value"]], time_taken = time_taken, n_iter = ( r6_lslx$extract_numerical_condition(include_faulty = TRUE)[["n_iter_out"]] ) ) return(result) } ## function for computing lslx-bfgs result ## compute_lslx_bfgs <- function(data, type) { if (type == "simple") { model_lslx <- " f1 :=> fix(.7) * y1 + y2 + y3 f2 :=> fix(.7) * y4 + y5 + y6 f3 :=> fix(.7) * y7 + y8 + y9 f1 + f2 + f3 <=> f1 + f2 + f3 x1 + x2 + x3 + x4 + x5 + x6 ~> f1 + f2 + f3 " } else { model_lslx <- " f1 :=> fix(.7) * y1 + y2 + y3 f2 :=> fix(.7) * y4 + y5 + y6 f3 :=> fix(.7) * y7 + y8 + y9 f1 :~> y4 + y5 + y6 + y7 + y8 + y9 f2 :~> y1 + y2 + y3 + y7 + y8 + y9 f3 :~> y1 + y2 + y3 + y4 + y5 + y6 f1 + f2 + f3 <=> f1 + f2 + f3 x1 + x2 + x3 + x4 + x5 + x6 ~> f1 + f2 + f3 " } r6_lslx <- lslx$new(model = model_lslx, data = data, verbose = FALSE) start_time <- Sys.time() r6_lslx$fit_lasso( lambda_grid = .1, algorithm = "bfgs", iter_out_max = 1000, iter_in_max = 50, tol_out = .00001, verbose = FALSE ) end_time <- Sys.time() time_taken <- difftime(end_time, start_time, units = "secs") result <- data.frame( type = type, n_obs = nrow(data), algorithm = "lslx-bfgs", objective_value = r6_lslx$extract_numerical_condition(include_faulty = TRUE)[["objective_value"]], time_taken = time_taken, n_iter = ( r6_lslx$extract_numerical_condition(include_faulty = TRUE)[["n_iter_out"]] ) ) return(result) } ## function for computing lslx-ecm result ## compute_lsl <- function(data, type) { lambda_x <- diag(1, 6) lambda_y <- matrix(0, 9, 3) lambda_y[1:(3), 1] <- lambda_y[(3 + 1):(3 * 2), 2] <- lambda_y[(3 * 2 + 1):(3 * 3), 3] <- .7 lambda <- matrix(0, 15, 3 + 6) lambda[1:6, 1:6] <- lambda_x lambda[7:15, 7:9] <- lambda_y lambda_pattern <- matrix(0, 15, 9) if (type == "simple") { lambda_pattern_y <- matrix(0, 9, 3) } else { lambda_pattern_y <- matrix(NA, 9, 3) } lambda_pattern_y[1, 1] <- lambda_pattern_y[4, 2] <- lambda_pattern_y[7, 3] <- 0 lambda_pattern_y[2:3, 1] <- lambda_pattern_y[5:6, 2] <- lambda_pattern_y[8:9, 3] <- 1 lambda_pattern[7:15, 7:9] <- lambda_pattern_y psi <- diag(c(rep(0, 6), rep(.2, 9))) psi_pattern <- 1 * (psi != 0) beta <- matrix(0, 9, 9) beta_pattern <- matrix(0, 9, 9) beta_pattern[7:9, 1:6] <- NA phi <- diag(.5, 9) phi_pattern <- diag(1, 9) phi_pattern[1:6, 1:6] <- 1 phi_pattern[7:9, 7:9] <- 1 rc_sem <- lslSEM() rc_sem$input(raw = data) rc_sem$specify( pattern = list( lambda = lambda_pattern, beta = beta_pattern, psi = psi_pattern, phi = phi_pattern ), value = list( lambda = lambda, beta = beta, psi = psi, phi = phi ), scale = FALSE ) start_time <- Sys.time() rc_sem$learn( penalty = list(type = "l1", gamma = 0.1), control = list(maxit = 1000, epsilon = 10 ^ -5) ) end_time <- Sys.time() time_taken <- difftime(end_time, start_time, units = "secs") result <- data.frame( type = type, n_obs = nrow(data), algorithm = "lsl-ecm", objective_value = rc_sem$summarize(type = "overall")["dpl", "bic optimal"], time_taken = time_taken, n_iter = rc_sem$summarize(type = "overall")["it", "bic optimal"] ) return(result) } ## function for computing regsem-default result ## compute_regsem_default <- function(data, type) { if (type == "simple") { model_lavaan <- " f1 =~ .7 * y1 + y2 + y3 f2 =~ .7 * y4 + y5 + y6 f3 =~ .7 * y7 + y8 + y9 f1 ~~ f2 + f3 f2 ~~ f3 f1 ~ x1 + x2 + x3 + x4 + x5 + x6 f2 ~ x1 + x2 + x3 + x4 + x5 + x6 f3 ~ x1 + x2 + x3 + x4 + x5 + x6 " idx_pen <- 7:24 } else { model_lavaan <- " f1 =~ .7 * y1 + y2 + y3 + y4 + y5 + y6 + y7 + y8 + y9 f2 =~ y1 + y2 + y3 + .7 * y4 + y5 + y6 + y7 + y8 + y9 f3 =~ y1 + y2 + y3 + y4 + y5 + y6 + .7 * y7 + y8 + y9 f1 ~~ f2 + f3 f2 ~~ f3 f1 ~ x1 + x2 + x3 + x4 + x5 + x6 f2 ~ x1 + x2 + x3 + x4 + x5 + x6 f3 ~ x1 + x2 + x3 + x4 + x5 + x6 " idx_pen <- c(3:11, 14:22, 25:42) } fit_lavaan <- lavaan( model_lavaan, data, fixed.x = FALSE, auto.fix.first = FALSE, std.lv = FALSE, auto.fix.single = FALSE, auto.var = TRUE, orthogonal = FALSE, int.ov.free = TRUE, meanstructure = TRUE, do.fit = FALSE ) start_time <- Sys.time() fit_regsem <- regsem( fit_lavaan, lambda = .05, type = "lasso", pars_pen = idx_pen, max.iter = 1000, tol = 1e-05 ) end_time <- Sys.time() time_taken <- difftime(end_time, start_time, units = "secs") result <- data.frame( type = type, n_obs = nrow(data), algorithm = "regsem-default", objective_value = 2 * fit_regsem$optim_fit, time_taken = time_taken, n_iter = fit_regsem$iterations ) return(result) } ## function for computing lslx-cd result ## compute_regsem_cd <- function(data, type) { if (type == "simple") { model_lavaan <- " f1 =~ .7 * y1 + y2 + y3 f2 =~ .7 * y4 + y5 + y6 f3 =~ .7 * y7 + y8 + y9 f1 ~~ f2 + f3 f2 ~~ f3 f1 ~ x1 + x2 + x3 + x4 + x5 + x6 f2 ~ x1 + x2 + x3 + x4 + x5 + x6 f3 ~ x1 + x2 + x3 + x4 + x5 + x6 " idx_pen <- 7:24 } else { model_lavaan <- " f1 =~ .7 * y1 + y2 + y3 + y4 + y5 + y6 + y7 + y8 + y9 f2 =~ y1 + y2 + y3 + .7 * y4 + y5 + y6 + y7 + y8 + y9 f3 =~ y1 + y2 + y3 + y4 + y5 + y6 + .7 * y7 + y8 + y9 f1 ~~ f2 + f3 f2 ~~ f3 f1 ~ x1 + x2 + x3 + x4 + x5 + x6 f2 ~ x1 + x2 + x3 + x4 + x5 + x6 f3 ~ x1 + x2 + x3 + x4 + x5 + x6 " idx_pen <- c(3:11, 14:22, 25:42) } fit_lavaan <- lavaan( model_lavaan, data, fixed.x = FALSE, auto.fix.first = FALSE, std.lv = FALSE, auto.fix.single = FALSE, auto.var = TRUE, orthogonal = FALSE, int.ov.free = TRUE, meanstructure = TRUE, do.fit = FALSE ) start_time <- Sys.time() fit_regsem <- regsem( fit_lavaan, lambda = .05, type = "lasso", pars_pen = idx_pen, max.iter = 1000, tol = 1e-05, solver = TRUE, solver.maxit = 50, line.search = TRUE ) end_time <- Sys.time() time_taken <- difftime(end_time, start_time, units = "secs") result <- data.frame( type = type, n_obs = nrow(data), algorithm = "regsem-cd", objective_value = 2 * fit_regsem$optim_fit, time_taken = time_taken, n_iter = fit_regsem$iterations ) return(result) } fun_simulate <- function(n_obs, type) { sigma <- specify_sigma() data <- generate_data(n_obs, sigma) result <- rbind( lslx_fisher = compute_lslx_fisher(data, type), lslx_bfgs = compute_lslx_bfgs(data, type), lsl_ecm = compute_lsl(data, type), regsem_default = compute_regsem_default(data, type), regsem_cd = compute_regsem_cd(data, type) ) return(result) } ## run sumulation ## type_all <- c("simple", "complex") n_obs_all <- c(200, 400, 600, 800) n_rep <- 1000 result <- do.call(rbind, lapply( type_all, FUN = function(type) { do.call(rbind, lapply( n_obs_all, FUN = function(n_obs) { return(do.call(rbind, lapply( 1:n_rep, FUN = function(i_rep) { result <- data.frame(iter = paste(type, n_obs, i_rep, sep = "-"), fun_simulate(n_obs, type)) return(result) } ))) } )) } )) ## write simulation result ## write.csv(result, "result.csv", row.names = FALSE) ## read simulation result ## result <- read.csv("result.csv") # min and max for convergence result %>% mutate(iter = sapply( strsplit(as.character(iter), split = "-"), FUN = function(x) { as.numeric(x[3]) } )) %>% filter(iter <= 500) %>% group_by(type, n_obs, algorithm) %>% summarise(rate = sum((n_iter < 1000) & (!is.na(objective_value))) / 500) %>% group_by(algorithm) %>% summarise(min_rate = min(rate), max_rate = max(rate)) # plot for convergence result %>% mutate(iter = sapply( strsplit(as.character(iter), split = "-"), FUN = function(x) { as.numeric(x[3]) } )) %>% filter(iter <= 500) %>% group_by(type, n_obs, algorithm) %>% summarise(rate = sum((n_iter < 1000) & (!is.na(objective_value))) / 500) %>% ggplot(aes(x = algorithm, y = rate)) + geom_bar(stat = "identity") + coord_flip() + facet_grid(type ~ n_obs) ## data for plot df_for_plot <- result %>% filter(algorithm != "regsem-default") %>% group_by(iter) %>% summarise(iter_success = all(!is.na(objective_value)) & all(n_iter < 1000)) %>% ungroup() %>% left_join(result, ., by = "iter") %>% filter(iter_success, algorithm != "regsem-default") df_for_plot <- df_for_plot %>% split(paste(df_for_plot$type, df_for_plot$n_obs, sep = "-")) %>% lapply( FUN = function(df_for_plot_i) df_for_plot_i[1:2000, ] ) %>% do.call(rbind, .) %>% select(type, n_obs, algorithm, objective_value, time_taken, n_iter) %>% gather(index, value, objective_value:n_iter) df_for_plot$n_obs <- paste("N", df_for_plot$n_obs, sep = "=") df_for_plot$index <- ifelse( df_for_plot$index == "objective_value", "function value", ifelse( df_for_plot$index == "n_iter", "number of iterations", "time in seconds" ) ) ################################################## ########## Figure 5 in Section 6 ################# ################################################## df_for_plot %>% filter(type == "simple") %>% ggplot(mapping = aes(x = factor( algorithm, levels = c("regsem-cd", "lsl-ecm", "lslx-bfgs", "lslx-fisher") ), y = value)) + geom_boxplot() + coord_flip() + facet_grid(n_obs ~ index, scales = "free_x") + labs(title = "Simple Model Specification", y = NULL, x = NULL) + theme(plot.title = ggplot2::element_text(hjust = 0.5)) ggsave( "JSS3359-figure-5a.pdf", plot = last_plot(), width = 8, height = 4 ) df_for_plot %>% filter(type == "complex") %>% ggplot(mapping = aes(x = factor( algorithm, levels = c("regsem-cd", "lsl-ecm", "lslx-bfgs", "lslx-fisher") ), y = value)) + geom_boxplot() + coord_flip() + facet_grid(n_obs ~ index, scales = "free_x") + labs(title = "Complex Model Specification", y = NULL, x = NULL) + theme(plot.title = ggplot2::element_text(hjust = 0.5)) ggsave( "JSS3359-figure-5b.pdf", plot = last_plot(), width = 8, height = 4 )