# install.packages("gips") library("gips") # Install in that order: # install.packages(c("BiocManager", "ggplot2", "igraph", "gRim", "magrittr", "dplyr", "huge", "tidyr", "mvtnorm")) # BiocManager::install(c("GEOquery", "RBGL")) # install.packages("rags2ridges") ##### # 3.1 breast cancer # Code for this section 3.1 took us under 2 hours to compute with AMD EPYC 7413 on a single core gset <- GEOquery::getGEO("GSE3494", GSEMatrix = TRUE, AnnotGPL = TRUE)[[1]] Biobase::fvarLabels(gset) <- make.names(Biobase::fvarLabels(gset)) gens <- gset@assayData$exprs # Get the subset of data to analyze; 150 gens of 58 people: data2 <- t(gens[c( "1053_at", "200039_s_at", "200053_at", "200079_s_at", "200628_s_at", "200639_s_at", "200670_at", "200687_s_at", "200710_at", "200740_s_at", "200773_x_at", "200783_s_at", "200795_at", "200810_s_at", "200811_at", "200822_x_at", "200840_at", "200853_at", "200854_at", "200855_at", "200996_at", "201041_s_at", "201077_s_at", "201088_at", "201104_x_at", "201114_x_at", "201115_at", "201124_at", "201140_s_at", "201170_s_at", "201195_s_at", "201196_s_at", "201197_at", "201201_at", "201202_at", "201236_s_at", "201263_at", "201266_at", "201281_at", "201291_s_at", "201292_at", "201327_s_at", "201342_at", "201384_s_at", "201394_s_at", "201395_at", "201397_at", "201455_s_at", "201479_at", "201508_at", "201557_at", "201584_s_at", "201591_s_at", "201663_s_at", "201664_at", "201685_s_at", "201694_s_at", "201710_at", "201755_at", "201761_at", "201770_at", "201790_s_at", "201791_s_at", "201795_at", "201833_at", "201890_at", "201896_s_at", "201897_s_at", "201930_at", "201970_s_at", "201977_s_at", "201978_s_at", "202011_at", "202088_at", "202095_s_at", "202105_at", "202107_s_at", "202109_at", "202117_at", "202174_s_at", "202188_at", "202200_s_at", "202233_s_at", "202240_at", "202270_at", "202276_at", "202307_s_at", "202338_at", "202370_s_at", "202371_at", "202409_at", "202487_s_at", "202503_s_at", "202580_x_at", "202589_at", "202590_s_at", "202613_at", "202666_s_at", "202690_s_at", "202705_at", "202725_at", "202754_at", "202779_s_at", "202808_at", "202815_s_at", "202854_at", "202858_at", "202870_s_at", "202954_at", "202962_at", "203022_at", "203046_s_at", "203066_at", "203071_at", "203126_at", "203130_s_at", "203139_at", "203145_at", "203187_at", "203188_at", "203208_s_at", "203213_at", "203214_x_at", "203223_at", "203249_at", "203265_s_at", "203266_s_at", "203276_at", "203287_at", "203296_s_at", "203345_s_at", "203347_s_at", "203358_s_at", "203362_s_at", "203380_x_at", "203405_at", "203418_at", "203422_at", "203428_s_at", "203432_at", "203438_at", "203439_s_at", "203463_s_at", "203484_at", "203554_x_at", "203560_at", "203594_at", "203625_x_at", "203693_s_at", "203696_s_at" ), c( "GSM79115", "GSM79116", "GSM79118", "GSM79122", "GSM79123", "GSM79131", "GSM79138", "GSM79144", "GSM79145", "GSM79147", "GSM79148", "GSM79154", "GSM79159", "GSM79165", "GSM79188", "GSM79189", "GSM79191", "GSM79196", "GSM79202", "GSM79211", "GSM79216", "GSM79223", "GSM79227", "GSM79241", "GSM79242", "GSM79244", "GSM79247", "GSM79253", "GSM79259", "GSM79263", "GSM79270", "GSM79271", "GSM79280", "GSM79285", "GSM79287", "GSM79292", "GSM79298", "GSM79299", "GSM79302", "GSM79304", "GSM79305", "GSM79306", "GSM79313", "GSM79321", "GSM79325", "GSM79329", "GSM79333", "GSM79336", "GSM79338", "GSM79340", "GSM79342", "GSM79346", "GSM79349", "GSM79351", "GSM79352", "GSM79353", "GSM79356", "GSM79363" )]) S <- cov(data2) n <- dim(data2)[1] # 58 p <- dim(data2)[2] # 150 g <- gips(S, n, was_mean_estimated = TRUE, D_matrix = diag(1, p)) set.seed(2022) g_MAP <- find_MAP(g, max_iter = 150000, optimizer = "MH") # 1 hour 25 minutes on AMD EPYC 7413 Processor (single core, because multithreading would not help here) my_sum <- summary(g_MAP) my_sum$whole_optimization_time # 1 h 25 min my_sum # this is printed in the paper table_comparison <- data.frame(matrix(numeric(4 * 2), nrow = 4, ncol = 2)) colnames(table_comparison) <- c("gips", "Python") rownames(table_comparison) <- c("n0", "n_parameters", "BIC", "AIC") table_comparison[1, 1] <- my_sum$n0 # 25 table_comparison[2, 1] <- my_sum$n_parameters # 611 table_comparison[3, 1] <- my_sum$BIC # 8741.694 table_comparison[4, 1] <- my_sum$AIC # 7482.764 my_sum$acceptance_rate # 0.195 % plot(g_MAP, type = "both", logarithmic_x = TRUE) # We see the search did plateau # perm found by Graczyk et al. (the same algorithm, same parameters, code in Python): string_python <- "(0, 1, 138, 148, 60, 51, 7, 144)(2, 10, 8, 88, 5, 101, 119, 3)(4, 46, 89)(6, 12, 137, 90, 116, 141, 142, 71, 145, 49, 135, 21, 56, 86, 123, 113, 83, 29)(9, 98, 38, 20, 100, 25, 36, 72)(11, 76, 99, 132, 121)(13, 18, 75, 146)(14, 70, 126, 109)(15, 91, 82, 33, 139, 26, 48, 136)(16, 97, 68)(17, 64, 133, 87, 106, 74, 107, 105, 81, 108, 122, 67)(19, 50, 134, 104, 37, 95, 24, 44)(22, 110, 23, 41, 66, 42, 130, 111)(30, 57, 65, 93, 80)(31, 32)(34, 92, 63, 85, 127, 147, 131, 102, 59, 149, 143, 128, 117, 69, 96, 120)(35, 84, 140)(43, 55, 118, 125, 103, 77, 78, 47)(45, 129, 114, 73, 115, 58, 112, 124, 94)" string_new <- stringr::str_replace_all(string_python, pattern = "\\d+", function(number_str) { return(as.character(1 + as.numeric(number_str))) }) perm_python <- gips_perm(string_new, 150) g_python <- gips(S, 58, was_mean_estimated = TRUE, perm = perm_python, D_matrix = diag(1, p) ) abline(log_posteriori_of_gips(g_python), 0, col = "green") # We see our MH was on the height of the python implementation around 10000th iteration python_sum <- summary(g_python) table_comparison[1, 2] <- python_sum$n0 # 30 table_comparison[2, 2] <- python_sum$n_parameters # 844 table_comparison[3, 2] <- python_sum$BIC # 9380.918 table_comparison[4, 2] <- python_sum$AIC # 7641.904 table_comparison # gips Python # n0 25.000 30.000 # n_parameters 611.000 844.000 # BIC 8741.694 9380.918 # AIC 7482.764 7641.904 compare_posteriories_of_perms(g_MAP, g_python, print_output = FALSE) # 1.60518*10^29 times ours more likely # make graph for Figure 7 library("igraph") library("gRim") library("RBGL") # Get graph and BIC graph_from_g_MAP <- function(gips_object, my_quantile) { S <- attr(gips_object, "S") S_MAP <- project_matrix(S, gips_object) K <- solve(S_MAP) # We know there is gRbase::cov2pcor(), but there was sth wrong with it on the cluster we are using... PC_MAP <- -cov2cor(K) diag(PC_MAP) <- -diag(PC_MAP) cutoff_val <- quantile(abs(PC_MAP), my_quantile) adj_mat <- (abs(PC_MAP) > cutoff_val) * 1 diag(adj_mat) <- 0 my_graph <- graph_from_adjacency_matrix(adj_mat, mode = "undirected") # BIC maxCliques_G <- maxClique(as_graphnel(my_graph))$maxCliques MLE_MAP <- ggmfit(S_MAP, n = n, maxCliques_G) kG <- length(unique(c(S_MAP * adj_mat))) - 1 BICG <- kG * log(n) - 2 * (-((n - 1) * p / 2) * log(2 * pi) + ((n - 1) / 2) * log(MLE_MAP$detK) - 0.5 * (n - 1) * p) print(paste0("BIC: ", BICG)) # for the plot: K_masked <- K * adj_mat my_graph_weighted <- graph_from_adjacency_matrix(K_masked, mode = "undirected", weighted = TRUE) my_graph_weighted <- simplify(my_graph_weighted) # edge list: my_edge_list <- get.edgelist(my_graph_weighted) my_edges_data_frame <- data.frame(my_edge_list) colnames(my_edges_data_frame) <- c("from", "to") my_edges_data_frame$value <- mapply(function(row, col) K_masked[row, col], my_edges_data_frame$from, my_edges_data_frame$to) my_edges_data_frame$value <- round(my_edges_data_frame$value, 3) my_edges_data_frame$for_embedding <- abs(my_edges_data_frame$value) big_edge <- max(my_edges_data_frame$for_embedding) # vertices have the same colors iff vertices have the same variance: my_perm <- gips_object[[1]] list_of_cycles <- unclass(my_perm) num_of_cycles <- length(list_of_cycles) my_vertices_data_frame <- data.frame(colnames(attr(gips_object, "S"))) colnames(my_vertices_data_frame) <- "name" my_vertices_data_frame$colors <- numeric(150) for (i in 1:num_of_cycles) { # vertices my_vertices_data_frame[list_of_cycles[[i]], "colors"] <- i # edges potential_from_edge <- (my_edges_data_frame$from %in% my_vertices_data_frame[list_of_cycles[[i]], "name"]) potential_to_edge <- (my_edges_data_frame$to %in% my_vertices_data_frame[list_of_cycles[[i]], "name"]) edge_is_in_cluster <- potential_from_edge & potential_to_edge my_edges_data_frame[edge_is_in_cluster, "for_embedding"] <- 10 * big_edge } invisible(list(BICG, my_graph_weighted, my_edges_data_frame, my_vertices_data_frame)) } # python BIC(g_python) # This is the full model; BIC: 9381 graph_python <- graph_from_g_MAP(g_python, 0.9112) # Graczyk et al said, they used alpha = cutoff_val = 0.15 = quantile(0.9112) graph_python[[1]] # This is the smaller graph model; BIC: 9174 # 9174 < 9381, so the smaller is better # gips BIC(g_MAP) # This is the full model; BIC: 8644 graph_MAP <- graph_from_g_MAP(g_MAP, 0.7) # 0.7 minimizes the BIC graph_MAP[[1]] # This is smaller graph model; BIC: 7807 # 7807 < 8644, so smaller is better # The object `graph_MAP` was used to make the Figure 7. # Figure 7 was produced with the [Cytoscape](https://cytoscape.org/). # If one uses `graph_python` object, then one would reproduce the graph in Figure 6 from Graczyk et al. (2022) ("Model selection in the space of Gaussian models invariant by symmetry"). # Layout and colors: # 1. default-black style # 2. Edge-weighted Spring Embedded Layout based on "value" column of `graph_MAP[[3]]`. We intended to use the "for_embedding" column, but it turned out to be worse, so we rejected it. This was the basis for out layout, but then we adjusted the placement of the nodes according to out preferences. # 3. Colors of vertices were based on "colors" column of `graph_MAP[[4]]`. ##### # 3.2 Hyperparameters' Influence # Code for this section 3.2 took us under 5 minuets to compute with AMD EPYC 7413 on 24 cores set.seed(2022) ## Create matrices p <- 8 large_structure_perm <- "(12345678)" moderate_structure_perm <- "(1234)" Z <- MASS::mvrnorm(n = p, mu = numeric(p), Sigma = diag(1, p)) S <- (t(Z) %*% Z) / p # cov_large_str creation cov_large_str <- project_matrix(S, large_structure_perm) # cov_moderate_str creation cov_moderate_str <- project_matrix(S, moderate_structure_perm) # cov_no_str creation cov_no_str <- S ## Define functions library("magrittr") show_progress_bar <- TRUE p <- nrow(cov_large_str) n_points <- 30 d_list <- c(0.1, 1, 10, 100) delta_list <- c(3, 30) structures <- c("large", "moderate", "no") set.seed(2022) # Make S_large, S_moderate and S_no: for (struc in structures) { Sigma <- get(paste0("cov_", struc, "_str")) Z <- MASS::mvrnorm( n_points, mu = rep(0, p), Sigma = Sigma ) S <- (t(Z) %*% Z) / n_points # we know the mean assign(paste0("S_", struc), S) } tasks <- expand.grid( d = d_list, delta = delta_list, matrix_structure = structures ) get_n_dim_from_perm_name <- function(g_perm_name, size = p) { g_perm <- gips_perm(g_perm_name, size = size) n_dim <- sum(get_structure_constants(g_perm)[["dim_omega"]]) n_dim } get_n0_from_perm_name <- function(g_perm_name, size = p) { structure_constants <- get_structure_constants(gips_perm(g_perm_name, p)) n0 <- max(structure_constants[["r"]] * structure_constants[["d"]] / structure_constants[["k"]]) n0 } conduct_trial <- function(d, delta, matrix_structure) { S <- get(paste0("S_", matrix_structure)) D_matrix <- diag(ncol(S)) * d g <- gips(S, n_points, delta = delta, D_matrix = D_matrix, was_mean_estimated = FALSE ) g_MAP <- find_MAP(g, optimizer = "BF", show_progress_bar = show_progress_bar, save_all_perms = TRUE, return_probabilities = TRUE ) probs <- get_probabilities_from_gips(g_MAP) # n_parameters visited_n_dim <- sapply( names(probs), get_n_dim_from_perm_name ) %>% factor() n_dim_distribution <- split(probs, visited_n_dim) %>% sapply(sum) %>% setNames(attr(visited_n_dim, "levels")) # n0 visited_n0 <- sapply( names(probs), get_n0_from_perm_name ) %>% factor() n0_distribution <- split(probs, visited_n0) %>% sapply(sum) %>% setNames(attr(visited_n0, "levels")) list( "n_dim_distribution" = n_dim_distribution, "n0_distribution" = n0_distribution ) } execute_task <- function(task_id) { # set.seed is not needed, because the computations are deterministic task_info <- as.vector(tasks[task_id, ]) delta <- task_info[["delta"]] d <- task_info[["d"]] matrix_structure <- task_info[["matrix_structure"]] trial_results <- conduct_trial(d, delta, matrix_structure) out_list <- as.list(tasks[task_id, ]) attr(out_list, "out.attrs") <- NULL out_list[["task_id"]] <- task_id out_list[["n_dim_distribution"]] <- trial_results[["n_dim_distribution"]] out_list[["n0_distribution"]] <- trial_results[["n0_distribution"]] out_list } ## run Brute Force library("parallel") numCores <- detectCores() available_cores <- numCores if (available_cores > nrow(tasks)) { available_cores <- nrow(tasks) # more cores will not be used } task_ids <- 1:nrow(tasks) # Parallelisation does not work on Windows if (available_cores > 1) { task_results <- mclapply(task_ids, execute_task, mc.cores = available_cores) } else { task_results <- lapply(task_ids, execute_task) } ## Visualization library("ggplot2") library("dplyr") theme_set(theme_bw()) job_results <- task_results # n_dim Distribution n_dim_df <- lapply(job_results, function(el) { n_dim <- names(el$n_dim_distribution) n_dim_prob <- el$n_dim_distribution %>% setNames(NULL) el$n_dim_distribution <- NULL el$n0_distribution <- NULL as.data.frame(el) %>% cbind(data.frame( n_dim = n_dim, n_dim_prob = n_dim_prob )) }) %>% bind_rows() # n0 Distribution n0_df <- lapply(job_results, function(el) { n0 <- names(el$n0_distribution) n0_prob <- el$n0_distribution %>% setNames(NULL) el$n_dim_distribution <- NULL el$n0_distribution <- NULL as.data.frame(el) %>% cbind(data.frame( n0 = n0, n0_prob = n0_prob )) }) %>% bind_rows() n0_df_forplot <- mutate(n0_df, x_elements = as.numeric(as.character(n0)), y_elements = n0_prob ) n_dim_df_forplot <- mutate(n_dim_df, x_elements = as.numeric(as.character(n_dim)), y_elements = n_dim_prob ) plot_heatmap_true <- function(my_ggplot) { my_ggplot + labs(fill = "covariance") + theme( title = element_text(size = 16), legend.title = element_text(size = 16) ) } plot_for_matrix_structure <- function(my_matrix_structure, x_axis_type, hide_legend, title) { stopifnot(x_axis_type %in% c("n0", "n_dim")) if (x_axis_type == "n0") { my_df <- n0_df_forplot true_line <- ifelse(my_matrix_structure == "large", 1, ifelse(my_matrix_structure == "moderate", 5, ifelse(my_matrix_structure == "no", 8, NA) ) ) my_xlabel <- "Simplicyty of the model (n0)" x_breaks <- 1:8 } else { my_df <- n_dim_df_forplot true_line <- ifelse(my_matrix_structure == "large", 5, ifelse(my_matrix_structure == "moderate", 17, ifelse(my_matrix_structure == "no", 36, NA) ) ) my_xlabel <- "Number of parameters in a model (n_dim)" x_breaks <- if (my_matrix_structure == "moderate") { c(5, 10, 17, 20, 30, 36) # additional 17 } else { c(5, 10, 20, 30, 36) } } out_plot <- filter( my_df, matrix_structure == my_matrix_structure ) %>% ggplot(aes(x = x_elements, y = y_elements, fill = factor(delta))) + geom_col(position = "dodge", width = 0.7) + geom_vline(xintercept = true_line, linetype = "dashed") + facet_wrap(~d, nrow = 2, labeller = "label_both") + xlab(my_xlabel) + ylab("Probability") + guides(fill = guide_legend(title = "delta", override.aes = aes(alpha = 1))) + labs(title = title) + lims(y = c(0, 1)) + scale_x_continuous(breaks = x_breaks) + theme(title = element_text(size = 15)) if (hide_legend) { out_plot <- out_plot + theme(legend.position = "none") } out_plot } my_heatmap_true_large <- plot_heatmap_true(gips:::pretty_plot_matrix( cov_large_str, bquote(atop("Heatmap of the true covariance matrix", bold("Large structure "))) )) my_heatmap_true_moderate <- plot_heatmap_true(gips:::pretty_plot_matrix( cov_moderate_str, bquote(atop("Heatmap of the true covariance matrix", bold("Moderate structure "))) )) my_heatmap_true_no <- plot_heatmap_true(gips:::pretty_plot_matrix( cov_no_str, bquote(atop("Heatmap of the true covariance matrix", bold("No structure "))) )) # Figure 8 my_heatmap_true_large my_heatmap_true_moderate my_heatmap_true_no structures <- c("no", "moderate", "large") hide_legend <- c(TRUE, TRUE, FALSE) plot_width <- c(7, 7, 7.7) x_axis_type <- "n_dim" # Figure 9 plot_for_matrix_structure(structures[1], x_axis_type, hide_legend[1], title = bquote(atop( "Effect of parameters on a posteriori structure distribution", "for a matrix with" ~ bold("no") ~ "structure " )) ) plot_for_matrix_structure(structures[2], x_axis_type, hide_legend[2], title = bquote(atop( "Effect of parameters on a posteriori structure distribution", "for a matrix with" ~ bold("moderate") ~ "structure " )) ) plot_for_matrix_structure(structures[3], x_axis_type, hide_legend[3], title = bquote(atop( "Effect of parameters on a posteriori structure distribution", "for a matrix with" ~ bold("large") ~ "structure " )) ) ##### # 3.3 Comparison with other methods # Code for this section 3.3 took us around 1 hour 30 minuets to compute with AMD EPYC 7413 on 45 cores ## Create matrices change_no_zeros_to_zeros <- function(no_zeros_matrix) { with_zeros_matrix <- no_zeros_matrix cutoff_val <- quantile(abs(with_zeros_matrix), 0.25) with_zeros_matrix[abs(with_zeros_matrix) < cutoff_val] <- 0 with_zeros_matrix } symmetric_solve <- function(symmetric_matrix) { solved_symmetric_matrix <- solve(symmetric_matrix) (solved_symmetric_matrix + t(solved_symmetric_matrix)) / 2 # this averaging makes almost no difference } p <- 50 # Dimensions of created covariance matrices. n <- 50 set.seed(2022) Z <- MASS::mvrnorm(n = n, mu = numeric(p), Sigma = diag(1, p)) S <- (t(Z) %*% Z) / p # Permutations for structures large_structure_perm <- paste0("(", toString(1:50), ")") moderate_structure_perm <- paste0("(", toString(1:25), ")") # large structure, no zeroes cov_large_str_no_zeros <- project_matrix(S, large_structure_perm) prec_large_str_no_zeros <- symmetric_solve(cov_large_str_no_zeros) # large structure, zeroes prec_large_str_zeros <- change_no_zeros_to_zeros(prec_large_str_no_zeros) cov_large_str_zeros <- symmetric_solve(prec_large_str_zeros) # moderate structure, no zeroes cov_mod_str_no_zeros <- project_matrix(S, moderate_structure_perm) prec_mod_str_no_zeros <- symmetric_solve(cov_mod_str_no_zeros) # moderate_structure, zeroes prec_mod_str_zeros <- change_no_zeros_to_zeros(prec_mod_str_no_zeros) cov_mod_str_zeros <- symmetric_solve(prec_mod_str_zeros) # no structure no zeroes cov_no_str_no_zeros <- S + 0.1 * diag(p) # a correction to ensure positive definitness of prec_no_str_zeros prec_no_str_no_zeros <- symmetric_solve(cov_no_str_no_zeros) # no structure, zeroes prec_no_str_zeros <- change_no_zeros_to_zeros(prec_no_str_no_zeros) cov_no_str_zeros <- symmetric_solve(prec_no_str_zeros) ## setup and functions library("huge") library("rags2ridges") N_TRIAL <- 10 GIPS_N_ITER <- 300000 METHODS <- c("rags2ridges", "huge", "gips") DEFAULT_D_MATRIX <- diag(p) tasks <- expand.grid( matrix_structure = factor(c( "none_zeros", "none_nozeros", "moderate_zeros", "moderate_nozeros", "large_zeros", "large_nozeros" )), n_points = c(10, 20, 40), n_trial = 1:N_TRIAL ) execute_task <- function(task_id) { set.seed(task_id) task_info <- as.vector(tasks[task_id, ]) true_cov <- switch(as.character(task_info[["matrix_structure"]]), none_zeros = cov_no_str_zeros, none_nozeros = cov_no_str_no_zeros, moderate_zeros = cov_mod_str_zeros, moderate_nozeros = cov_mod_str_no_zeros, large_zeros = cov_large_str_zeros, large_nozeros = cov_large_str_no_zeros ) n_points <- as.numeric(as.character(task_info[["n_points"]])) trial_results <- conduct_trial(true_cov, n_points, task_id) cbind(trial_results, tasks[task_id, ], data.frame(task_id = task_id)) } conduct_trial <- function(true_cov, n_points, task_id = -1) { withr::with_seed( task_id, Z <- MASS::mvrnorm( n_points, mu = rep(0, nrow(true_cov)), Sigma = true_cov ) ) estimated_covs <- lapply(METHODS, function(algorithm) { estimate_covariance(Z, n_points, algorithm, task_id) }) loglikelihoods <- sapply(estimated_covs, function(estimated_cov) { calculate_loglikelihood(Z, estimated_cov) }) Frobenius_norms <- sapply(estimated_covs, function(estimated_cov) { calculate_Frobenius_Norm(true_cov, estimated_cov) }) data.frame( algorithm = METHODS, loglikelihood = loglikelihoods, Frobenius_norm = Frobenius_norms ) } estimate_covariance <- function(Z, n_points, method = "", task_id = -1) { print(paste0("Began estimating covariance with method ", method, " at ", Sys.time())) out <- switch(method, gips = estimate_covariance_with_gips(Z, n_points, task_id), rags2ridges = estimate_covariance_with_ridge(Z, n_points), huge = estimate_covariance_with_huge(Z, n_points) ) print(paste0("Finished estimating covariance with method ", method, " at ", Sys.time())) out } estimate_covariance_with_gips <- function(Z, n_points, task_id) { S <- (t(Z) %*% Z) / n_points # we know the mean is 0 g <- gips(S, n_points, was_mean_estimated = FALSE) g_MAP <- find_MAP(g, max_iter = GIPS_N_ITER, optimizer = "MH", save_all_perms = FALSE ) print(g_MAP) project_matrix(S, g_MAP[[1]]) } estimate_covariance_with_ridge <- function(Z, n_points) { S <- cov(Z) target <- solve(DEFAULT_D_MATRIX) opt <- optPenalty.kCVauto(Z, lambdaMin = 0.001, lambdaMax = 100, # values from README target = target ) optLambda <- opt[["optLambda"]] precisionM <- ridgeP(S, optLambda, target = target) solve(precisionM) } estimate_covariance_with_huge <- function(Z, n_points) { huge_obj <- huge(Z, # huge will estimate the mean nlambda = 40, lambda.min.ratio = 0.02, # values from paper method = "glasso", cov.output = TRUE ) select_obj <- huge.select(huge_obj) select_obj$opt.cov } calculate_loglikelihood <- function(Z, est_cov) { loglikelihoods <- mvtnorm::dmvnorm(Z, sigma = est_cov, log = TRUE) -sum(loglikelihoods) } calculate_Frobenius_Norm <- function(true_cov, est_cov) { my_diff <- (true_cov - est_cov) sqrt(sum(abs(my_diff)^2)) } ## run library("parallel") numCores <- 45 task_ids <- 1:nrow(tasks) # 180 tasks # Parallelisation does not work on Windows if (numCores > 1) { task_results <- mclapply(task_ids, execute_task, mc.cores = numCores) } else { task_results <- lapply(task_ids, execute_task) } job_results_df <- do.call(rbind, task_results) ## Visualization library("ggplot2") library("dplyr") theme_set(theme_bw()) # caption on plots switch_names_structure_size <- function(structure_size_name) { ifelse(structure_size_name == "large", "large structure", ifelse(structure_size_name == "moderate", "moderate structure", "no structure" ) ) } # transform task_results_df_1 <- job_results_df %>% rename(sample_size = n_points) results_df <- task_results_df_1 %>% mutate( matrix_info = factor(matrix_structure, levels = c( "large_nozeros", "moderate_nozeros", "none_nozeros", "large_zeros", "moderate_zeros", "none_zeros" )), zeros_present = strsplit(as.character(matrix_info), "_") %>% sapply(function(v) v[2]), matrix_structure = strsplit(as.character(matrix_info), "_") %>% sapply(function(v) v[1]), avg_neg_loglik = loglikelihood / sample_size, sample_size = factor(sample_size) ) # It is possible that the package gave the estimator that was not proper. # For example when `gips` run did provide an estimator with n0 < n. # Fortunately, all of them did (We see the following data table is full of 1). results_df %>% group_by(algorithm, sample_size, matrix_structure, zeros_present) %>% summarise(positive_definite = mean(!is.infinite(loglikelihood))) %>% tidyr::pivot_wider(names_from = sample_size, values_from = positive_definite) # The nonzeros looks very similar (to our surprise) results_df_zeros <- results_df %>% filter(zeros_present == "zeros") plot_loglik_comparison <- results_df_zeros %>% ggplot(aes(x = algorithm, y = avg_neg_loglik, col = sample_size)) + geom_boxplot() + facet_wrap(vars(switch_names_structure_size(matrix_structure))) + labs( title = "Comparison between estimated and actual covariance matrix\nacross different matrix structures", col = "sample size (n)" ) + xlab("Estimating algoritm") + ylab("Negative loglikelihood weighted by sample size") # Figure 10 plot_loglik_comparison plot_spectral_comparison <- results_df_zeros %>% ggplot(aes(x = algorithm, y = Frobenius_norm, col = sample_size)) + geom_boxplot() + scale_y_continuous(trans = "log10", breaks = c(1.25, 2.5, 5, 10, 20)) + facet_wrap(vars(switch_names_structure_size(matrix_structure))) + labs( title = "Comparison between estimated and actual covariance matrix\nacross different matrix structures", col = "sample size (n)" ) + xlab("Estimating algoritm") + ylab("Frobenius norm") # Figure 11 plot_spectral_comparison