# Stability Selection and Consensus Clustering in R: The R Package sharp # Barbara Bodinier, Sabrina Rodrigues, Maryam Karimi, Sarah Filippi, Julien Chiquet, Marc Chadeau-Hyam ################################################### ### code chunk number 1: packages ################################################### library("fake") library("sharp") library("corpcor") library("rpart") library("plotrix") ################################################### ### code chunk number 2: data_simulation ################################################### # Regression set.seed(1) data_reg <- SimulateRegression( n = 100, pk = 20, beta_abs = 1 ) # Classification set.seed(1) data_class <- SimulateRegression( n = 1000, pk = 50, ev_xy = 0.9, family = "binomial" ) # Structural equation modelling set.seed(1) data_sem <- SimulateStructural( n = 200, pk = c(5, 5, 5), nu_between = 0.2, v_between = 1 ) # Gaussian graphical modelling set.seed(1) data_ggm <- SimulateGraphical( n = 100, pk = 20, topology = "scale-free" ) # Clustering set.seed(1) data_clust <- SimulateClustering( n = c(5, 10, 20), pk = 20, ev_xc = 0.3 ) ################################################### ### code chunk number 3: typical ################################################### ## Regression # Example data generation set.seed(1) data_reg <- SimulateRegression( n = 100, pk = 20, beta_abs = 1 ) x_reg <- data_reg$xdata y_reg <- data_reg$ydata # Stability selection stab_reg <- VariableSelection( xdata = x_reg, ydata = y_reg, verbose = FALSE ) # Calibration CalibrationPlot(stab_reg) Argmax(stab_reg) # Visualisation plot(stab_reg) ## Structural equation modelling # Example data generation set.seed(1) data_sem <- SimulateStructural( n = 200, pk = c(5, 5, 5), nu_between = 0.2, v_between = 1 ) x_sem <- data_sem$data # Stability selection dag <- LayeredDAG( layers = c(5, 5, 5) ) Lambda <- LambdaSequence( lmax = 1, lmin = 1e-5 ) stab_sem <- StructuralModel( xdata = x_sem, adjacency = dag, Lambda = Lambda, verbose = FALSE ) # Calibration CalibrationPlot(stab_sem) Argmax(stab_sem) # Visualisation plot(stab_sem) ## Gaussian graphical modelling # Example data generation set.seed(1) data_ggm <- SimulateGraphical( n = 100, pk = 20, topology = "scale-free" ) x_ggm <- data_ggm$data # Stability selection stab_ggm <- GraphicalModel( xdata = x_ggm, verbose = FALSE ) # Calibration CalibrationPlot(stab_ggm) Argmax(stab_ggm) # Visualisation plot(stab_ggm) ## Clustering # Example data generation set.seed(1) data_clust <- SimulateClustering( n = c(5, 10, 20), pk = 20, ev_xc = 0.3 ) x_clust <- data_clust$data # Stability selection stab_clust <- Clustering( xdata = x_clust, verbose = FALSE ) # Calibration CalibrationPlot(stab_clust) Argmax(stab_clust) # Visualisation plot(stab_clust) ################################################### ### code chunk number 4: seed ################################################### stab1 <- VariableSelection( xdata = data_reg$xdata, ydata = data_reg$ydata, K = 5, seed = 1, verbose = FALSE ) stab1_bis <- VariableSelection( xdata = data_reg$xdata, ydata = data_reg$ydata, K = 5, seed = 1, verbose = FALSE ) all(SelectionProportions(stab1) == SelectionProportions(stab1_bis)) ################################################### ### code chunk number 5: n_cores (eval = FALSE) ################################################### stab <- VariableSelection( xdata = data_reg$xdata, ydata = data_reg$ydata, K = 100, n_cores = 2, verbose = FALSE ) ################################################### ### code chunk number 6: combine ################################################### stab1 <- VariableSelection( xdata = data_reg$xdata, ydata = data_reg$ydata, K = 10, seed = 1, verbose = FALSE ) stab2 <- VariableSelection( xdata = data_reg$xdata, ydata = data_reg$ydata, K = 10, seed = 2, verbose = FALSE ) stab <- Combine(stab1, stab2) stab$params$K ################################################### ### code chunk number 7: optim ################################################### stab <- VariableSelection( xdata = data_reg$xdata, ydata = data_reg$ydata, optimisation = "nloptr" ) ################################################### ### code chunk number 8: ellipsis ################################################### stab <- VariableSelection( xdata = data_reg$xdata, ydata = data_reg$ydata, Lambda = 1, K = 1, penalty.factor = c(0, 0, 0, rep(1, 17)), verbose = FALSE ) print(stab$Beta) ################################################### ### code chunk number 9: cart (eval = FALSE) ################################################### stab <- VariableSelection( xdata = data_reg$xdata, ydata = data_reg$ydata, implementation = CART, verbose = FALSE ) ################################################### ### code chunk number 10: shrinkage1 (eval = FALSE) ################################################### ShrinkageSelection <- function(xdata, Lambda, ...) { mypcor <- corpcor::pcor.shrink(xdata, verbose = FALSE ) adjacency <- array(NA, dim = c(nrow(mypcor), ncol(mypcor), nrow(Lambda))) for (k in 1:nrow(Lambda)) { A <- ifelse(abs(mypcor) >= Lambda[k, 1], yes = 1, no = 0) diag(A) <- 0 adjacency[, , k] <- A } return(list(adjacency = adjacency)) } ################################################### ### code chunk number 11: shrinkage (eval = FALSE) ################################################### stab <- GraphicalModel( xdata = data_ggm$data, Lambda = matrix(c(0.01, 0.05, 0.1), ncol = 1), implementation = ShrinkageSelection, verbose = FALSE ) ################################################### ### code chunk number 12: split ################################################### set.seed(1) ids <- Split( data = data_class$ydata, family = "binomial", tau = c(0.7, 0.3) ) xtrain <- data_class$xdata[ids[[1]], , drop = FALSE] ytrain <- data_class$ydata[ids[[1]], , drop = FALSE] xtest <- data_class$xdata[ids[[2]], , drop = FALSE] ytest <- data_class$ydata[ids[[2]], , drop = FALSE] stab <- VariableSelection( xdata = xtrain, ydata = ytrain, family = "binomial", verbose = FALSE ) perf <- ExplanatoryPerformance( xdata = xtrain, ydata = ytrain, new_xdata = xtest, new_ydata = ytest, stability = stab ) perf$AUC ################################################### ### code chunk number 13: incremental ################################################### incr <- Incremental( xdata = xtrain, ydata = ytrain, new_xdata = xtest, new_ydata = ytest, stability = stab, n_predictors = ncol(data_class$xdata), verbose = FALSE ) plot(incr, ylim = c(0.5, 1)) ################################################### ### code chunk number 14: selection_perf ################################################### set.seed(1) simul <- SimulateRegression( n = 100, pk = 20 ) stab <- VariableSelection( xdata = simul$xdata, ydata = simul$ydata, verbose = FALSE ) SelectionPerformance(stab, simul) ################################################### ### code chunk number 15: clustering_perf ################################################### set.seed(1) simul <- SimulateClustering( n = c(30, 30, 30), ev_xc = 0.4 ) stab <- Clustering( xdata = simul$data, verbose = FALSE ) ClusteringPerformance(stab, simul) ################################################### ### code chunk number 16: K ################################################### lout <- NULL K_vect <- c(10, 100, 1000) for (K in K_vect) { stab <- VariableSelection( xdata = data_class$xdata, ydata = data_class$ydata, K = K, verbose = FALSE ) lout <- c(lout, list(SelectionPerformance(stab, data_class))) } names(lout) <- K_vect print(lout) ################################################### ### code chunk number 17: grids ################################################### par(mfrow = c(1, 2), mar = c(7, 7, 7, 5)) # Run 1 stab <- VariableSelection( xdata = data_class$xdata, ydata = data_class$ydata, Lambda = LambdaSequence(lmax = 0.2, lmin = 0.1), verbose = FALSE ) CalibrationPlot(stab) SelectionPerformance(stab, data_class) # Run 2 stab <- VariableSelection( xdata = data_class$xdata, ydata = data_class$ydata, Lambda = LambdaSequence(lmax = 0.2, lmin = 0.01), verbose = FALSE ) CalibrationPlot(stab) SelectionPerformance(stab, data_class) ################################################### ### code chunk number 18: tau ################################################### par(mfrow = c(1, 3), mar = c(7, 7, 7, 5)) lout <- NULL tau_vect <- c(0.2, 0.5, 0.8) for (tau in tau_vect) { stab <- VariableSelection( xdata = data_class$xdata, ydata = data_class$ydata, tau = tau, family = "binomial", verbose = FALSE ) CalibrationPlot(stab) lout <- c(lout, list(SelectionPerformance(stab, data_class))) } names(lout) <- tau_vect print(lout) ################################################### ### code chunk number 19: PFER ################################################### stab <- GraphicalModel( xdata = data_ggm$data, verbose = FALSE ) SelectionPerformance(stab, data_ggm) stab <- GraphicalModel( xdata = data_ggm$data, PFER_thr = 20, verbose = FALSE ) SelectionPerformance(stab, data_ggm) ################################################### ### code chunk number 20: selection_appli ################################################### par(mfrow = c(1, 3), mar = c(7, 7, 7, 5)) dat <- readRDS("e-mtab-11349.rds") y <- dat[, 1] x <- dat[, -1] set.seed(1) ids <- Split(data = y, family = "binomial", tau = c(0.7, 0.3)) stab <- VariableSelection( xdata = x[ids[[1]], ], ydata = y[ids[[1]]], family = "binomial", verbose = FALSE ) CalibrationPlot(stab) plot(stab, n_predictors = 20) sum(Stable(stab)) Argmax(stab) incr <- Incremental( xdata = x[ids[[1]], ], ydata = y[ids[[1]]], new_xdata = x[ids[[2]], ], new_ydata = y[ids[[2]]], stability = stab, n_predictors = 20, verbose = FALSE ) plot(incr, ylim = c(0.5, 0.9)) incr$AUC[[sum(Stable(stab))]] ################################################### ### code chunk number 21: clustering_appli ################################################### par(mfrow = c(1, 2), mar = rep(5, 4)) dat <- readRDS("single_cell_10x_5cl.rds") y <- dat$cell_line z <- dat$doublet x <- dat[, -c(1, 2)] stab <- Clustering( xdata = x, nc = seq_len(10), verbose = FALSE ) CalibrationPlot(stab) plot( stab, theta_star = y, cex.axis = 0.3 ) ################################################### ### code chunk number 22: doublets ################################################### table(Clusters(stab), y, z) ################################################### ### session info ################################################### sessionInfo()