## This file contains code to do a performance comparison between the ## packages hdpGLM and PReMiuM. These are the currently available R packages for ## the purposes of estimating hierarchical Dirichlet process prior regression ## models. ## * packages library("hdpGLM") library("PReMiuM") ## library("magrittr") library("glue") library("tictoc") library("SimDesign") library("ggplot2") ## * functions get_formula <- function(df) { X = df %>% dplyr::select(dplyr::matches("^X")) %>% names(.) %>% paste0(., collapse = "+") f = glue("y ~ {X}") %>% as.character(.) return(as.formula(f)) } get_estimation_time <- function(res, model, family, K, Kmax, time) { tab = dplyr::bind_rows(time) %>% tidyr::separate(., col=msg, into=c('n', 'ncov'), sep="_") %>% dplyr::mutate( model = model, family = family, elapsed = toc-tic, K = K, Kmax_allowed = Kmax ) res = res %>% dplyr::bind_rows(tab) return(res) } estimate_hdpGLM <- function(res, df, mcmc, K, Kmax, family, Kinit, formula) { cat(glue("\nEstimating hdpGLM (K={K}, n={nrow(df)}, nCov={ncol(df)-1})...") ) case = glue("{n}_{nCov}") ## tic(case, quiet=T) est=quiet( hdpGLM(formula, data=df, mcmc=mcmc, K=Kmax, family=family, n.display=10000)) time = toc(quiet=T) res = get_estimation_time(res=res, model="hdpGLM", family=family, K=K, Kmax=Kmax, time=time) cat("done!\n") return(res) } estimate_premium <- function(res, df, mcmc, K, Kmax, family, Kinit, formula) { cat(glue("\nEstimating Premium (K={K}, n={nrow(df)}, nCov={ncol(df)-1})...") ) case = glue("{n}_{nCov}") ## tic(case, quiet=T) est = quiet(PReMiuM::profRegr( yModel = 'Normal', xModel = 'Normal', nSweeps = mcmc$n.iter, nBurn = mcmc$burn.in, outcome = "y", covNames = formula.tools::rhs.vars(formula), nClusInit = Kinit, data = df, predict = df, run = TRUE) ) dissimObj = quiet(PReMiuM::calcDissimilarityMatrix(est)) clusObj = quiet(PReMiuM::calcOptimalClustering(dissimObj, maxNClusters=Kmax)) time = toc(quiet=T) res = get_estimation_time(res=res, model="PReMiuM", family=family, K=K, Kmax=Kmax, time=time) cat("done!\n") return(res) } simulate_data <- function(n, nCov, K, family) { sim = hdpGLM_simulateData(n=n, K=K, nCov=nCov, family=family) df = sim$data %>% tibble::as_tibble() return(df) } ## * Performance set.seed(12345) ## n.iter = 500 burn.in = 0 subjects = c(1500, 2500, 5000) covariates = c(5, 10, 15) Kmax = 30 ## res=tibble::tibble() for (family in c("gaussian", 'binomial')) { cat(glue("\n\nFamily: {family}\n\n") ) for (n in subjects) { for (nCov in covariates) { ## clusters and MCMC K = sample(1:5, size=1) mcmc = list(n.iter=n.iter, burn.in=burn.in) ## simulate data df = simulate_data(n, nCov, K, family) ## Estimate formula = get_formula(df) res = estimate_hdpGLM (res, df, mcmc, K=K, Kmax=Kmax, family=family, Kinit=1, formula=formula) res = estimate_premium(res, df, mcmc, K=K, Kmax=Kmax, family=family, Kinit=1, formula=formula) } } } res