########################################################################### ### ### ### Simulating Incomplete Data Sets ### ### ### ########################################################################### suppressWarnings(RNGversion("3.5.0")) library(MixtureMissing) set.seed(123) iris_80_cases <- hide_values(iris[1:4], n_cases = 80) sum(!complete.cases(iris_80_cases)) # [1] 80 head(iris_80_cases) # Sepal.Length Sepal.Width Petal.Length Petal.Width # 1 5.1 3.5 1.4 NA # 2 4.9 3.0 1.4 0.2 # 3 NA NA NA 0.2 # 4 4.6 3.1 1.5 0.2 # 5 5.0 NA NA NA # 6 5.4 NA NA NA ############################################################################ ### ### ### Multivariate Contaminated Normal Mixture ### ### ### ############################################################################ data("UScost", package = "MixtureMissing") set.seed(123) X <- hide_values(UScost[3:5], n_cases = 10) mod_CN3 <- MCNM(X, G = 3, init_method = "kmedoids", max_iter = 200) # Mixture: Contaminated Normal (CN) # # Data Set: Incomplete # # Initialization: kmedoids # # Fitting G = 3 was successful with 21/200 iterations # # The fitted mixture model with G = 3 has BIC = 1146.959 plot(mod_CN3, cex.point = 1.8, cex.axis = 1.5, lwd = 2) summary(mod_CN3) # Model: 3-Component Contaminated Normal Mixture with Incomplete Data # # Observations with missing values: 10 / 50 # # Missing values per variable: # Grocery Housing Utilities # 3 7 6 # Iterations: 21 / 200 # # Initialization: kmedoids # # Component frequency table: # comp1 comp2 comp3 # 30 13 7 # # Total outliers: 1 # # Outliers per component: # comp1 comp2 comp3 # 0 1 0 # # Mixing proportions: # comp1 comp2 comp3 # 0.5768903 0.2721738 0.1509359 # # Component location vectors: # Grocery Housing Utilities # comp1 96.65776 80.23405 95.42366 # comp2 109.15298 116.37169 106.17192 # comp3 121.80816 206.66333 114.89822 # # Component dispersion matrices: # , , comp1 # # Grocery Housing Utilities # Grocery 22.87359540 14.88862 -0.04339625 # Housing 14.88861638 48.28071 5.01427967 # Utilities -0.04339625 5.01428 35.89938258 # # , , comp2 # # Grocery Housing Utilities # Grocery 69.01612 60.66522 118.0932 # Housing 60.66522 190.24648 151.7993 # Utilities 118.09318 151.79934 388.3211 # # , , comp3 # # Grocery Housing Utilities # Grocery 374.4364 910.3327 461.8503 # Housing 910.3327 2398.3366 1122.6205 # Utilities 461.8503 1122.6205 601.5073 # # # Final log-likelihood: -505.0193 # # Total parameters: 35 # # Information Criteria: # AIC BIC KIC KICc AIC3 CAIC AICc ICL AWE CLC # 1080.039 1146.959 1118.039 1350.568 1115.039 1181.959 1260.039 1148.76 1392.481 1006.438 UScost[which(mod_CN3$outliers), 1:5] # Abbr State Grocery Housing Utilities # 2 AK Alaska 134.2 133.9 154.2 library(usmap) library(ggplot2) dataplot <- data.frame( state = UScost$Abbr, cluster = factor(mod_CN3$clusters, levels = 1:3, labels = paste('Cluster', 1:3)) ) plot_usmap(regions = "state", data = dataplot , values = 'cluster', labels = TRUE, exclude = 'DC') + theme(legend.position = 'right', legend.title = element_blank()) + scale_fill_manual(values = c('#377eb8', '#e41a1c', '#4daf4a')) mod_CN3_mc <- MCNM(X, G = 3, init_method = "mclust", max_iter = 200) cls0 <- c(rep(1, 20), rep(2, 20), rep(3, 10)) mod_CN3_user <- MCNM(X, G = 3, init_method = "manual", clusters = cls0, max_iter = 200) ########################################################################### ### ### ### Multivariate Generalized Hyperbolic Mixture ### ### ### ########################################################################### set.seed(12345) X <- hide_values(bankruptcy[, 2:3], prop_cases = 0.2) mod_GH <- MGHM(X, G = 2, model = "GH", init_method = "kmedoids", max_iter = 200, progress = FALSE) plot(mod_GH, cex.point = 1.8, lwd = 2) mod_GH$pi # comp1 comp2 # 0.4001022 0.5998978 mod_GH$mu # RE EBIT #comp1 61.90255 23.14936 #comp2 53.69775 34.83119 mod_GH$beta # RE EBIT # comp1 -480.62946 -124.33229 # comp2 -40.00375 -40.63468 mod_St <- MGHM(X, G = 2, model = "St", init_method = "kmedoids", max_iter = 200, progress = FALSE) mod_SNIG <- MGHM(X, G = 2, model = "SNIG", init_method = "kmedoids", max_iter = 200, progress = FALSE) mod_N <- MGHM(X, G = 2, model = "N", init_method = "kmedoids", max_iter = 200, progress = FALSE) ########################################################################### ### ### ### Mixture Model Selection ### ### ### ########################################################################### data("auto", package = "MixtureMissing") sapply(auto[, 1:15], function(a) sum(is.na(a))) # normalized_losses wheel_base length width # 41 0 0 0 # height curb_weight engine_size bore # 0 0 0 4 # stroke compression_ratio horsepower peak_rpm # 4 0 2 2 # city_mpg highway_mpg price # 0 0 4 vars_miss <- c("normalized_losses", "bore", "stroke", "horsepower", "peak_rpm", "price") X <- auto[, vars_miss] sum(!complete.cases(X)) # [1] 45 mod_auto1 <- select_mixture(X, G = 2, criterion = "BIC", init_method = "kmedoids", max_iter = 200) # Data Set: Incomplete # # Information Criterion: BIC # # Initialization: kmedoids # # Model Fitting: # CN GH NIG SNIG SC C St t N SGH HUM H SH # # According to BIC, the best mixture model is based on the Contaminated Normal # distribution # # Model rank according to BIC: # 1. Contaminated Normal: 10489.11 # 2. t: 10492.61 # 3. Symmetric Hyperbolic: 10504.03 # 4. Normal-Inverse Gaussian: 10512.29 # 5. Hyperbolic Univariate Marginals: 10516.73 # 6. Hyperbolic: 10516.73 # 7. Generalized Hyperbolic: 10521.75 # 8. Skew-t: 10522.31 # 9. Symmetric Normal-Inverse Gaussian: 10529.81 # 10. Symmetric Generalized Hyperbolic: 10540.45 # 11. Skew-Cauchy: 10604.35 # 12. Normal: 10619.21 # 13. Cauchy: 10697.46 mod_auto2 <- select_mixture(X, G = 2, criterion = "ICL", model = c("t", "SC", "SGH", "NIG"), init_method = "kmedoids", max_iter = 200) # Data Set: Incomplete # # Information Criterion: ICL # # Initialization: kmedoids # # Model Fitting: # t SC SGH NIG # # According to ICL, the best mixture model is based on the t distribution # # Model rank according to ICL: # 1. t: 10493.85 # 2. Normal-Inverse Gaussian: 10515.03 # 3. Symmetric Generalized Hyperbolic: 10545.58 # 4. Skew-Cauchy: 10606.54