################## ## computation time library("VIM") library("microbenchmark") library("ggplot2") ################## ## Timing Hot-deck Imputation ## Generate a data set with nRows rows and several variables Rows <- c(100, 500, 1000, 5000, 10000, 1e+05, 1e+06, 1e+07) timeshd <- list() timesknn <- list() timesirmi <- list() timesimi <- list() for (i in seq_along(Rows)) { nRows <- Rows[i] print(nRows) xtmp <- rnorm(nRows) x <- data.frame(x = xtmp, y = 5 * xtmp + rnorm(nRows)/10, z = sample(LETTERS, nRows, rep = T), d1 = sample(LETTERS[1:3], nRows, rep = T), d2 = sample(LETTERS[1:2], nRows, rep = T), o1 = rnorm(nRows), o2 = rnorm(nRows), o3 = rnorm(100)) origX <- x x[sample(1:nRows, nRows/10), 1] <- NA x[sample(1:nRows, nRows/10), 2] <- NA x[sample(1:nRows, nRows/10), 3] <- NA x[sample(1:nRows, nRows/10), 4] <- NA timeshd[[i]] <- microbenchmark(xImp <- hotdeck(x, ord_var = c("o1", "o2", "o3"), domain_var = "d2"), times = 10) if (i <= 5) { timesknn[[i]] <- microbenchmark(xImp <- kNN(x, dist_var = c("d2", "o1", "o2", "o3"), weights = c(10, 1, 1, 1)), times = 10) timesirmi[[i]] <- microbenchmark(xImp <- irmi(x, robust = TRUE, force = TRUE), times = 10) timesimi[[i]] <- microbenchmark(xImp <- irmi(x, robust = FALSE), times = 10) } } data1 <- data.frame(n = c(Rows[1:5], Rows[1:5], Rows[1:5], Rows[1:5]), times = c(sapply(timesknn, function(x) median(x[, 2])/1e+09)[1:5], sapply(timeshd, function(x) median(x[, 2])/1e+09)[1:5], sapply(timesirmi, function(x) median(x[, 2])/1e+09)[1:5], sapply(timesimi, function(x) median(x[, 2])/1e+09)[1:5]), method = c(rep("kNN", 5), rep("hot-deck", 5), rep("IRMI", 5), rep("IMI", 5))) data1 <- data.frame(n = c(Rows[1:5], Rows[1:5], Rows[1:5], Rows[1:5]), times = c(sapply(timesknn, function(x) median(x[, 2])/1e+09)[1:5], sapply(timeshd, function(x) median(x[, 2])/1e+09)[1:5], sapply(timesirmi, function(x) median(x[, 2])/1e+09)[1:5], sapply(timesimi, function(x) median(x[, 2])/1e+09)[1:5]), method = c(rep("kNN", 5), rep("hot-deck", 5), rep("IRMI", 5), rep("IMI", 5))) data2 <- data.frame(n = Rows, times = sapply(timeshd, function(x) median(x[, 2])/1e+09)) pdf(file = "Timing1.pdf") ggplot(data = data1, aes(x = n, y = log(times), color = method)) + geom_line() + scale_y_continuous("Computation time (in log(seconds))") + scale_x_continuous("Number of observations") + theme_bw() + theme(axis.title = element_text(size = rel(1.6)), axis.text = element_text(size = rel(1.6)), legend.title = element_text(size = rel(1.6)), legend.text = element_text(size = rel(1.6))) dev.off() pdf(file = "Timing2.pdf") ggplot(data = data2, aes(x = n, y = times)) + geom_line() + scale_y_continuous("Computation time (in seconds)") + scale_x_continuous("Number of observations") + theme_bw() + theme(axis.title = element_text(size = rel(1.6)), axis.text = element_text(size = rel(1.6)), legend.title = element_text(size = rel(1.6)), legend.text = element_text(size = rel(1.6))) dev.off() ## hotdeck example library("VIM") data("eusilcP", package = "simFrame") ## This data set is synthetically generated from real Austrian EU-SILC ## (European Union Statistics on Income and Living Conditions) ## data. See simFrame package Since the data.frame consists only of ## about 58654 observations, the data set is enlarged to 8 million ## observations eusilcP <- eusilcP[eusilcP$age > 15 & eusilcP$age < 65, ] pop <- eusilcP[sample(1:nrow(eusilcP), 5500000, replace = TRUE), c("region", "gender", "hsize", "age", "eqIncome", "ecoStat", "citizenship", "py010n", "py050n", "py090n")] dim(pop) for (v in c("ecoStat", "citizenship", "py010n", "py050n", "py090n")) { pop[sample(1:nrow(pop), round(nrow(pop)/10)), v] <- NA } ## ordered by equalised income, age and household size splitted by ## region and gender system.time(popImp <- hotdeck(pop, ord_var = c("eqIncome", "age", "hsize"), domain_var = c("region", "gender"))) ## kNN example data("eusilcP", package = "simFrame") eusilcP <- eusilcP[eusilcP$age > 15 & eusilcP$age < 65, ] ## 14000 is the sample size of Austrian SILC samp <- eusilcP[sample(1:nrow(eusilcP), 14000, replace = FALSE), c("region", "gender", "hsize", "age", "eqIncome", "ecoStat", "citizenship", "py010n", "py050n", "py090n")] for (v in c("ecoStat", "citizenship", "py050n", "py090n", "py010n")) { samp[sample(1:nrow(samp), round(nrow(samp) / 10)), v] <- NA } medianMixed <- function(x) { # relative frequency of 0 nr <- sum(x == 0)/length(x) # Sampling 0 with probability equivalent to the relative frequency out <- sample(c(0, 1), size = 1, prob = c(nr, 1 - nr)) if (out == 0 || nr == 1) return(0) else return(median(x[x != 0])) } ## income, age, household size, region and gender are used for ## distance computation 5 nearest neighbours are used the function ## medianMixed is used for the aggregation of numerical variables sampImp <- kNN(samp, dist_var = c("eqIncome", "age", "hsize", "region", "gender"), k = 5, numFun = medianMixed) ## irmi example library("VIM") data("ses", package = "laeken") variables <- c("earningsMonth", "earningsOvertime", "overtimeHours", "location") for (v in variables) { ses[sample(1:nrow(ses), round(nrow(ses) / 10)), v] <- NA } modelFormulas <- list(earningsMonth = c("earningsHour", "fullPart"), earningsOvertime = c("overtimeHours", "earningsHour"), overtimeHours = c("earningsOvertime", "shareNormalHours"), location = c("NACE1", "size")) sesImp <- irmi(ses, init.method = "median", mixed = c("overtimeHours", "earningsOvertime"), modelFormulas = modelFormulas) sesImpRob <- irmi(ses, init.method = "median", mixed = c("overtimeHours", "earningsOvertime"), modelFormulas = modelFormulas, robust = TRUE) ## regressionImp example library("VIM") data("ses", package = "laeken") ## Defining a model for the variable earningsHour form1 <- earningsHour ~ location + size + economicFinanc + payAgreement + sex + age + education + occupation + contract + lengthService + overtimeHours + holiday + notPaid + earningsOvertime + paymentsShiftWork + earningsMonth + earnings ## Defining a model for the multinomial variable NACE1 = business branch form2 <- NACE1 ~ location + size + economicFinanc + payAgreement + sex + age + education + occupation + earnings ## 10% of th observations in the two variable set to missing for (v in c("earningsHour", "NACE1")) { ses[sample(1:nrow(ses), round(nrow(ses) / 10)), v] <- NA } ## Imputation of the two variables ses <- regressionImp(form1, ses) ses <- regressionImp(form2, ses)