##################
## computation time
library("VIM")
library("microbenchmark")
library("ggplot2")

##################
## Timing Hot-deck Imputation
## Generate a data set with nRows rows and several variables
Rows <- c(100, 500, 1000, 5000, 10000, 1e+05, 1e+06, 1e+07)
timeshd <- list()
timesknn <- list()
timesirmi <- list()
timesimi <- list()
for (i in seq_along(Rows)) {
  nRows <- Rows[i]
  print(nRows)
  xtmp <- rnorm(nRows)
  x <- data.frame(x = xtmp, y = 5 * xtmp + rnorm(nRows)/10, z = sample(LETTERS, 
    nRows, rep = T), d1 = sample(LETTERS[1:3], nRows, rep = T), d2 = sample(LETTERS[1:2], 
    nRows, rep = T), o1 = rnorm(nRows), o2 = rnorm(nRows), o3 = rnorm(100))
  origX <- x
  x[sample(1:nRows, nRows/10), 1] <- NA
  x[sample(1:nRows, nRows/10), 2] <- NA
  x[sample(1:nRows, nRows/10), 3] <- NA
  x[sample(1:nRows, nRows/10), 4] <- NA
  timeshd[[i]] <- microbenchmark(xImp <- hotdeck(x, ord_var = c("o1", "o2", "o3"), 
    domain_var = "d2"), times = 10)
  
  if (i <= 5) {
    timesknn[[i]] <- microbenchmark(xImp <- kNN(x, dist_var = c("d2", "o1", "o2", 
      "o3"), weights = c(10, 1, 1, 1)), times = 10)
    timesirmi[[i]] <- microbenchmark(xImp <- irmi(x, robust = TRUE, force = TRUE), 
      times = 10)
    timesimi[[i]] <- microbenchmark(xImp <- irmi(x, robust = FALSE), times = 10)
  }
}
data1 <- data.frame(n = c(Rows[1:5], Rows[1:5], Rows[1:5], Rows[1:5]), times = c(sapply(timesknn, 
  function(x) median(x[, 2])/1e+09)[1:5], sapply(timeshd, function(x) median(x[, 
  2])/1e+09)[1:5], sapply(timesirmi, function(x) median(x[, 2])/1e+09)[1:5], sapply(timesimi, 
  function(x) median(x[, 2])/1e+09)[1:5]), method = c(rep("kNN", 5), rep("hot-deck", 
  5), rep("IRMI", 5), rep("IMI", 5)))
data1 <- data.frame(n = c(Rows[1:5], Rows[1:5], Rows[1:5], Rows[1:5]), times = c(sapply(timesknn, 
  function(x) median(x[, 2])/1e+09)[1:5], sapply(timeshd, function(x) median(x[, 
  2])/1e+09)[1:5], sapply(timesirmi, function(x) median(x[, 2])/1e+09)[1:5], sapply(timesimi, 
  function(x) median(x[, 2])/1e+09)[1:5]), method = c(rep("kNN", 5), rep("hot-deck", 
  5), rep("IRMI", 5), rep("IMI", 5)))
data2 <- data.frame(n = Rows, times = sapply(timeshd, function(x) median(x[, 2])/1e+09))
pdf(file = "Timing1.pdf")
ggplot(data = data1, aes(x = n, y = log(times), color = method)) + geom_line() + 
  scale_y_continuous("Computation time (in log(seconds))") + scale_x_continuous("Number of observations") + 
  theme_bw() + theme(axis.title = element_text(size = rel(1.6)), axis.text = element_text(size = rel(1.6)), 
  legend.title = element_text(size = rel(1.6)), legend.text = element_text(size = rel(1.6)))
dev.off()

pdf(file = "Timing2.pdf")
ggplot(data = data2, aes(x = n, y = times)) + geom_line() + scale_y_continuous("Computation time (in seconds)") + 
  scale_x_continuous("Number of observations") + theme_bw() + theme(axis.title = element_text(size = rel(1.6)), 
  axis.text = element_text(size = rel(1.6)), legend.title = element_text(size = rel(1.6)), 
  legend.text = element_text(size = rel(1.6)))
dev.off()


## hotdeck example
library("VIM")
data("eusilcP", package = "simFrame")
## This data set is synthetically generated from real Austrian EU-SILC
## (European Union Statistics on Income and Living Conditions)
## data. See simFrame package Since the data.frame consists only of
## about 58654 observations, the data set is enlarged to 8 million
## observations
eusilcP <- eusilcP[eusilcP$age > 15 & eusilcP$age < 65, ]
pop <- eusilcP[sample(1:nrow(eusilcP), 5500000, replace = TRUE), c("region", "gender", 
  "hsize", "age", "eqIncome", "ecoStat", "citizenship", "py010n", "py050n", "py090n")]
dim(pop)
for (v in c("ecoStat", "citizenship", "py010n", "py050n", "py090n")) {
  pop[sample(1:nrow(pop), round(nrow(pop)/10)), v] <- NA
}
## ordered by equalised income, age and household size splitted by
## region and gender
system.time(popImp <- hotdeck(pop, ord_var = c("eqIncome", "age", "hsize"),
  domain_var = c("region", "gender")))


## kNN example
data("eusilcP", package = "simFrame")
eusilcP <- eusilcP[eusilcP$age > 15 & eusilcP$age < 65, ]
## 14000 is the sample size of Austrian SILC
samp <- eusilcP[sample(1:nrow(eusilcP), 14000, replace = FALSE), c("region", "gender", 
  "hsize", "age", "eqIncome", "ecoStat", "citizenship", "py010n", "py050n", "py090n")]
for (v in c("ecoStat", "citizenship", "py050n", "py090n", "py010n")) {
  samp[sample(1:nrow(samp), round(nrow(samp) / 10)), v] <- NA
}
medianMixed <- function(x) {
  # relative frequency of 0
  nr <- sum(x == 0)/length(x)
  # Sampling 0 with probability equivalent to the relative frequency
  out <- sample(c(0, 1), size = 1, prob = c(nr, 1 - nr))
  if (out == 0 || nr == 1) 
    return(0) else return(median(x[x != 0]))
}
## income, age, household size, region and gender are used for
## distance computation 5 nearest neighbours are used the function
## medianMixed is used for the aggregation of numerical variables
sampImp <- kNN(samp, dist_var = c("eqIncome", "age", "hsize", "region", "gender"), 
  k = 5, numFun = medianMixed)

## irmi example
library("VIM")
data("ses", package = "laeken")
variables <- c("earningsMonth", "earningsOvertime", "overtimeHours",
  "location")
for (v in variables) {
  ses[sample(1:nrow(ses), round(nrow(ses) / 10)), v] <- NA
}
modelFormulas <- list(earningsMonth = c("earningsHour", "fullPart"), earningsOvertime = c("overtimeHours", 
  "earningsHour"), overtimeHours = c("earningsOvertime", "shareNormalHours"), location = c("NACE1", 
  "size"))
sesImp <- irmi(ses, init.method = "median", mixed = c("overtimeHours", "earningsOvertime"), 
  modelFormulas = modelFormulas)
sesImpRob <- irmi(ses, init.method = "median", mixed = c("overtimeHours", "earningsOvertime"), 
  modelFormulas = modelFormulas, robust = TRUE)

## regressionImp example
library("VIM")
data("ses", package = "laeken")
## Defining a model for the variable earningsHour
form1 <- earningsHour ~ location + size + economicFinanc + payAgreement + sex + age + 
  education + occupation + contract + lengthService + overtimeHours + holiday + 
  notPaid + earningsOvertime + paymentsShiftWork + earningsMonth + earnings
## Defining a model for the multinomial variable NACE1 = business branch
form2 <- NACE1 ~ location + size + economicFinanc + payAgreement + sex + age + education + 
  occupation + earnings
## 10% of th observations in the two variable set to missing
for (v in c("earningsHour", "NACE1")) {
  ses[sample(1:nrow(ses), round(nrow(ses) / 10)), v] <- NA
}
## Imputation of the two variables
ses <- regressionImp(form1, ses)
ses <- regressionImp(form2, ses)