## Preliminaries ----------------------------------------------------------- # Make sure we do not exceed 76 columns in output as per JSS standard. options(width = 76) # So anything depending on RNG is reproducible set.seed(2015) if (!require("validate")) install.packages("validate"); if (!require("dcmodify")) install.packages("dcmodify") if (!require("errorlocate")) install.packages("errorlocate") if (!require("simputation")) install.packages("simputation") if (!require("rspa")) install.packages("rspa") ## Section 3 --------------------------------------------------------------- library("validate") data("retailers", package = "validate") head(retailers[3:8], 3) retailers$id <- sprintf("RET%2d", 1:nrow(retailers)) rules <- validator( st = staff >= 0, to = turnover >= 0, or = other.rev >= 0, st.cs = if (staff > 0) staff.costs > 0, bl = turnover + other.rev == total.rev, mn = mean(profit, na.rm = TRUE) >= 1 ) confront(retailers, rules, key = "id") # We use a variable not occurring in the dataset badrule <- validator(employees >= 0) confront(retailers, badrule) check <- confront(retailers, rules, key = "id") summary(check) output <- as.data.frame(check) tail(output, 3) # passing all checks? all(check) # ignore missings: all(check, na.rm = TRUE) plot(check[1:5]) ## this out <- check_that(retailers, staff >= 0, other.rev >= 0) ## is equivalent to out <- confront(retailers, validator(staff >= 0, other.rev >= 0)) ## Section 3.2 ------------------------------------------------------------- rules <- validator(x > 0, mean(x)) checks <- check_that(retailers, !any(duplicated(id))) all(checks) checks <- check_that(retailers, is.numeric(turnover), is.factor(size)) all(checks) correlation_check <- validator(cor(height, weight) > 0.5) summary(confront(women, correlation_check) ) checks <- check_that(retailers, grepl("^sc[0-9]$", size)) all(checks) checks <- check_that(retailers, size %in% c("sc0", "sc1", "sc2", "sc3")) all(checks) check <- check_that(retailers, if (staff > 0) staff.costs > 0) all(check, na.rm = TRUE) summary(check) rules <- validator(street ~ postal_code) rules <- validator(city + street ~ postal_code) rules <- validator(postal_code ~ city + street) ## Section 3.3 ------------------------------------------------------------- rules <- validator( nrow(.) >= 100, "Species" %in% names(.)) rules <- validator( fraction := mean(Species == "versicolor"), vc_upr = fraction >= 0.25, vc_lwr = fraction <= 0.50) as.data.frame(confront(iris, rules))["expression"] codelist <- data.frame( validSpecies = c("versicolor", "virginica","setosa"), stringsAsFactors = FALSE) rules <- validator(Species %in% ref$validSpecies) summary(confront(iris, rules, ref = codelist))[1:5] rules <- validator(x >= 0, y >= 0, z >= 0, x <= 1, y <= 1, z <= 1) rules <- validator(G := var_group(x, y, z), G >= 0, G <= 1) ## Section 3.4 ------------------------------------------------------------- rules <- validator(minht = height >= 40, maxht = height <= 95) # Subsetting returns a 'validator' object. rules[2] rules[[1]] label(rules) <- c("least height", "largest height") rules variables(rules) variables(rules, as = "matrix") all(names(women) %in% variables(rules)) names(women)[!names(women) %in% variables(rules)] ## Section 3.5 ------------------------------------------------------------- rules <- validator(.file = "rules.txt") rules origin(rules) ## Section 3.6 ------------------------------------------------------------- data("retailers", package = "validate") rules <- validator(turnover >= 0, turnover + other.rev == total.rev) summary(confront(retailers, rules))[-8] summary(confront(retailers, rules, na.value = FALSE))[-8] voptions(rules, lin.eq.eps = 0, lin.ineq.eps = 0) rules summary(confront(retailers, rules))[-(6:7)] ## Section 3.7 ------------------------------------------------------------- rules <- validator(other.rev >= 0, turnover >= 0, turnover + other.rev == total.rev) check <- confront(retailers, rules) summary(check[1:2]) head(values(check), n = 3) sort(check, by = "rule") rules <- validator(staff >= 0, turnover >= 0, mean(profit) >= 1) check <- confront(retailers, rules) class(values(check[1:2])) class(values(check)) ## Section 5 --------------------------------------------------------------- library("dcmodify") library("errorlocate") library("simputation") library("rspa") data("retailers", package = "validate") dat_raw <- retailers[, -(1:2)] rules <- validator(.file = "rules.R") voptions(rules, lin.eq.eps = 0.01) rules[c(1, 4, 11)] # show a few examples dat_raw[15, c("staff", "turnover", "staff.costs")] modifiers <- dcmodify::modifier(.file = "modifiers.R") modifiers[c(1, 4, 11)] dat_mod <- dcmodify::modify(dat_raw, modifiers) compare(rules, raw = dat_raw, modified = dat_mod) weights <- setNames(rep(1, ncol(dat_raw)), names(dat_raw)) weights[c("staff","vat")] <- 10 dat_el <- errorlocate::replace_errors(dat_mod, rules, weight = weights) colSums(summary(confront(dat_el, rules))[3:5]) dat_imp <- simputation::impute_cart(dat_el, . ~ .) colSums(summary(confront(dat_imp, rules))[3:5]) # Compute weights for weighted Euclidean distance W <- t(t(dat_imp)) # convert to numeric matrix W <- 1/(abs(W)+1) # compute weights W <- W/rowSums(W) # normalize by row # adjust the imputed values. dat_adj <- rspa::match_restrictions(dat_imp, rules, adjust = is.na(dat_el), weight = W, maxiter = 1e4) all(confront(dat_adj, rules)) plot(compare(rules, raw= dat_raw, modified = dat_mod, errors_located = dat_el, imputed = dat_imp, adjusted = dat_adj, how = "sequential")) plot(cells(raw = dat_raw, modified = dat_mod, errors_located = dat_el, imputed = dat_imp, adjusted = dat_adj, compare = "sequential"))