# Replication Script for 'Optimum Allocation for Adaptive Multi-Wave Sampling # in R: The R Package optimall' # Summary: This script replicates all of the code and results from the # submitted manuscript. It is broken down into into sections # according to the manuscript for organization. # Load package ------------------------------------ # Install, if necessary: # library(devtools) # uncomment and run if necessary # install_github("yangjasp/optimall") # uncomment and run if necessary # Or install from CRAN with: # install.packages("optimall) library("optimall") # Install suggested packages for Shiny app and multiwave_diagram() if not # already installed. # These are not required to run the basic functions of optimall, # but they are required for some of the features demonstrated in the paper. # install.packages("DiagrammeR") # install.packages("shiny") # install.packages("bslib") # install.packages("DT") # install.packages("survey") ############# ### Section 3 ---------------- ############# ###### ## Section 3.1: defining, splitting, and merging strata ###### # view species in iris dataset library("datasets") table(iris$Species) # demonstrate split_strata() function iris2 <- split_strata(data = iris, strata = "Species", split = c("setosa", "virginica"), split_var = "Sepal.Width", split_at = c(0.5), type = "local quantile") table(iris2$new_strata) # demonstrate merge_strata() function iris3 <- merge_strata(data = iris, strata = "Species", merge = c("setosa", "versicolor"), name = "set_or_vers") table(iris3$new_strata) ###### ## Section 3.2: optimally allocating samples ###### # demonstrate optimum_allocation() function: sampling_design <- optimum_allocation(data = iris, strata = "Species", y = "Sepal.Width", nsample = 40, method = "WrightII") sampling_design # demonstrate optimum_allocation() with simpler input: iris_summary <- data.frame(strata = unique(iris$Species), size = c(50, 50, 50), sd = c(0.3791, 0.3138, 0.3225)) optimum_allocation(data = iris_summary, strata = "strata", sd_h = "sd", N_h = "size", nsample = 40, method = "WrightII") ###### ## Section 3.3: sampling based on results ###### ## demonstrate sample_strata() function iris$id <- 1:150 set.seed(743) iris <- sample_strata(data = iris, strata = "Species", id = "id", design_data = sampling_design, design_strata = "strata", n_allocated = "stratum_size", probs = ~stratum_size/npop) head(iris[,c("id", "Species", "Sepal.Width", "sample_indicator", "sampling_prob")]) #Drop extra column for space ## extract ids to sample ids_to_sample <- iris$id[iris$sample_indicator == 1] head(ids_to_sample) length(ids_to_sample) ###### ## Section 3.4: allocating samples in waves ###### # Set up wave 1: # Collect Sepal.Width from only the 30 samples wave1_design <- data.frame(strata = c("setosa", "virginica", "versicolor"), stratum_size = c(7, 16, 7)) phase1_data <- subset(datasets::iris, select = -Sepal.Width) phase1_data$id <- 1:nrow(phase1_data) #Add id column set.seed(234) # for sampling reproducibility phase1_data <- sample_strata(data = phase1_data, strata = "Species", id = "id", design_data = wave1_design, design_strata = "strata", n_allocated = "stratum_size") wave1_ids <- iris$id[phase1_data$sample_indicator == 1] wave1_sampled_data <- iris[iris$id %in% wave1_ids, c("id","Sepal.Width")] wave1_data <- merge(phase1_data, wave1_sampled_data, by = "id", no.dups = TRUE, all.x = TRUE) # We have Sepal.Width for our 30 samples table(is.na(wave1_data$Sepal.Width), wave1_data$Species) ## Now, run allocate_wave() to generate wave 2 design: wave2_design <- allocate_wave(data = wave1_data, strata = "Species", y = "Sepal.Width", already_sampled = "sample_indicator", nsample = 10, detailed = TRUE) wave2_design[,1:6] ## Demonstrate sample_strata again wave2_data <- sample_strata(data = wave1_data, strata = "Species", id = "id", design_data = wave2_design, design_strata = "strata", n_allocated = "n_to_sample") ############# ### Section 4 ---------------- ############# ###### ## Section 4.1: Launching the app and uploading data ###### # Run the app # optimall_shiny() # Uncomment to run app # The images in Figures 2-5 are obtained by loading the iris dataset in, # matching the inputs in Figure 3, and pressing the 'Confirm Split' button. ############# ### Section 5 ---------------- ############# ###### ## Section 5.2: Getting Started with the Multiwave Object ###### # Initialize object MySurvey <- new_multiwave(phases = 2, waves = c(1,3)) ###### ## Section 5.3: Accessing and writing slots ###### # To access Phase 2, Wave 2 design get_mw(MySurvey, phase = 2, wave = 2, slot = "design") # To access overall metadata get_mw(MySurvey, phase = 1, slot = "metadata") # To write overall metadata set_mw(MySurvey, phase = NA, slot = "metadata") <- list(title = "Sepal Width Survey") # To access Overall metadata get_mw(MySurvey, phase = NA, slot = "metadata") ###### ## Section 5.4: Call functions with fewer arguments ###### # To place iris data in Phase 1 iris <- datasets::iris iris$id <- 1:150 set_mw(MySurvey, phase = 1, slot = "data") <- subset(iris, select = c(id, Species, Sepal.Length)) ## Use apply_multiwave() to apply optimum_allocation() MySurvey <- apply_multiwave(MySurvey, phase = 2, wave = 1, fun = "optimum_allocation", strata = "Species", y = "Sepal.Length", nsample = 30, method = "WrightII") ## Add species to metadata set_mw(MySurvey, phase = 2, slot = "metadata") <- list(strata = "Species") ## Re-run apply_multiwave() MySurvey <- apply_multiwave(MySurvey, phase = 2, wave = 1, fun = "optimum_allocation", y = "Sepal.Length", nsample = 30, method = "WrightII") ## View design get_mw(MySurvey, phase = 2, wave = 1, slot = "design") ## Use apply_multiwave() to apply sample_strata() set.seed(340) MySurvey <- apply_multiwave(MySurvey, phase = 2, wave = 1, fun = "sample_strata", id = "id", design_strata = "strata", n_allocated = "stratum_size", probs = ~stratum_size/npop) ## View samples get_mw(MySurvey, phase = 2, wave = 1, slot = "samples") ## Set sampled_data set_mw(MySurvey, phase = 2, wave = 1, slot = "sampled_data") <- iris[iris$id %in% get_mw(MySurvey, phase = 2, wave = 1, slot = "samples")$ids, c("id", "Sepal.Width")] ## Call merge_samples() MySurvey <- merge_samples(MySurvey, phase = 2, wave = 1, id = "id", include_probs = TRUE) ## View data head head(get_mw(MySurvey, phase = 2, wave = 1, slot = "data")) ###### ## Section 5.5: View summary diagram of survey ###### multiwave_diagram(MySurvey) ############# ### Section 6 ---------------- ############# ###### ## Section 6.1: Introduction and data set up ###### ## View Data head(MatWgt_Sim) ## Make phase 1 data phase1 <- subset(MatWgt_Sim, select = -mat_weight_true) ###### ## Section 6.2: Estimating the Mean of an expensive covariate in a single wave ###### # Initialize strata column phase1$strata <- phase1$race # Merge the smallest race categories into "other" phase1 <- merge_strata(data = phase1, strata = "strata", merge = c("Other","Asian"), name = "Other") # Split each race stratum at 25th and 75th global percentile of wgt change phase1 <- split_strata(data = phase1, strata = "new_strata", split = NULL, split_var = "mat_weight_est", type = "global quantile", split_at = c(0.25,0.75), trunc = "MWC_est") table(phase1$new_strata) # 9 strata ## Initialize Survey1 Survey1 <- multiwave(phases = 2, waves = c(1,1)) # Add a title set_mw(Survey1, phase = NA, slot = "metadata") <- list(title = "Single Wave Mat Wgt Survey") # Add Phase 1 data set_mw(Survey1, phase = 1, slot = "data") <- phase1 # Run optimum_allocation: Survey1 <- apply_multiwave(Survey1, phase = 2, wave = 1, fun = "optimum_allocation", strata = "new_strata", y = "mat_weight_est", nsample = 750, method = "WrightII") ## View results get_mw(Survey1, phase = 2, wave = 1, slot = "design") # Set seed for reproducibility set.seed(435) # Select samples Survey1 <- apply_multiwave(Survey1, phase = 2, wave = 1, fun = "sample_strata", strata = "new_strata", id = "id", design_strata = "strata", n_allocated = "stratum_size") ## Simulate sample selection set_mw(Survey1, phase = 2, wave = 1, slot = "sampled_data") <- MatWgt_Sim[MatWgt_Sim$id %in% get_mw(Survey1, phase = 2, wave = 1, slot = "samples")$ids, c("id","mat_weight_true")] ## Merge samples with data Survey1 <- merge_samples(Survey1, phase = 2, wave = 1, id = "id") #Find mean with survey package library("survey") # Extract final data final_data <- get_mw(Survey1, phase = 2, wave = 1, slot = "data") # Specify two-phase design twophasedesign <- twophase(id = list(~1,~1), strata = list(NULL,~new_strata), subset = ~as.logical(sampled_phase2), data = final_data, method = "approx") # Stratified Mean Estimate survey::svymean(~mat_weight_true, design = twophasedesign) ###### ## Section 6.3: Estimating the Mean of an expensive covariate over multiple waves ###### ## Initialize multiwave object Survey2 <- multiwave(phases = 2, waves = c(1,3)) # Add a title set_mw(Survey2, phase = NA, slot = "metadata") <- list(title = "Multi-wave Mat Wgt Survey") # Add Phase 1 data and metadata set_mw(Survey2, phase = 1, slot = "data") <- phase1 set_mw(Survey2, phase = 1, slot = "metadata") <- list(title = "Phase 1", description = "This is simulated data generated in December 2020 based on a real maternal weight study", data_dict = data.frame(id = "unique identifier", new_strata = "Name of updated strata", race = "self-identified race of mother", mat_weight_est = "error-prone estimate of maternal weight change in lbs. during pregnancy", obesity = "1/0 indicator for child obesity within 6 years of birth", diabetes = "1/0 indicator for mother's diabetes status" )) ##### Phase 2, Wave 1 ## Design Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 1, fun = "optimum_allocation", strata = "new_strata", y = "mat_weight_est", nsample = 250, method = "Neyman") get_mw(Survey2, phase = 2, wave = 1, slot = "design") ## Write phase 2 metadata set_mw(Survey2, phase = 2, slot = "metadata") <- list(title = "Phase 2", description = "This phase will sample 750 patients across three waves of 250. The first wave will use proportional allocation, and the next two will use Wright Algorithm II based on the previous wave's samples", strata = "new_strata", design_strata = "strata", id = "id", y = "mat_weight_true") ## Select wave 1 ids set.seed(983) # for sampling reproducibility Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 1, fun = "sample_strata", n_allocated = "stratum_size") ## Collect the sampled data set_mw(Survey2, phase = 2, wave = 1, slot = "sampled_data") <- MatWgt_Sim[MatWgt_Sim$id %in% get_mw(Survey2, phase = 2, wave = 1, slot = "samples")$ids, c("id","mat_weight_true")] ## Merge with merge_samples Survey2 <- merge_samples(Survey2, phase = 2, wave = 1) ## View completed Wave 1 data dim(get_mw(Survey2, phase = 2, wave = 1, slot = "data")) table(is.na(get_mw(Survey2, phase = 2, wave = 1, "data")$mat_weight_true)) ##### Phase 2, Wave 2 ## Apply allocate_wave() Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 2, fun = "allocate_wave", already_sampled = "sampled_phase2", nsample = 250) get_mw(Survey2, phase = 2, wave = 2, slot = "design") ## Select ids to sample set.seed(584) # for sampling reproducibility Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 2, fun = "sample_strata", already_sampled = "sampled_phase2", design_strata = "strata", n_allocated = "n_to_sample") ## "Collect" data set_mw(Survey2, phase = 2, wave = 2, slot = "sampled_data") <- MatWgt_Sim[MatWgt_Sim$id %in% get_mw(Survey2, phase = 2, wave = 2, slot = "samples")$ids, c("id","mat_weight_true")] ## Merge it back with data Survey2 <- merge_samples(Survey2, phase = 2, wave = 2) ## Confirm sampling names(get_mw(Survey2, phase = 2, wave = 2, slot = "data")) table(get_mw(Survey2, phase = 2, wave = 2, "data")$sampled_phase2) ##### Phase 2, Wave 3 ## Allocate samples Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 3, fun = "allocate_wave", already_sampled = "sampled_phase2", nsample = 250, detailed = FALSE) ## Select ids to samples set.seed(584) # for sampling reproducibility Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 3, fun = "sample_strata", already_sampled = "sampled_phase2", design_strata = "strata", n_allocated = "n_to_sample") ## "Collect" the data set_mw(Survey2, phase = 2, wave = 3, slot = "sampled_data") <- MatWgt_Sim[MatWgt_Sim$id %in% get_mw(Survey2, phase = 2, wave = 3, slot = "samples")$ids, c("id","mat_weight_true")] ## Merged sampled data to full data Survey2 <- merge_samples(Survey2, phase = 2, wave = 3) ###### ## Section 6.4: A multi-wave sampling design for regression modelling ###### ## Initialize multiwave object, add data and metadata Survey3 <- multiwave(phases = 2, waves = c(1,2)) set_mw(Survey3, phase = NA, slot = "metadata") <- list(title = "Survey for Regression of Childhood Obesity") set_mw(Survey3, phase = 1, slot = "data") <- phase1 ## Demonstrate influence function calculation # Model fit using error-prone maternal weight fit <- glm(obesity ~ mat_weight_est + diabetes + race, data = get_mw(Survey3, phase = 1, slot = "data"), family = "quasibinomial") # Fisher information design_mat <- model.matrix(fit) I_hat <- (t(design_mat) %*% (design_mat * fit$fitted.values * (1 - fit$fitted.values))) / nrow(design_mat) # Calculate influence function influence <- (design_mat * resid(fit, type = "response")) %*% solve(I_hat) influence <- as.data.frame(influence) # Add it as new column to phase 1 data set_mw(Survey3, phase = 1, slot = "data") <- cbind(get_mw(Survey3, phase = 1, slot = "data"), naive_influence = influence[,"mat_weight_est"]) ## Allocate Wave 1 using influence function: Survey3 <- apply_multiwave(Survey3, phase = 2, wave = 1, fun = "optimum_allocation", strata = "new_strata", y = "naive_influence", nsample = 250, method = "WrightII") ## FINISH - Call to sessionInfo() sessionInfo()