# Replication Script for 'Optimum Allocation for Adaptive Multi-Wave Sampling
# in R: The R Package optimall'

# Summary: This script replicates all of the code and results from the
# submitted manuscript. It is broken down into into sections
# according to the manuscript for organization.

# Load package ------------------------------------
# Install, if necessary:
# library(devtools) # uncomment and run if necessary
# install_github("yangjasp/optimall") # uncomment and run if necessary
# Or install from CRAN with:
# install.packages("optimall)
library("optimall")

# Install suggested packages for Shiny app and multiwave_diagram() if not
# already installed.
# These are not required to run the basic functions of optimall,
# but they are required for some of the features demonstrated in the paper.

# install.packages("DiagrammeR")
# install.packages("shiny")
# install.packages("bslib")
# install.packages("DT")
# install.packages("survey")

#############
### Section 3 ----------------
#############

######
## Section 3.1: defining, splitting, and merging strata
######

# view species in iris dataset
library("datasets")
table(iris$Species)

# demonstrate split_strata() function
iris2 <- split_strata(data = iris,
                      strata = "Species",
                      split = c("setosa", "virginica"),
                      split_var = "Sepal.Width",
                      split_at = c(0.5), type = "local quantile")
table(iris2$new_strata)

# demonstrate merge_strata() function
iris3 <- merge_strata(data = iris,
                      strata = "Species",
                      merge = c("setosa", "versicolor"),
                      name = "set_or_vers")
table(iris3$new_strata)

######
## Section 3.2: optimally allocating samples
######

# demonstrate optimum_allocation() function:
sampling_design <- optimum_allocation(data = iris,
                                      strata = "Species",
                                      y = "Sepal.Width",
                                      nsample = 40,
                                      method = "WrightII")
sampling_design

# demonstrate optimum_allocation() with simpler input:
iris_summary <- data.frame(strata = unique(iris$Species),
                           size = c(50, 50, 50),
                           sd = c(0.3791, 0.3138, 0.3225))

optimum_allocation(data = iris_summary,
                   strata = "strata",
                   sd_h = "sd",
                   N_h = "size",
                   nsample = 40,
                   method = "WrightII")

######
## Section 3.3: sampling based on results
######

## demonstrate sample_strata() function
iris$id <- 1:150
set.seed(743)

iris <- sample_strata(data = iris, strata = "Species", id = "id",
                      design_data = sampling_design,
                      design_strata = "strata",
                      n_allocated = "stratum_size",
                      probs = ~stratum_size/npop)
head(iris[,c("id", "Species", "Sepal.Width",
             "sample_indicator", "sampling_prob")]) #Drop extra column for space

## extract ids to sample
ids_to_sample <- iris$id[iris$sample_indicator == 1]
head(ids_to_sample)
length(ids_to_sample)

######
## Section 3.4: allocating samples in waves
######

# Set up wave 1:

# Collect Sepal.Width from only the 30 samples
wave1_design <- data.frame(strata = c("setosa",
                                      "virginica",
                                      "versicolor"),
                           stratum_size = c(7, 16, 7))

phase1_data <- subset(datasets::iris, select =  -Sepal.Width)

phase1_data$id <- 1:nrow(phase1_data) #Add id column

set.seed(234) # for sampling reproducibility
phase1_data <- sample_strata(data = phase1_data,
                             strata = "Species", id = "id",
                             design_data = wave1_design,
                             design_strata = "strata",
                             n_allocated = "stratum_size")

wave1_ids <- iris$id[phase1_data$sample_indicator == 1]

wave1_sampled_data <- iris[iris$id %in% wave1_ids, c("id","Sepal.Width")]

wave1_data <- merge(phase1_data, wave1_sampled_data, by = "id",
                    no.dups =  TRUE, all.x = TRUE)

# We have Sepal.Width for our 30 samples
table(is.na(wave1_data$Sepal.Width), wave1_data$Species)

## Now, run allocate_wave() to generate wave 2 design:
wave2_design <- allocate_wave(data = wave1_data,
                              strata = "Species",
                              y = "Sepal.Width",
                              already_sampled = "sample_indicator",
                              nsample = 10,
                              detailed = TRUE)
wave2_design[,1:6]

## Demonstrate sample_strata again
wave2_data <- sample_strata(data = wave1_data,
                            strata = "Species", id = "id",
                            design_data = wave2_design,
                            design_strata = "strata",
                            n_allocated = "n_to_sample")

#############
### Section 4 ----------------
#############

######
## Section 4.1: Launching the app and uploading data
######

# Run the app
# optimall_shiny() # Uncomment to run app

# The images in Figures 2-5 are obtained by loading the iris dataset in,
# matching the inputs in Figure 3, and pressing the 'Confirm Split' button.

#############
### Section 5 ----------------
#############

######
## Section 5.2: Getting Started with the Multiwave Object
######

# Initialize object
MySurvey <- new_multiwave(phases = 2, waves = c(1,3))

######
## Section 5.3: Accessing and writing slots
######

# To access Phase 2, Wave 2 design
get_mw(MySurvey, phase = 2, wave = 2, slot = "design")

# To access overall metadata
get_mw(MySurvey, phase = 1, slot = "metadata")

# To write overall metadata
set_mw(MySurvey, phase = NA, slot = "metadata") <-
  list(title = "Sepal Width Survey")

# To access Overall metadata

get_mw(MySurvey, phase = NA, slot = "metadata")

######
## Section 5.4: Call functions with fewer arguments
######

# To place iris data in Phase 1
iris <- datasets::iris
iris$id <- 1:150
set_mw(MySurvey, phase = 1, slot = "data") <-
  subset(iris, select = c(id, Species, Sepal.Length))

## Use apply_multiwave() to apply optimum_allocation()
MySurvey <- apply_multiwave(MySurvey, phase = 2, wave = 1,
                            fun = "optimum_allocation",
                            strata = "Species", y = "Sepal.Length",
                            nsample = 30, method = "WrightII")

## Add species to metadata
set_mw(MySurvey, phase = 2, slot = "metadata") <-
  list(strata = "Species")

## Re-run apply_multiwave()
MySurvey <- apply_multiwave(MySurvey, phase = 2, wave = 1,
                            fun = "optimum_allocation",
                            y = "Sepal.Length",
                            nsample = 30, method = "WrightII")

## View design
get_mw(MySurvey, phase = 2, wave = 1, slot = "design")

## Use apply_multiwave() to apply sample_strata()
set.seed(340)
MySurvey <- apply_multiwave(MySurvey, phase = 2, wave = 1,
                            fun = "sample_strata", id = "id",
                            design_strata = "strata",
                            n_allocated = "stratum_size",
                            probs = ~stratum_size/npop)

## View samples
get_mw(MySurvey, phase = 2, wave = 1, slot = "samples")

## Set sampled_data
set_mw(MySurvey, phase = 2, wave = 1, slot = "sampled_data") <-
  iris[iris$id %in% get_mw(MySurvey,
                             phase = 2,
                             wave = 1,
                             slot = "samples")$ids,
       c("id", "Sepal.Width")]

## Call merge_samples()
MySurvey <- merge_samples(MySurvey, phase = 2, wave = 1,
                          id = "id", include_probs = TRUE)

## View data head
head(get_mw(MySurvey, phase = 2, wave = 1, slot = "data"))

######
## Section 5.5: View summary diagram of survey
######
multiwave_diagram(MySurvey)

#############
### Section 6 ----------------
#############

######
## Section 6.1: Introduction and data set up
######

## View Data
head(MatWgt_Sim)

## Make phase 1 data
phase1 <- subset(MatWgt_Sim, select = -mat_weight_true)

######
## Section 6.2: Estimating the Mean of an expensive covariate in a single wave
######

# Initialize strata column
phase1$strata <- phase1$race

# Merge the smallest race categories into "other"
phase1 <- merge_strata(data = phase1,
                       strata = "strata",
                       merge = c("Other","Asian"),
                       name = "Other")

# Split each race stratum at 25th and 75th global percentile of wgt change
phase1 <- split_strata(data = phase1, strata = "new_strata", split = NULL,
                       split_var = "mat_weight_est",
                       type = "global quantile",
                       split_at = c(0.25,0.75),
                       trunc = "MWC_est")

table(phase1$new_strata) # 9 strata

## Initialize Survey1
Survey1 <- multiwave(phases = 2, waves = c(1,1))

# Add a title
set_mw(Survey1, phase = NA, slot = "metadata") <-
  list(title = "Single Wave Mat Wgt Survey")

# Add Phase 1 data
set_mw(Survey1, phase = 1, slot = "data") <- phase1

# Run optimum_allocation:
Survey1 <- apply_multiwave(Survey1, phase = 2, wave = 1,
                           fun = "optimum_allocation",
                           strata = "new_strata",
                           y = "mat_weight_est",
                           nsample = 750,
                           method = "WrightII")

## View results
get_mw(Survey1, phase = 2, wave = 1, slot = "design")

# Set seed for reproducibility
set.seed(435)

# Select samples
Survey1 <- apply_multiwave(Survey1, phase = 2, wave = 1,
                           fun = "sample_strata",
                           strata  = "new_strata",
                           id = "id",
                           design_strata = "strata",
                           n_allocated = "stratum_size")

## Simulate sample selection
set_mw(Survey1, phase = 2, wave = 1, slot = "sampled_data") <-
  MatWgt_Sim[MatWgt_Sim$id %in% get_mw(Survey1,
                                         phase = 2,
                                         wave = 1,
                                         slot = "samples")$ids,
             c("id","mat_weight_true")]

## Merge samples with data
Survey1 <- merge_samples(Survey1, phase = 2, wave = 1,
                         id = "id")

#Find mean with survey package
library("survey")

# Extract final data
final_data <- get_mw(Survey1, phase = 2, wave = 1, slot = "data")

# Specify two-phase design
twophasedesign <- twophase(id = list(~1,~1),
                           strata = list(NULL,~new_strata),
                           subset = ~as.logical(sampled_phase2),
                           data = final_data, method = "approx")

# Stratified Mean Estimate
survey::svymean(~mat_weight_true, design = twophasedesign)

######
## Section 6.3: Estimating the Mean of an expensive covariate over multiple waves
######

## Initialize multiwave object
Survey2 <- multiwave(phases = 2, waves = c(1,3))

# Add a title
set_mw(Survey2, phase = NA, slot = "metadata") <-
  list(title = "Multi-wave Mat Wgt Survey")

# Add Phase 1 data and metadata
set_mw(Survey2, phase = 1, slot = "data") <- phase1

set_mw(Survey2, phase = 1, slot = "metadata") <-
  list(title = "Phase 1",
       description = "This is simulated data
       generated in December 2020 based on a real maternal
       weight study",
       data_dict = data.frame(id = "unique identifier",
                              new_strata = "Name of updated strata",
                              race = "self-identified race of mother",
                              mat_weight_est = "error-prone estimate of
                              maternal weight change in lbs. during
                              pregnancy",
                              obesity = "1/0 indicator for child obesity
                              within 6 years of birth",
                              diabetes = "1/0 indicator for mother's
                              diabetes status"
                              ))

##### Phase 2, Wave 1

## Design
Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 1,
                           fun = "optimum_allocation",
                           strata = "new_strata",
                           y = "mat_weight_est",
                           nsample = 250, method = "Neyman")

get_mw(Survey2, phase = 2, wave = 1, slot = "design")

## Write phase 2 metadata
set_mw(Survey2, phase = 2, slot = "metadata") <-
  list(title = "Phase 2",
       description = "This phase will sample 750 patients across three
       waves of 250. The first wave will use proportional allocation,
       and the next two will use Wright Algorithm II based on the
       previous wave's samples",
       strata = "new_strata",
       design_strata = "strata",
       id = "id",
       y = "mat_weight_true")

## Select wave 1 ids
set.seed(983) # for sampling reproducibility
Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 1,
                           fun = "sample_strata",
                           n_allocated = "stratum_size")

## Collect the sampled data
set_mw(Survey2, phase = 2, wave = 1, slot = "sampled_data") <-
  MatWgt_Sim[MatWgt_Sim$id %in% get_mw(Survey2,
                                         phase = 2,
                                         wave = 1,
                                         slot = "samples")$ids,
             c("id","mat_weight_true")]

## Merge with merge_samples
Survey2 <- merge_samples(Survey2, phase = 2, wave = 1)

## View completed Wave 1 data
dim(get_mw(Survey2, phase = 2, wave = 1, slot = "data"))
table(is.na(get_mw(Survey2, phase = 2,
                     wave = 1, "data")$mat_weight_true))

##### Phase 2, Wave 2

## Apply allocate_wave()
Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 2,
                           fun = "allocate_wave",
                           already_sampled = "sampled_phase2",
                           nsample = 250)

get_mw(Survey2, phase = 2, wave = 2, slot = "design")

##  Select ids to sample
set.seed(584) # for sampling reproducibility
Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 2,
                           fun = "sample_strata",
                           already_sampled = "sampled_phase2",
                           design_strata  = "strata",
                           n_allocated = "n_to_sample")

## "Collect" data
set_mw(Survey2, phase = 2, wave = 2, slot = "sampled_data") <-
  MatWgt_Sim[MatWgt_Sim$id %in% get_mw(Survey2,
                                         phase = 2,
                                         wave = 2,
                                         slot = "samples")$ids,
             c("id","mat_weight_true")]

## Merge it back with data
Survey2 <- merge_samples(Survey2, phase = 2, wave = 2)

## Confirm sampling
names(get_mw(Survey2, phase = 2, wave = 2, slot = "data"))
table(get_mw(Survey2, phase = 2, wave = 2, "data")$sampled_phase2)

##### Phase 2, Wave 3

## Allocate samples
Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 3,
                           fun = "allocate_wave",
                           already_sampled = "sampled_phase2",
                           nsample = 250, detailed = FALSE)

## Select ids to samples
set.seed(584) # for sampling reproducibility
Survey2 <- apply_multiwave(Survey2, phase = 2, wave = 3,
                           fun = "sample_strata",
                           already_sampled = "sampled_phase2",
                           design_strata  = "strata",
                           n_allocated = "n_to_sample")

## "Collect" the data
set_mw(Survey2, phase = 2, wave = 3, slot = "sampled_data") <-
  MatWgt_Sim[MatWgt_Sim$id %in% get_mw(Survey2,
                                         phase = 2,
                                         wave = 3,
                                         slot = "samples")$ids,
             c("id","mat_weight_true")]

## Merged sampled data to full data
Survey2 <- merge_samples(Survey2, phase = 2, wave = 3)

######
## Section 6.4: A multi-wave sampling design for regression modelling
######

## Initialize multiwave object, add data and metadata
Survey3 <- multiwave(phases = 2, waves = c(1,2))

set_mw(Survey3, phase = NA, slot = "metadata") <-
  list(title = "Survey for Regression of Childhood Obesity")

set_mw(Survey3, phase = 1, slot = "data") <- phase1

## Demonstrate influence function calculation
# Model fit using error-prone maternal weight
fit <- glm(obesity ~ mat_weight_est + diabetes + race,
           data = get_mw(Survey3, phase = 1, slot = "data"),
           family = "quasibinomial")

# Fisher information
design_mat <- model.matrix(fit)
I_hat <- (t(design_mat) %*% (design_mat *
                               fit$fitted.values *
                               (1 - fit$fitted.values))) / nrow(design_mat)

# Calculate influence function
influence <- (design_mat * resid(fit, type = "response")) %*% solve(I_hat)
influence <- as.data.frame(influence)

# Add it as new column to phase 1 data
set_mw(Survey3, phase = 1, slot = "data") <-
  cbind(get_mw(Survey3, phase = 1, slot = "data"),
        naive_influence = influence[,"mat_weight_est"])

## Allocate Wave 1 using influence function:
Survey3 <- apply_multiwave(Survey3, phase = 2, wave = 1,
                           fun = "optimum_allocation",
                           strata = "new_strata",
                           y = "naive_influence",
                           nsample = 250,
                           method = "WrightII")


## FINISH - Call to sessionInfo()
sessionInfo()