## REndo: Internal Instrumental Variables to Address Endogeneity
## Authors: Gui, Meierer, Schilter, Algesheimer
## Operating system: Windows 10 Enterprise version 21H2; 
##                   Processor: Intel Core i7-6600U CPU @ 2.60GHz 2.81 GHz; 
##                   Memory: 20 GB; System type: 64-bit operating system, x64-based processor

## R version 4.3.0

##      Replication Manuscript
##      REndo - v.2.4.8

# list.of.packages <- c("ggplot2", "AER", "MASS", "gridExtra", "data.table",
#                       "REndo", "corpcor", "AER", "optimx", "compiler", "lme4",
#                       "fastDummies", "MCMCglmm", "rgl")
# new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])]
# if (length(new.packages)) install.packages(new.packages, dependencies = TRUE)


library("REndo")
library("data.table")
library("ggplot2")
library("corpcor")
library("MASS")
library("MCMCglmm")
library("AER")
library("gridExtra")
library("compiler")
library("lme4")
library("fastDummies")
library("rgl")

## save figures to PDF: TRUE/FALSE
doPDF <- FALSE

if (doPDF) dir.create("Figures")


#### Chapter 2 ####
##### Figure 2  #####
## Estimates bias at different correlation levels between endogenous regressor
## and model error.

## Shows the inconsistency of estimates of regressors depending on the correlation 
## between variables and between the endogenous regressor and the error.
## We consider four cases:
## -  low correlation btw variables and low correlation btw end regressor and error 
## -  low correlation btw regressors and high correlation btw end reg and error 
## -  high correlation btw regressors and low correlation btw end reg and error 
## -  high correlation btw regressors and high correlation btw end reg and error 
## 
## The model we use is:  
## 
##   y = b0 + b1 * P + b2 * X + error_y   
##   P = g0 + g1 * Z + error_x

set.seed(43210)

## Set true parameters
b0_true <- 1
b1_true <- -1
b2_true <- 1
g0_true <- 1
g1_true <- 1

## Number of observations
nobs <- 2500

## Generate omitted variable (standard normal)
OV <- rnorm(n = nobs, mean = 0, sd = 1)

## Generate errors error_y and error_x
error_y <- rnorm(n = nobs, mean = 0, sd = 0.5)
error_p <- rnorm(n = nobs, mean = 0, sd = 0.5)
rho <- 0.2    # rho - cor(P, X)
m <- c(0, 0)   # mean
gamma <- c(0.1, 0.3, 0.5)  

## covariance matrix between P and X - the 2 regressors
R <- matrix(c(1, rho, rho, 1), 2, 2)

## Generate data frame to store observed data
df <- data.frame(obs.id = as.factor(c(1:nobs)))

sim <- 1000

## coefficients and std deviation 
coef <- matrix(NA, sim, length(gamma))
stdev <-  vector("list", sim)
confInt <- list(confInt1 = matrix(NA, sim, 2), confInt2 = matrix(NA, sim, 2),
  confInt3 = matrix(NA, sim, 2))

## cor(P, OV) = 0.1
for (k in 1:length(gamma)){
  for (i in 1:sim) {
    
    ## list of the 2 regressors X and W, with different correlation btw them  
    regressors <-  MASS::mvrnorm(nobs, m, R)
    
    ## case 1
    df$P1 <- regressors[, 1]
    df$X <- regressors[, 2]
    
    ## Generate endogenous regressor
    df$P <- g0_true + g1_true * df$P1 + gamma[k] * OV + error_p
    
    ## Generate dependent variable
    df$Y <- b0_true + b1_true * df$P  + b2_true * df$X + OV + error_y
    
    ## run the OLS
    model1 <- lm(Y ~ P + X, data = df)
    
    ## save coefficients for the 3 cases, each column represents the coefficients 
    ## 1,2,3 of each model
    coef[i, k] <- coef(model1)["P"]
    
    ## confidence interval for the endogeneous variable P for each value of correlation
    confInt[[k]][i, ] <- confint(model1, "P", level = 0.95)                  
  }
}
## corr(P,error) = 0.1
means_0.1 <- colMeans(coef)[1]
stdev_0.1 <- apply(coef, 2, sd)[1]
confInt1 <- apply(confInt[[1]], 2, mean)
conf11 <- confInt1[2] + 1 

## corr(P,error) = 0.3
means_0.3 <- colMeans(coef)[2]
stdev_0.3 <- apply(coef, 2, sd)[2]
confInt2 <-  apply(confInt[[2]], 2, mean)

## corr(P,error) = 0.5
means_0.5 <- colMeans(coef)[3]
stdev_0.5 <- apply(coef, 2, sd)[3]
confInt3  <- apply(confInt[[3]], 2, mean)

## estimates of b1 at different correlation levels between end. regressor and error 
dd_b1 <- rbind(as.matrix(means_0.1, 3, 1), as.matrix(means_0.3, 3, 1), 
  as.matrix(means_0.5, 3, 1))
colnames(dd_b1) <- c("beta")
dd_b1 <- cbind(dd_b1, corr_P_error = rep(c(0.1, 0.3, 0.5), each = 1))
dd_b1 <- as.data.table(dd_b1)
dd_b1[, bias := beta + 1]

## create graph
Corr_P_error <- factor(dd_b1$corr_P_error)
dd_b1 <- as.data.table(dd_b1)
dd_b1[, confInt_low := c(confInt1[1] + 1, confInt2[1] + 1, confInt3[1] + 1)]
dd_b1[, confInt_up := c(confInt1[2] + 1, confInt2[2] + 1, confInt3[2] + 1)]

## beta1 at different correlation levels between error and endogenous regressor X
p2 <- ggplot(data = dd_b1, aes(as.factor(corr_P_error), bias, group = 1)) + 
  geom_line() + geom_point()   
p22 <- p2 + theme_grey(base_family = "serif", base_size = 10) +
  theme(panel.background = element_rect(fill = "white", colour = "black"),
    text = element_text(size = 14)) +
  xlab("Correlation between P and error")  + 
  ylab("b1 Estimates Bias")  + 
  ylim(c(0, 0.5)) +
  geom_errorbar(aes(ymin = confInt_low, ymax = confInt_up, width = 0.2))

if (doPDF) pdf(file = "Figures/jss_3807_fig_2_new.pdf") # width = 6, height = 6)
plot(p22)
if (doPDF) dev.off() 

####   Chapter 4   #### 
##      Figures 4 - 8 
##
## The performance of the five internal instrumental variables approaches in 
## REndo package is compared to that  of ordinary least squares, over 
## 1000 simulations, with sample size equal to 2500 observations.
## 
## To replicate the exact figures in the manuscript, the number of
## simulations has to be changed to 1000. 

beta1 <- beta2 <- NULL
rho <- 0.5
R <- matrix(c(1, rho,  rho, 1), 2, 2)

m <- c(0, 0)

b0 <- 3
b1 <- 1.5
b2 <- 3
b3 <- 3
a <- -1
g1 <- 1


##### Figure 4  ####
## Latent instrumental variable vs OLS: parameter estimates were obtained over 
## 1000 simulated samples, each of size 2500; true parameter value is -1.

set.seed(432111)
beta1 <- beta2 <- rep(NA, sim)

for (i in 1:sim) {
  
  N1 <- MASS::mvrnorm(nobs, m, R)
  eps <- N1[,1]
  nu <- N1[,2]
  
  # Take it discrete
  Z <- rbinom(n = nobs, size = 1, p = 0.3) 
  
  P1 <- g1*Z + nu
  
  y1 <- b0 + a * P1  + eps
  
  d1 <- data.frame(cbind(y1 = y1,P1 = P1))
  
  beta1[i] <- coef(lm(y1 ~ P1, data = d1))["P1"]
  beta2[i] <- coef(latentIV(y1 ~ P1, data = d1))["P1"]
  
}

beta1 <- as.matrix(beta1)
colnames(beta1) <- c("beta1")
beta2 <- as.matrix(beta2)
colnames(beta2) <- c("beta2")
dLIV <- data.frame(rbind(cbind(beta1, "OLS"), cbind(beta2, "LIV")))
dLIV$beta1 <- as.numeric(as.character(dLIV$beta1))

##### Estimates ####
cat("Latent IV vs OLS", "\n", "OLS estimate - mean:",
  mean(dLIV$beta1[dLIV$V2 == "OLS"]), "\n",
  "LIV estimate - mean:", mean(dLIV$beta1[dLIV$V2 == "LIV"]))

cat("Latent IV vs OLS", "\n", "OLS estimate - sd:", sd(dLIV$beta1[dLIV$V2 == "OLS"]),
  "\n", "LIV estimate - sd:", sd(dLIV$beta1[dLIV$V2 == "LIV"]))

p4 <- ggplot(dLIV, aes(x = beta1, fill = V2), size = 3) + 
  geom_density(alpha = 0.3) +
  labs(x = "Endogenous Regressor", y = "Density")  + 
  geom_vline(aes(xintercept = -1,linetype = "-1"), show.legend = TRUE) + 
  theme(panel.background = element_rect(fill = "white", colour = "black"),
    text = element_text(size = 14))+
  scale_linetype_manual(name = "True value", values = "dotdash", labels = "-1") +
  guides(fill = guide_legend(override.aes = list(colour = NA), order = 2))+
  labs(fill = "Method")  +
  xlim(c(-1.5, -0.5)) + ylim(c(0, 30))
## Adjust key height and width
pp4 <- p4 + theme(legend.key.height = unit(0.6, "cm"),
  legend.key.width = unit(1, "cm")) + 
  theme(aspect.ratio = 5/4)

if (doPDF) pdf(file = "Figures/jss_3807_fig_4_liv.pdf") # width = 6, height = 6)
plot(pp4)
if (doPDF) dev.off() 


##### Figure 5 ##### 
## Copula correction vs.  OLS: parameter estimates were obtained over 
## 1000 simulated samples, each of size 2500; true parameter value is -1. 

## The number of bootstraps used here is 2, due to time considerations. 
## But this does not influence the coefficient estimates. 
## But the the confidence intervals are be computed. 
## In general, it is advisable to use around 1000 bootstraps. 

set.seed(432112)
beta5 <- beta6 <- rep(0, sim)
rho <- 0.5
m <- c(0, 0)
R <- matrix(c(1, rho,  rho, 1), 2, 2)

b0 <- 3
b1 <- 1.5
b2 <- 3
b3 <- 3
a <- -1

I <- rep(1, nobs)

for (k2 in 1:sim){
  
  X1 <- rnorm(nobs, 2, 1)
  X2 <-  rnorm(nobs, 1.5, 1)
  
  N <- MASS::mvrnorm(nobs, mu = m, R)
  eps <- N[, 1]
  OV <- N[, 2]
  
  # P1 has a t-distribution 
  P1 <- qt(pnorm(OV), 3)
  
  y <-  b0*I +  b1 * X1 + b2 * X2 + a * P1 + eps
  data_CopCorr <- data.frame(y, I, X1, X2, P1)
  
  beta5[k2] <- coef(lm(y ~ P1 + X1 + X2, data = data_CopCorr))["P1"]
  
  cc <- copulaCorrection(y ~ P1 + X1 + X2 -1| continuous(P1), data = data_CopCorr, 
    num.boots = 2, verbose = FALSE)
  beta6[k2] <- coef(cc)["P1"]
}

dCC <- data.frame(rbind(cbind(beta5, "OLS"), cbind(beta6, "Copula Correction")))
dCC$beta5 <- as.numeric(as.character(dCC$beta5))

##### Estimates ####
cat("Copula Correction vs OLS", "\n", "OLS estimate - mean:", mean(dCC$beta5[dCC$V2 == "OLS"]), "\n",
  "Copula estimate - mean:", mean(dCC$beta5[dCC$V2 == "Copula Correction"]))
cat("Copula Correction vs OLS", "\n", "OLS estimate - sd:", sd(dCC$beta5[dCC$V2 == "OLS"]),"\n",
  "Copula estimate - sd:", sd(dCC$beta5[dCC$V2 == "Copula Correction"]))

p5 <- ggplot(dCC, aes(x = beta5, fill = V2), size = 3) + geom_density(alpha = 0.3)

pp5 <- p5 + labs(x = "Endogenous Regressor", y = "Density")  +
  coord_cartesian(xlim = c(-1.2, -0.5)) + 
  geom_vline(aes(xintercept = -1, linetype = "-1"), show.legend = TRUE) + 
  theme(panel.background = element_rect(fill = "white", colour = "black"),
    text = element_text(size = 14)) +
  scale_linetype_manual(name = "True value", values = "dotdash", labels = "-1")+
  guides(fill = guide_legend(override.aes = list(colour = NA), order = 2)) +
  labs(fill = "Method") + xlim(c(-1.2, -0.6)) + ylim(c(0, 30))

pp51 <- pp5 + theme(legend.key.height = unit(1, "cm"), legend.key.width = unit(0.6, "cm"))

if (doPDF) pdf(file = "Figures/jss_3807_fig_5_copula.pdf")
plot(pp51)
if (doPDF) dev.off() 


##### Figure 6  #####
## Higher moments vs OLS: parameter estimates were obtained over 1000 simulated samples,
## each of size 2500; true parameter value is -1.

set.seed(43213)
beta3 <- beta4 <-  rep(NA, sim)

for (k1 in 1:sim) {
  N <- MASS::mvrnorm(nobs, mu = m, R)
  OV <- N[,1]
  e1 <- N[,2]
  X3 <- rt(nobs, 3)
  e2 <- rt(nobs, 10)
  
  X1 <- rnorm(nobs, 2, 0.1)
  X2 <- X1 + 0.5 * rnorm(nobs, 1.5, 0.9)
  
  P1 <- b3 * X3 + e2 + 3 * OV
  y1 <- b0 + b1 * X1 + b2 * X2 + a * P1 + e1 + OV
  
  M <- cbind(X1,X2)
  dataHigherMoments <- data.frame(y1,X1,X2,P1)
  beta3[k1] <- coef(lm(y1 ~ P1 + X1 + X2))["P1"]
  beta4[k1] <- coef(higherMomentsIV(y1 ~ P1 + X1 + X2 | P1 | IIV(iiv = yp, g = x2, X1, X2),
    data = dataHigherMoments))["P1"]
}
dHM <- data.frame(rbind(cbind(beta3, "OLS"), cbind(beta4, "Higher Moments")))
dHM$beta3 <- as.numeric(as.character(dHM$beta3))

##### Estimates ####
cat("Higher Moments vs OLS", "\n", "OLS estimate - mean:", mean(dHM$beta3[dHM$V2 == "OLS"]),"\n",
  "Higher Moments estimate - mean:", mean(dHM$beta3[dHM$V2 == "Higher Moments"]))
cat("Higher Moments vs OLS", "\n", "OLS estimate - sd:", sd(dHM$beta3[dHM$V2 == "OLS"]),"\n",
  "Higher Moments estimate - sd:", sd(dHM$beta3[dHM$V2 == "Higher Moments"]))


p6 <- ggplot(dHM, aes(x = beta3, fill = V2), size = 3) + geom_density(alpha = 0.3)

pp6 <- p6 + labs(x = "Endogenous Regressor", y = "Density") + 
  coord_cartesian(xlim = c(-1.2, -0.5)) + 
  geom_vline(aes(xintercept = -1, linetype = "-1"), show.legend = TRUE) +
  theme(panel.background = element_rect(fill = "white", colour = "black"),
    text = element_text(size = 14)) +
  scale_linetype_manual(name = "True value", values = "dotdash", labels = "-1")+
  guides(fill = guide_legend(override.aes = list(colour = NA), order = 2))+
  labs(fill = "Method") + xlim(c(-1.2, -0.5)) 

pp61 <- pp6 + theme(legend.key.height = unit(0.6, "cm"), 
  legend.key.width = unit(1, "cm"))

if (doPDF) pdf(file = "Figures/jss_3807_fig_6_hm.pdf")# width = 6, height = 6)
plot(pp61)
if (doPDF) dev.off() 


##### Figure 7 #####
## Heteroskedastic errors IV vs OLS: parameter estimates were obtained over 
## 1000 simulated samples, each of size 2500; true parameter value is -1.

set.seed(43214)

beta7 <- beta8 <- rep(0, sim)

for (k3 in 1:sim){
  N <- MASS::mvrnorm(nobs, mu = m, R)
  OV <- N[, 1]
  e1 <- N[, 2]
  
  X3 <- rt(nobs, 3)
  e2 <- rt(nobs, 10)
  
  X1 <- rnorm(nobs, 2, 0.1)
  X2 <- X1 + 0.5 * rnorm(nobs, 1.5, 0.9)
  
  P1 <- b3 * X3 + e2 + 3 * OV
  y1 <- b0 + b1 * X1 + b2 * X2 + a * P1 + e1 + OV
  
  M <- cbind(X1,X2)
  Data_hetErrors <- data.frame(cbind(y1, M, P1))
  beta7[k3] <- coef(lm(y1 ~ P1 + X1 + X2))["P1"]
  beta8[k3] <- coef(hetErrorsIV(y1 ~ P1 + X1 + X2| P1 | IIV(X1, X2), 
    data = Data_hetErrors))["P1"]
}

dHet <- data.frame(rbind(cbind(beta7, "OLS"), cbind(beta8, "Heteroskedastic Errors")))
dHet$beta7 <- as.numeric(as.character(dHet$beta7))

##### Estimates ####
cat("Heteroskedastic Errors vs OLS", "\n", "OLS estimate -mean:", mean(dHet$beta7[dHet$V2 == "OLS"]),"\n",
  "Heteroskedastic Errors estimate - mean:", mean(dHet$beta7[dHet$V2 == "Heteroskedastic Errors"]))
cat("Heteroskedastic Errors vs OLS", "\n", "OLS estimate - sd:", sd(dHet$beta7[dHet$V2 == "OLS"]),"\n",
  "Heteroskedastic Errors estimate - sd:", sd(dHet$beta7[dHet$V2 == "Heteroskedastic Errors"]))


p7 <- ggplot(dHet, aes(x = beta7, fill = V2), size = 3) + 
  geom_density(alpha = 0.3)

pp7 <- p7 + labs(x = "Endogenous Regressor", y = "Density") + 
  coord_cartesian(xlim = c(-1.2, -0.7)) + 
  geom_vline(aes(xintercept = -1, linetype = "-1")) + 
  theme(panel.background = element_rect(fill = "white", colour = "black"), 
    text = element_text(size = 14))+
  scale_linetype_manual(name = "True value", values = "dotdash", labels = "-1") +
  guides(fill = guide_legend(override.aes = list(colour = NA), order = 2)) +
  labs(fill = "Method") + xlim(c(-1.2, -0.7)) 

pp71 <- pp7 + theme(legend.key.height = unit(0.6, "cm"),
  legend.key.width = unit(1, "cm"))

if (doPDF) pdf(file = "Figures/jss_3807_fig_7_het.pdf") # width = 6, height = 6)
plot(pp71)
if (doPDF) dev.off() 


##### Figure 8 #####
## MultilevelIV  vs  random  effects:  parameter  estimates  were  obtained  over  
## 1000  simulated samples, each of size 2500; true parameter value is -1.
set.seed(43215)

######  functions used  #####
dmat <- cmpfun(function(i) {
  j <- length(i)
  n <- sum(i)
  index <- cbind(start = cumsum(c(1, i[-j])), stop = cumsum(i))
  H <- matrix(0, nrow = n, ncol = j)
  for (i in 1:j) {
    H[index[i, 1]:index[i, 2], i] <- 1L
  }
  return(H)
})

mycut <- cmpfun(function(x, p) {
  cut(x = x, breaks = quantile(x, probs = p), labels = FALSE, include.lowest = TRUE)
})

## the coefficients to be recovered
coefGMM <- c(70, 3, 9, -2, 2, -2, -4, -3, 5, 2, 0.5, 0.1, -0.5)

## simulation replications
## sim <- 1000
## stored coefficient estimates
OLS <- GMML2 <- rep(NA, sim)

## variance-covariance matrix for level 3 variables 
R3_31 <- -0.30
R3_32 <- -0.22
R3_33 <- -0.10
R3_34  <- 0.23 
R3_35  <- -0.15 
R3_36 <- 0.19 
R3 <-  matrix(c(1, R3_31, R3_32, R3_34,
  R3_31, 1, R3_33, R3_35,
  R3_32, R3_33, 1, R3_36,
  R3_34, R3_35, R3_36, 1), 4, 4)
R3 <- cov2cor(make.positive.definite(R3))

## variance-covariance matrix for level 2 variables 
R2_12 <- -0.05
R2_13 <- 0.006
R2_14 <- 0.02
R2_23 <- -0.75
R2_24 <- -0.15
R2_34 <- -0.10
R2 <-  matrix(c(1, R2_12, R2_13, R2_14,
  R2_12, 1, R2_23, R2_24,
  R2_13, R2_23, 1, R2_34,
  R2_14, R2_24, R2_34, 1), 4, 4)
R2 <- cov2cor(make.positive.definite(R2))

# variance-covariance matrix for level 1 variables 
R1_12 <-  0.015
R1_13 <-  0.15
R1_14 <-  0.05
R1_23 <-  0.10
R1_24 <-  0.01
R1_34 <-  0.10
R1 <- matrix(c(1, R1_12, R1_13, R1_14,
  R1_12, 1, R1_23, R1_24,
  R1_13, R1_23, 1, R1_34,
  R1_14, R1_24, R1_34, 1), 4, 4)
R1 <- cov2cor(make.positive.definite(R1))

###### mean  ######
mu <- list(int=0, lev1 = c(X11 = 1.60, X12 = 0.056338, X13 = 0.06760563, X14 = 0.763122), 
  lev2 = c(X21 = 0.4688129), lev3 = c(X32 = 20.628, X31 = 12.2953, X33 = 94.84266))   

for (t in 1:sim) {
  k <- 40                       # total number of schools
  n <- sample(29:39, k, TRUE)   # number of students within each school
  j <- sum(n)                   # total number of students
  N <- sample(1:5, j, TRUE, prob = c(0.415833333, 0.246666667, 0.191666667, 0.142500000,
    0.003333333))
  i <- sum(N)                 
  
  ###### level 3 variables ####
  X3 <- as.data.table(MASS::mvrnorm(n = k, mu = c(mu$lev3, mu$int), Sigma = R3))
  X3[, SID := 1:k]
  X3 <- within(X3, { 
    X32 <- floor(X32 - abs(V4))
    X31 <- floor(X31 + rtnorm(k, mean = mu$lev3["X31"] - 0.19 * X32, sd = 3.195407, 
      lower = 5, upper = 25))
    X33 <-  floor(rtnorm(k, mean = mu$lev3["X33"] + 0.19 * X32 - 0.10 * X31, sd = 8.562959, 
      lower = 30, upper = 98))
  })
  
  X3[, err3 := rnorm(k, 0, 1)]
  X3[, OVL3 := rnorm(k, 8, 2.5) + 0.8 * abs(X3$V4)]
  
  S <- dmat(N) %*% dmat(n) %*% as.matrix(X3)
  
  ###### level 2 variables ####
  X21 <- rnorm(n = j, mu$lev2, 0.23)
  X2 <- as.data.frame(X21)
  
  RACE <- sample(0:3, j, TRUE, prob = c(0.02, 0.52, 0.35, 0.11))
  Race <- dummy_cols(RACE)
  X24 <- Race[, 1]
  X22 <- Race[, 2]
  X23 <- Race[, 3]
  
  b2 <- cbind(CID = 1:j, CRC = rlnorm(j, mean = mu$int, sd = 0.1579), err2 = rnorm(j, 0, 1))
  
  X2 <- within(X2, { 
    X21 <- mycut(X2[,"X21"], c(0, 0.53, 1)) - 1
  })
  
  b21 <- cbind(b2, X2, X24, X22, X23)
  
  C <- dmat(N) %*% as.matrix(b21)
  
  dt <- as.data.table(cbind(C, S))
  
  dt[, X21 := mycut(X21 + 0.8 * X31, c(0, 0.53, 1)) - 1]
  dt[, X22 := mycut(X22 - 0.5 * X31, c(0, 0.52, 1)) - 1]
  
  library("plyr")
  dt <- rename(dt, c("V4" = "SRC"))
  setkeyv(dt, cols = c("CID", "SID"))     
  
  ###### level 1 variables ####
  X1 <- as.data.frame(MASS::mvrnorm (n = i, mu = mu$lev1, Sigma = R1))
  X1 <- within(X1, { 
    X12 <- mycut(as.numeric(unlist(X1[, 2] + 0.1 * dt[, "SRC"] + 0.25 * dt[, "CRC"])),
      c(0, 0.99467, 1)) - 1
    X13 <- mycut(as.numeric(unlist(X1[, 3] + 0.1 * dt[, "SRC"] + 0.23 * dt[, "CRC"])),
      c(0, 0.943, 1)) - 1
    X14 <- (X1[,4] - min(X1[,4]))/(max(X1[,4]) - min(X1[,4]))
  })
  
  dt[, err1 := rnorm(i, 0, 1)]
  dt[, GRADE:= 1:.N, by = c("CID", "SID")]
  dt[, X11 := GRADE - 1]
  
  dt[,X14 := X1[,"X14"] - 0.02 * X31 + 0.05 * X32 + 0.0008 * X33 + rnorm(i, 0, 1)]
  dt[,X13 := X1[,"X13"] + 0.02 * X22 - 0.02 * X21 + 0.03 * X24 - 0.05 * X31 + 0.05 * X32 + 
      0.0008 * X33 + rnorm(i,0,1)]
  dt[,X12 := X1[,"X12"] + 0.4 * X22 - 0.25 * X21 + 0.2 * X24 - 0.04 * X31 + 0.05 * X32 + 
      0.0008 * X33 + rnorm(i, 0, 1)]
  
  dt[, Intercept := rep(1, i)]
  dt[, c("SRC", "CRC") := NULL]
  dt[, GRADE := NULL]
  
  # create random slope
  sigma1 <- matrix(c(1, 0.25, 0.25, 1), 2, 2)
  U <- as.data.frame(MASS::mvrnorm(n = i, mu = c(0,0), Sigma = sigma1))
  
  dt[,X15 :=  (0.2 * X21 + 0.03 * X22 + 0.04 * X23) + rnorm(i, 0, 1) + dt[, "err2"]] 
  cor(dt[, "err2"], dt[, "X15"])
  cor(dt[, "err2"], dt[, "X14"])
  cor(dt[, "err2"], dt[, "X12"])
  cor(dt[, "err2"], dt[, "X13"])
  
  dt1 <- data.table::copy(dt)
  dt[, c("SID", "CID") := NULL]
  setcolorder(dt,c("Intercept", "X11", "X12", "X13", "X14", "X21", "X22", "X23", "X24", 
    "X31", "X32", "X33", "OVL3", "X15", "err1", "err2", "err3"))
  
  ###### dependent variable ####
  dt1[, TLI:=  matrix(coefGMM %*% t(dt[, 1:13]), dim(dt)[1], 1) + (-1) * dt[, "X15"] +
      (-.5) * U[, 1] * dt[, "X11"] + 0.05 * U[, 2] + dt[, "err1"] + dt[, "err2"] + dt[, "err3"]]
  
  setcolorder(dt1,c("Intercept", "SID", "CID", "TLI", "X11", "X12", "X13", "X14", "X21",
    "X22", "X23", "X24", "X31", "X32", "X33", "OVL3", "X15", "err1", 
    "err2", "err3"))
  
  dt1[, c("OVL3", "err1", "err2", "err3") := NULL]
  
  colnames(dt1) <- c("Intercept", "SID", "CID", "y", "X11", "X12", "X13", "X14", 
    "X21", "X22", "X23", "X24", "X31", "X32", "X33", "X15")
  
  ###### model estimation ####
  resMultilevel <- REndo::multilevelIV(y ~ X11+ X12 + X13 + X14 + X15 + X21 + X22 + X23 + 
      X24 + X31 + X32 + X33 + (1 | CID) + (1 | SID) | endo(X15),
    data = dt1)
  OLS[t]   <- coef(resMultilevel)[6, 1]
  GMML2[t] <- coef(resMultilevel)[6, 4]
}

dmultilevel <- data.frame(rbind(cbind(OLS, "OLS"), cbind(GMML2, "MultilevelIV")))

dmultilevel$beta8 <- as.numeric(as.character(dmultilevel$OLS))

##### Estimates ####
cat("Multilevel IV vs OLS", "\n", "OLS estimate - mean:", mean(dmultilevel$beta8[dmultilevel$V2 == "OLS"]), "\n",
  "Multilevel IV estimate - mean:", mean(dmultilevel$beta8[dmultilevel$V2 == "MultilevelIV"]))

cat("Multilevel IV vs OLS", "\n", "OLS estimate - sd:", sd(dmultilevel$beta8[dmultilevel$V2 == "OLS"]), "\n",
  "Multilevel IV estimate - sd:", sd(dmultilevel$beta8[dmultilevel$V2 == "MultilevelIV"]))

######  plot ######
p8 <- ggplot(dmultilevel, aes(x = beta8,fill = V2), size = 3) +
  geom_density(alpha = 0.3)

pp8 <- p8 + labs(x = "Endogenous Regressor", y = "Density") + 
  coord_cartesian(xlim = c(-1.25, -0.3)) + 
  geom_vline(aes(xintercept = -1, linetype = "-1")) + 
  theme(panel.background = element_rect(fill = "white", colour = "black"),
    text = element_text(size = 14)) +
  scale_linetype_manual(name = "True value", values = "dotdash", labels = "-1")+
  guides(fill = guide_legend(override.aes = list(colour = NA), order = 2))+
  labs(fill = "Method") + xlim(c(-1.25, -0.3)) + ylim(c(0, 30))

pp81 <- pp8 + theme(legend.key.height = unit(0.6, "cm"),
  legend.key.width = unit(1, "cm"))

if (doPDF) pdf(file = "Figures/jss_3807_fig_8_ml.pdf") # width = 6, height = 6)
plot(pp81)
if (doPDF) dev.off() 


####  Chapter 5  ####
##### Code & Results: Latent IV ####
data("dataLatentIV", package = "REndo") 
resultsLIV <- latentIV(y ~ P, data = dataLatentIV) 
summary(resultsLIV)
## OLS results ont the same dataset
results_ols_liv <- lm(y ~ P, data = dataLatentIV)
summary(results_ols_liv)

##### Code & Results: Joint estimation using gaussian copula ####

###### Code & Results: One continuous endogenous regressor ####
## the bootstrapped errors and intervals might differ if on different OS or different package versions

data("dataCopCont", package = "REndo")
set.seed(1002)
resultsCC1 <- copulaCorrection(formula = y ~ X1 + X2 + P | continuous(P), 
  data = dataCopCont, num.boots = 50, verbose = FALSE) 
summary(resultsCC1)

## OLS estimates on the same datasset
results_ols_cc1 <- lm(y ~ X1 + X2 + P, data = dataCopCont)
summary(results_ols_cc1)

###### Code & Results: One discrete endogenous regressor ####
## the bootstrapped errors and intervals might differ if on different OS 
set.seed(1003)
resultsCC2 <- copulaCorrection(formula = y ~ X1 + X2 + P | discrete(P),
  data = dataCopDis, verbose = FALSE) 
summary(resultsCC2)
# confint(resultsCC2)

### OLS results on the same dataset
results_ols_cd <- lm(y ~ X1 + X2 + P, data = dataCopDis)
summary(results_ols_cd)

###### Code & Results: Discrete and continuous endogenous regressors ####
## the bootstrapped errors and intervals might differ if on different OS 

data("dataCopDisCont", package = "REndo")
set.seed(1004)
resultsCC3 <- copulaCorrection(formula = y ~ X1 + X2 + P1 + P2 | discrete(P1) + continuous(P2),
  data = dataCopDisCont, verbose = FALSE)
summary(resultsCC3)

### OLS results on the same dataset
results_ols_cdc <- lm(y ~ X1 + X2 + P1 + P2, data = dataCopDisCont)
summary(results_ols_cdc)

##### Code & Results: Higher moments  ####
data("dataHigherMoments", package = "REndo")
resultsHM <- higherMomentsIV(formula = y ~ X1 + X2 + P | P | IIV(iiv = yp),
  data = dataHigherMoments)
summary(resultsHM)
### OLS on the same data
results_ols_hm <- lm(y ~ X1 + X2 + P, data = dataHigherMoments)
summary(results_ols_hm)

##### Code & Results: Heteroskedastic errors  ####
data("dataHetIV", package = "REndo")
resultsHetIV <- hetErrorsIV(y ~ X1 + X2 + P | P  | IIV(X2), data = dataHetIV)
summary(resultsHetIV)

### OLS for the same dataset
results_ols_het <- lm(y ~ X1 + X2 + P, data = dataHetIV)
summary(results_ols_het)

##### Code & Results: Multilevel internal IV  ####
set.seed(1005)
data("dataMultilevelIV", package = "REndo")
formula1 <- y ~ X11 + X12 + X13 + X14 + X15 + X21 + X22 + X23 + X24 + X31 + X32 + 
  X33 + (1 | CID) + (1 | SID) | endo(X15) 
resultsMIV <- multilevelIV(formula = formula1, data = dataMultilevelIV)
coef(resultsMIV)             # results page 
summary(resultsMIV, "REF")   # results page
summary(resultsMIV, "FE_L2")  


#### Chapter 6 ####
##### Table 5 ##### 
## IIV model comparison.
## We use the CASchools dataset from the AER package to exemplify the internal IV methods
## and compare them with the OLS, 2SLS and the control function approaches.
data("CASchools", package = "AER") 
CASchools$stratio <- with(CASchools, students / teachers) 
school <- CASchools 
school$gr08 <- 1 
school$gr08[school$grades == "KK-06"] <- 0 
school$cc <- as.numeric(school$county)

##### OLS  ####
ols <- lm(read ~ stratio + english + lunch + grades + calworks + income + county, 
  data = school) 
summary(ols)$coefficients[1:7,] 
summary(ols)

##### Two Stage Least Squares ####
extiv <- ivreg(read ~ stratio + english + lunch + grades + income + calworks + 
    county, ~ expenditure + english + lunch + income + grades + county + 
    calworks, data = school) 
summary(extiv)$coefficients[1:7, ]
summary(extiv)

##### Control Function  ####
res_cf <- lm(read ~ expenditure, data = school)$residuals 
m_cf <- lm(read ~ stratio + res_cf + lunch + english + calworks + income + grades + 
    county , data = school) 
summary(m_cf)$coefficients[1:8, ] 
summary(m_cf)

##### CopulaCorrection  ####
## the bootstrapped errors and intervals might differ if on different OS 
set.seed(43223) 
cop.model <- copulaCorrection(read ~ stratio + english + lunch + calworks + grades +
    income + county | continuous(stratio),
  num.boots = 50, data = school, verbose = FALSE) 
summary(cop.model)$coefficients[1:7, ] 
summary(cop.model)

##### Latent IIV ####
liv <- latentIV(read ~ stratio, data = school) 
summary(liv) 

##### Heteroskedastic Errors IIV  ####
hetEr <- hetErrorsIV(read ~ stratio + english + lunch + calworks + income +
    grades + county | stratio | IIV(income, english), data = school) 
summary(hetEr)$coefficients[1:7, ] 
summary(hetEr)

##### Higher Moments IIV  ####
highMoment <- higherMomentsIV(read ~ stratio + english + lunch + calworks + 
    income + grades + county| stratio | IIV(g = x3, iiv = gp, income),
  data = school) 
summary(highMoment)$coefficients[1:7, ]
summary(highMoment)

##### Multilevel IIV ####
multilev <- multilevelIV(read ~ stratio + english + lunch + gr08 + 
    income + calworks + (1 | cc) | endo(stratio), data = school) 
coef(multilev)


#### Appendix ####
##### Appendix A ####
## Weak vs.Strong Instrumental Variables

###### Table 6  ####
## The sample size varies between 500 and 2500 observations respectively, 
## and the correlation ($\rho$) between the instrument and the endogenous regressor
## takes two values, $0.10$ and $0.40$. 
## The results are obtained running a simulation over 1000 random samples, where the
## true parameter value is equal to $-1$ and the correlation between the omitted variable and 
## the endogenous regressor takes two values, $0.1$ and $0.3$.

## Lets take a look at a simple IV model
## We will generate data and then estimate it
## The model we will use is:  
## y = b0 + b1 * X + error_y   
## x = g0 + g1 * Z + error_x
## There is an omitted variable, in both equations, driving y and x (call it "OV")

set.seed(43224)

## Simulate some data
## Set true parameters

b0_true <- 1
b1_true <- -1
g0_true <- 1
g1_true <- 1
mu <- c(0,0)

## phi = correlation between endogenous regressor (X) and error
## rho = correlation between the IV (Z) and endogenous regressor (X)
endobias <- function(nobs, phi, rho, sim){
  
  coefOLS <- rep(NA, sim)
  coefIV <- rep(NA, sim) 
  IV.df <- data.frame(obs.id = as.factor(c(1:nobs)))
  R <- matrix(c(1, rho, rho, 1), 2, 2)
  OV <- rnorm(n = nobs, mean = 0, sd = 1)
  
  ## Generate errors error_y and error_x  (both standard normal)
  error_y <- rnorm(n = nobs, mean = 0, sd = 0.5)
  error_x <- rnorm(n = nobs, mean = 0, sd = 0.5)
  
  matXZ1 <- MASS::mvrnorm(nobs, mu, R)  # cor(X,Z)
  
  ## Generate instrumental variable
  IV.df$Z <- matXZ1[, 1]
  IV.df$X1 <- matXZ1[, 2]
  
  ## Generate endogenous regressor
  IV.df$X <- g0_true + g1_true * IV.df$X1 + phi * OV + error_x
  
  ## Generate dependent variable
  IV.df$Y <- b0_true + b1_true * IV.df$X  + OV + error_y
  
  ## Run OLS with Y and X
  m1 <- lm(Y ~ X, data = IV.df)
  
  ## Run IV regression using Z as instrument
  m2 <- ivreg(Y ~ X | Z, data = IV.df)
  
  for (i in 1:sim){ 
    coefOLS[i] <- summary(m1)$coefficients[2]
    coefIV[i]  <- summary(m2)$coefficients[2]
  }
  res <- list(resOLS = coefOLS, resIV = coefIV)
  res
}

###  SAMPLE SIZE = 500 
## phi = 0.1, rho = 0.1 #
res1_1_1 <- endobias(nobs = 500, phi = 0.1, rho = 0.1, sim = 1000)
## phi = 0.1, rho = 0.4 #
res1_1_4 <- endobias(nobs = 500, phi = 0.1, rho = 0.4, sim = 1000)
## phi = 0.3, rho = 0.1 #
res1_3_1 <- endobias(nobs = 500, phi = 0.3, rho = 0.1, sim = 1000)
## phi = 0.3, rho = 0.4 #
res1_3_4 <- endobias(nobs = 500, phi = 0.3, rho = 0.4, sim = 1000)

###  SAMPLE SIZE = 2500 
## phi = 0.1, rho = 0.1 #
res2_1_1 <- endobias(nobs = 2500, phi = 0.1, rho = 0.1, sim = 1000)
## phi = 0.1, rho = 0.4 #
res2_1_4 <- endobias(nobs = 2500, phi = 0.1, rho = 0.4, sim = 1000)
## phi = 0.3, rho = 0.1 #
res2_3_1 <- endobias(nobs = 2500, phi = 0.3, rho = 0.1, sim = 1000)
## phi = 0.3, rho = 0.4 #
res2_3_4 <- endobias(nobs = 2500, phi = 0.3, rho = 0.4, sim = 1000)

## Correlation between X and the error (phi) = 0.1 and 
## rho takes values 0.1 and 0.4

## rho = 0.1
results_rho_01_1 <- data.table(phi = c(0.1, 0.1), rho = c(0.1, 0.1), true_value = c(-1, -1),
  OLS = c(round(mean(res1_1_1$resOLS), 3), round(mean(res2_1_1$resOLS), 3)), 
  IV =  c(round(mean(res1_1_1$resIV), 3), round(mean(res2_1_1$resIV), 3)),
  sample_size = c(500, 2500))
results_rho_01_1

## rho = 0.4
results_rho_04_1 <- data.table(phi = c(0.1, 0.1), rho = c(0.4, 0.4), true_value = c(-1, -1),
  OLS = c(round(mean(res1_1_4$resOLS), 3), round(mean(res2_1_4$resOLS), 3)), 
  IV = c(round(mean(res1_1_4$resIV), 3), round(mean(res2_1_4$resIV), 3)),
  sample_size = c(500, 2500))
results_rho_04_1

## Correlation between X and the error (phi) = 0.3 and 
## rho takes values 0.1 and 0.4

## rho = 0.1
results_rho_01_2 <- data.table(phi = c(0.3, 0.3), rho = c(0.1, 0.1), true_value = c(-1, -1),
  OLS = c(round(mean(res1_3_1$resOLS), 3), round(mean(res2_3_1$resOLS), 3)),
  IV = c(round(mean(res1_3_1$resIV), 3), round(mean(res2_3_1$resIV), 3)), sample_size = c(500, 2500))
results_rho_01_2

## rho = 0.4
results_rho_04_2 <- data.table(phi = c(0.3, 0.3), rho = c(0.4, 0.4), true_value = c(-1, -1),
  OLS = c(round(mean(res1_3_4$resOLS), 3), round(mean(res2_3_4$resOLS), 3)), 
  IV = c(round(mean(res1_3_4$resIV), 3), round(mean(res2_3_4$resIV), 3)),
  sample_size = c(500, 2500))
results_rho_04_2


##### Appendix B  #####
##: Bias under Endogeneity

###### Figure 9  #######
## Endogeneity and the bias of estimates
## for beta1 and beta2:  bias at different levels of 
## correlation between the endogenous regressor (P) 
## and the error.

set.seed(200)
## Set true parameters
b0_true <- 1
b1_true <- -1
b2_true <- 1
g0_true <- 1
g1_true <- 1

mu <- c(0,0)
## phi = correlation between endogenous regressor (X) and error
## rho = correlation between the IV (Z) and endogenous regressor (X)
endobias <- function(nobs, phi, rho, sim){
  
  coefOLS1 <-  coefOLS2 <- corXP <- corPE <- rep(NA , sim)
  coefIV   <- rep(NA,sim) 
  IV.df    <- data.frame(obs.id = as.factor(c(1:nobs)))
  R  <- matrix(c(1, 1.8*rho, 1.8*rho, 1), 2, 2)
  R1 <- matrix(c(1, 1.8*phi, 1.8*phi, 1), 2, 2)
  
  
  ## Generate errors error_y and error_x (both standard normal)
  error_x <- rnorm(n = nobs, mean = 0, sd = 1)
  
  matPX <- MASS::mvrnorm(nobs, mu, R)  # cor(P,X)
  matPerrx <- MASS::mvrnorm(nobs, mu, R1)  # cor(err,P)
  
  ## Generate instrumental variable
  IV.df$X <- matPX[, 1]
  IV.df$X1 <- matPX[, 2]
  
  OV <- matPerrx[,1]
  error_y <- matPerrx[,2]
  ## Generate endogenous regressor
  IV.df$P <- g0_true + g1_true * IV.df$X1  + OV + error_x
  
  ## Generate dependent variable
  IV.df$Y <- b0_true + b1_true * IV.df$P + b2_true * IV.df$X  + error_y
  
  ## Run OLS with Y and X
  m1 <- lm(Y ~ P + X, data = IV.df)
  for (i in 1:sim){ 
    coefOLS1[i] <- summary(m1)$coefficients[2]
    coefOLS2[i] <- summary(m1)$coefficients[3]
    corXP[i] <- cor(IV.df$X, IV.df$P)
    corPE[i] <- cor(error_y, IV.df$P)
    
  }
  res <- list(resOLS1 = coefOLS1, resOLS2 = coefOLS2, resIV = coefIV, corxp = corXP, corperr = corPE)
  res
}

res1 <- res2 <- corelationXP <- corelationPE <- matrix(0, 3, 3)

phi <- c(0.1, 0.3, 0.5)
rho <- c(0.1, 0.3, 0.5)

for (i in 1:3){
  for (j in 1:3){
    resbias <- endobias(nobs = 2500, phi = phi[i], rho = rho[j], sim = 1000)
    res1[i,j] <- mean(resbias$resOLS1)
    res2[i,j] <- mean(resbias$resOLS2)
    corelationXP[i,j] <- mean(resbias$corxp)
    corelationPE[i,j] <- mean(resbias$corperr)
    
  }
}

## beta1 - the coefficient of the endogenous regressor
dd_b1 <- rbind(as.matrix(res1[1, ], 3, 1), as.matrix(res1[2, ], 3, 1), as.matrix(res1[3, ], 3, 1))
colnames(dd_b1) <- c("beta1")
dd_b1 <- cbind(dd_b1, corr_P_error = rep(c(0.1, 0.3, 0.5), each = 3))
dd_b1 <- cbind(dd_b1, corr_P_X=rep(c(0.1, 0.3, 0.5), 3))
dd_b1 <- as.data.frame(dd_b1)

## create graph
Corr_P_error <- factor(dd_b1$corr_P_error)

## beta2 - the coefficient of the covariate
dd_b2 <- rbind(as.matrix(res2[1, ], 3, 1), as.matrix(res2[2, ], 3, 1),
  as.matrix(res2[3, ], 3, 1))
colnames(dd_b2) <- c("beta2")
dd_b2 <- cbind(dd_b2, corr_P_error = rep(c(0.1, 0.3, 0.5), each = 3))
dd_b2 <- cbind(dd_b2, corr_P_X = rep(c(0.1, 0.3, 0.5), 3))
dd_b2 <- as.data.frame(dd_b2)
## create graph
Corr_P_error <- factor(dd_b2$corr_P_error)

## beta1 at different correlation levels between error and endogenous regressor X
p1 <- ggplot(data=dd_b1, aes(x = corr_P_X, y = beta1, group = corr_P_error,
  colour = Corr_P_error)) +
  geom_line() +
  theme_grey(base_family = "serif", base_size = 10) + ggtitle("Panel A")

p11 <- p1 + xlab("Correlation between P and X") +
  ylab("Coefficient estimates for beta1") +
  geom_point(data = as.data.frame(dd_b1[1:3, ]), aes(x = corr_P_X, y = beta1, 
    group = corr_P_error, colour = factor(corr_P_error))) +
  geom_point(data = as.data.frame(dd_b1[4:6, ]), aes(x = corr_P_X, y = beta1, 
    group = corr_P_error, colour = factor(corr_P_error))) +
  geom_point(data = as.data.frame(dd_b1[7:9,]), aes(x = corr_P_X, y = beta1, 
    group = corr_P_error, colour = factor(corr_P_error))) +
  theme(panel.background = element_rect(fill = "white", colour = "black"),
    text = element_text(family = "serif", size = 10))+
  geom_hline(aes(yintercept = -1, linetype = "dotdash"), show.legend = TRUE) + 
  guides(fill = guide_legend(override.aes = list(linetype = "blank"))) +
  scale_linetype_manual(name = "true b1 value", values = "dotdash", 
    labels = "-1") + 
  scale_fill_discrete(guide = guide_legend(order = 1)) +    
  scale_color_discrete(guide = guide_legend(order = 2)) +
  labs(colour = "corr(P, error)")

p2 <- ggplot(data = dd_b2, aes(x = corr_P_X, y = beta2, group = corr_P_error, 
  colour = Corr_P_error)) + geom_line() +
  theme_grey(base_family = "serif", base_size = 10)

p21 <- p2 + xlab("Correlation between P and X") +
  ylab("Coefficient estimates for beta2") + ggtitle("Panel B") +
  geom_point(data = as.data.frame(dd_b2[1:3, ]), aes(x = corr_P_X, y = beta2, 
    group = corr_P_error, colour = factor(corr_P_error))) +
  geom_point(data = as.data.frame(dd_b2[4:6, ]), aes(x = corr_P_X, y = beta2, 
    group = corr_P_error, colour = factor(corr_P_error))) +
  geom_point(data = as.data.frame(dd_b2[7:9, ]), aes(x = corr_P_X, y = beta2, 
    group = corr_P_error, colour = factor(corr_P_error))) +
  theme(panel.background = element_rect(fill = "white", colour = "black"),
    text = element_text(family = "serif", size = 10)) +
  geom_hline(aes(yintercept = 1, linetype = "dotdash"), show.legend = TRUE)+ 
  scale_linetype_manual(name = "true b2 value", values = "dotdash", labels = "1") +
  guides(fill = guide_legend(override.aes = list(linetype = "blank"))) +
  scale_fill_discrete(guide = guide_legend(order = 1)) +   
  scale_color_discrete(guide = guide_legend(order = 2)) +
  labs(colour = "corr(P, error)")

figb1 <- grid.arrange(p11, p21, ncol = 2)
colnames(res1) <- c("rho_0.1", "rho_0.3", "rho_0.5")
rownames(res1) <-  c("phi_0.1", "phi_0.3", "phi_0.5")
if (doPDF) ggsave("Figures/jss_3807_appendix_figB1.pdf", figb1)


###### Table 7  #######
## Estimates bias: beta1 - intercept, beta2 - slope

## estimates for beta1 - coefficient of P
colnames(res1) <- c("rho_0.1", "rho_0.3", "rho_0.5")
rownames(res1) <-  c("phi_0.1", "phi_0.3", "phi_0.5")

## estimates for beta2 - coefficient of X
colnames(res2) <- c("rho_0.1", "rho_0.3", "rho_0.5")
rownames(res2) <-  c("phi_0.1", "phi_0.3", "phi_0.5")

-1 - res1
1 - res2

##### Appendix C  #####

###### Figure 10  #######
## Copula example
library("rgl")
set.seed(100)
m <- 2
n <- 2000
sigma <- matrix(c(1, 0.4,
  0.4, 1), nrow = 2)
z <- mvrnorm(n, mu = rep(0, m), Sigma = sigma, empirical = T)

u <- pnorm(z)
x1 <- qt(u[, 1], df = 3)
x2 <- qnorm(u[, 2])

plot3d(x1, x2, pch = 20, col = "navyblue", size = 22, xlab = "", ylab = "x1", zlab = "x2")

t_dat <- data.frame(q = x1, cdf = pt(x1, df = 1))
c1 <- ggplot(t_dat) + geom_line(aes(x = x1, y = cdf)) + 
  theme_classic() + ylab("Cumulative distribution function of x1") +
  theme(axis.text = element_text(size = 22),
    axis.title = element_text(size =22))
c1

norm_dat <- data.frame(q = x2, cdf = pnorm(x2))
c2 <- ggplot(norm_dat) + geom_line(aes(x = x2, y = cdf)) +
  theme_classic() + ylab("Cumulative distribution function of x2") +
  theme(axis.text = element_text(size = 22),
    axis.title = element_text(size =22))

c2

####   Data generation   #############
###### Latent IV ######
library("MASS")
set.seed(1410)

# sample size
Tn <- 2500
# correlation matrix
R <- matrix(c(1, 0.25, 0.25, 1), 2, 2)
## mean
m <- c(0, 0)
N <- mvrnorm(Tn, m, R)
eps <- N[, 1]
nu  <- N[, 2]
b0 <- 3
a <- -1
Z <- rep(c(1, 4, 7, 10, 12), c(100, 400, 1000, 700, 300))
P <- Z + 3 * nu
y <- I * b0 + a * P + eps
I <- rep(1, Tn)
dataLatentIV <- data.table(y, I, P, Z, eps)

resultsLIV <- latentIV(y ~ P, data = dataLatentIV)				   	
summary(resultsLIV)	

### OLS
ols_iv <- lm(y ~ P, data = dataLatentIV)
summary(ols_iv)


##### Higher moments #####
set.seed(1121)

#generate data 
Tn <- 2500
rho2 <- 0.5
R <-  matrix(c(1, rho2, rho2, 1), 2, 2)
m <- c(0, 0)

N <- mvrnorm(Tn, mu = m, R)
OV <- N[, 1]
e1 <- N[, 2]

I <- rep(1, Tn)
X3 <- rt(Tn, 3)

e2 <- rt(Tn, 10)

X1 <- rnorm(Tn, 2, 0.1)
X2 <- X1 + 0.5 * rnorm(Tn, 1.5, 0.9)

b0 <- 2
b1 <- 1.5
b2 <- 3

b3 <- 3
a <-  -1

P <- b3*X3 + e2 + 3*OV
y <- b0 + b1*X1 + b2*X2 + a*P + e1 + OV
X <- cbind(X1,X2)
colnames(X) <- c("X1","X2")
dataHigherMoments <- data.frame(y, X, P)

### Higher moments
resultsHM <- higherMomentsIV(formula = y ~ X1 + X2 + P | P | IIV(iiv = yp),  data = dataHigherMoments)  
summary(resultsHM)

### OLS 
ols_hm <- lm(y ~ X1 + X2 + P, data = dataHigherMoments)
summary(ols_hm)


###### Heteroskedastic errors #####
set.seed(4333) 

N <-  2500
b0 <- 2
b1 <- 1.5
b2 <- 3
a <- -1

X1 <- rnorm(N, 0, 1)
X2 <- rnorm(N, 0, 1)
U <- rnorm(N, 0, 1)
S1 <- rnorm(N, 0, 1)
S2 <- rnorm(N, 0, 2)
eps1 <- U + exp(X2) * S1 

eps2 <- U + exp(-X2) * S2

P <- b0 + b2 * X2 + eps1
y <- b0 + b1 * X1 + b2 * X2 + a * P + eps2

dataHetIV <- as.data.frame(cbind(y, X1, X2, P))

### Heteroskedastic errors 
resultsHetIV <- hetErrorsIV(y ~ X1 + X2 + P | P | IIV(X2), data = dataHetIV)
summary(resultsHetIV)

### OLS 
ols_het <- lm(y ~ X1 + X2 + P, data = dataHetIV)
summary(ols_het)


##### Copula correction ####
######  1. One continuous endogenous variable  #######
## Creates a dataset with 1 continuous endogeneous variables, non-normally distributed
## The dependent variable is y, there are two exogeneous regressors, X1 and X2,
## and one endogeneous regressor P1 which has a t-distribution with 3 degrees of freedom.    

set.seed(1110)

# sample size
Tn <- 2500
# corr(eps P) = rho1
rho1 <- 0.33

## correlation matrix
R <- matrix(c(1, rho1, rho1, 1), 2, 2)
## mean
m <- c(0, 0)

b0 <- 2
b1  <- 1.5
b2 <- -3
a.1 <- -1

I <- rep(1, Tn)
X1 <- rnorm(Tn, 4, 2.3)
X2 <- rnorm(Tn, 2, 1.6)

## generate multivariate normal
N <- mvrnorm(Tn, mu = m, Sigma = R)

P  <- N[, 1]
eps <- N[, 2]

## P1 has a t-distribution with 3 df
P <- qt(pnorm(P),3)

y <-  b0 * I + b1 * X1 + b2 * X2 + a.1 * P + eps

dataCopCont <- cbind(y, I, X1, X2, P) 
dataCopCont <- as.data.frame(dataCopCont)

### Copula 1 continuous 
set.seed(1002)
resultsCC1 <- copulaCorrection(formula = y ~ X1 + X2 + P | continuous(P), 
  data = dataCopCont, num.boots = 50, verbose = FALSE)
summary(resultsCC1)

### OLS 
ols_copcont <- lm(y ~ X1 + X2 + P, data = dataCopCont)
summary(ols_copcont)


#####  2. One discrete endogenoue variable ####
## create dataset with 1 discrete endogeneous variable, with a Poisson distribution

set.seed(42100)
## sample size
Tn <- 2500
## corr(eps, P) = rho1
rho <- 0.33

## correlation matrix
R <- matrix(c(1, rho, rho, 1), 2, 2)
## mean
m <- c(0, 0)
b0 <- 2
b1  <- 1.5
b2 <- -3
a <- -1

I <- rep(1, Tn) 
X.1 <- rnorm(Tn, 4, 2.3)
X.2 <- X.1 + 0.5 * rnorm(Tn, 2, 1.6)

## generate multivariate normal
N <- mvrnorm(Tn, mu = m, Sigma = R)

P  <- N[, 1]
eps <- N[, 2]

X1 <- X.1
X2 <- X.2
## P has a poisson with lambda=5
P <- qpois(pnorm(P), 5)

y <- b0 + b1 * X1 + b2 * X2 + a * P + eps
X <- cbind(X1, X2, P)

dataCopDis <- cbind(y, I, X, P)
dataCopDis <- as.data.frame(dataCopDis)

### Copula - discrete
## with intercept
set.seed(1003) 
resultsCC2 <- copulaCorrection(formula = y ~ X1 + X2 + P | discrete(P), data = dataCopDis)
summary(resultsCC2)

### OLS
ols_copdis <- lm( y ~ X1 + X2 + P, data  = dataCopDis)
summary(ols_copdis)


##### 3. One continuous and one discrete endogenous variable ####
## create dataset with 2 continuous endogeneous variables, non-normaly distributed

set.seed(42102)
## sample size
Tn <- 2500
## corr(eps, P) = rho1
rho12 <- 0.25    # correlation btw P1 and P2
rho1a <- 0.33    # correlation btw P1 and error
rho2a <- 0.25    # correlation btw P2 and error

## correlation matrix
R <- matrix(c(1, rho12, rho1a, rho12, 1, rho2a, rho1a, rho2a, 1), 3, 3)

## mean
m <- c(0, 0, 0)

b0 <- 2
b1  <- 1.5
b2 <- -3
a.1 <- -1
a.2 <- 0.8  

I <- rep(1, Tn)
X1 <- rnorm(Tn, 4, 2.3)
X2 <- X1 + 0.5 * rnorm(Tn, 2, 1.6)

## generate multivariate normal
N <- mvrnorm(Tn, mu = m, Sigma = R)

P1  <- N[, 1]
P2  <- N[, 2]
eps <- N[, 3]

## P1 has a poisson distribution with 3 df
P1 <- qpois(pnorm(P1), 3)
## P2 has a t-distribution with 3 df
P2 <- qt(pnorm(P2), 3)

X <- cbind(I, X1, X2, P1, P2)
y <-  b0 + b1 * X1 + b2 * X2 + a.1 * P1 + a.2 * P2 + eps
P <- cbind(P1,P2)
dataCopDisCont <- cbind(y, I, X1, X2, P1, P2)
dataCopDisCont <- as.data.frame(dataCopDisCont)

#### Copula 1 discrete + 1 continuous
set.seed(1004) 
resultsCC3 <- copulaCorrection(formula = y ~ X1 + X2 + P1 + P2 | discrete(P1) + continuous(P2), data = dataCopDisCont)
summary(resultsCC3)

### OLS 
ols_copdis2 <- lm(y ~ X1 + X2 + P1 + P2, data =  dataCopDisCont)
summary(ols_copdis2)


####### Multilevel #####
set.seed(43410)

####
## 3-levels
####

####  functions used #########
dmat <- cmpfun(function(i) {
  j <- length(i)
  n <- sum(i)
  index <- cbind(start = cumsum(c(1, i[-j])), stop = cumsum(i))
  H <- matrix(0, nrow = n, ncol = j)
  for (i in 1:j) {
    H[index[i, 1]:index[i, 2], i] <- 1L
  }
  return(H)
})

mycut <- cmpfun(function(x, p) {
  cut(x = x, breaks = quantile(x, probs = p), labels = FALSE, include.lowest = TRUE)
})


## the coefficients to be recovered
coefGMM <- c(70, 3, 9, -2, 2, -2, -4, -3, 5, 2, 0.5, 0.1, -0.5)


## variance-covariance matrix for level 3 variables in original dataset
R3_31 <- -0.30
R3_32 <- -0.22
R3_33 <- -0.10
R3_34  <- 0.23 
R3_35  <- -0.15 
R3_36 <- 0.19 
R3 <-  matrix(c(1, R3_31, R3_32, R3_34,
  R3_31, 1, R3_33, R3_35,
  R3_32, R3_33, 1, R3_36,
  R3_34, R3_35, R3_36, 1), 4, 4)
R3 <- cov2cor(make.positive.definite(R3))


## variance-covariance matrix for level 2 variables in original dataset
R2_12 <- -0.05
R2_13 <- 0.006
R2_14 <- 0.02
R2_23 <- -0.75
R2_24 <- -0.15
R2_34 <- -0.10
R2 <-  matrix(c(1, R2_12, R2_13, R2_14,
  R2_12, 1, R2_23, R2_24,
  R2_13, R2_23, 1, R2_34,
  R2_14, R2_24, R2_34, 1), 4, 4)
R2 <- cov2cor(make.positive.definite(R2))


## variance-covariance matrix for level 1 variables in original dataset
R1_12 <-  0.015
R1_13 <-  0.15
R1_14 <-  0.05
R1_23 <-  0.10
R1_24 <-  0.01
R1_34 <-  0.10
R1 <- matrix(c(1, R1_12, R1_13, R1_14,
  R1_12, 1, R1_23, R1_24,
  R1_13, R1_23, 1, R1_34,
  R1_14, R1_24, R1_34, 1), 4, 4)
R1 <- cov2cor(make.positive.definite(R1))


mu <- list(int = 0, lev1 = c(X11 = 1.60, X12 = 0.056338, X13 = 0.06760563, X14 = 0.763122), 
  lev2 = c(X21 = 0.4688129), lev3 = c(X32 = 20.628, X31 = 12.2953, X33 = 94.84266))   

k <- 40                        ## total number of schools
n <- sample(29:39, k, TRUE)    ## number of students within each school
j <- sum(n)                    ## total number of students
N <- sample(1:5, j, TRUE, prob = c(0.415833333, 0.246666667, 0.191666667, 0.142500000, 0.003333333))
i <- sum(N)                 

## %%%%%%%%%%%%%%%
## school variables
## %%%%%%%%%%%%%%%
X3 <- as.data.table(mvrnorm(n = k, mu = c(mu$lev3, mu$int), Sigma = R3))

X3[,SID:= 1:k]


X3 <- within(X3, { 
  X32 <- floor(X32 - abs(V4))
  X31 <- floor(X31 + rtnorm(k, mean = mu$lev3["X31"] - 0.19 * X32, sd = 3.195407, lower = 5, upper = 25))
  X33 <-  floor(rtnorm(k, mean = mu$lev3["X33"] + 0.19 * X32 - 0.10 * X31, sd = 8.562959, lower = 30, upper = 98))
})

X3[,err3 := rnorm(k, 0, 1)]
X3[,OVL3 := rnorm(k, 8, 2.5) + 0.8 * abs(X3$V4)]

S <- dmat(N) %*% dmat(n) %*% as.matrix(X3)


## %%%%%%%%%%%%%%%
## child variables
## %%%%%%%%%%%%%%%
X21 <- rnorm(n = j, mu$lev2, 0.23)
X2 <- as.data.frame(X21)

RACE <-  sample(0:3, j, TRUE, prob = c(0.02, 0.52, 0.35, 0.11))
Race <- dummy_cols(RACE)
X24 <- Race[, 1]
X22 <- Race[, 2]
X23 <- Race[, 3]

b2 <- cbind(CID = 1:j, CRC = rlnorm(j, mean = mu$int, sd = 0.1579), err2 = rnorm(j, 0, 1))

X2 <- within(X2, { 
  X21 <-  mycut(X2[,"X21"],c(0, 0.53, 1)) - 1
})


b21 <- cbind(b2, X2, X24, X22, X23)

C <- dmat(N) %*% as.matrix(b21)


dt <- as.data.table(cbind(C, S))

dt[, X21:= mycut(X21 + 0.8 * X31, c(0, 0.53, 1)) - 1]
dt[, X22:= mycut(X22 - 0.5 * X31, c(0, 0.52, 1)) - 1]


library("plyr")
dt <- rename(dt, c("V4" = "SRC"))
setkeyv(dt, cols = c("CID", "SID"))     

X1 <- as.data.frame(mvrnorm (n = i, mu = mu$lev1, Sigma = R1))
X1 <- within(X1, { 
  X12 <- mycut(as.numeric(unlist(X1[, 2] + 0.1 * dt[,"SRC"] + 0.25 * dt[,"CRC"])), c(0, 0.99467, 1)) - 1
  X13 <- mycut(as.numeric(unlist(X1[, 3] + 0.1 * dt[,"SRC"] + 0.23 * dt[,"CRC"])), c(0, 0.943, 1)) - 1
  X14 <- (X1[, 4] - min(X1[, 4]))/(max(X1[, 4]) - min(X1[, 4]))
})

dt[, err1 := rnorm(i, 0, 1)]
dt[, GRADE := 1:.N, by = c("CID", "SID")]
dt[, X11 := GRADE - 1]

dt[, X14 := X1[, "X14"] - 0.02 * X31 + 0.05 * X32 + 0.0008 * X33 + rnorm(i, 0, 1)]
dt[, X13 := X1[, "X13"] + 0.02 * X22 - 0.02 * X21 + 0.03 * X24 - 0.05 * X31 + 0.05 * X32 + 0.0008 * X33 + rnorm(i, 0, 1)]
dt[, X12 := X1[, "X12"] + 0.4 * X22 - 0.25 * X21 + 0.2 * X24 - 0.04 * X31 + 0.05 * X32 + 0.0008 * X33 + rnorm(i, 0, 1)]

dt[, Intercept := rep(1, i)]
dt[, c("SRC", "CRC") := NULL]
dt[, GRADE := NULL]

## create random slope
sigma1 <- matrix(c(1, 0.25, 0.25, 1), 2, 2)

U <- as.data.frame(mvrnorm(n = i, mu = c(0, 0), Sigma = sigma1))

dt[, X15 :=  (0.2 * X21 + 0.03 * X22 + 0.04 * X23) + rnorm(i, 0, 1) + dt[, "err2"]] 

dt1 <- data.table::copy(dt)
dt[, c("SID", "CID") := NULL]
setcolorder(dt, c("Intercept", "X11", "X12", "X13", "X14", "X21", "X22", "X23",
  "X24", "X31", "X32", "X33", "OVL3", "X15", "err1", "err2", "err3"))

## dependent variable
dt1[, TLI :=  matrix(coefGMM %*% t(dt[, 1:13]), dim(dt)[1], 1) + (-1) * dt[, "X15"] +
    (-.5) * U[, 1] * dt[, "X11"] + 0.05 * U[, 2] + dt[, "err1"] + dt[, "err2"] + dt[, "err3"]]

setcolorder(dt1, c("Intercept", "SID", "CID", "TLI", "X11", "X12", "X13", "X14",
  "X21", "X22", "X23", "X24", "X31", "X32", "X33", "OVL3", "X15",
  "err1", "err2", "err3"))

dt1[, c("OVL3", "err1", "err2", "err3") := NULL]

colnames(dt1) <- c("Intercept", "SID", "CID", "y", "X11", "X12", "X13", "X14",
  "X21", "X22", "X23", "X24", "X31", "X32", "X33", "X15")

dataMultilevelIV <- dt1