## REndo: Internal Instrumental Variables to Address Endogeneity ## Authors: Gui, Meierer, Schilter, Algesheimer ## Operating system: Windows 10 Enterprise version 21H2; ## Processor: Intel Core i7-6600U CPU @ 2.60GHz 2.81 GHz; ## Memory: 20 GB; System type: 64-bit operating system, x64-based processor ## R version 4.3.0 ## Replication Manuscript ## REndo - v.2.4.8 # list.of.packages <- c("ggplot2", "AER", "MASS", "gridExtra", "data.table", # "REndo", "corpcor", "AER", "optimx", "compiler", "lme4", # "fastDummies", "MCMCglmm", "rgl") # new.packages <- list.of.packages[!(list.of.packages %in% installed.packages()[,"Package"])] # if (length(new.packages)) install.packages(new.packages, dependencies = TRUE) library("REndo") library("data.table") library("ggplot2") library("corpcor") library("MASS") library("MCMCglmm") library("AER") library("gridExtra") library("compiler") library("lme4") library("fastDummies") library("rgl") ## save figures to PDF: TRUE/FALSE doPDF <- FALSE if (doPDF) dir.create("Figures") #### Chapter 2 #### ##### Figure 2 ##### ## Estimates bias at different correlation levels between endogenous regressor ## and model error. ## Shows the inconsistency of estimates of regressors depending on the correlation ## between variables and between the endogenous regressor and the error. ## We consider four cases: ## - low correlation btw variables and low correlation btw end regressor and error ## - low correlation btw regressors and high correlation btw end reg and error ## - high correlation btw regressors and low correlation btw end reg and error ## - high correlation btw regressors and high correlation btw end reg and error ## ## The model we use is: ## ## y = b0 + b1 * P + b2 * X + error_y ## P = g0 + g1 * Z + error_x set.seed(43210) ## Set true parameters b0_true <- 1 b1_true <- -1 b2_true <- 1 g0_true <- 1 g1_true <- 1 ## Number of observations nobs <- 2500 ## Generate omitted variable (standard normal) OV <- rnorm(n = nobs, mean = 0, sd = 1) ## Generate errors error_y and error_x error_y <- rnorm(n = nobs, mean = 0, sd = 0.5) error_p <- rnorm(n = nobs, mean = 0, sd = 0.5) rho <- 0.2 # rho - cor(P, X) m <- c(0, 0) # mean gamma <- c(0.1, 0.3, 0.5) ## covariance matrix between P and X - the 2 regressors R <- matrix(c(1, rho, rho, 1), 2, 2) ## Generate data frame to store observed data df <- data.frame(obs.id = as.factor(c(1:nobs))) sim <- 1000 ## coefficients and std deviation coef <- matrix(NA, sim, length(gamma)) stdev <- vector("list", sim) confInt <- list(confInt1 = matrix(NA, sim, 2), confInt2 = matrix(NA, sim, 2), confInt3 = matrix(NA, sim, 2)) ## cor(P, OV) = 0.1 for (k in 1:length(gamma)){ for (i in 1:sim) { ## list of the 2 regressors X and W, with different correlation btw them regressors <- MASS::mvrnorm(nobs, m, R) ## case 1 df$P1 <- regressors[, 1] df$X <- regressors[, 2] ## Generate endogenous regressor df$P <- g0_true + g1_true * df$P1 + gamma[k] * OV + error_p ## Generate dependent variable df$Y <- b0_true + b1_true * df$P + b2_true * df$X + OV + error_y ## run the OLS model1 <- lm(Y ~ P + X, data = df) ## save coefficients for the 3 cases, each column represents the coefficients ## 1,2,3 of each model coef[i, k] <- coef(model1)["P"] ## confidence interval for the endogeneous variable P for each value of correlation confInt[[k]][i, ] <- confint(model1, "P", level = 0.95) } } ## corr(P,error) = 0.1 means_0.1 <- colMeans(coef)[1] stdev_0.1 <- apply(coef, 2, sd)[1] confInt1 <- apply(confInt[[1]], 2, mean) conf11 <- confInt1[2] + 1 ## corr(P,error) = 0.3 means_0.3 <- colMeans(coef)[2] stdev_0.3 <- apply(coef, 2, sd)[2] confInt2 <- apply(confInt[[2]], 2, mean) ## corr(P,error) = 0.5 means_0.5 <- colMeans(coef)[3] stdev_0.5 <- apply(coef, 2, sd)[3] confInt3 <- apply(confInt[[3]], 2, mean) ## estimates of b1 at different correlation levels between end. regressor and error dd_b1 <- rbind(as.matrix(means_0.1, 3, 1), as.matrix(means_0.3, 3, 1), as.matrix(means_0.5, 3, 1)) colnames(dd_b1) <- c("beta") dd_b1 <- cbind(dd_b1, corr_P_error = rep(c(0.1, 0.3, 0.5), each = 1)) dd_b1 <- as.data.table(dd_b1) dd_b1[, bias := beta + 1] ## create graph Corr_P_error <- factor(dd_b1$corr_P_error) dd_b1 <- as.data.table(dd_b1) dd_b1[, confInt_low := c(confInt1[1] + 1, confInt2[1] + 1, confInt3[1] + 1)] dd_b1[, confInt_up := c(confInt1[2] + 1, confInt2[2] + 1, confInt3[2] + 1)] ## beta1 at different correlation levels between error and endogenous regressor X p2 <- ggplot(data = dd_b1, aes(as.factor(corr_P_error), bias, group = 1)) + geom_line() + geom_point() p22 <- p2 + theme_grey(base_family = "serif", base_size = 10) + theme(panel.background = element_rect(fill = "white", colour = "black"), text = element_text(size = 14)) + xlab("Correlation between P and error") + ylab("b1 Estimates Bias") + ylim(c(0, 0.5)) + geom_errorbar(aes(ymin = confInt_low, ymax = confInt_up, width = 0.2)) if (doPDF) pdf(file = "Figures/jss_3807_fig_2_new.pdf") # width = 6, height = 6) plot(p22) if (doPDF) dev.off() #### Chapter 4 #### ## Figures 4 - 8 ## ## The performance of the five internal instrumental variables approaches in ## REndo package is compared to that of ordinary least squares, over ## 1000 simulations, with sample size equal to 2500 observations. ## ## To replicate the exact figures in the manuscript, the number of ## simulations has to be changed to 1000. beta1 <- beta2 <- NULL rho <- 0.5 R <- matrix(c(1, rho, rho, 1), 2, 2) m <- c(0, 0) b0 <- 3 b1 <- 1.5 b2 <- 3 b3 <- 3 a <- -1 g1 <- 1 ##### Figure 4 #### ## Latent instrumental variable vs OLS: parameter estimates were obtained over ## 1000 simulated samples, each of size 2500; true parameter value is -1. set.seed(432111) beta1 <- beta2 <- rep(NA, sim) for (i in 1:sim) { N1 <- MASS::mvrnorm(nobs, m, R) eps <- N1[,1] nu <- N1[,2] # Take it discrete Z <- rbinom(n = nobs, size = 1, p = 0.3) P1 <- g1*Z + nu y1 <- b0 + a * P1 + eps d1 <- data.frame(cbind(y1 = y1,P1 = P1)) beta1[i] <- coef(lm(y1 ~ P1, data = d1))["P1"] beta2[i] <- coef(latentIV(y1 ~ P1, data = d1))["P1"] } beta1 <- as.matrix(beta1) colnames(beta1) <- c("beta1") beta2 <- as.matrix(beta2) colnames(beta2) <- c("beta2") dLIV <- data.frame(rbind(cbind(beta1, "OLS"), cbind(beta2, "LIV"))) dLIV$beta1 <- as.numeric(as.character(dLIV$beta1)) ##### Estimates #### cat("Latent IV vs OLS", "\n", "OLS estimate - mean:", mean(dLIV$beta1[dLIV$V2 == "OLS"]), "\n", "LIV estimate - mean:", mean(dLIV$beta1[dLIV$V2 == "LIV"])) cat("Latent IV vs OLS", "\n", "OLS estimate - sd:", sd(dLIV$beta1[dLIV$V2 == "OLS"]), "\n", "LIV estimate - sd:", sd(dLIV$beta1[dLIV$V2 == "LIV"])) p4 <- ggplot(dLIV, aes(x = beta1, fill = V2), size = 3) + geom_density(alpha = 0.3) + labs(x = "Endogenous Regressor", y = "Density") + geom_vline(aes(xintercept = -1,linetype = "-1"), show.legend = TRUE) + theme(panel.background = element_rect(fill = "white", colour = "black"), text = element_text(size = 14))+ scale_linetype_manual(name = "True value", values = "dotdash", labels = "-1") + guides(fill = guide_legend(override.aes = list(colour = NA), order = 2))+ labs(fill = "Method") + xlim(c(-1.5, -0.5)) + ylim(c(0, 30)) ## Adjust key height and width pp4 <- p4 + theme(legend.key.height = unit(0.6, "cm"), legend.key.width = unit(1, "cm")) + theme(aspect.ratio = 5/4) if (doPDF) pdf(file = "Figures/jss_3807_fig_4_liv.pdf") # width = 6, height = 6) plot(pp4) if (doPDF) dev.off() ##### Figure 5 ##### ## Copula correction vs. OLS: parameter estimates were obtained over ## 1000 simulated samples, each of size 2500; true parameter value is -1. ## The number of bootstraps used here is 2, due to time considerations. ## But this does not influence the coefficient estimates. ## But the the confidence intervals are be computed. ## In general, it is advisable to use around 1000 bootstraps. set.seed(432112) beta5 <- beta6 <- rep(0, sim) rho <- 0.5 m <- c(0, 0) R <- matrix(c(1, rho, rho, 1), 2, 2) b0 <- 3 b1 <- 1.5 b2 <- 3 b3 <- 3 a <- -1 I <- rep(1, nobs) for (k2 in 1:sim){ X1 <- rnorm(nobs, 2, 1) X2 <- rnorm(nobs, 1.5, 1) N <- MASS::mvrnorm(nobs, mu = m, R) eps <- N[, 1] OV <- N[, 2] # P1 has a t-distribution P1 <- qt(pnorm(OV), 3) y <- b0*I + b1 * X1 + b2 * X2 + a * P1 + eps data_CopCorr <- data.frame(y, I, X1, X2, P1) beta5[k2] <- coef(lm(y ~ P1 + X1 + X2, data = data_CopCorr))["P1"] cc <- copulaCorrection(y ~ P1 + X1 + X2 -1| continuous(P1), data = data_CopCorr, num.boots = 2, verbose = FALSE) beta6[k2] <- coef(cc)["P1"] } dCC <- data.frame(rbind(cbind(beta5, "OLS"), cbind(beta6, "Copula Correction"))) dCC$beta5 <- as.numeric(as.character(dCC$beta5)) ##### Estimates #### cat("Copula Correction vs OLS", "\n", "OLS estimate - mean:", mean(dCC$beta5[dCC$V2 == "OLS"]), "\n", "Copula estimate - mean:", mean(dCC$beta5[dCC$V2 == "Copula Correction"])) cat("Copula Correction vs OLS", "\n", "OLS estimate - sd:", sd(dCC$beta5[dCC$V2 == "OLS"]),"\n", "Copula estimate - sd:", sd(dCC$beta5[dCC$V2 == "Copula Correction"])) p5 <- ggplot(dCC, aes(x = beta5, fill = V2), size = 3) + geom_density(alpha = 0.3) pp5 <- p5 + labs(x = "Endogenous Regressor", y = "Density") + coord_cartesian(xlim = c(-1.2, -0.5)) + geom_vline(aes(xintercept = -1, linetype = "-1"), show.legend = TRUE) + theme(panel.background = element_rect(fill = "white", colour = "black"), text = element_text(size = 14)) + scale_linetype_manual(name = "True value", values = "dotdash", labels = "-1")+ guides(fill = guide_legend(override.aes = list(colour = NA), order = 2)) + labs(fill = "Method") + xlim(c(-1.2, -0.6)) + ylim(c(0, 30)) pp51 <- pp5 + theme(legend.key.height = unit(1, "cm"), legend.key.width = unit(0.6, "cm")) if (doPDF) pdf(file = "Figures/jss_3807_fig_5_copula.pdf") plot(pp51) if (doPDF) dev.off() ##### Figure 6 ##### ## Higher moments vs OLS: parameter estimates were obtained over 1000 simulated samples, ## each of size 2500; true parameter value is -1. set.seed(43213) beta3 <- beta4 <- rep(NA, sim) for (k1 in 1:sim) { N <- MASS::mvrnorm(nobs, mu = m, R) OV <- N[,1] e1 <- N[,2] X3 <- rt(nobs, 3) e2 <- rt(nobs, 10) X1 <- rnorm(nobs, 2, 0.1) X2 <- X1 + 0.5 * rnorm(nobs, 1.5, 0.9) P1 <- b3 * X3 + e2 + 3 * OV y1 <- b0 + b1 * X1 + b2 * X2 + a * P1 + e1 + OV M <- cbind(X1,X2) dataHigherMoments <- data.frame(y1,X1,X2,P1) beta3[k1] <- coef(lm(y1 ~ P1 + X1 + X2))["P1"] beta4[k1] <- coef(higherMomentsIV(y1 ~ P1 + X1 + X2 | P1 | IIV(iiv = yp, g = x2, X1, X2), data = dataHigherMoments))["P1"] } dHM <- data.frame(rbind(cbind(beta3, "OLS"), cbind(beta4, "Higher Moments"))) dHM$beta3 <- as.numeric(as.character(dHM$beta3)) ##### Estimates #### cat("Higher Moments vs OLS", "\n", "OLS estimate - mean:", mean(dHM$beta3[dHM$V2 == "OLS"]),"\n", "Higher Moments estimate - mean:", mean(dHM$beta3[dHM$V2 == "Higher Moments"])) cat("Higher Moments vs OLS", "\n", "OLS estimate - sd:", sd(dHM$beta3[dHM$V2 == "OLS"]),"\n", "Higher Moments estimate - sd:", sd(dHM$beta3[dHM$V2 == "Higher Moments"])) p6 <- ggplot(dHM, aes(x = beta3, fill = V2), size = 3) + geom_density(alpha = 0.3) pp6 <- p6 + labs(x = "Endogenous Regressor", y = "Density") + coord_cartesian(xlim = c(-1.2, -0.5)) + geom_vline(aes(xintercept = -1, linetype = "-1"), show.legend = TRUE) + theme(panel.background = element_rect(fill = "white", colour = "black"), text = element_text(size = 14)) + scale_linetype_manual(name = "True value", values = "dotdash", labels = "-1")+ guides(fill = guide_legend(override.aes = list(colour = NA), order = 2))+ labs(fill = "Method") + xlim(c(-1.2, -0.5)) pp61 <- pp6 + theme(legend.key.height = unit(0.6, "cm"), legend.key.width = unit(1, "cm")) if (doPDF) pdf(file = "Figures/jss_3807_fig_6_hm.pdf")# width = 6, height = 6) plot(pp61) if (doPDF) dev.off() ##### Figure 7 ##### ## Heteroskedastic errors IV vs OLS: parameter estimates were obtained over ## 1000 simulated samples, each of size 2500; true parameter value is -1. set.seed(43214) beta7 <- beta8 <- rep(0, sim) for (k3 in 1:sim){ N <- MASS::mvrnorm(nobs, mu = m, R) OV <- N[, 1] e1 <- N[, 2] X3 <- rt(nobs, 3) e2 <- rt(nobs, 10) X1 <- rnorm(nobs, 2, 0.1) X2 <- X1 + 0.5 * rnorm(nobs, 1.5, 0.9) P1 <- b3 * X3 + e2 + 3 * OV y1 <- b0 + b1 * X1 + b2 * X2 + a * P1 + e1 + OV M <- cbind(X1,X2) Data_hetErrors <- data.frame(cbind(y1, M, P1)) beta7[k3] <- coef(lm(y1 ~ P1 + X1 + X2))["P1"] beta8[k3] <- coef(hetErrorsIV(y1 ~ P1 + X1 + X2| P1 | IIV(X1, X2), data = Data_hetErrors))["P1"] } dHet <- data.frame(rbind(cbind(beta7, "OLS"), cbind(beta8, "Heteroskedastic Errors"))) dHet$beta7 <- as.numeric(as.character(dHet$beta7)) ##### Estimates #### cat("Heteroskedastic Errors vs OLS", "\n", "OLS estimate -mean:", mean(dHet$beta7[dHet$V2 == "OLS"]),"\n", "Heteroskedastic Errors estimate - mean:", mean(dHet$beta7[dHet$V2 == "Heteroskedastic Errors"])) cat("Heteroskedastic Errors vs OLS", "\n", "OLS estimate - sd:", sd(dHet$beta7[dHet$V2 == "OLS"]),"\n", "Heteroskedastic Errors estimate - sd:", sd(dHet$beta7[dHet$V2 == "Heteroskedastic Errors"])) p7 <- ggplot(dHet, aes(x = beta7, fill = V2), size = 3) + geom_density(alpha = 0.3) pp7 <- p7 + labs(x = "Endogenous Regressor", y = "Density") + coord_cartesian(xlim = c(-1.2, -0.7)) + geom_vline(aes(xintercept = -1, linetype = "-1")) + theme(panel.background = element_rect(fill = "white", colour = "black"), text = element_text(size = 14))+ scale_linetype_manual(name = "True value", values = "dotdash", labels = "-1") + guides(fill = guide_legend(override.aes = list(colour = NA), order = 2)) + labs(fill = "Method") + xlim(c(-1.2, -0.7)) pp71 <- pp7 + theme(legend.key.height = unit(0.6, "cm"), legend.key.width = unit(1, "cm")) if (doPDF) pdf(file = "Figures/jss_3807_fig_7_het.pdf") # width = 6, height = 6) plot(pp71) if (doPDF) dev.off() ##### Figure 8 ##### ## MultilevelIV vs random effects: parameter estimates were obtained over ## 1000 simulated samples, each of size 2500; true parameter value is -1. set.seed(43215) ###### functions used ##### dmat <- cmpfun(function(i) { j <- length(i) n <- sum(i) index <- cbind(start = cumsum(c(1, i[-j])), stop = cumsum(i)) H <- matrix(0, nrow = n, ncol = j) for (i in 1:j) { H[index[i, 1]:index[i, 2], i] <- 1L } return(H) }) mycut <- cmpfun(function(x, p) { cut(x = x, breaks = quantile(x, probs = p), labels = FALSE, include.lowest = TRUE) }) ## the coefficients to be recovered coefGMM <- c(70, 3, 9, -2, 2, -2, -4, -3, 5, 2, 0.5, 0.1, -0.5) ## simulation replications ## sim <- 1000 ## stored coefficient estimates OLS <- GMML2 <- rep(NA, sim) ## variance-covariance matrix for level 3 variables R3_31 <- -0.30 R3_32 <- -0.22 R3_33 <- -0.10 R3_34 <- 0.23 R3_35 <- -0.15 R3_36 <- 0.19 R3 <- matrix(c(1, R3_31, R3_32, R3_34, R3_31, 1, R3_33, R3_35, R3_32, R3_33, 1, R3_36, R3_34, R3_35, R3_36, 1), 4, 4) R3 <- cov2cor(make.positive.definite(R3)) ## variance-covariance matrix for level 2 variables R2_12 <- -0.05 R2_13 <- 0.006 R2_14 <- 0.02 R2_23 <- -0.75 R2_24 <- -0.15 R2_34 <- -0.10 R2 <- matrix(c(1, R2_12, R2_13, R2_14, R2_12, 1, R2_23, R2_24, R2_13, R2_23, 1, R2_34, R2_14, R2_24, R2_34, 1), 4, 4) R2 <- cov2cor(make.positive.definite(R2)) # variance-covariance matrix for level 1 variables R1_12 <- 0.015 R1_13 <- 0.15 R1_14 <- 0.05 R1_23 <- 0.10 R1_24 <- 0.01 R1_34 <- 0.10 R1 <- matrix(c(1, R1_12, R1_13, R1_14, R1_12, 1, R1_23, R1_24, R1_13, R1_23, 1, R1_34, R1_14, R1_24, R1_34, 1), 4, 4) R1 <- cov2cor(make.positive.definite(R1)) ###### mean ###### mu <- list(int=0, lev1 = c(X11 = 1.60, X12 = 0.056338, X13 = 0.06760563, X14 = 0.763122), lev2 = c(X21 = 0.4688129), lev3 = c(X32 = 20.628, X31 = 12.2953, X33 = 94.84266)) for (t in 1:sim) { k <- 40 # total number of schools n <- sample(29:39, k, TRUE) # number of students within each school j <- sum(n) # total number of students N <- sample(1:5, j, TRUE, prob = c(0.415833333, 0.246666667, 0.191666667, 0.142500000, 0.003333333)) i <- sum(N) ###### level 3 variables #### X3 <- as.data.table(MASS::mvrnorm(n = k, mu = c(mu$lev3, mu$int), Sigma = R3)) X3[, SID := 1:k] X3 <- within(X3, { X32 <- floor(X32 - abs(V4)) X31 <- floor(X31 + rtnorm(k, mean = mu$lev3["X31"] - 0.19 * X32, sd = 3.195407, lower = 5, upper = 25)) X33 <- floor(rtnorm(k, mean = mu$lev3["X33"] + 0.19 * X32 - 0.10 * X31, sd = 8.562959, lower = 30, upper = 98)) }) X3[, err3 := rnorm(k, 0, 1)] X3[, OVL3 := rnorm(k, 8, 2.5) + 0.8 * abs(X3$V4)] S <- dmat(N) %*% dmat(n) %*% as.matrix(X3) ###### level 2 variables #### X21 <- rnorm(n = j, mu$lev2, 0.23) X2 <- as.data.frame(X21) RACE <- sample(0:3, j, TRUE, prob = c(0.02, 0.52, 0.35, 0.11)) Race <- dummy_cols(RACE) X24 <- Race[, 1] X22 <- Race[, 2] X23 <- Race[, 3] b2 <- cbind(CID = 1:j, CRC = rlnorm(j, mean = mu$int, sd = 0.1579), err2 = rnorm(j, 0, 1)) X2 <- within(X2, { X21 <- mycut(X2[,"X21"], c(0, 0.53, 1)) - 1 }) b21 <- cbind(b2, X2, X24, X22, X23) C <- dmat(N) %*% as.matrix(b21) dt <- as.data.table(cbind(C, S)) dt[, X21 := mycut(X21 + 0.8 * X31, c(0, 0.53, 1)) - 1] dt[, X22 := mycut(X22 - 0.5 * X31, c(0, 0.52, 1)) - 1] library("plyr") dt <- rename(dt, c("V4" = "SRC")) setkeyv(dt, cols = c("CID", "SID")) ###### level 1 variables #### X1 <- as.data.frame(MASS::mvrnorm (n = i, mu = mu$lev1, Sigma = R1)) X1 <- within(X1, { X12 <- mycut(as.numeric(unlist(X1[, 2] + 0.1 * dt[, "SRC"] + 0.25 * dt[, "CRC"])), c(0, 0.99467, 1)) - 1 X13 <- mycut(as.numeric(unlist(X1[, 3] + 0.1 * dt[, "SRC"] + 0.23 * dt[, "CRC"])), c(0, 0.943, 1)) - 1 X14 <- (X1[,4] - min(X1[,4]))/(max(X1[,4]) - min(X1[,4])) }) dt[, err1 := rnorm(i, 0, 1)] dt[, GRADE:= 1:.N, by = c("CID", "SID")] dt[, X11 := GRADE - 1] dt[,X14 := X1[,"X14"] - 0.02 * X31 + 0.05 * X32 + 0.0008 * X33 + rnorm(i, 0, 1)] dt[,X13 := X1[,"X13"] + 0.02 * X22 - 0.02 * X21 + 0.03 * X24 - 0.05 * X31 + 0.05 * X32 + 0.0008 * X33 + rnorm(i,0,1)] dt[,X12 := X1[,"X12"] + 0.4 * X22 - 0.25 * X21 + 0.2 * X24 - 0.04 * X31 + 0.05 * X32 + 0.0008 * X33 + rnorm(i, 0, 1)] dt[, Intercept := rep(1, i)] dt[, c("SRC", "CRC") := NULL] dt[, GRADE := NULL] # create random slope sigma1 <- matrix(c(1, 0.25, 0.25, 1), 2, 2) U <- as.data.frame(MASS::mvrnorm(n = i, mu = c(0,0), Sigma = sigma1)) dt[,X15 := (0.2 * X21 + 0.03 * X22 + 0.04 * X23) + rnorm(i, 0, 1) + dt[, "err2"]] cor(dt[, "err2"], dt[, "X15"]) cor(dt[, "err2"], dt[, "X14"]) cor(dt[, "err2"], dt[, "X12"]) cor(dt[, "err2"], dt[, "X13"]) dt1 <- data.table::copy(dt) dt[, c("SID", "CID") := NULL] setcolorder(dt,c("Intercept", "X11", "X12", "X13", "X14", "X21", "X22", "X23", "X24", "X31", "X32", "X33", "OVL3", "X15", "err1", "err2", "err3")) ###### dependent variable #### dt1[, TLI:= matrix(coefGMM %*% t(dt[, 1:13]), dim(dt)[1], 1) + (-1) * dt[, "X15"] + (-.5) * U[, 1] * dt[, "X11"] + 0.05 * U[, 2] + dt[, "err1"] + dt[, "err2"] + dt[, "err3"]] setcolorder(dt1,c("Intercept", "SID", "CID", "TLI", "X11", "X12", "X13", "X14", "X21", "X22", "X23", "X24", "X31", "X32", "X33", "OVL3", "X15", "err1", "err2", "err3")) dt1[, c("OVL3", "err1", "err2", "err3") := NULL] colnames(dt1) <- c("Intercept", "SID", "CID", "y", "X11", "X12", "X13", "X14", "X21", "X22", "X23", "X24", "X31", "X32", "X33", "X15") ###### model estimation #### resMultilevel <- REndo::multilevelIV(y ~ X11+ X12 + X13 + X14 + X15 + X21 + X22 + X23 + X24 + X31 + X32 + X33 + (1 | CID) + (1 | SID) | endo(X15), data = dt1) OLS[t] <- coef(resMultilevel)[6, 1] GMML2[t] <- coef(resMultilevel)[6, 4] } dmultilevel <- data.frame(rbind(cbind(OLS, "OLS"), cbind(GMML2, "MultilevelIV"))) dmultilevel$beta8 <- as.numeric(as.character(dmultilevel$OLS)) ##### Estimates #### cat("Multilevel IV vs OLS", "\n", "OLS estimate - mean:", mean(dmultilevel$beta8[dmultilevel$V2 == "OLS"]), "\n", "Multilevel IV estimate - mean:", mean(dmultilevel$beta8[dmultilevel$V2 == "MultilevelIV"])) cat("Multilevel IV vs OLS", "\n", "OLS estimate - sd:", sd(dmultilevel$beta8[dmultilevel$V2 == "OLS"]), "\n", "Multilevel IV estimate - sd:", sd(dmultilevel$beta8[dmultilevel$V2 == "MultilevelIV"])) ###### plot ###### p8 <- ggplot(dmultilevel, aes(x = beta8,fill = V2), size = 3) + geom_density(alpha = 0.3) pp8 <- p8 + labs(x = "Endogenous Regressor", y = "Density") + coord_cartesian(xlim = c(-1.25, -0.3)) + geom_vline(aes(xintercept = -1, linetype = "-1")) + theme(panel.background = element_rect(fill = "white", colour = "black"), text = element_text(size = 14)) + scale_linetype_manual(name = "True value", values = "dotdash", labels = "-1")+ guides(fill = guide_legend(override.aes = list(colour = NA), order = 2))+ labs(fill = "Method") + xlim(c(-1.25, -0.3)) + ylim(c(0, 30)) pp81 <- pp8 + theme(legend.key.height = unit(0.6, "cm"), legend.key.width = unit(1, "cm")) if (doPDF) pdf(file = "Figures/jss_3807_fig_8_ml.pdf") # width = 6, height = 6) plot(pp81) if (doPDF) dev.off() #### Chapter 5 #### ##### Code & Results: Latent IV #### data("dataLatentIV", package = "REndo") resultsLIV <- latentIV(y ~ P, data = dataLatentIV) summary(resultsLIV) ## OLS results ont the same dataset results_ols_liv <- lm(y ~ P, data = dataLatentIV) summary(results_ols_liv) ##### Code & Results: Joint estimation using gaussian copula #### ###### Code & Results: One continuous endogenous regressor #### ## the bootstrapped errors and intervals might differ if on different OS or different package versions data("dataCopCont", package = "REndo") set.seed(1002) resultsCC1 <- copulaCorrection(formula = y ~ X1 + X2 + P | continuous(P), data = dataCopCont, num.boots = 50, verbose = FALSE) summary(resultsCC1) ## OLS estimates on the same datasset results_ols_cc1 <- lm(y ~ X1 + X2 + P, data = dataCopCont) summary(results_ols_cc1) ###### Code & Results: One discrete endogenous regressor #### ## the bootstrapped errors and intervals might differ if on different OS set.seed(1003) resultsCC2 <- copulaCorrection(formula = y ~ X1 + X2 + P | discrete(P), data = dataCopDis, verbose = FALSE) summary(resultsCC2) # confint(resultsCC2) ### OLS results on the same dataset results_ols_cd <- lm(y ~ X1 + X2 + P, data = dataCopDis) summary(results_ols_cd) ###### Code & Results: Discrete and continuous endogenous regressors #### ## the bootstrapped errors and intervals might differ if on different OS data("dataCopDisCont", package = "REndo") set.seed(1004) resultsCC3 <- copulaCorrection(formula = y ~ X1 + X2 + P1 + P2 | discrete(P1) + continuous(P2), data = dataCopDisCont, verbose = FALSE) summary(resultsCC3) ### OLS results on the same dataset results_ols_cdc <- lm(y ~ X1 + X2 + P1 + P2, data = dataCopDisCont) summary(results_ols_cdc) ##### Code & Results: Higher moments #### data("dataHigherMoments", package = "REndo") resultsHM <- higherMomentsIV(formula = y ~ X1 + X2 + P | P | IIV(iiv = yp), data = dataHigherMoments) summary(resultsHM) ### OLS on the same data results_ols_hm <- lm(y ~ X1 + X2 + P, data = dataHigherMoments) summary(results_ols_hm) ##### Code & Results: Heteroskedastic errors #### data("dataHetIV", package = "REndo") resultsHetIV <- hetErrorsIV(y ~ X1 + X2 + P | P | IIV(X2), data = dataHetIV) summary(resultsHetIV) ### OLS for the same dataset results_ols_het <- lm(y ~ X1 + X2 + P, data = dataHetIV) summary(results_ols_het) ##### Code & Results: Multilevel internal IV #### set.seed(1005) data("dataMultilevelIV", package = "REndo") formula1 <- y ~ X11 + X12 + X13 + X14 + X15 + X21 + X22 + X23 + X24 + X31 + X32 + X33 + (1 | CID) + (1 | SID) | endo(X15) resultsMIV <- multilevelIV(formula = formula1, data = dataMultilevelIV) coef(resultsMIV) # results page summary(resultsMIV, "REF") # results page summary(resultsMIV, "FE_L2") #### Chapter 6 #### ##### Table 5 ##### ## IIV model comparison. ## We use the CASchools dataset from the AER package to exemplify the internal IV methods ## and compare them with the OLS, 2SLS and the control function approaches. data("CASchools", package = "AER") CASchools$stratio <- with(CASchools, students / teachers) school <- CASchools school$gr08 <- 1 school$gr08[school$grades == "KK-06"] <- 0 school$cc <- as.numeric(school$county) ##### OLS #### ols <- lm(read ~ stratio + english + lunch + grades + calworks + income + county, data = school) summary(ols)$coefficients[1:7,] summary(ols) ##### Two Stage Least Squares #### extiv <- ivreg(read ~ stratio + english + lunch + grades + income + calworks + county, ~ expenditure + english + lunch + income + grades + county + calworks, data = school) summary(extiv)$coefficients[1:7, ] summary(extiv) ##### Control Function #### res_cf <- lm(read ~ expenditure, data = school)$residuals m_cf <- lm(read ~ stratio + res_cf + lunch + english + calworks + income + grades + county , data = school) summary(m_cf)$coefficients[1:8, ] summary(m_cf) ##### CopulaCorrection #### ## the bootstrapped errors and intervals might differ if on different OS set.seed(43223) cop.model <- copulaCorrection(read ~ stratio + english + lunch + calworks + grades + income + county | continuous(stratio), num.boots = 50, data = school, verbose = FALSE) summary(cop.model)$coefficients[1:7, ] summary(cop.model) ##### Latent IIV #### liv <- latentIV(read ~ stratio, data = school) summary(liv) ##### Heteroskedastic Errors IIV #### hetEr <- hetErrorsIV(read ~ stratio + english + lunch + calworks + income + grades + county | stratio | IIV(income, english), data = school) summary(hetEr)$coefficients[1:7, ] summary(hetEr) ##### Higher Moments IIV #### highMoment <- higherMomentsIV(read ~ stratio + english + lunch + calworks + income + grades + county| stratio | IIV(g = x3, iiv = gp, income), data = school) summary(highMoment)$coefficients[1:7, ] summary(highMoment) ##### Multilevel IIV #### multilev <- multilevelIV(read ~ stratio + english + lunch + gr08 + income + calworks + (1 | cc) | endo(stratio), data = school) coef(multilev) #### Appendix #### ##### Appendix A #### ## Weak vs.Strong Instrumental Variables ###### Table 6 #### ## The sample size varies between 500 and 2500 observations respectively, ## and the correlation ($\rho$) between the instrument and the endogenous regressor ## takes two values, $0.10$ and $0.40$. ## The results are obtained running a simulation over 1000 random samples, where the ## true parameter value is equal to $-1$ and the correlation between the omitted variable and ## the endogenous regressor takes two values, $0.1$ and $0.3$. ## Lets take a look at a simple IV model ## We will generate data and then estimate it ## The model we will use is: ## y = b0 + b1 * X + error_y ## x = g0 + g1 * Z + error_x ## There is an omitted variable, in both equations, driving y and x (call it "OV") set.seed(43224) ## Simulate some data ## Set true parameters b0_true <- 1 b1_true <- -1 g0_true <- 1 g1_true <- 1 mu <- c(0,0) ## phi = correlation between endogenous regressor (X) and error ## rho = correlation between the IV (Z) and endogenous regressor (X) endobias <- function(nobs, phi, rho, sim){ coefOLS <- rep(NA, sim) coefIV <- rep(NA, sim) IV.df <- data.frame(obs.id = as.factor(c(1:nobs))) R <- matrix(c(1, rho, rho, 1), 2, 2) OV <- rnorm(n = nobs, mean = 0, sd = 1) ## Generate errors error_y and error_x (both standard normal) error_y <- rnorm(n = nobs, mean = 0, sd = 0.5) error_x <- rnorm(n = nobs, mean = 0, sd = 0.5) matXZ1 <- MASS::mvrnorm(nobs, mu, R) # cor(X,Z) ## Generate instrumental variable IV.df$Z <- matXZ1[, 1] IV.df$X1 <- matXZ1[, 2] ## Generate endogenous regressor IV.df$X <- g0_true + g1_true * IV.df$X1 + phi * OV + error_x ## Generate dependent variable IV.df$Y <- b0_true + b1_true * IV.df$X + OV + error_y ## Run OLS with Y and X m1 <- lm(Y ~ X, data = IV.df) ## Run IV regression using Z as instrument m2 <- ivreg(Y ~ X | Z, data = IV.df) for (i in 1:sim){ coefOLS[i] <- summary(m1)$coefficients[2] coefIV[i] <- summary(m2)$coefficients[2] } res <- list(resOLS = coefOLS, resIV = coefIV) res } ### SAMPLE SIZE = 500 ## phi = 0.1, rho = 0.1 # res1_1_1 <- endobias(nobs = 500, phi = 0.1, rho = 0.1, sim = 1000) ## phi = 0.1, rho = 0.4 # res1_1_4 <- endobias(nobs = 500, phi = 0.1, rho = 0.4, sim = 1000) ## phi = 0.3, rho = 0.1 # res1_3_1 <- endobias(nobs = 500, phi = 0.3, rho = 0.1, sim = 1000) ## phi = 0.3, rho = 0.4 # res1_3_4 <- endobias(nobs = 500, phi = 0.3, rho = 0.4, sim = 1000) ### SAMPLE SIZE = 2500 ## phi = 0.1, rho = 0.1 # res2_1_1 <- endobias(nobs = 2500, phi = 0.1, rho = 0.1, sim = 1000) ## phi = 0.1, rho = 0.4 # res2_1_4 <- endobias(nobs = 2500, phi = 0.1, rho = 0.4, sim = 1000) ## phi = 0.3, rho = 0.1 # res2_3_1 <- endobias(nobs = 2500, phi = 0.3, rho = 0.1, sim = 1000) ## phi = 0.3, rho = 0.4 # res2_3_4 <- endobias(nobs = 2500, phi = 0.3, rho = 0.4, sim = 1000) ## Correlation between X and the error (phi) = 0.1 and ## rho takes values 0.1 and 0.4 ## rho = 0.1 results_rho_01_1 <- data.table(phi = c(0.1, 0.1), rho = c(0.1, 0.1), true_value = c(-1, -1), OLS = c(round(mean(res1_1_1$resOLS), 3), round(mean(res2_1_1$resOLS), 3)), IV = c(round(mean(res1_1_1$resIV), 3), round(mean(res2_1_1$resIV), 3)), sample_size = c(500, 2500)) results_rho_01_1 ## rho = 0.4 results_rho_04_1 <- data.table(phi = c(0.1, 0.1), rho = c(0.4, 0.4), true_value = c(-1, -1), OLS = c(round(mean(res1_1_4$resOLS), 3), round(mean(res2_1_4$resOLS), 3)), IV = c(round(mean(res1_1_4$resIV), 3), round(mean(res2_1_4$resIV), 3)), sample_size = c(500, 2500)) results_rho_04_1 ## Correlation between X and the error (phi) = 0.3 and ## rho takes values 0.1 and 0.4 ## rho = 0.1 results_rho_01_2 <- data.table(phi = c(0.3, 0.3), rho = c(0.1, 0.1), true_value = c(-1, -1), OLS = c(round(mean(res1_3_1$resOLS), 3), round(mean(res2_3_1$resOLS), 3)), IV = c(round(mean(res1_3_1$resIV), 3), round(mean(res2_3_1$resIV), 3)), sample_size = c(500, 2500)) results_rho_01_2 ## rho = 0.4 results_rho_04_2 <- data.table(phi = c(0.3, 0.3), rho = c(0.4, 0.4), true_value = c(-1, -1), OLS = c(round(mean(res1_3_4$resOLS), 3), round(mean(res2_3_4$resOLS), 3)), IV = c(round(mean(res1_3_4$resIV), 3), round(mean(res2_3_4$resIV), 3)), sample_size = c(500, 2500)) results_rho_04_2 ##### Appendix B ##### ##: Bias under Endogeneity ###### Figure 9 ####### ## Endogeneity and the bias of estimates ## for beta1 and beta2: bias at different levels of ## correlation between the endogenous regressor (P) ## and the error. set.seed(200) ## Set true parameters b0_true <- 1 b1_true <- -1 b2_true <- 1 g0_true <- 1 g1_true <- 1 mu <- c(0,0) ## phi = correlation between endogenous regressor (X) and error ## rho = correlation between the IV (Z) and endogenous regressor (X) endobias <- function(nobs, phi, rho, sim){ coefOLS1 <- coefOLS2 <- corXP <- corPE <- rep(NA , sim) coefIV <- rep(NA,sim) IV.df <- data.frame(obs.id = as.factor(c(1:nobs))) R <- matrix(c(1, 1.8*rho, 1.8*rho, 1), 2, 2) R1 <- matrix(c(1, 1.8*phi, 1.8*phi, 1), 2, 2) ## Generate errors error_y and error_x (both standard normal) error_x <- rnorm(n = nobs, mean = 0, sd = 1) matPX <- MASS::mvrnorm(nobs, mu, R) # cor(P,X) matPerrx <- MASS::mvrnorm(nobs, mu, R1) # cor(err,P) ## Generate instrumental variable IV.df$X <- matPX[, 1] IV.df$X1 <- matPX[, 2] OV <- matPerrx[,1] error_y <- matPerrx[,2] ## Generate endogenous regressor IV.df$P <- g0_true + g1_true * IV.df$X1 + OV + error_x ## Generate dependent variable IV.df$Y <- b0_true + b1_true * IV.df$P + b2_true * IV.df$X + error_y ## Run OLS with Y and X m1 <- lm(Y ~ P + X, data = IV.df) for (i in 1:sim){ coefOLS1[i] <- summary(m1)$coefficients[2] coefOLS2[i] <- summary(m1)$coefficients[3] corXP[i] <- cor(IV.df$X, IV.df$P) corPE[i] <- cor(error_y, IV.df$P) } res <- list(resOLS1 = coefOLS1, resOLS2 = coefOLS2, resIV = coefIV, corxp = corXP, corperr = corPE) res } res1 <- res2 <- corelationXP <- corelationPE <- matrix(0, 3, 3) phi <- c(0.1, 0.3, 0.5) rho <- c(0.1, 0.3, 0.5) for (i in 1:3){ for (j in 1:3){ resbias <- endobias(nobs = 2500, phi = phi[i], rho = rho[j], sim = 1000) res1[i,j] <- mean(resbias$resOLS1) res2[i,j] <- mean(resbias$resOLS2) corelationXP[i,j] <- mean(resbias$corxp) corelationPE[i,j] <- mean(resbias$corperr) } } ## beta1 - the coefficient of the endogenous regressor dd_b1 <- rbind(as.matrix(res1[1, ], 3, 1), as.matrix(res1[2, ], 3, 1), as.matrix(res1[3, ], 3, 1)) colnames(dd_b1) <- c("beta1") dd_b1 <- cbind(dd_b1, corr_P_error = rep(c(0.1, 0.3, 0.5), each = 3)) dd_b1 <- cbind(dd_b1, corr_P_X=rep(c(0.1, 0.3, 0.5), 3)) dd_b1 <- as.data.frame(dd_b1) ## create graph Corr_P_error <- factor(dd_b1$corr_P_error) ## beta2 - the coefficient of the covariate dd_b2 <- rbind(as.matrix(res2[1, ], 3, 1), as.matrix(res2[2, ], 3, 1), as.matrix(res2[3, ], 3, 1)) colnames(dd_b2) <- c("beta2") dd_b2 <- cbind(dd_b2, corr_P_error = rep(c(0.1, 0.3, 0.5), each = 3)) dd_b2 <- cbind(dd_b2, corr_P_X = rep(c(0.1, 0.3, 0.5), 3)) dd_b2 <- as.data.frame(dd_b2) ## create graph Corr_P_error <- factor(dd_b2$corr_P_error) ## beta1 at different correlation levels between error and endogenous regressor X p1 <- ggplot(data=dd_b1, aes(x = corr_P_X, y = beta1, group = corr_P_error, colour = Corr_P_error)) + geom_line() + theme_grey(base_family = "serif", base_size = 10) + ggtitle("Panel A") p11 <- p1 + xlab("Correlation between P and X") + ylab("Coefficient estimates for beta1") + geom_point(data = as.data.frame(dd_b1[1:3, ]), aes(x = corr_P_X, y = beta1, group = corr_P_error, colour = factor(corr_P_error))) + geom_point(data = as.data.frame(dd_b1[4:6, ]), aes(x = corr_P_X, y = beta1, group = corr_P_error, colour = factor(corr_P_error))) + geom_point(data = as.data.frame(dd_b1[7:9,]), aes(x = corr_P_X, y = beta1, group = corr_P_error, colour = factor(corr_P_error))) + theme(panel.background = element_rect(fill = "white", colour = "black"), text = element_text(family = "serif", size = 10))+ geom_hline(aes(yintercept = -1, linetype = "dotdash"), show.legend = TRUE) + guides(fill = guide_legend(override.aes = list(linetype = "blank"))) + scale_linetype_manual(name = "true b1 value", values = "dotdash", labels = "-1") + scale_fill_discrete(guide = guide_legend(order = 1)) + scale_color_discrete(guide = guide_legend(order = 2)) + labs(colour = "corr(P, error)") p2 <- ggplot(data = dd_b2, aes(x = corr_P_X, y = beta2, group = corr_P_error, colour = Corr_P_error)) + geom_line() + theme_grey(base_family = "serif", base_size = 10) p21 <- p2 + xlab("Correlation between P and X") + ylab("Coefficient estimates for beta2") + ggtitle("Panel B") + geom_point(data = as.data.frame(dd_b2[1:3, ]), aes(x = corr_P_X, y = beta2, group = corr_P_error, colour = factor(corr_P_error))) + geom_point(data = as.data.frame(dd_b2[4:6, ]), aes(x = corr_P_X, y = beta2, group = corr_P_error, colour = factor(corr_P_error))) + geom_point(data = as.data.frame(dd_b2[7:9, ]), aes(x = corr_P_X, y = beta2, group = corr_P_error, colour = factor(corr_P_error))) + theme(panel.background = element_rect(fill = "white", colour = "black"), text = element_text(family = "serif", size = 10)) + geom_hline(aes(yintercept = 1, linetype = "dotdash"), show.legend = TRUE)+ scale_linetype_manual(name = "true b2 value", values = "dotdash", labels = "1") + guides(fill = guide_legend(override.aes = list(linetype = "blank"))) + scale_fill_discrete(guide = guide_legend(order = 1)) + scale_color_discrete(guide = guide_legend(order = 2)) + labs(colour = "corr(P, error)") figb1 <- grid.arrange(p11, p21, ncol = 2) colnames(res1) <- c("rho_0.1", "rho_0.3", "rho_0.5") rownames(res1) <- c("phi_0.1", "phi_0.3", "phi_0.5") if (doPDF) ggsave("Figures/jss_3807_appendix_figB1.pdf", figb1) ###### Table 7 ####### ## Estimates bias: beta1 - intercept, beta2 - slope ## estimates for beta1 - coefficient of P colnames(res1) <- c("rho_0.1", "rho_0.3", "rho_0.5") rownames(res1) <- c("phi_0.1", "phi_0.3", "phi_0.5") ## estimates for beta2 - coefficient of X colnames(res2) <- c("rho_0.1", "rho_0.3", "rho_0.5") rownames(res2) <- c("phi_0.1", "phi_0.3", "phi_0.5") -1 - res1 1 - res2 ##### Appendix C ##### ###### Figure 10 ####### ## Copula example library("rgl") set.seed(100) m <- 2 n <- 2000 sigma <- matrix(c(1, 0.4, 0.4, 1), nrow = 2) z <- mvrnorm(n, mu = rep(0, m), Sigma = sigma, empirical = T) u <- pnorm(z) x1 <- qt(u[, 1], df = 3) x2 <- qnorm(u[, 2]) plot3d(x1, x2, pch = 20, col = "navyblue", size = 22, xlab = "", ylab = "x1", zlab = "x2") t_dat <- data.frame(q = x1, cdf = pt(x1, df = 1)) c1 <- ggplot(t_dat) + geom_line(aes(x = x1, y = cdf)) + theme_classic() + ylab("Cumulative distribution function of x1") + theme(axis.text = element_text(size = 22), axis.title = element_text(size =22)) c1 norm_dat <- data.frame(q = x2, cdf = pnorm(x2)) c2 <- ggplot(norm_dat) + geom_line(aes(x = x2, y = cdf)) + theme_classic() + ylab("Cumulative distribution function of x2") + theme(axis.text = element_text(size = 22), axis.title = element_text(size =22)) c2 #### Data generation ############# ###### Latent IV ###### library("MASS") set.seed(1410) # sample size Tn <- 2500 # correlation matrix R <- matrix(c(1, 0.25, 0.25, 1), 2, 2) ## mean m <- c(0, 0) N <- mvrnorm(Tn, m, R) eps <- N[, 1] nu <- N[, 2] b0 <- 3 a <- -1 Z <- rep(c(1, 4, 7, 10, 12), c(100, 400, 1000, 700, 300)) P <- Z + 3 * nu y <- I * b0 + a * P + eps I <- rep(1, Tn) dataLatentIV <- data.table(y, I, P, Z, eps) resultsLIV <- latentIV(y ~ P, data = dataLatentIV) summary(resultsLIV) ### OLS ols_iv <- lm(y ~ P, data = dataLatentIV) summary(ols_iv) ##### Higher moments ##### set.seed(1121) #generate data Tn <- 2500 rho2 <- 0.5 R <- matrix(c(1, rho2, rho2, 1), 2, 2) m <- c(0, 0) N <- mvrnorm(Tn, mu = m, R) OV <- N[, 1] e1 <- N[, 2] I <- rep(1, Tn) X3 <- rt(Tn, 3) e2 <- rt(Tn, 10) X1 <- rnorm(Tn, 2, 0.1) X2 <- X1 + 0.5 * rnorm(Tn, 1.5, 0.9) b0 <- 2 b1 <- 1.5 b2 <- 3 b3 <- 3 a <- -1 P <- b3*X3 + e2 + 3*OV y <- b0 + b1*X1 + b2*X2 + a*P + e1 + OV X <- cbind(X1,X2) colnames(X) <- c("X1","X2") dataHigherMoments <- data.frame(y, X, P) ### Higher moments resultsHM <- higherMomentsIV(formula = y ~ X1 + X2 + P | P | IIV(iiv = yp), data = dataHigherMoments) summary(resultsHM) ### OLS ols_hm <- lm(y ~ X1 + X2 + P, data = dataHigherMoments) summary(ols_hm) ###### Heteroskedastic errors ##### set.seed(4333) N <- 2500 b0 <- 2 b1 <- 1.5 b2 <- 3 a <- -1 X1 <- rnorm(N, 0, 1) X2 <- rnorm(N, 0, 1) U <- rnorm(N, 0, 1) S1 <- rnorm(N, 0, 1) S2 <- rnorm(N, 0, 2) eps1 <- U + exp(X2) * S1 eps2 <- U + exp(-X2) * S2 P <- b0 + b2 * X2 + eps1 y <- b0 + b1 * X1 + b2 * X2 + a * P + eps2 dataHetIV <- as.data.frame(cbind(y, X1, X2, P)) ### Heteroskedastic errors resultsHetIV <- hetErrorsIV(y ~ X1 + X2 + P | P | IIV(X2), data = dataHetIV) summary(resultsHetIV) ### OLS ols_het <- lm(y ~ X1 + X2 + P, data = dataHetIV) summary(ols_het) ##### Copula correction #### ###### 1. One continuous endogenous variable ####### ## Creates a dataset with 1 continuous endogeneous variables, non-normally distributed ## The dependent variable is y, there are two exogeneous regressors, X1 and X2, ## and one endogeneous regressor P1 which has a t-distribution with 3 degrees of freedom. set.seed(1110) # sample size Tn <- 2500 # corr(eps P) = rho1 rho1 <- 0.33 ## correlation matrix R <- matrix(c(1, rho1, rho1, 1), 2, 2) ## mean m <- c(0, 0) b0 <- 2 b1 <- 1.5 b2 <- -3 a.1 <- -1 I <- rep(1, Tn) X1 <- rnorm(Tn, 4, 2.3) X2 <- rnorm(Tn, 2, 1.6) ## generate multivariate normal N <- mvrnorm(Tn, mu = m, Sigma = R) P <- N[, 1] eps <- N[, 2] ## P1 has a t-distribution with 3 df P <- qt(pnorm(P),3) y <- b0 * I + b1 * X1 + b2 * X2 + a.1 * P + eps dataCopCont <- cbind(y, I, X1, X2, P) dataCopCont <- as.data.frame(dataCopCont) ### Copula 1 continuous set.seed(1002) resultsCC1 <- copulaCorrection(formula = y ~ X1 + X2 + P | continuous(P), data = dataCopCont, num.boots = 50, verbose = FALSE) summary(resultsCC1) ### OLS ols_copcont <- lm(y ~ X1 + X2 + P, data = dataCopCont) summary(ols_copcont) ##### 2. One discrete endogenoue variable #### ## create dataset with 1 discrete endogeneous variable, with a Poisson distribution set.seed(42100) ## sample size Tn <- 2500 ## corr(eps, P) = rho1 rho <- 0.33 ## correlation matrix R <- matrix(c(1, rho, rho, 1), 2, 2) ## mean m <- c(0, 0) b0 <- 2 b1 <- 1.5 b2 <- -3 a <- -1 I <- rep(1, Tn) X.1 <- rnorm(Tn, 4, 2.3) X.2 <- X.1 + 0.5 * rnorm(Tn, 2, 1.6) ## generate multivariate normal N <- mvrnorm(Tn, mu = m, Sigma = R) P <- N[, 1] eps <- N[, 2] X1 <- X.1 X2 <- X.2 ## P has a poisson with lambda=5 P <- qpois(pnorm(P), 5) y <- b0 + b1 * X1 + b2 * X2 + a * P + eps X <- cbind(X1, X2, P) dataCopDis <- cbind(y, I, X, P) dataCopDis <- as.data.frame(dataCopDis) ### Copula - discrete ## with intercept set.seed(1003) resultsCC2 <- copulaCorrection(formula = y ~ X1 + X2 + P | discrete(P), data = dataCopDis) summary(resultsCC2) ### OLS ols_copdis <- lm( y ~ X1 + X2 + P, data = dataCopDis) summary(ols_copdis) ##### 3. One continuous and one discrete endogenous variable #### ## create dataset with 2 continuous endogeneous variables, non-normaly distributed set.seed(42102) ## sample size Tn <- 2500 ## corr(eps, P) = rho1 rho12 <- 0.25 # correlation btw P1 and P2 rho1a <- 0.33 # correlation btw P1 and error rho2a <- 0.25 # correlation btw P2 and error ## correlation matrix R <- matrix(c(1, rho12, rho1a, rho12, 1, rho2a, rho1a, rho2a, 1), 3, 3) ## mean m <- c(0, 0, 0) b0 <- 2 b1 <- 1.5 b2 <- -3 a.1 <- -1 a.2 <- 0.8 I <- rep(1, Tn) X1 <- rnorm(Tn, 4, 2.3) X2 <- X1 + 0.5 * rnorm(Tn, 2, 1.6) ## generate multivariate normal N <- mvrnorm(Tn, mu = m, Sigma = R) P1 <- N[, 1] P2 <- N[, 2] eps <- N[, 3] ## P1 has a poisson distribution with 3 df P1 <- qpois(pnorm(P1), 3) ## P2 has a t-distribution with 3 df P2 <- qt(pnorm(P2), 3) X <- cbind(I, X1, X2, P1, P2) y <- b0 + b1 * X1 + b2 * X2 + a.1 * P1 + a.2 * P2 + eps P <- cbind(P1,P2) dataCopDisCont <- cbind(y, I, X1, X2, P1, P2) dataCopDisCont <- as.data.frame(dataCopDisCont) #### Copula 1 discrete + 1 continuous set.seed(1004) resultsCC3 <- copulaCorrection(formula = y ~ X1 + X2 + P1 + P2 | discrete(P1) + continuous(P2), data = dataCopDisCont) summary(resultsCC3) ### OLS ols_copdis2 <- lm(y ~ X1 + X2 + P1 + P2, data = dataCopDisCont) summary(ols_copdis2) ####### Multilevel ##### set.seed(43410) #### ## 3-levels #### #### functions used ######### dmat <- cmpfun(function(i) { j <- length(i) n <- sum(i) index <- cbind(start = cumsum(c(1, i[-j])), stop = cumsum(i)) H <- matrix(0, nrow = n, ncol = j) for (i in 1:j) { H[index[i, 1]:index[i, 2], i] <- 1L } return(H) }) mycut <- cmpfun(function(x, p) { cut(x = x, breaks = quantile(x, probs = p), labels = FALSE, include.lowest = TRUE) }) ## the coefficients to be recovered coefGMM <- c(70, 3, 9, -2, 2, -2, -4, -3, 5, 2, 0.5, 0.1, -0.5) ## variance-covariance matrix for level 3 variables in original dataset R3_31 <- -0.30 R3_32 <- -0.22 R3_33 <- -0.10 R3_34 <- 0.23 R3_35 <- -0.15 R3_36 <- 0.19 R3 <- matrix(c(1, R3_31, R3_32, R3_34, R3_31, 1, R3_33, R3_35, R3_32, R3_33, 1, R3_36, R3_34, R3_35, R3_36, 1), 4, 4) R3 <- cov2cor(make.positive.definite(R3)) ## variance-covariance matrix for level 2 variables in original dataset R2_12 <- -0.05 R2_13 <- 0.006 R2_14 <- 0.02 R2_23 <- -0.75 R2_24 <- -0.15 R2_34 <- -0.10 R2 <- matrix(c(1, R2_12, R2_13, R2_14, R2_12, 1, R2_23, R2_24, R2_13, R2_23, 1, R2_34, R2_14, R2_24, R2_34, 1), 4, 4) R2 <- cov2cor(make.positive.definite(R2)) ## variance-covariance matrix for level 1 variables in original dataset R1_12 <- 0.015 R1_13 <- 0.15 R1_14 <- 0.05 R1_23 <- 0.10 R1_24 <- 0.01 R1_34 <- 0.10 R1 <- matrix(c(1, R1_12, R1_13, R1_14, R1_12, 1, R1_23, R1_24, R1_13, R1_23, 1, R1_34, R1_14, R1_24, R1_34, 1), 4, 4) R1 <- cov2cor(make.positive.definite(R1)) mu <- list(int = 0, lev1 = c(X11 = 1.60, X12 = 0.056338, X13 = 0.06760563, X14 = 0.763122), lev2 = c(X21 = 0.4688129), lev3 = c(X32 = 20.628, X31 = 12.2953, X33 = 94.84266)) k <- 40 ## total number of schools n <- sample(29:39, k, TRUE) ## number of students within each school j <- sum(n) ## total number of students N <- sample(1:5, j, TRUE, prob = c(0.415833333, 0.246666667, 0.191666667, 0.142500000, 0.003333333)) i <- sum(N) ## %%%%%%%%%%%%%%% ## school variables ## %%%%%%%%%%%%%%% X3 <- as.data.table(mvrnorm(n = k, mu = c(mu$lev3, mu$int), Sigma = R3)) X3[,SID:= 1:k] X3 <- within(X3, { X32 <- floor(X32 - abs(V4)) X31 <- floor(X31 + rtnorm(k, mean = mu$lev3["X31"] - 0.19 * X32, sd = 3.195407, lower = 5, upper = 25)) X33 <- floor(rtnorm(k, mean = mu$lev3["X33"] + 0.19 * X32 - 0.10 * X31, sd = 8.562959, lower = 30, upper = 98)) }) X3[,err3 := rnorm(k, 0, 1)] X3[,OVL3 := rnorm(k, 8, 2.5) + 0.8 * abs(X3$V4)] S <- dmat(N) %*% dmat(n) %*% as.matrix(X3) ## %%%%%%%%%%%%%%% ## child variables ## %%%%%%%%%%%%%%% X21 <- rnorm(n = j, mu$lev2, 0.23) X2 <- as.data.frame(X21) RACE <- sample(0:3, j, TRUE, prob = c(0.02, 0.52, 0.35, 0.11)) Race <- dummy_cols(RACE) X24 <- Race[, 1] X22 <- Race[, 2] X23 <- Race[, 3] b2 <- cbind(CID = 1:j, CRC = rlnorm(j, mean = mu$int, sd = 0.1579), err2 = rnorm(j, 0, 1)) X2 <- within(X2, { X21 <- mycut(X2[,"X21"],c(0, 0.53, 1)) - 1 }) b21 <- cbind(b2, X2, X24, X22, X23) C <- dmat(N) %*% as.matrix(b21) dt <- as.data.table(cbind(C, S)) dt[, X21:= mycut(X21 + 0.8 * X31, c(0, 0.53, 1)) - 1] dt[, X22:= mycut(X22 - 0.5 * X31, c(0, 0.52, 1)) - 1] library("plyr") dt <- rename(dt, c("V4" = "SRC")) setkeyv(dt, cols = c("CID", "SID")) X1 <- as.data.frame(mvrnorm (n = i, mu = mu$lev1, Sigma = R1)) X1 <- within(X1, { X12 <- mycut(as.numeric(unlist(X1[, 2] + 0.1 * dt[,"SRC"] + 0.25 * dt[,"CRC"])), c(0, 0.99467, 1)) - 1 X13 <- mycut(as.numeric(unlist(X1[, 3] + 0.1 * dt[,"SRC"] + 0.23 * dt[,"CRC"])), c(0, 0.943, 1)) - 1 X14 <- (X1[, 4] - min(X1[, 4]))/(max(X1[, 4]) - min(X1[, 4])) }) dt[, err1 := rnorm(i, 0, 1)] dt[, GRADE := 1:.N, by = c("CID", "SID")] dt[, X11 := GRADE - 1] dt[, X14 := X1[, "X14"] - 0.02 * X31 + 0.05 * X32 + 0.0008 * X33 + rnorm(i, 0, 1)] dt[, X13 := X1[, "X13"] + 0.02 * X22 - 0.02 * X21 + 0.03 * X24 - 0.05 * X31 + 0.05 * X32 + 0.0008 * X33 + rnorm(i, 0, 1)] dt[, X12 := X1[, "X12"] + 0.4 * X22 - 0.25 * X21 + 0.2 * X24 - 0.04 * X31 + 0.05 * X32 + 0.0008 * X33 + rnorm(i, 0, 1)] dt[, Intercept := rep(1, i)] dt[, c("SRC", "CRC") := NULL] dt[, GRADE := NULL] ## create random slope sigma1 <- matrix(c(1, 0.25, 0.25, 1), 2, 2) U <- as.data.frame(mvrnorm(n = i, mu = c(0, 0), Sigma = sigma1)) dt[, X15 := (0.2 * X21 + 0.03 * X22 + 0.04 * X23) + rnorm(i, 0, 1) + dt[, "err2"]] dt1 <- data.table::copy(dt) dt[, c("SID", "CID") := NULL] setcolorder(dt, c("Intercept", "X11", "X12", "X13", "X14", "X21", "X22", "X23", "X24", "X31", "X32", "X33", "OVL3", "X15", "err1", "err2", "err3")) ## dependent variable dt1[, TLI := matrix(coefGMM %*% t(dt[, 1:13]), dim(dt)[1], 1) + (-1) * dt[, "X15"] + (-.5) * U[, 1] * dt[, "X11"] + 0.05 * U[, 2] + dt[, "err1"] + dt[, "err2"] + dt[, "err3"]] setcolorder(dt1, c("Intercept", "SID", "CID", "TLI", "X11", "X12", "X13", "X14", "X21", "X22", "X23", "X24", "X31", "X32", "X33", "OVL3", "X15", "err1", "err2", "err3")) dt1[, c("OVL3", "err1", "err2", "err3") := NULL] colnames(dt1) <- c("Intercept", "SID", "CID", "y", "X11", "X12", "X13", "X14", "X21", "X22", "X23", "X24", "X31", "X32", "X33", "X15") dataMultilevelIV <- dt1