## replication file for the singleRcapture 1.0.0 version ## codes are exactly the same as in the paper options(prompt = 'R> ', continue = '+ ') # ?ztpoisson # install.packages("singleRcapture") library("singleRcapture") data("netherlandsimmigrant", package = "singleRcapture") head(netherlandsimmigrant) summary(netherlandsimmigrant) table(netherlandsimmigrant$capture) basicModel <- estimatePopsize( formula = capture ~ gender + age + nation, model = ztpoisson(), data = netherlandsimmigrant, controlMethod = controlMethod(silent = TRUE) ) summary(basicModel) set.seed(123456) modelInflated <- estimatePopsize( formula = capture ~ nation, model = ztoigeom(omegaLink = "cloglog"), data = netherlandsimmigrant, controlModel = controlModel( omegaFormula = ~ gender + age ), popVar = "bootstrap", controlPopVar = controlPopVar(bootType = "semiparametric") ) summary(modelInflated) popSizeEst(basicModel) # alternative: basicModel$populationSize popSizeEst(modelInflated) # alternative: modelInflated$populationSize library("lmtest") lrtest(basicModel, modelInflated, name = function(x) { if (family(x)$family == "ztpoisson") "Basic model" else "Inflated model" }) margFreq <- marginalFreq(basicModel) summary(margFreq, df = 1, dropl5 = "group") margFreq_inf <- marginalFreq(modelInflated) summary(margFreq_inf, df = 1, dropl5 = "group") plot( basicModel, plotType = "rootogram", main = "ZT Poisson model") plot(modelInflated, plotType = "rootogram", main = "ZTOI Geometric model") # dev.off() dfb <- dfbeta(basicModel) round(t(apply(dfb, 2, quantile)*100), 4) dfi <- dfbeta(modelInflated) round(t(apply(dfi, 2, quantile)*100), 4) dfb_pop <- dfpopsize(basicModel, dfbeta = dfb) dfi_pop <- dfpopsize(modelInflated, dfbeta = dfi) summary(dfb_pop) summary(dfi_pop) plot(basicModel, plotType = "dfpopContr", dfpop = dfb_pop, xlim = c(-4500, 150)) plot(modelInflated, plotType = "dfpopContr", dfpop = dfi_pop, xlim = c(-4500, 150)) # dev.off() # ?plot.singleRStaticCountData popSizestrata <- stratifyPopsize(basicModel) cols <- c("name", "Observed", "Estimated", "logNormalLowerBound", "logNormalUpperBound") popSizestrata_report <- popSizestrata[, cols] cols_custom <- c("Name", "Obs", "Estimated", "LowerBound", "UpperBound") names(popSizestrata_report) <- cols_custom popSizestrata_report popSizestrata_inflated <- stratifyPopsize(modelInflated) popSizestrata_inflated_report <- popSizestrata_inflated[, cols] names(popSizestrata_inflated_report) <- cols_custom popSizestrata_inflated_report library(sandwich) popSizestrataCustom <- stratifyPopsize( object = basicModel, strata = ~ gender + age, alpha = rep(c(0.1, 0.05), each=2), cov = vcovHC(basicModel, type = "HC4") ) popSizestrataCustom_report <- popSizestrataCustom[, c(cols, "confLevel")] names(popSizestrataCustom_report) <- c(cols_custom, "alpha") popSizestrataCustom_report # list( # "Stratum 1" = netherlandsimmigrant$gender == "male" & # netherlandsimmigrant$nation == "Suriname", # "Stratum 2" = netherlandsimmigrant$gender == "female" & # netherlandsimmigrant$nation == "North Africa" # ) par(mar = c(2.5, 8.5, 4.1, 2.5), cex.main = .7, cex.lab = .6) plot(basicModel, plotType = "strata") plot(modelInflated, plotType = "strata") # dev.off() (popEst <- popSizeEst(basicModel)) coef(summary(basicModel)) set.seed(1234567890) N <- 10000 gender <- rbinom(N, 1, 0.2) eta <- -1 + 0.5*gender counts <- simulate(ztpoisson(), eta = cbind(eta), seed = 1) summary(data.frame(gender, eta, counts)) # estimatePopsize( # TOTAL_SUB ~ ., # data = farmsubmission, # model = oiztgeom(), # controlModel( # omegaFormula = ~ 1 + log_size + C_TYPE # ) # ) X <- matrix(data = 0, nrow = 2 * NROW(farmsubmission), ncol = 7) X[1:NROW(farmsubmission), 1:4] <- model.matrix( ~ 1 + log_size + log_distance + C_TYPE, farmsubmission ) X[-(1:NROW(farmsubmission)), 5:7] <- model.matrix( ~ 1 + log_distance + C_TYPE, farmsubmission ) attr(X, "hwm") <- c(4, 3) start <- glm.fit( y = farmsubmission$TOTAL_SUB, x = X[1:NROW(farmsubmission), 1:4], family = poisson() )$coefficients start res <- estimatePopsizeFit( y = farmsubmission$TOTAL_SUB, X = X, method = "IRLS", priorWeights = 1, family = oiztgeom(), control = controlMethod(silent = TRUE), coefStart = c(start, 0, 0, 0), etaStart = matrix(X %*% c(start, 0, 0, 0), ncol = 2), offset = cbind(rep(0, NROW(farmsubmission)), rep(0, NROW(farmsubmission))) ) ll <- oiztgeom()$makeMinusLogLike(y = farmsubmission$TOTAL_SUB, X = X) res2 <- estimatePopsizeFit( y = farmsubmission$TOTAL_SUB, X = X, method = "optim", priorWeights = 1, family = oiztgeom(), coefStart = c(start, 0, 0, 0), control = controlMethod(silent = TRUE, maxiter = 10000), offset = cbind(rep(0, NROW(farmsubmission)), rep(0, NROW(farmsubmission))) ) data.frame(IRLS = round(c(res$beta, -ll(res$beta), res$iter), 4), optim = round(c(res2$beta, -ll(res2$beta), res2$iter[1]), 4)) # Implementing a custom \pkg{singleRcapture} family function {short-title="Implementing custom singleRcapture family function"} #Suppose we want to implement a very specific zero truncated family function as presented in Appendix B myFamilyFunction <- function(lambdaLink = c("logit", "cloglog", "probit"), piLink = c("logit", "cloglog", "probit"), ...) { if (missing(lambdaLink)) lambdaLink <- "logit" if (missing(piLink)) piLink <- "logit" links <- list() attr(links, "linkNames") <- c(lambdaLink, piLink) lambdaLink <- switch(lambdaLink, "logit" = singleRcapture:::singleRinternallogitLink, "cloglog" = singleRcapture:::singleRinternalcloglogLink, "probit" = singleRcapture:::singleRinternalprobitLink ) piLink <- switch(piLink, "logit" = singleRcapture:::singleRinternallogitLink, "cloglog" = singleRcapture:::singleRinternalcloglogLink, "probit" = singleRcapture:::singleRinternalprobitLink ) links[1:2] <- c(lambdaLink, piLink) mu.eta <- function(eta, type = "trunc", deriv = FALSE, ...) { pi <- piLink(eta[, 2], inverse = TRUE) / 2 lambda <- lambdaLink(eta[, 1], inverse = TRUE) / 2 if (!deriv) { switch (type, "nontrunc" = pi + 2 * lambda, "trunc" = 1 + lambda / (pi + lambda) ) } else { # Only necessary if one wishes to use standard errors in predict method switch (type, "nontrunc" = { matrix(c(2, 1) * c( lambdaLink(eta[, 1], inverse = TRUE, deriv = 1) / 2, piLink(eta[, 2], inverse = TRUE, deriv = 1) / 2 ), ncol = 2) }, "trunc" = { matrix(c( pi / (pi + lambda) ^ 2, -lambda / (pi + lambda) ^ 2 ) * c( lambdaLink(eta[, 1], inverse = TRUE, deriv = 1) / 2, piLink(eta[, 2], inverse = TRUE, deriv = 1) / 2 ), ncol = 2) } ) } } variance <- function(eta, type = "nontrunc", ...) { pi <- piLink(eta[, 2], inverse = TRUE) / 2 lambda <- lambdaLink(eta[, 1], inverse = TRUE) / 2 switch (type, "nontrunc" = pi * (1 - pi) + 4 * lambda * (1 - lambda - pi), "trunc" = lambda * (1 - lambda) / (pi + lambda) ) } Wfun <- function(prior, y, eta, ...) { pi <- piLink(eta[, 2], inverse = TRUE) / 2 lambda <- lambdaLink(eta[, 1], inverse = TRUE) / 2 G01 <- ((lambda + pi) ^ (-2)) * piLink(eta[, 2], inverse = TRUE, deriv = 1) * lambdaLink(eta[, 1], inverse = TRUE, deriv = 1) * prior / 4 G00 <- ((lambda + pi) ^ (-2)) - (pi ^ (-2)) - lambda / ((lambda + pi) * (pi ^ 2)) G00 <- G00 * prior * (piLink(eta[, 2], inverse = TRUE, deriv = 1) ^ 2) / 4 G11 <- ((lambda + pi) ^ (-2)) - (((lambda + pi) * lambda) ^ -1) G11 <- G11 * prior * (lambdaLink(eta[, 1], inverse = TRUE, deriv = 1) ^ 2) / 4 matrix( -c(G11, # lambda G01, # mixed G01, # mixed G00 # pi ), dimnames = list(rownames(eta), c("lambda", "mixed", "mixed", "pi")), ncol = 4 ) } funcZ <- function(eta, weight, y, prior, ...) { pi <- piLink(eta[, 2], inverse = TRUE) / 2 lambda <- lambdaLink(eta[, 1], inverse = TRUE) / 2 weight <- weight / prior G0 <- (2 - y) / pi - ((lambda + pi) ^ -1) G1 <- (y - 1) / lambda - ((lambda + pi) ^ -1) G1 <- G1 * lambdaLink(eta[, 1], inverse = TRUE, deriv = 1) / 2 G0 <- G0 * piLink(eta[, 2], inverse = TRUE, deriv = 1) / 2 uMatrix <- matrix(c(G1, G0), ncol = 2) weight <- lapply(X = 1:nrow(weight), FUN = function (x) { matrix(as.numeric(weight[x, ]), ncol = 2) }) pseudoResid <- sapply(X = 1:length(weight), FUN = function (x) { #xx <- chol2inv(chol(weight[[x]])) # less computationally demanding xx <- solve(weight[[x]]) # more stable xx %*% uMatrix[x, ] }) pseudoResid <- t(pseudoResid) dimnames(pseudoResid) <- dimnames(eta) pseudoResid } minusLogLike <- function(y, X, offset, weight = 1, NbyK = FALSE, vectorDer = FALSE, deriv = 0, ...) { y <- as.numeric(y) if (is.null(weight)) { weight <- 1 } if (missing(offset)) { offset <- cbind(rep(0, NROW(X) / 2), rep(0, NROW(X) / 2)) } if (!(deriv %in% c(0, 1, 2))) stop("Only score function and derivatives up to 2 are supported.") deriv <- deriv + 1 switch (deriv, function(beta) { eta <- matrix(as.matrix(X) %*% beta, ncol = 2) + offset pi <- piLink(eta[, 2], inverse = TRUE) / 2 lambda <- lambdaLink(eta[, 1], inverse = TRUE) / 2 -sum(weight * ((2 - y) * log(pi) + (y - 1) * log(lambda) - log(pi + lambda))) }, function(beta) { eta <- matrix(as.matrix(X) %*% beta, ncol = 2) + offset pi <- piLink(eta[, 2], inverse = TRUE) / 2 lambda <- lambdaLink(eta[, 1], inverse = TRUE) / 2 G0 <- (2 - y) / pi - ((lambda + pi) ^ -1) G1 <- (y - 1) / lambda - ((lambda + pi) ^ -1) G1 <- G1 * weight * lambdaLink(eta[, 1], inverse = TRUE, deriv = 1) / 2 G0 <- G0 * weight * piLink(eta[, 2], inverse = TRUE, deriv = 1) / 2 if (NbyK) { XX <- 1:(attr(X, "hwm")[1]) return(cbind(as.data.frame(X[1:nrow(eta), XX]) * G1, as.data.frame(X[-(1:nrow(eta)), -XX]) * G0)) } if (vectorDer) { return(cbind(G1, G0)) } as.numeric(c(G1, G0) %*% X) }, function (beta) { lambdaPredNumber <- attr(X, "hwm")[1] eta <- matrix(as.matrix(X) %*% beta, ncol = 2) + offset pi <- piLink(eta[, 2], inverse = TRUE) / 2 lambda <- lambdaLink(eta[, 1], inverse = TRUE) / 2 res <- matrix(nrow = length(beta), ncol = length(beta), dimnames = list(names(beta), names(beta))) # pi^2 derivative dpi <- (2 - y) / pi - (lambda + pi) ^ -1 G00 <- ((lambda + pi) ^ (-2)) - (2 - y) / (pi ^ 2) G00 <- t(as.data.frame(X[-(1:(nrow(X) / 2)), -(1:lambdaPredNumber)] * (G00 * ((piLink(eta[, 2], inverse = TRUE, deriv = 1) / 2) ^ 2) + dpi * piLink(eta[, 2], inverse = TRUE, deriv = 2) / 2) * weight)) %*% as.matrix(X[-(1:(nrow(X) / 2)), -(1:lambdaPredNumber)]) # mixed derivative G01 <- (lambda + pi) ^ (-2) G01 <- t(as.data.frame(X[1:(nrow(X) / 2), 1:lambdaPredNumber]) * G01 * (lambdaLink(eta[, 1], inverse = TRUE, deriv = 1) / 2) * (piLink(eta[, 2], inverse = TRUE, deriv = 1) / 2) * weight) %*% as.matrix(X[-(1:(nrow(X) / 2)), -(1:lambdaPredNumber)]) # lambda^2 derivative G11 <- ((lambda + pi) ^ (-2)) - (y - 1) / (lambda ^ 2) dlambda <- (y - 1) / lambda - ((lambda + pi) ^ -1) G11 <- t(as.data.frame(X[1:(nrow(X) / 2), 1:lambdaPredNumber] * (G11 * ((lambdaLink(eta[, 1], inverse = TRUE, deriv = 1) / 2) ^ 2) + dlambda * lambdaLink(eta[, 1], inverse = TRUE, deriv = 2) / 2) * weight)) %*% X[1:(nrow(X) / 2), 1:lambdaPredNumber] res[-(1:lambdaPredNumber), -(1:lambdaPredNumber)] <- G00 res[1:lambdaPredNumber, 1:lambdaPredNumber] <- G11 res[1:lambdaPredNumber, -(1:lambdaPredNumber)] <- t(G01) res[-(1:lambdaPredNumber), 1:lambdaPredNumber] <- G01 res } ) } validmu <- function(mu) { (sum(!is.finite(mu)) == 0) && all(0 < mu) && all(2 > mu) } # this is optional devResids <- function(y, eta, wt, ...) { 0 } pointEst <- function (pw, eta, contr = FALSE, ...) { pi <- piLink(eta[, 2], inverse = TRUE) / 2 lambda <- lambdaLink(eta[, 1], inverse = TRUE) / 2 N <- pw / (lambda + pi) if(!contr) { N <- sum(N) } N } popVar <- function (pw, eta, cov, Xvlm, ...) { pi <- piLink(eta[, 2], inverse = TRUE) / 2 lambda <- lambdaLink(eta[, 1], inverse = TRUE) / 2 bigTheta1 <- -pw / (pi + lambda) ^ 2 # w.r to pi bigTheta1 <- bigTheta1 * piLink(eta[, 2], inverse = TRUE, deriv = 1) / 2 bigTheta2 <- -pw / (pi + lambda) ^ 2 # w.r to lambda bigTheta2 <- bigTheta2 * lambdaLink(eta[, 1], inverse = TRUE, deriv = 1) / 2 # w.r to lambda bigTheta <- t(c(bigTheta2, bigTheta1) %*% Xvlm) f1 <- t(bigTheta) %*% as.matrix(cov) %*% bigTheta f2 <- sum(pw * (1 - pi - lambda) / ((pi + lambda) ^ 2)) f1 + f2 } dFun <- function (x, eta, type = c("trunc", "nontrunc")) { if (missing(type)) type <- "trunc" pi <- piLink(eta[, 2], inverse = TRUE) / 2 lambda <- lambdaLink(eta[, 1], inverse = TRUE) / 2 switch (type, "trunc" = { (pi * as.numeric(x == 1) + lambda * as.numeric(x == 2)) / (pi + lambda) }, "nontrunc" = { (1 - pi - lambda) * as.numeric(x == 0) + pi * as.numeric(x == 1) + lambda * as.numeric(x == 2) } ) } simulate <- function(n, eta, lower = 0, upper = Inf) { pi <- piLink(eta[, 2], inverse = TRUE) / 2 lambda <- lambdaLink(eta[, 1], inverse = TRUE) / 2 CDF <- function(x) { ifelse(x == Inf, 1, ifelse(x < 0, 0, ifelse(x < 1, 1 - pi - lambda, ifelse(x < 2, 1 - lambda, 1)))) } lb <- CDF(lower) ub <- CDF(upper) p_u <- stats::runif(n, lb, ub) sims <- rep(0, n) cond <- CDF(sims) <= p_u while (any(cond)) { sims[cond] <- sims[cond] + 1 cond <- CDF(sims) <= p_u } sims } getStart <- expression( if (method == "IRLS") { etaStart <- cbind( family$links[[1]](mean(observed == 2) * (1 + 0 * (observed == 2))), # lambda family$links[[2]](mean(observed == 1) * (1 + 0 * (observed == 1))) # pi ) + offset } else if (method == "optim") { init <- c( family$links[[1]](weighted.mean(observed == 2, priorWeights) * 1 + .0001), family$links[[2]](weighted.mean(observed == 1, priorWeights) * 1 + .0001) ) if (attr(terms, "intercept")) { coefStart <- c(init[1], rep(0, attr(Xvlm, "hwm")[1] - 1)) } else { coefStart <- rep(init[1] / attr(Xvlm, "hwm")[1], attr(Xvlm, "hwm")[1]) } if ("(Intercept):pi" %in% colnames(Xvlm)) { coefStart <- c(coefStart, init[2], rep(0, attr(Xvlm, "hwm")[2] - 1)) } else { coefStart <- c(coefStart, rep(init[2] / attr(Xvlm, "hwm")[2], attr(Xvlm, "hwm")[2])) } } ) structure( list( makeMinusLogLike = minusLogLike, densityFunction = dFun, links = links, mu.eta = mu.eta, valideta = function (eta) {TRUE}, variance = variance, Wfun = Wfun, funcZ = funcZ, devResids = devResids, validmu = validmu, pointEst = pointEst, popVar = popVar, family = "myFamilyFunction", etaNames = c("lambda", "pi"), simulate = simulate, getStart = getStart, extraInfo = c( mean = "pi / 2 + lambda", variance = paste0("(pi / 2) * (1 - pi / 2) + 2 * lambda * (1 - lambda / 2 - pi / 2)"), popSizeEst = "(1 - (pi + lambda) / 2) ^ -1", meanTr = "1 + lambda / (pi + lambda)", varianceTr = paste0("lambda * (1 - lambda / 2) / (pi + lambda)") ) ), class = c("singleRfamily", "family") ) } # A quick tests shows us that this implementation in fact works: set.seed(123) Y <- simulate( myFamilyFunction(lambdaLink = "logit", piLink = "logit"), nsim = 1000, eta = matrix(0, nrow = 1000, ncol = 2), truncated = FALSE ) mm <- estimatePopsize( formula = Y ~ 1, data = data.frame(Y = Y[Y > 0]), model = myFamilyFunction(lambdaLink = "logit", piLink = "logit"), # the usual observed information matrix # is ill-suited for this distribution controlPopVar = controlPopVar(covType = "Fisher") ) summary(mm) #where the link functions, such as \code{singleRcapture:::singleRinternalcloglogLink}, are just internal functions in \pkg{singleRcapture} that compute link functions, their inverses and derivatives of both links and inverse links up to the third order: singleRcapture:::singleRinternalcloglogLink # One could, of course, include the code for computing them manually. # Session info sessionInfo()