######################################################################
## Section 6: Timings
######################################################################

######################################################################
## 6.1. Cross validation
######################################################################

## WARNING: TAKES A LONG TIME!

library("mvtnorm")
library("oem")
library("glmnet")
library("ncvreg")
library("gglasso")
library("grpreg")
library("microbenchmark")
set.seed(123)

k.vec <- 10
nvars.vec <- c(50, 100, 250, 500)
nobs.vec <- c(1e5, 1e6)

n.runs <- 10
rho <- 0.5

mcp.pen <- function(t, lambda, gamma) {
    ifelse(abs(t) < lambda * gamma, lambda * (abs(t) - t ^ 2 / (2 * lambda * gamma)),
           (lambda ^ 2) * gamma * 0.5)
}

grp.pen <- function(t, lambda, groups, intercept = FALSE) {
    unique.groups <- unique(groups)
    ng <- length(unique.groups)
    
    pen <- 0
    for (g in 1:ng) {
        grp.idx <- which(groups == unique.groups[g])
        grp.size <- length(grp.idx)
        if (intercept & g == 1) {
        } else {
            pen <- pen + lambda * sqrt(grp.size) * sqrt(sum((t[grp.idx]) ^ 2))
        }
    }
    pen
}

logistic.loss <- function(beta, x, y) {
    xbeta <- x %*% beta
    exb <- exp( xbeta )
    prob1 <- exb / (1 + exb)
    logexb <- log(prob1)
    
    y0 <- 1 - y
    
    logexb0 <- log(1 - prob1)
    
    logL <- as.vector(apply(y %*% logexb + y0 %*% logexb0, 2, sum))
    -logL
}

sq.err.loss <- function(beta, x, y) {
    xbeta <- x %*% beta
    0.5 * sum((y - xbeta) ^ 2) / nrow(x)
}

mcp.logistic <- function(beta, x, y, lambda, gamma) {
    nobs <- NROW(x)
    loss <- logistic.loss(beta, x, y)
    if (sd(x[, 1]) == 0) {
        pen <- sum(mcp.pen(beta[-1], lambda, gamma))
    } else {
        pen <- sum(mcp.pen(beta, lambda, gamma))
    }
    loss / nobs + pen
}

lasso.linear <- function(beta, x, y, lambda) {
    nobs <- NROW(x)
    loss <- sq.err.loss(beta, x, y)
    if (sd(x[, 1]) == 0) {
        pen <- lambda * sum(abs(beta[-1]))
    } else {
        pen <- lambda * sum(abs(beta))
    }
    loss + pen
}

lasso.logistic <- function(beta, x, y, lambda) {
    nobs <- NROW(x)
    loss <- logistic.loss(beta, x, y)
    if (sd(x[, 1]) == 0) {
        pen <- lambda * sum(abs(beta[-1]))
    } else {
        pen <- lambda * sum(abs(beta))
    }
    loss / nobs + pen
}

grp.logistic <- function(beta, x, y, lambda, groups) {
    nobs <- NROW(x)
    loss <- logistic.loss(beta, x, y)
    pen <- grp.pen(beta, lambda, groups, intercept = sd(x[, 1]) == 0)
    loss / nobs + pen
}

mcp.linear <- function(beta, x, y, lambda, gamma) {
    nobs <- NROW(x)
    loss <- sq.err.loss(beta, x, y)
    if (sd(x[, 1]) == 0) {
        pen <- sum(mcp.pen(beta[-1], lambda, gamma))
    } else {
        pen <- sum(mcp.pen(beta, lambda, gamma))
    }
    loss + pen
}

grp.linear <- function(beta, x, y, lambda, groups) {
    nobs <- NROW(x)
    loss <- sq.err.loss(beta, x, y)
    pen <- grp.pen(beta, lambda, groups, intercept = sd(x[, 1]) == 0)
    loss + pen
}

check.kkt.grp.linear <- function(beta, x, y, lambda, groups) {
    if (sd(x[, 1]) == 0) {
        intercept <- TRUE
        groups <- c(-1, groups)
    } else {
        intercept <- FALSE
    }
    y <- 2 * y - 1
    n <- NROW(y)
    xbeta <- drop(x %*% beta)
    neg.grad.loglik <- mean((1 / (1 + exp(drop(y) * xbeta))) * drop(y) * x)
    
    unique.groups <- unique(groups)
    ng <- length(unique.groups)
    
    wk <- kkt <- numeric(ng)
    pen <- 0
    
    for (g in 1:ng) {
        grp.idx <- which(groups == unique.groups[g])
        grp.size <- length(grp.idx)
        if (intercept & g == 1) {
            wk[g] <- 0
        } else {
            wk[g] <- sqrt(grp.size)
        }
        
        beta.g <- beta[grp.idx]
        norm.beta.g <- sqrt(sum((beta.g) ^ 2))
        
        if (norm.beta.g != 0) {
            kkt[g] <- all(-neg.grad.loglik + lambda * wk[g] * beta.g / norm.beta.g == 0)
        } else {
            kkt[g] <- sqrt(sum((neg.grad.loglik) ^ 2)) <= lambda * wk[g]
        }
    }
    kkt
}

res.mat <- array(NA, dim = c(length(k.vec) * length(nvars.vec) * length(nobs.vec), 17))
ct <- 0

loss.diff.mat.oem <- array(0, dim = c(1 * length(nvars.vec) * length(nobs.vec), 6))

for (j in 1:length(nvars.vec)) {
    nvars <- nvars.vec[j]
    sigma <- rho ^ abs(outer(1:nvars, 1:nvars, FUN = "-"))
    for (i in 1:length(nobs.vec)) {
        for (k in 1:length(k.vec)) {
            K <- k.vec[k]
            
            ct <- ct + 1
            nobs <- nobs.vec[i]
            x.unscaled <- rmvnorm(nobs, mean = numeric(nvars), sigma = sigma)
            
            ## need some way to get around the fact that
            ## ncvreg standardizes no matter what and gglasso
            ## does not allow standardization. we need to do
            ## this so comparisons of the loss functions
            ## are on the same scale for all methods
            x <- scale(x.unscaled)
            y <- drop(x %*% c(-0.5, -0.5, 0.5, 0.5, 1, rep(0, nvars - 5))) + rnorm(nobs, sd = 2)
            
            grp.size <- 25
            grps <- rep(1:(nvars/grp.size), each = grp.size)
            
            oefit <- oem(x = x, y = y, tol = 1e-15,
                         penalty = c("lasso", "mcp", "grp.lasso"),
                         groups = grps, gamma = 3, standardize = FALSE)
            lam <- unique(oefit$lambda)[[1]]
            
            ## need to save data so we can use the
            ## exact same datasets and the exact
            ## same tuning parameter values for
            ## the sklearn simulation in python
            write.csv(x, file = paste0("x_nvars", nvars, "_nobs", nobs, ".csv"), row.names = FALSE)
            write.csv(y, file = paste0("y_nvars", nvars, "_nobs", nobs, ".csv"), row.names = FALSE)
            write.csv(lam, file = paste0("lam_nvars", nvars, "_nobs", nobs, ".csv"), row.names = FALSE)
            
            mb <- microbenchmark(oem = {oem_lasso <- oem(x = x, y = y, penalty = "lasso", lambda = lam, tol = 1e-8, standardize = FALSE)},
                                 glmnet = {glmnet_lasso <- glmnet(x = x, y = y, lambda = lam, thresh = 1e-8, standardize = FALSE)},
                                 "cv.oem[lasso]" = {cv_oemlasso <- cv.oem(x = x, y = y, nfolds = K, penalty = "lasso", lambda = lam, tol = 1e-8, standardize = FALSE)},
                                 "xval.oem[lasso]" = {xval_oem_lasso <- xval.oem(x = x, y = y, nfolds = K, penalty = "lasso", lambda = lam, tol = 1e-8, standardize = FALSE)},
                                 "cv.oem[mcp]" = {cv_oem_mcp <- cv.oem(x = x, y = y, nfolds = K, penalty = "mcp", lambda = lam, tol = 1e-8, standardize = FALSE)},
                                 "xval.oem[mcp]" = {xval_oem_mcp <- xval.oem(x = x, y = y, nfolds = K, penalty = "mcp", lambda = lam, gamma = 3, tol = 1e-8, standardize = FALSE)},
                                 "cv.oem[grp.lasso]" = {cv_oem_grp_lasso <- cv.oem(x = x, y = y, nfolds = K, penalty = "grp.lasso", lambda = lam, groups = grps, tol = 1e-8, standardize = FALSE)},
                                 "xval.oem[grp.lasso]" = {xval_oem_grp_lasso <- xval.oem(x = x, y = y, nfolds = K, penalty = "grp.lasso", lambda = lam, groups = grps, tol = 1e-8, standardize = FALSE)},
                                 "cv.oem[all]" = {cv_oem_all <- cv.oem(x = x, y = y, nfolds = K, penalty = c("lasso", "mcp", "grp.lasso"), lambda = lam, gamma = 3, groups = grps, tol = 1e-8, standardize = FALSE)},
                                 "xval.oem[all]" = {xval_oem_all <- xval.oem(x = x, y = y, nfolds = K, penalty = c("lasso", "mcp", "grp.lasso"), lambda = lam, gamma = 3, groups = grps, tol = 1e-8, standardize = FALSE)},
                                 "cv.glmnet[lasso]" = {cv_glmnet_lasso <- cv.glmnet(x = x, y = y, nfolds = K, lambda = lam, thresh = 1e-8, standardize = FALSE)},
                                 "cv.ncvreg[mcp]" = {cv_ncvreg_mcp <- cv.ncvreg(X = x, y = y, nfolds = K, lambda = lam, gamma = 3, eps = 1e-8)},
                                 "cv.grpreg[grp.lasso]" = {cv_grpreg_grp_lasso <- cv.grpreg(X = x, y = y, nfolds = K, lambda = lam, group = grps, eps = 1e-8)},
                                 "cv.gglasso[grp.lasso]" = {cv_gglasso_grp_lasso <- cv.gglasso(x = x, y = y, nfolds = K, lambda = lam, group = grps, eps = 1e-8)},
                                 times = n.runs,
                                 unit = "s")
            
            
            mcp.ncvreg_lf <- sapply(1:length(lam), function(ii) mcp.linear(cv_ncvreg_mcp$fit$beta[,ii],
                                                                           cbind(1, x), y, lam[ii], 3))
            mcp.oem_lf <- sapply(1:length(lam), function(ii) mcp.linear(cv_oem_mcp$oem.fit$beta[[1]][,ii],
                                                                        cbind(1, x), y, lam[ii], 3))
            
            ggbeta <- rbind(cv_gglasso_grp_lasso$gglasso.fit$b0, cv_gglasso_grp_lasso$gglasso.fit$beta)
            
            grpreg_lf <- sapply(1:length(lam), function(ii) grp.linear(cv_grpreg_grp_lasso$fit$beta[,ii],
                                                                       cbind(1, x), y, lam[ii], grps))
            oem_lf <- sapply(1:length(lam), function(ii) grp.linear(cv_oem_grp_lasso$oem.fit$beta[[1]][,ii],
                                                                    cbind(1, x), y, lam[ii], grps))
            gglasso_lf <- sapply(1:length(lam), function(ii) grp.linear(ggbeta[,ii],
                                                                        cbind(1, x), y, lam[ii], grps))
            oem.m_lf <- sapply(1:length(lam), function(ii) grp.logistic(oefit$beta[[3]][,ii],
                                                                        cbind(1, x), y, lam[ii], grps))
            
            glbeta <- rbind(glmnet_lasso$a0, as.matrix(glmnet_lasso$beta))
            
            lasso.glmnet_lf <- sapply(1:length(lam), function(ii)
                lasso.linear(glbeta[, ii], cbind(1, x), y, lam[ii]))
            lasso.oem_lf <- sapply(1:length(lam), function(ii)
                lasso.linear(oem_lasso$beta[[1]][, ii], cbind(1, x), y, lam[ii]))
            lasso.oem.m_lf <- sapply(1:length(lam), function(ii)
                lasso.linear(oefit$beta[[1]][, ii], cbind(1, x), y, lam[ii]))
            
            ## LASSO
            loss.diff.mat.oem[ct, 3] <- loss.diff.mat.oem[ct, 3] + mean(lasso.oem_lf - lasso.glmnet_lf)
            
            ## grp lasso
            loss.diff.mat.oem[ct, 4] <- loss.diff.mat.oem[ct, 4] + mean(oem_lf - gglasso_lf)
            
            ## grpreg minimizes a slightly different loss function
            ## (orthogonalization occurs within groups)
            ## and thus the results are not comparable.
            loss.diff.mat.oem[ct, 5] <- loss.diff.mat.oem[ct, 5] + mean(oem_lf - grpreg_lf)
            
            ## MCP
            loss.diff.mat.oem[ct, 6] <- loss.diff.mat.oem[ct, 6] + mean(mcp.oem_lf - mcp.ncvreg_lf)
            
            sum.mb <- summary(mb)
            
            names <- levels(sum.mb$expr)[sum.mb$expr]
            
            if (ct == 1) {
                colnames(res.mat) <- c("nfolds", "nobs", "nvars", names)
                colnames(loss.diff.mat.oem) <- c("nobs", "nvars", "glmnet",
                                                 "gglasso", "grpreg", "ncvreg")
            }
            res.mat[ct, 1:3] <- c(K, nobs, nvars)
            res.mat[ct, 4:(length(sum.mb$mean) + 3)] <- sum.mb$mean
            
            print("times:")
            print(res.mat[ct, ])
            print(loss.diff.mat.oem[ct, ])
        }
        
        ## average over choices of nfolds
        loss.diff.mat.oem[ct,3] <- loss.diff.mat.oem[ct,3] / length(k.vec)
        loss.diff.mat.oem[ct,4] <- loss.diff.mat.oem[ct,4] / length(k.vec)
        loss.diff.mat.oem[ct,5] <- loss.diff.mat.oem[ct,5] / length(k.vec)
        loss.diff.mat.oem[ct,6] <- loss.diff.mat.oem[ct,6] / length(k.vec)
    }
}

## save results
save(res.mat, file = "res.mat.rda")
save(loss.diff.mat.oem , file = "loss.diff.mat.oem.rda")

