############################################################################ ## Code for reproducing results in "qgam: Bayesian non-parametric ## quantile regression modelling in R" ############################################################################ ## To run the following code, the following packages need to be installed: ## - mgcv version >= 1.8-28 ## - qgam version 1.3.3 (on CRAN) ## - mgcViz version 0.1.6. (on CRAN) ######################################################## ## Motorcycle example ######################################################## ## Fit qgam model data("mcycle", package = "MASS") library("qgam") fitCycle1 <- qgam(list(form = accel ~ s(times, k = 20, bs = "ad"), ~ s(times)), data = mcycle, qu = 0.9) ## Predict head(predict(fitCycle1), 5) ## Plot par(mar = c(4.1, 4.1, 0.5, 2.1)) plot(fitCycle1) ## Extract log(sigma) and bandwidth fitCycle1$family$getTheta() head(fitCycle1$family$getCo(), 5) ## Fit multiple quantile GAMs fitCycleM <- mqgam(list(form = accel ~ s(times, k = 20, bs = "ad"), ~ s(times)), data = mcycle, qu = c(0.1, 0.25, 0.5, 0.75, 0.9)) ## Summary qdo(fitCycleM, qu = 0.25, fun = summary) ## Converting to mgcViz library("mgcViz") fitCycleM <- getViz(fitCycleM) ## Plot all quantiles effects at once plot(fitCycleM) ## Plot actual quantiles (not centered) par(mar = c(4.1, 4.1, 0.1, 0)) xseq <- with(mcycle, seq(min(times), max(times), length.out = 100)) preds <- sapply(fitCycleM, predict, newdata = data.frame(times = xseq)) plot(mcycle, ylim = range(preds)) for (ii in 1:5) lines(xseq, preds[, ii], col = 2) ## Check distance between consecutive quantiles min(apply(preds, 1, diff)) ## Using constant bandwidth fitCycleConst <- qgam(accel ~ s(times, k = 20, bs = "ad"), data = mcycle, qu = 0.9) ## Also fit model by bootstrapping (which is stochastic, so we fix the seed) set.seed(525) fitCycleConst_boot1 <- qgam(list(accel ~ s(times, k = 20, bs = "ad"), ~ s(times)), data = mcycle, qu = 0.9, control = list(loss = "cal", K = 100)) fitCycleConst_boot <- qgam(accel ~ s(times, k = 20, bs = "ad"), data = mcycle, qu = 0.9, control = list(loss = "cal", K = 100)) ## Comparing the fit with variable and constant bandwidth (this code chunk does not appear in the article) par(mfrow = c(2, 1), mar = c(0, 4.1, 4.1, 2.1)) xSeq <- data.frame(cbind("accel" = rep(0, 1e3), "times" = seq(2, 58, length.out = 1e3))) pred <- predict(fitCycle1, newdata = xSeq, se = TRUE) plot(mcycle$times, mcycle$accel, xlab = "", ylab = "Acceleration", xaxt = "n", ylim = c(-150, 92), col = "white") grid(5, 5, lwd = 1.5) points(mcycle$times, mcycle$accel) lines(xSeq$times, pred$fit) lines(xSeq$times, pred$fit + 2*pred$se.fit, col = 2) lines(xSeq$times, pred$fit - 2*pred$se.fit, col = 2) text(x = 7.5, y = 74, labels = expression(sigma(x))) pred_boot1 <- predict(fitCycleConst_boot1, newdata = xSeq, se = TRUE) lines(xSeq$times, pred_boot1$fit, lty = 2) lines(xSeq$times, pred_boot1$fit + 2*pred_boot1$se.fit, col = 2, lty = 2) lines(xSeq$times, pred_boot1$fit - 2*pred_boot1$se.fit, col = 2, lty = 2) par(mar = c(4.1, 4.1, 1.5, 2.1)) xSeq <- data.frame(cbind("accel" = rep(0, 1e3), "times" = seq(2, 58, length.out = 1e3))) pred <- predict(fitCycleConst, newdata = xSeq, se=TRUE) plot(mcycle$times, mcycle$accel, xlab = "Times", ylab = "Acceleration", ylim = c(-150, 92), col = "white") grid(5, 5, lwd = 1.5) points(mcycle$times, mcycle$accel) lines(xSeq$times, pred$fit) lines(xSeq$times, pred$fit + 2*pred$se.fit, col = 2) lines(xSeq$times, pred$fit - 2*pred$se.fit, col = 2) text(x = 12, y = 74, labels = expression(sigma * " = const")) pred_boot <- predict(fitCycleConst_boot, newdata = xSeq, se = TRUE) lines(xSeq$times, pred_boot$fit, col = 1, lty = 2) lines(xSeq$times, pred_boot$fit + 2*pred_boot$se.fit, col = 2, lty = 2) lines(xSeq$times, pred_boot$fit - 2*pred_boot$se.fit, col = 2, lty = 2) ######################################################## ## Electricity demand example ######################################################## ## Load demand data data("AUDem", package = "qgam") meanDem <- AUDem$meanDem head(meanDem, 3) ## Divide in learning and testing set cutDate <- as.Date("2011-04-01 00:00:00") meanDemLearn <- subset(meanDem, as.Date(date) < cutDate) ## Learning meanDemTest <- subset(meanDem, as.Date(date) >= cutDate) ## Testing ## Plot the demand (this code chunk does not appear in the article) par(mar = c(4.1, 4.1, 0.5, 2.1)) meanDemPlot <- subset(meanDem, tod == 20.5) meanDemLearnPlot <- subset(meanDemLearn, tod == 20.5) ## Learning meanDemTestPlot <- subset(meanDemTest, tod == 20.5) ## Testing plot(1:nrow(meanDemPlot), meanDemPlot$dem, type = "l", ylab = "Demand (KW)", col = "white", xlab = "Date", xaxt = "n") lines(1:nrow(meanDemLearnPlot), meanDemLearnPlot$dem, col = 1) lines((nrow(meanDemLearnPlot)+1):nrow(meanDemPlot), meanDemTestPlot$dem, col = "darkgrey") abline(v = nrow(meanDemLearnPlot), lty = 2) axis(1, at = c(20, 120, 220, 320), labels = as.Date(format(as.Date(meanDemPlot$date[c(21, 104, 203, 303)], "%Y-%m-%d")))) legend(x = "topleft", c("training", "testing"), col = c("black", "darkgrey"), lty = 1) ## Multiple quantile fits library("qgam") qusObj <- seq(0.1, 0.9, length.out = 5) fitQ <- mqgam(list(dem ~ dow + dem48 + s(tod, k = 6) + s(temp) + s(doy, bs = "cc"), ~ s(doy, bs = "cc")), qu = qusObj, data = meanDemLearn) ## Plotting smooth effect. This is the code appearing the paper BUT ... library("mgcViz") fitQ <- getViz(fitQ) print(plot(fitQ, allTerms = TRUE, select = c(1:3, 5)), pages = 1) ## ... this is the code actually used to produce the figure appearing the paper ## (we change a bit the format to get a better plot for publication) fitQ <- mqgam(list(dem ~ dow + dem48 + s(tod, k = 6) + s(temp) + s(doy, bs = "cc"), ~ s(doy, bs = "cc")), qu = qusObj, data = meanDemLearn) library("mgcViz") fitQ <- getViz(fitQ) plQ <- plot(fitQ, allTerms = TRUE, select = c(1:3, 5))[[1]] plQ[[1]] <- plQ[[1]] + theme(legend.position = "none") + l_fitLine() plQ[[2]] <- plQ[[2]] + theme(legend.position = "none") + l_fitLine() plQ[[3]] <- plQ[[3]] + theme(legend.position = "bottom") + l_fitLine() plQ[[4]] <- plQ[[4]] + l_fitPoints() + l_ciBar() library("gridExtra", quietly = TRUE) grid.arrange(grobs = lapply(plQ, "[[", "ggObj"), layout_matrix = matrix(c(1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 1, 1, 2, 2, 3, 3, 4, 4, 3, 3, 4, 4, 3, 3, 4, 4, 3, 3, 4, 4, 3, 3, 4, 4), 9, 4, byrow = TRUE)) ## Get quantiles of lagged demand qDem48 <- AUDem$qDem48 head(qDem48[, c(1, 2, 5, 15, 19, 20)], 3) ## Plotting the quantiles of the by-customer demand (this code chunk does not appear in the article) par(mar = c(4.1, 4.1, 0.5, 2.1)) matplot(seq(0.01, 0.99, length.out = 20), t(qDem48[c(1, 50, 75, 100, 250), ]), type = "l", ylab = "Electricity demand (KW)", xlab = expression("Probability level " * "(p)"), lty = 1:5) ## Get matrix of probability levels probLev <- matrix(seq(0.01, 0.99, length.out = 20), nrow = nrow(meanDem), ncol = 20, byrow = TRUE) head(probLev[, c(1, 2, 5, 15, 19, 20)], 3) ## Splitting between training and testing set ntrain <- nrow(meanDemLearn) ntest <- nrow(meanDemTest) meanDemLearn$qDem48 <- head(qDem48, ntrain) meanDemTest$qDem48 <- tail(qDem48, ntest) meanDemLearn$probLev <- head(probLev, ntrain) meanDemTest$probLev <- tail(probLev, ntest) ## Fitting functional quantile GAM on several quantiles (NB: this will take a few minutes) fitFunQ <- mqgamV(list(dem ~ dow + s(temp) + s(doy, bs = "cc") + s(tod, k = 6) + s(probLev, by = qDem48), ~ s(doy, bs = "cc")), qu = qusObj, data = meanDemLearn) ## Plotting functional effect on all quantiles plot(fitFunQ, select = 4) + labs(color = expression(tau)) ## Comparing the functional effect on two quantiles (this code chunk does not appear in the article) par(mar = c(4.1, 4.1, 0.5, 2.1)) pl1 <- plot(fitFunQ[[1]], select = 4) + l_ciPoly(alpha = 0.3, fill = "black") + l_fitLine(colour = "black", linetype = 1) pl2 <- plot(fitFunQ[[5]], select = 4) dat2 <- pl2$plots[[1]]$data$fit dat2PO <- data.frame("x" = c(dat2$x, rev(dat2$x)), "y" = c(dat2$y + 1.96 * dat2$se , rev(dat2$y - 1.96 * dat2$se))) pl1 + geom_polygon(data = dat2PO, linetype = 2, alpha = 0.3, mapping = aes(x = x, y = y), inherit.aes = FALSE, fill = "black") + geom_line(data = dat2, colour = "blue", linetype = 2) + ylab("s(probLev):orDem") ## Comparing the models in terms of BIC round(rbind(QGAM = sapply(fitQ, BIC), FuncQGAM = sapply(fitFunQ, BIC))) ## Comparing the models in terms of AIC round(rbind(QGAM = sapply(fitQ, AIC), FuncQGAM = sapply(fitFunQ, AIC))) ## Comparing the models in terms of pinball loss on the test set predQ <- sapply(fitQ, predict, newdata = meanDemTest) predFunQ <- sapply(fitFunQ, predict, newdata = meanDemTest) round(rbind("QGAM" = pinLoss(meanDemTest$dem, predQ, qusObj), "FuncQGAM" = pinLoss(meanDemTest$dem, predFunQ, qusObj)), 1) ## Checking the median quantile GAM check(fitFunQ[[3]]) ######################################################## ## Appendix: choosing the loss bandwidth manually ######################################################## ## Simulate some data set.seed(5523) x <- seq(-3, 3, length.out = 1e3) X <- data.frame("x" = x, "y" = x + x^2 + rgamma(1e3, 4, 1)) ## Fit 3 quantiles for different values of 'err' fitGrid <- lapply(c(0.01, 0.05, 0.1, 0.3, 0.5), function(.errVal){ mqgam(y ~ s(x), data = X, qu = c(0.05, 0.5, 0.95), err = .errVal) }) ## Plot the resulting fits (this code chunk does not appear in the article) qus <- c(0.05, 0.5, 0.95) datPoint <- data.frame(x = X$x, y = X$y) datLin <- data.frame(x = rep(X$x, length(fitGrid)), y = do.call("c", lapply(1:length(fitGrid), function(ii) qdo(fitGrid[[ii]], qus[1], predict))), group = rep(as.factor(c(0.01, 0.05, 0.1, 0.3, 0.5)), each = 1e3)) datLin2 <- data.frame(x = rep(X$x, length(fitGrid)), y = do.call("c", lapply(1:length(fitGrid), function(ii) qdo(fitGrid[[ii]], qus[2], predict))), group = rep(as.factor(c(0.01, 0.05, 0.1, 0.3, 0.5)), each = 1e3)) datLin3 <- data.frame(x = rep(X$x, length(fitGrid)), y = do.call("c", lapply(1:length(fitGrid), function(ii) qdo(fitGrid[[ii]], qus[3], predict))), group = rep(as.factor(c(0.01, 0.05, 0.1, 0.3, 0.5)), each = 1e3)) daTRU <- data.frame(x = rep(X$x, 3), y = c(X$x + X$x^2 + qgamma(0.95, 4, 1), X$x + X$x^2 + qgamma(0.5, 4, 1), X$x + X$x^2 + qgamma(0.05, 4, 1)), group = rep(as.factor(1:3), each = length(X$x))) library("ggplot2", quietly = TRUE) ggplot(data = datPoint, mapping = aes(x = x, y = y)) + geom_point(col = "grey86") + theme_bw() + geom_line(data = datLin, mapping = aes(x = x, y = y, colour = group, linetype = group), inherit.aes = TRUE) + geom_line(data = datLin2, mapping = aes(x = x, y = y, colour = group, linetype = group), inherit.aes = TRUE) + geom_line(data = datLin3, mapping = aes(x = x, y = y, colour = group, linetype = group), inherit.aes = TRUE) + geom_line(data = daTRU, mapping = aes(x = x, y = y, group = group), inherit.aes = TRUE, linetype = 2) + theme(panel.grid.major = element_blank(), panel.grid.minor = element_blank()) + labs(color = "err", linetype = "err") ## Measuring computing times for 2 values of 'err' system.time( fitBigErr <- qgam(y ~ s(x), data = X, qu = 0.95, err = 0.05))[[3]] system.time( fitSmallErr <- qgam(y ~ s(x), data = X, qu = 0.95, err = 0.001))[[3]] ## Check the loss function check(fitSmallErr$calibr, sel = 2) check(fitBigErr$calibr, sel = 2) ## Check convergence of intermediate criterion par(mfrow = c(1, 2)) check(fitBigErr)