#################################################### ### R code to reproduce the submitted manuscript ### ### "BayesMortalityPlus: ### package in R for Bayesian graduation of ### mortality modelling" ### ### Authors: Lucas M. F. Silva , Luiz F. V. Figueiredo, ### Viviana G. R. Lobo, Thais C. O. Fonseca, Mariane, B. Alves ### Data: February 2024 #################################################### # Dependencies for running the replication script can be installed via # install.packages(c("ggplot2", "dplyr", "magrittr")) # install.packages("BayesMortalityPlus") ## libraries library("ggplot2") library("BayesMortalityPlus") library("scales") library("dplyr") library("tidyr") ## plot options options(scipen=999) #---------------------------------------------------------- #' Section 2 Bayesian graduation via Heligman-Pollard model #' Figure 1: An illustration of the Heligman-Pollard curve: Progress with the age of the logarithm of the probability of dying and of the logarithms of its three components. hp_curve <- function(x, p){ a = p[1];b = p[2];c = p[3];d = p[4];e = p[5];f = p[6];g = p[7];h = p[8] k = p[9] c1 = a^((x+b)^c) c2 = d*exp(-e*log(x/f)^2) c3 = g*h^x # c3 = g*h^x/(1+k*g*h^x) curva = c1 + c2 + c3 return(list(c1 = c1, c2 = c2, c3 = c3, curva = curva)) } x = 0:90 param = c(5.04e-4, 7.49e-2, 1.16e-1, 8.89e-4, 6.28, 24.35, 5.7e-5, 1.09) cur = hp_curve(x, param) data = data.frame(Idade = x, m = unlist(cur$curva), c1 = c(cur$c1), c2 = c(cur$c2), c3 = c(cur$c3)) data = as.data.frame(data) p = ggplot(data) + scale_y_continuous(trans = "log10", breaks = 10^-seq(0,5), limits = 10^-c(5,0), labels = scales::comma) + scale_x_continuous(breaks = seq(0, 100, by = 10)) + theme_bw() + theme(legend.position = "bottom") + labs(x = "Age", y = "Death probability (qx)", title = NULL) + geom_line(aes(x = Idade, y = m, col = "1")) + geom_line(aes(x = Idade, y = c1, col = '2')) + geom_line(aes(x = Idade, y = c2, col = "3")) + geom_line(aes(x = Idade, y = c3, col = "4")) + scale_color_manual(name = NULL, values = c("black", rainbow(3)), label = c("full curve", "infant mortality", "accident hump", "old age mortality")) suppressWarnings(print(p)) #### installing and loading ## install.packages("BayesMortalityPlus") library("BayesMortalityPlus") #---------------------------------------------------------- #' Section 4 - Static graduation with BayesMortalityPlus ### loading dplyr to facilitate filtering the data data("USA") library("dplyr") data <- select(USA, Year, Age, Ex.Total, Dx.Total) %>% filter(Year %in% c(1980, 1990, 2000, 2010, 2019) & Age %in% 0:80) %>% mutate(mx = Dx.Total / Ex.Total, Age = as.numeric(Age) ) x.labs <- c("USA 1980", "USA 1990", "USA 2000", "USA 2010", "USA 2019") ggplot(data) + scale_y_continuous(trans = "log10", breaks = 10^-seq(0, 5), limits = 10^-c(5, 0), labels = scales::comma) + scale_x_continuous(breaks = seq(0, 100, by = 10) ) + theme_bw() + theme(legend.position = "bottom") + labs(x = "Age", y = "Raw Mortality Rate", title = NULL) + geom_point(aes(x = Age, y = mx, col = as.factor(Year)) ) + scale_color_manual(name = NULL, values = c(rainbow(5)), label = x.labs) #---------------------------------------------------------- #' Section 4.1 - Heligman-Pollard model ### modelling with hp() set.seed(111) fit_1980 <- hp(0:80, data$Ex.Total[data$Year == 1980], data$Dx.Total[data$Year == 1980], model = "lognormal") set.seed(112) fit_1990 <- hp(0:80, data$Ex.Total[data$Year == 1990], data$Dx.Total[data$Year == 1990], model = "lognormal") set.seed(113) fit_2000 <- hp(0:80, data$Ex.Total[data$Year == 2000], data$Dx.Total[data$Year == 2000], model = "lognormal") set.seed(114) fit_2010 <- hp(0:80, data$Ex.Total[data$Year == 2010], data$Dx.Total[data$Year == 2010], model = "lognormal") set.seed(115) fit_2019 <- hp(0:80, data$Ex.Total[data$Year == 2019], data$Dx.Total[data$Year == 2019], model = "lognormal") ### summary summary(fit_1980) ### estimated mortality rates and credible intervals set.seed(116) fitted(fit_1980, age = c(0, 20, 40, 60, 80)) #' Figure 2: Posterior summaries via HP: Median mortality curve and 95% predictive credible interval in log-scale. The United States, total population, ages 0-80 and year 1980. The black dots represent the raw mortality rates. plot(fit_1980, labels = "US 1980", plotIC = TRUE, plotData = TRUE) #' Modelling with hp() function and Poisson model set.seed(117) fit_1980_poi <- hp(0:80, data$Ex.Total[data$Year == 1980], data$Dx.Total[data$Year == 1980], model = "poisson") #' Figure 3: Posterior summaries via HP: Zoomed in median mortality curve and 99\% predictive credible interval in log-scale for the Log-Normal and Poisson models. The United States, total population, ages 0-80 and year 1980. plot(list(fit_1980, fit_1980_poi), labels = c("Log-Normal", "Poisson"), plotData = F, age = 0:40, prob = 0.99, colors = c("seagreen", "darkred") ) #' Table 2: Posterior summaries: Median and 95% credibility interval for the Log-Normal for years 1980, 1990, 2000, 2010 and 2019. summary(fit_1980) summary(fit_1990) summary(fit_2000) summary(fit_2010) summary(fit_2019) #' Figure 4: Posterior summaries via HP: Median mortality curve in log-scale. The United States, total population, ages 0-80 and years 1980, 1990, 2000, 2010, 2019. fits <- list(fit_1980, fit_1990, fit_2000, fit_2010, fit_2019) labels <- c("US 1980", "US 1990", "US 2000", "US 2010", "US 2019") plot(fits, labels = labels, plotData = FALSE, plotIC = FALSE) #---------------------------------------------------------- ### informative prior incorporation #### artificial data creation library(truncnorm) USA2010 = USA %>% filter(Year == 2010, Age <= 90) set.seed(1) sigma2 = c(rep(10, 31), rep(1, 91 - 31)) Ex = USA2010$Ex.Total / 50 Dx = USA2010$Dx.Total / 50 set.seed(14) Dx = rtruncnorm(91, a = 0, mean = Dx, sd = sqrt(Ex * (Dx / Ex) * (1 - Dx / Ex)) * sqrt(sigma2) ) #### fitting past experience USA2009 = USA %>% filter(Year == 2009, Age <= 90) set.seed(14) hp_2009 <- hp(0:90, Ex = USA2009$Ex.Total[1:91], Dx = USA2009$Dx.Total[1:91], model = "lognormal", M = 50000) prior <- summary(hp_2009)$mean prior_sd <- summary(hp_2009)$sd #### informative prior incorporation set.seed(14) hp_2010 <- hp(0:90, Ex = Ex, Dx = Dx, model = "lognormal", M = 50000, m = c(NA, prior[2], NA, prior[4], NA, NA, NA, NA), v = c(NA, prior_sd[2]^3, NA, prior_sd[4]^3, NA, NA, NA, NA)) #' Figure 5: Posterior summaries via HP: Raw mortality rates motivation for past experience incorporation through informative prior distributions (left) and resulting fitted curve with 95\% credible intervals (right) for the United States. Total population, ages 0-90 years old and year 2010 - artificial dataset. ggplot() + scale_y_continuous(trans = "log10", breaks = 10^-seq(0, 5), limits = 10^-c(5, 0), labels = scales::comma) + scale_x_continuous(breaks = seq(0, 100, by = 10) ) + theme_bw() + theme(legend.position = "bottom", legend.text = element_text(size = 12), axis.text = element_text(color = 'black', size = 12) ) + labs(x = "Age", y = "Raw Mortality Rate", title = NULL, color = "") + geom_point(data = data.frame(x = 0:90, y = Dx / Ex), aes(x = x, y = y, col = "artificial USA 2010") ) + geom_point(data = data.frame(x = 0:90, y = (USA2009$Dx.Total / USA2009$Ex.Total)[1:91]), aes(x = x, y = y, col = "USA 2009"), pch = 16) + scale_color_manual(name = NULL, values = c("seagreen", "red4"), label = c("artificial USA 2010", "USA 2009") ) plot(hp_2010, labels = "artificial USA 2010") #' Figure 6: Posterior summaries via HP: traces of the chains for the estimation of the parameters after incorporating prior information. The United States, total population, ages 0-90 and year 2010 - artificial dataset. plot_chain(hp_2010) #---------------------------------------------------------- ### life expectancy #### example for life expectancy set.seed(120) expectancy(fit_1980, age = c(0, 20, 40, 60, 80), graph = FALSE) #' Figure 7: Posterior summaries via HP: Life expectancy graduation for the United States. Total population, ages 0-80 and years 1980, 1990, 2000, 2010, and 2019. set.seed(121) ex1 <- expectancy(fit_1980, age = seq(0, 80, by = 10), graph = FALSE) %>% cbind(year = rep(1980, 9)) set.seed(122) ex2 <- expectancy(fit_1990, age = seq(0, 80, by = 10), graph = FALSE) %>% cbind(year = rep(1990, 9)) set.seed(123) ex3 <- expectancy(fit_2000, age = seq(0, 80, by = 10), graph = FALSE) %>% cbind(year = rep(2000, 9)) set.seed(124) ex4 <- expectancy(fit_2010, age = seq(0, 80, by = 10), graph = FALSE) %>% cbind(year = rep(2010, 9)) set.seed(125) ex5 <- expectancy(fit_2019, age = seq(0, 80, by = 10), graph = FALSE) %>% cbind(year = rep(2019, 9)) aux.ex <- rbind(ex1, ex2, ex3, ex4, ex5) ggplot(data=aux.ex) + theme_light() + theme(axis.title.x = element_text(color = 'black', size = 12), axis.title.y = element_text(color = 'black', size = 12), axis.text = element_text(color = 'black', size = 12), legend.text = element_text(size = 12), legend.position = "bottom") + geom_line(aes(x = age, y = expectancy, color = as.factor(year))) + geom_ribbon(aes(x = age, ymin = ci.lower, ymax = ci.upper, fill = as.factor(year)), alpha = 0.2) + labs(color = "Year", fill = "Year", x = "Age", y = "Expectancy (years)") Heatmap(fits, x_lab = labels, age = 0:80) #---------------------------------------------------------- ### Reduced model #' Figure 8: Posterior summaries via HP: Median mortality curve in log-scale via the reduced model. The United States, total population, ages 18-80 and year 2010. Black dots represent the raw mortality rates. red_ex <- filter(data, Year == "2010", Age %in% 18:80)$Ex.Total red_dx <- filter(data, Year == "2010", Age %in% 18:80)$Dx.Total set.seed(126) hp.fit2 <- hp(x = 18:80, Ex = red_ex, Dx = red_dx, model = "lognormal", reduced_model = TRUE) plot(hp.fit2, plotIC = FALSE, labels = "HP fitted") ## Modelling 15-60 ages data_aux <- data %>% filter(Age %in% 15:60, Year == 2010) set.seed(127) fit <- hp(15:60, data_aux$Ex.Total, data_aux$Dx.Total, model = "lognormal") ## Scenario 1: Modelling 15-50 ages with reduced HP model data_aux <- data %>% filter(Age %in% 15:50, Year == 2010) set.seed(128) fit <- hp(15:50, data_aux$Ex.Total, data_aux$Dx.Total, model = "lognormal", reduced_model = TRUE) #' Figure 9: Scenario 1: HP curve for young-adult mortality begins at age 15 and ends at age 50. plot(fit) ## Summary summary(fit) ## Scenario 2: Modelling 25-50 ages with reduced HP model data_aux <- data %>% filter(Age %in% 25:50, Year == 2010) set.seed(129) fit <- hp(25:50, data_aux$Ex.Total, data_aux$Dx.Total, model = "lognormal", reduced_model = TRUE) set.seed(130) fit <- hp(25:50, data_aux$Ex.Total, data_aux$Dx.Total, model = "lognormal", m = c(NA, NA, NA, .0006, 7, 22, .0003, NA), v = c(NA, NA, NA, 1e-5, .9, .3, 1e-7, NA), reduced_model = TRUE) #' Figure 10: Scenario 2: HP curve for young-adult mortality begins at age 25 and ends at age 50. plot(fit) ## Summary summary(fit) #---------------------------------------------------------- ### Mortality measurement at advanced ages and extrapolation #### hp set.seed(131) hp.close1 <- hp_close(fit_2019, method = "hp") #' Figure 11: Posterior summaries via HP: Mortality graduation with HP curve extrapolation in log-scale. The United States, total population, ages 0-120 and year 2019. plot(hp.close1, labels = "hp method") #### plateau set.seed(132) hp.close2 <- hp_close(fit_2019, method = "plateau") #' Figure 12: Posterior summaries via HP: Mortality graduation with plateau extrapolation in log-scale. The United States, total population, ages 0-120 and year 2019. plot(hp.close2, labels = "Plateau method") #### linear set.seed(133) hp.close3 <- hp_close(fit_2019, method = "linear") #' Figure 13: Posterior summaries via HP: Mortality graduation with linear extrapolation in log-scale. The United States, total population, ages 0-120 and year 2019. plot(hp.close3, labels = "Linear method") #### Gompertz set.seed(134) hp.close4 <- hp_close(fit_2019, method = "gompertz") #' Figure 14: Posterior summaries via HP: Mortality graduation with Gompertz extrapolation in log-scale. The United States, total population, ages 0-120 and year 2019. plot(hp.close4, labels = "Gompertz method") #' Figure 15: Posterior summaries via HP: Mortality graduation with Gompertz extrapolation in log-scale. The United States, total population, ages 0-120 and year 2019. The age groups are colored to illustrate the mixing procedure. df = fitted(hp.close4) %>% select(age, qx.fitted) %>% mutate(age_group = rep(c("HP model age", "Mixing age", "Closing age"), c(73, 15, 33))) ggplot(df) + theme_bw() + theme(plot.title = ggplot2::element_text(lineheight = 1.2), axis.title.x = ggplot2::element_text(color = 'black', size = 12), axis.title.y = ggplot2::element_text(color = 'black', size = 12), axis.text = ggplot2::element_text(color = 'black', size = 12), legend.text = ggplot2::element_text(size = 12), legend.position = "bottom") + scale_x_continuous(name = "Age", breaks = seq(0, 120, by = 10)) + scale_y_continuous(name = "qx", trans = "log10", labels = scales::comma, limits = c(NA, 1)) + geom_line(aes(x = age, y = qx.fitted, col = age_group), linewidth = 0.8) + labs(col = NULL) ## Adding new data new_Ex <- filter(USA, Year == 2019)$Ex.Total[82:101] new_Dx <- filter(USA, Year == 2019)$Dx.Total[82:101] set.seed(135) hp.close4.alt <- hp_close(fit_2019, method = "gompertz", new_Ex = new_Ex, new_Dx = new_Dx) #' Figure 16: Posterior summaries via HP: Comparison between Gompertz extrapolations with and without new information in log-scale. The United States, total population, ages 0-120 and year 2019. plot(list(hp.close4, hp.close4.alt), labels = c("Without new data", "With new data")) #----------------------------------------------------------- #' Section 4.2 - Mortality graduation via dynamic linear smoothers ## filtering data y = data %>% mutate(logmx = log(mx)) %>% select(Year, Age, logmx) %>% pivot_wider(names_from = Year, values_from = logmx, id_cols = Age) set.seed(136) dlm_1980 <- dlm(y[[2]], delta = 0.85) set.seed(137) dlm_1990 <- dlm(y[[3]], delta = 0.85) set.seed(138) dlm_2000 <- dlm(y[[4]], delta = 0.85) set.seed(139) dlm_2010 <- dlm(y[[5]], delta = 0.85) set.seed(140) dlm_2019 <- dlm(y[[6]], delta = 0.85) ## summary head(summary(dlm_1980) ) #' Figure 17: Posterior summaries via DLM: Traces of the chains for the mean and variance of the process. The United States, total population, ages 0, 40 and 80 for the mean chains and year 1980. params <- c("sigma2", "mu[0]", "mu[40]", "mu[80]") plot_chain(dlm_1980, param = params) #' Figure 18: Posterior summaries via DLM: Median mortality curve in log-scale. The United States, total population, ages 0-80 and years 1980, 1990, 2000, 2010, 2019. fits <- list(dlm_1980, dlm_1990, dlm_2000, dlm_2010, dlm_2019) plot(fits, labels = labels, plotData = FALSE, plotIC = FALSE) #' Figure 19: Posterior summaries via DLM: Median mortality curve and predictive credible interval 95% in log-scale. The United States, total population, ages 0-80 and year 1980. The black dots represent the raw mortality rates. plot(dlm_1980, labels = "US 1980", plotIC = TRUE, plotData = TRUE) ## expectancy set.seed(141) expectancy(dlm_1980, age = c(0, 20, 40, 60, 80), graph = FALSE) #' Figure 20: Posterior summaries via DLM: Median mortality curve in log-scale to adult ages. The United States, total population, ages 18-80 and year 2010. Black dots represent the raw mortality rates red_y <- filter(y, Age %in% 18:80)[[5]] set.seed(142) dlm.fit2 <- dlm(red_y, delta = 0.95, ages = 18:80) plot(dlm.fit2, plotIC = FALSE, labels = "DLM fitted") #---------------------------------------------------------- ### Mortality measurement at advanced ages and extrapolation new_data <- log(new_Dx / new_Ex) set.seed(143) dlm.close1 <- dlm_close(dlm_2019, method = "plateau", max_age = 100) set.seed(144) dlm.close2 <- dlm_close(dlm_2019, method = "linear", max_age = 100, new_data = new_data) set.seed(145) dlm.close3 <- dlm_close(dlm_2019, method = "gompertz", max_age = 100, new_data = new_data) #' Figure 21: Posterior summaries via DLM: Mortality graduation with different closing methods in log-scale. The United States, total population, ages 70-100 and year 2019. new_Ex <- filter(USA, Year == 2019)$Ex.Total[1:101] new_Dx <- filter(USA, Year == 2019)$Dx.Total[1:101] new_y <- log(new_Dx / new_Ex) set.seed(146) dlm.fit3 <- dlm(new_y, delta = 0.85) plot(list(dlm.fit3, dlm.close1, dlm.close2, dlm.close3), plotIC = FALSE, plotData = FALSE, age = 70:100, labels = c("DLM fitted", "Plateau", "Linear", "Gompertz"), linetype = c("twodash", "solid", "solid", "solid")) + guides(colour = guide_legend(override.aes = list(linetype = c(6, 1, 1, 1))) ) #---------------------------------------------------------- ### Extrapolation for dynamic linear smoothers set.seed(147) dlm.fit4 <- predict(dlm_2019, h = 20, prob = 0.95) head(dlm.fit4) #' Figure 22: Posterior summaries via DLM: Prediction for the United States. Total population, ages 0-100 and year 2019 plot(dlm_2019, plotIC = FALSE, plotData = FALSE) + geom_line(data = dlm.fit4, aes(x = age, y = qx.fitted, col = "Predict") ) + geom_ribbon(data = dlm.fit4, aes(x = age, ymin = qx.lower, ymax = qx.upper, fill = "Predict"), alpha = 0.4) + scale_color_manual(values = c("seagreen", "red"), label = c("DLM fitted", "Predict") ) + guides(fill = "none", linetype = "none") + labs(colour = "") ## Filtering data data <- select(USA, Year, Age, Ex.Male, Dx.Male) %>% filter(Year == 1980 & Age %in% 0:110) %>% mutate(logmx = log(Dx.Male / Ex.Male)) %>% select(Age, logmx) ## DLM with fixed discount factor set.seed(148) dlm_1980_1 = dlm(data$logmx, delta = 0.85) ## DLM with varying discount factor set.seed(149) dlm_1980_2 = dlm(data$logmx, delta = rep(c(0.99, 0.6, 0.85), c(5, 31, 75)) ) #' Figure 23: Posterior summaries via DLM: (a) the mortality curve fitted considering static discount factor per age equal 0.85 and, (b) the mortality curve fitted considering different discount factor per age, 0.80 for rage 0-35 and 0.90 after 35 years old. plot(dlm_1980_1) plot(dlm_1980_2) # ------------------------------------------------------------------------- #' Section 5 - Bayesian Lee-Carter model data("PT") Y <- PT head(Y[,1:4]) ### fitting the BLC model set.seed(150) fit.blc <- blc(Y) ### estimation set.seed(151) head(fitted(fit.blc)[[1]][, 1:4]) #' Figure 24: Posterior summaries via BLC: Median mortality curves in log-scale. Portugal, total population, ages 18-80 and years 2000-2015. plot(fit.blc, parameter = "fitted", ages = 18:80) #' Figure 25: Posterior summaries via BLC: Means and variance for each parameter of the fitted BLC model. Portugal, total population, ages 18-80 and years 2000-2015. plot(fit.blc, parameter = "all", ages = 18:80, year = 2000:2015) ### improvement estimation set.seed(152) head(improvement(fit.blc, prob = 0.95) ) ### life expectancy set.seed(153) expectancy(fit.blc, at = c(1, 21, 41, 61))$expectancy[, 1:4] # ------------------------------------------------------------------------- ### Forecast for fitted BLC models set.seed(154) fit.blc2 <- predict(fit.blc, h = 10) print(fit.blc2) ### estimated mortality rates for forecast set.seed(155) head(fitted(fit.blc2)$mean[, 1:4]) ### life expectancy for the forecast set.seed(156) expectancy(fit.blc2, at = c(1, 21, 41, 61))$expectancy[, 1:4] ### life expectancy forecast behavior at birth set.seed(157) ex.fitted <- expectancy(fit.blc, at = 1) ex.pred <- expectancy(fit.blc2, at = 1) ex.obs <- apply(Y, 2, function(x) sum(cumprod(1 - (1 - exp(-exp(x)))))) #' Figure 26: Posterior summaries via BLC: Fitted and predicted life expectancy at age 18 by year. Portugal, total population, ages 18-80 and years 2000-2015. The black dots represent the observed life expectancy. ggplot(NULL) + theme_bw() + theme(legend.text = element_text(size = 12)) + geom_point(data = data.frame(x = 2000:2015, y = ex.obs), aes(x = x, y = y), color = "black") + geom_line(data = data.frame(x = 2000:2015, y = ex.fitted$expectancy), aes(x = x, y = y), color = "seagreen") + geom_ribbon(data = data.frame(x = 2000:2015, l = ex.fitted$lower, u = ex.fitted$upper), aes(x = x, ymax = u, ymin = l), fill = "seagreen", alpha = .4) + geom_line(data = data.frame(x = 2016:2025, y = ex.pred$expectancy), aes(x = x, y = y), color = "red") + geom_ribbon(data = data.frame(x = 2016:2025, l = ex.pred$lower, u = ex.pred$upper), aes(x = x, ymax = u, ymin = l), fill = "red", alpha = .4) + labs(y = "Expectancy (years)", x = "Year") sessionInfo()