####################################################
### R code to reproduce the submitted manuscript
###
### "BayesMortalityPlus:
###  package in R for Bayesian graduation of
###  mortality modelling"
###
###  Authors: Lucas M. F. Silva , Luiz F. V. Figueiredo,
###  Viviana G. R. Lobo, Thais C. O. Fonseca, Mariane, B. Alves
###  Data: February 2024
####################################################

# Dependencies for running the replication script can be installed via
# install.packages(c("ggplot2", "dplyr", "magrittr"))
# install.packages("BayesMortalityPlus")

## libraries
library("ggplot2")
library("BayesMortalityPlus")
library("scales")
library("dplyr")
library("tidyr")

## plot options
options(scipen=999)

#----------------------------------------------------------
#' Section 2 Bayesian graduation via Heligman-Pollard model

#' Figure 1: An illustration of the Heligman-Pollard curve: Progress with the age of the logarithm of the probability of dying and of the logarithms of its three components.
hp_curve <- function(x, p){
  a = p[1];b = p[2];c = p[3];d = p[4];e = p[5];f = p[6];g = p[7];h = p[8]
  k = p[9]
  c1 = a^((x+b)^c)
  c2 = d*exp(-e*log(x/f)^2)
  c3 = g*h^x
  # c3 = g*h^x/(1+k*g*h^x)
  curva = c1 + c2 + c3
  return(list(c1 = c1, c2 = c2, c3 = c3, curva = curva))
}

x = 0:90
param = c(5.04e-4, 7.49e-2, 1.16e-1, 8.89e-4, 6.28, 24.35, 5.7e-5, 1.09)

cur = hp_curve(x, param)
data = data.frame(Idade = x, m = unlist(cur$curva), c1 = c(cur$c1), c2 = c(cur$c2), c3 = c(cur$c3))
data = as.data.frame(data)

p = ggplot(data) +
  scale_y_continuous(trans = "log10", breaks = 10^-seq(0,5),
                     limits = 10^-c(5,0), labels = scales::comma) +
  scale_x_continuous(breaks = seq(0, 100, by = 10)) +
  theme_bw() +
  theme(legend.position = "bottom") +
  labs(x = "Age", y = "Death probability (qx)", title = NULL) +
  geom_line(aes(x = Idade, y = m, col = "1")) +
  geom_line(aes(x = Idade, y = c1, col = '2')) +
  geom_line(aes(x = Idade, y = c2, col = "3")) +
  geom_line(aes(x = Idade, y = c3, col = "4")) +
  scale_color_manual(name = NULL, values = c("black", rainbow(3)),
                     label = c("full curve", "infant mortality", "accident hump", "old age mortality"))
suppressWarnings(print(p))


#### installing and loading
## install.packages("BayesMortalityPlus")

library("BayesMortalityPlus")


#----------------------------------------------------------
#' Section 4 - Static graduation with BayesMortalityPlus

### loading dplyr to facilitate filtering the data
data("USA")
library("dplyr")

data <- select(USA, Year, Age, Ex.Total, Dx.Total) %>%
  filter(Year %in% c(1980, 1990, 2000, 2010, 2019) & Age %in% 0:80) %>%
  mutate(mx = Dx.Total / Ex.Total, Age = as.numeric(Age)
  )

x.labs <- c("USA 1980", "USA 1990", "USA 2000", "USA 2010", "USA 2019")
ggplot(data) +
  scale_y_continuous(trans = "log10", breaks = 10^-seq(0, 5), 
                     limits = 10^-c(5, 0), labels = scales::comma) +
  scale_x_continuous(breaks = seq(0, 100, by = 10)
                     ) + 
  theme_bw() + theme(legend.position = "bottom") + 
  labs(x = "Age", y = "Raw Mortality Rate", title = NULL) +
  geom_point(aes(x = Age, y = mx, col = as.factor(Year))
             ) + 
  scale_color_manual(name = NULL, values = c(rainbow(5)), label = x.labs)


#----------------------------------------------------------
#' Section 4.1 - Heligman-Pollard model

### modelling with hp()
set.seed(111)
fit_1980 <- hp(0:80, data$Ex.Total[data$Year == 1980],
               data$Dx.Total[data$Year == 1980], model = "lognormal")
set.seed(112)
fit_1990 <- hp(0:80, data$Ex.Total[data$Year == 1990],
               data$Dx.Total[data$Year == 1990], model = "lognormal")
set.seed(113)
fit_2000 <- hp(0:80, data$Ex.Total[data$Year == 2000],
               data$Dx.Total[data$Year == 2000], model = "lognormal")
set.seed(114)
fit_2010 <- hp(0:80, data$Ex.Total[data$Year == 2010],
               data$Dx.Total[data$Year == 2010], model = "lognormal")
set.seed(115)
fit_2019 <- hp(0:80, data$Ex.Total[data$Year == 2019],
               data$Dx.Total[data$Year == 2019], model = "lognormal")

### summary
summary(fit_1980)

### estimated mortality rates and credible intervals
set.seed(116)
fitted(fit_1980, age = c(0, 20, 40, 60, 80))

#' Figure 2: Posterior summaries via HP: Median mortality curve and 95% predictive credible interval in log-scale. The United States, total population, ages 0-80 and year 1980. The black dots represent the raw mortality rates.
plot(fit_1980, labels = "US 1980", plotIC = TRUE, plotData = TRUE)

#' Modelling with hp() function and Poisson model
set.seed(117)
fit_1980_poi <- hp(0:80, data$Ex.Total[data$Year == 1980],
                   data$Dx.Total[data$Year == 1980], model = "poisson")

#' Figure 3: Posterior summaries via HP: Zoomed in median mortality curve and  99\% predictive credible interval in log-scale for the Log-Normal and Poisson models. The United States, total population, ages 0-80 and year 1980.
plot(list(fit_1980, fit_1980_poi), labels = c("Log-Normal", "Poisson"),
     plotData = F, age = 0:40, prob = 0.99, colors = c("seagreen", "darkred")
     )

#' Table 2: Posterior summaries: Median and 95% credibility interval for the Log-Normal for years 1980, 1990, 2000, 2010 and 2019.
summary(fit_1980)
summary(fit_1990)
summary(fit_2000)
summary(fit_2010)
summary(fit_2019)

#' Figure 4: Posterior summaries via HP: Median mortality curve in log-scale. The United States, total population, ages 0-80 and years 1980, 1990, 2000, 2010, 2019.
fits <- list(fit_1980, fit_1990, fit_2000, fit_2010, fit_2019)
labels <- c("US 1980", "US 1990", "US 2000", "US 2010", "US 2019")

plot(fits, labels = labels, plotData = FALSE, plotIC = FALSE)


#----------------------------------------------------------
### informative prior incorporation
#### artificial data creation
library(truncnorm)
USA2010 = USA %>% filter(Year == 2010, Age <= 90)
set.seed(1)
sigma2 = c(rep(10, 31), rep(1, 91 - 31)) 
Ex = USA2010$Ex.Total / 50 
Dx = USA2010$Dx.Total / 50
set.seed(14)
Dx = rtruncnorm(91, a = 0, mean = Dx, 
                sd = sqrt(Ex * (Dx / Ex) * (1 - Dx / Ex)) * sqrt(sigma2)
                )

#### fitting past experience
USA2009 = USA %>% filter(Year == 2009, Age <= 90)
set.seed(14)
hp_2009 <- hp(0:90, Ex = USA2009$Ex.Total[1:91],
              Dx = USA2009$Dx.Total[1:91], model = "lognormal", M = 50000)
prior <- summary(hp_2009)$mean
prior_sd <- summary(hp_2009)$sd  

#### informative prior incorporation
set.seed(14)
hp_2010 <- hp(0:90, Ex = Ex, Dx = Dx, model = "lognormal", M = 50000,
              m = c(NA, prior[2], NA, prior[4], NA, NA, NA, NA), 
              v = c(NA, prior_sd[2]^3, NA, prior_sd[4]^3, NA, NA, NA, NA))

#' Figure 5: Posterior summaries via HP: Raw mortality rates motivation for past experience incorporation through informative prior distributions (left) and resulting fitted curve with 95\% credible intervals (right) for the United States. Total population, ages 0-90 years old and year 2010 - artificial dataset.
ggplot() + 
  scale_y_continuous(trans = "log10", breaks = 10^-seq(0, 5), 
                     limits = 10^-c(5, 0), labels = scales::comma) +
  scale_x_continuous(breaks = seq(0, 100, by = 10)
                     ) + theme_bw() +
  theme(legend.position = "bottom", legend.text = element_text(size = 12),
        axis.text = element_text(color = 'black', size = 12)
        ) +
  labs(x = "Age", y = "Raw Mortality Rate", title = NULL, color = "") +
  geom_point(data = data.frame(x = 0:90, y = Dx / Ex), 
             aes(x = x, y = y, col = "artificial USA 2010")
             ) +
  geom_point(data = data.frame(x = 0:90,
                               y = (USA2009$Dx.Total / USA2009$Ex.Total)[1:91]),
             aes(x = x, y = y, col = "USA 2009"), pch = 16) +
  scale_color_manual(name = NULL, values = c("seagreen", "red4"),
                     label = c("artificial USA 2010", "USA 2009")
                     )

plot(hp_2010, labels = "artificial USA 2010")

#' Figure 6: Posterior summaries via HP: traces of the chains for the estimation of the parameters after incorporating prior information. The United States, total population, ages 0-90 and year 2010 - artificial dataset.
plot_chain(hp_2010)

#----------------------------------------------------------
### life expectancy
#### example for life expectancy
set.seed(120)
expectancy(fit_1980, age = c(0, 20, 40, 60, 80), graph = FALSE)


#' Figure 7: Posterior summaries via HP: Life expectancy graduation for the United States. Total population, ages 0-80 and years 1980, 1990, 2000, 2010, and 2019.
set.seed(121)
ex1 <- expectancy(fit_1980, age = seq(0, 80, by = 10), graph = FALSE) %>%
  cbind(year = rep(1980, 9))
set.seed(122)
ex2 <- expectancy(fit_1990, age = seq(0, 80, by = 10), graph = FALSE) %>%
  cbind(year = rep(1990, 9))
set.seed(123)
ex3 <- expectancy(fit_2000, age = seq(0, 80, by = 10), graph = FALSE) %>%
  cbind(year = rep(2000, 9))
set.seed(124)  
ex4 <- expectancy(fit_2010, age = seq(0, 80, by = 10), graph = FALSE) %>%
  cbind(year = rep(2010, 9))
set.seed(125)
ex5 <- expectancy(fit_2019, age = seq(0, 80, by = 10), graph = FALSE) %>%
  cbind(year = rep(2019, 9))

aux.ex <- rbind(ex1, ex2, ex3, ex4, ex5)

ggplot(data=aux.ex) + theme_light() +
  theme(axis.title.x = element_text(color = 'black', size = 12), 
        axis.title.y = element_text(color = 'black', size = 12),
        axis.text = element_text(color = 'black', size = 12),
        legend.text = element_text(size = 12), 
        legend.position = "bottom") +
  geom_line(aes(x = age, y = expectancy, color = as.factor(year))) +
  geom_ribbon(aes(x = age, ymin = ci.lower, ymax = ci.upper,
                  fill = as.factor(year)), alpha = 0.2) +
  labs(color = "Year", fill = "Year", x = "Age", y = "Expectancy (years)")

Heatmap(fits, x_lab = labels, age = 0:80)


#----------------------------------------------------------
### Reduced model
#' Figure 8: Posterior summaries via HP: Median mortality curve in log-scale via the reduced model. The United States, total population, ages 18-80 and year 2010. Black dots represent the raw mortality rates.
red_ex <- filter(data, Year == "2010", Age %in% 18:80)$Ex.Total
red_dx <- filter(data, Year == "2010", Age %in% 18:80)$Dx.Total
set.seed(126)
hp.fit2 <- hp(x = 18:80, Ex = red_ex, Dx = red_dx,
              model = "lognormal", reduced_model = TRUE)
plot(hp.fit2, plotIC = FALSE, labels = "HP fitted")

## Modelling 15-60 ages
data_aux <- data %>% filter(Age %in% 15:60, Year == 2010)

set.seed(127)
fit <- hp(15:60, data_aux$Ex.Total,
          data_aux$Dx.Total, model = "lognormal")

## Scenario 1: Modelling 15-50 ages with reduced HP model
data_aux <- data %>% filter(Age %in% 15:50, Year == 2010)
set.seed(128)
fit <- hp(15:50, data_aux$Ex.Total,
          data_aux$Dx.Total, model = "lognormal", reduced_model = TRUE)

#' Figure 9: Scenario 1: HP curve for young-adult mortality begins at age 15 and ends at age 50.
plot(fit)

## Summary
summary(fit)

## Scenario 2: Modelling 25-50 ages with reduced HP model
data_aux <- data %>% filter(Age %in% 25:50, Year == 2010)
set.seed(129)
fit <- hp(25:50, data_aux$Ex.Total,
          data_aux$Dx.Total, model = "lognormal", reduced_model = TRUE)

set.seed(130)
fit <- hp(25:50, data_aux$Ex.Total, data_aux$Dx.Total, model = "lognormal",
          m = c(NA, NA, NA, .0006, 7, 22, .0003, NA),
          v = c(NA, NA, NA, 1e-5, .9, .3, 1e-7, NA),
          reduced_model = TRUE)

#' Figure 10: Scenario 2: HP curve for young-adult mortality begins at age 25 and ends at age 50.
plot(fit)

## Summary
summary(fit)

#----------------------------------------------------------
### Mortality measurement at advanced ages and extrapolation
#### hp
set.seed(131)
hp.close1 <- hp_close(fit_2019, method = "hp")

#' Figure 11: Posterior summaries via HP: Mortality graduation with HP curve extrapolation in log-scale. The United States, total population, ages 0-120 and year 2019.
plot(hp.close1, labels = "hp method")

#### plateau
set.seed(132)
hp.close2 <- hp_close(fit_2019, method = "plateau")
#' Figure 12: Posterior summaries via HP: Mortality graduation with plateau extrapolation in log-scale. The United States, total population, ages 0-120 and year 2019.
plot(hp.close2, labels = "Plateau method")

#### linear
set.seed(133)
hp.close3 <- hp_close(fit_2019, method = "linear")
#' Figure 13: Posterior summaries via HP: Mortality graduation with linear extrapolation in log-scale. The United States, total population, ages 0-120 and year 2019.
plot(hp.close3, labels = "Linear method")

#### Gompertz
set.seed(134)
hp.close4 <- hp_close(fit_2019, method = "gompertz")
#' Figure 14: Posterior summaries via HP: Mortality graduation with Gompertz extrapolation in log-scale. The United States, total population, ages 0-120 and year 2019.
plot(hp.close4, labels = "Gompertz method")

#' Figure 15: Posterior summaries via HP: Mortality graduation with Gompertz extrapolation in log-scale. The United States, total population, ages 0-120 and year 2019. The age groups are colored to illustrate the mixing procedure.
df = fitted(hp.close4) %>%
  select(age, qx.fitted) %>%
  mutate(age_group = rep(c("HP model age", "Mixing age", "Closing age"),
                         c(73, 15, 33)))
ggplot(df) +
  theme_bw() +
  theme(plot.title = ggplot2::element_text(lineheight = 1.2),
        axis.title.x = ggplot2::element_text(color = 'black', size = 12),
        axis.title.y = ggplot2::element_text(color = 'black', size = 12),
        axis.text = ggplot2::element_text(color = 'black', size = 12),
        legend.text = ggplot2::element_text(size = 12),
        legend.position = "bottom") +
  scale_x_continuous(name = "Age", breaks = seq(0, 120, by = 10)) +
  scale_y_continuous(name = "qx", trans = "log10", labels = scales::comma,
                     limits = c(NA, 1)) +
  geom_line(aes(x = age, y = qx.fitted, col = age_group),
            linewidth = 0.8) +
  labs(col = NULL)


## Adding new data
new_Ex <- filter(USA, Year == 2019)$Ex.Total[82:101]
new_Dx <- filter(USA, Year == 2019)$Dx.Total[82:101]
set.seed(135)
hp.close4.alt <- hp_close(fit_2019, method = "gompertz", new_Ex = new_Ex,
                          new_Dx = new_Dx)

#' Figure 16: Posterior summaries via HP: Comparison between Gompertz extrapolations with and without new information in log-scale. The United States, total population, ages 0-120 and year 2019.
plot(list(hp.close4, hp.close4.alt),
     labels = c("Without new data", "With new data"))


#-----------------------------------------------------------
#' Section 4.2 - Mortality graduation via dynamic linear smoothers
## filtering data
y = data %>%
  mutate(logmx = log(mx)) %>%
  select(Year, Age, logmx) %>%
  pivot_wider(names_from = Year, values_from = logmx, id_cols = Age)

set.seed(136)
dlm_1980 <- dlm(y[[2]], delta = 0.85)
set.seed(137)
dlm_1990 <- dlm(y[[3]], delta = 0.85)
set.seed(138)
dlm_2000 <- dlm(y[[4]], delta = 0.85)
set.seed(139)
dlm_2010 <- dlm(y[[5]], delta = 0.85)
set.seed(140)
dlm_2019 <- dlm(y[[6]], delta = 0.85)

## summary
head(summary(dlm_1980)
)

#' Figure 17: Posterior summaries via DLM: Traces of the chains for the mean and variance of the process. The United States, total population, ages 0, 40 and 80 for the mean chains and year 1980.
params <- c("sigma2", "mu[0]", "mu[40]", "mu[80]")
plot_chain(dlm_1980, param = params)

#' Figure 18: Posterior summaries via DLM: Median mortality curve in log-scale. The United States, total population, ages 0-80 and years 1980, 1990, 2000, 2010, 2019.
fits <- list(dlm_1980, dlm_1990, dlm_2000, dlm_2010, dlm_2019)
plot(fits, labels = labels, plotData = FALSE, plotIC = FALSE)

#' Figure 19: Posterior summaries via DLM: Median mortality curve and predictive credible interval 95% in log-scale. The United States, total population, ages 0-80 and year 1980. The black dots represent the raw mortality rates.
plot(dlm_1980, labels = "US 1980", plotIC = TRUE, plotData = TRUE)


## expectancy
set.seed(141)
expectancy(dlm_1980, age = c(0, 20, 40, 60, 80), graph = FALSE)

#' Figure 20: Posterior summaries via DLM: Median mortality curve in log-scale to adult ages. The United States, total population, ages 18-80 and year 2010. Black dots represent the raw mortality rates
red_y <- filter(y, Age %in% 18:80)[[5]]
set.seed(142)
dlm.fit2 <- dlm(red_y, delta = 0.95, ages = 18:80)
plot(dlm.fit2, plotIC = FALSE, labels = "DLM fitted")


#----------------------------------------------------------
### Mortality measurement at advanced ages and extrapolation
new_data <- log(new_Dx / new_Ex)
set.seed(143)
dlm.close1 <- dlm_close(dlm_2019, method = "plateau", max_age = 100)
set.seed(144)
dlm.close2 <- dlm_close(dlm_2019, method = "linear", max_age = 100,
                        new_data = new_data)
set.seed(145)
dlm.close3 <- dlm_close(dlm_2019, method = "gompertz", max_age = 100,
                        new_data = new_data)

#' Figure 21: Posterior summaries via DLM: Mortality graduation with different closing methods in log-scale. The United States, total population, ages 70-100 and year 2019.
new_Ex <- filter(USA, Year == 2019)$Ex.Total[1:101]
new_Dx <- filter(USA, Year == 2019)$Dx.Total[1:101]
new_y <- log(new_Dx / new_Ex)
set.seed(146)
dlm.fit3 <- dlm(new_y, delta = 0.85)

plot(list(dlm.fit3, dlm.close1, dlm.close2, dlm.close3), plotIC = FALSE,
     plotData = FALSE, age = 70:100,
     labels = c("DLM fitted", "Plateau", "Linear", "Gompertz"),
     linetype = c("twodash", "solid", "solid", "solid")) +
  guides(colour = guide_legend(override.aes = list(linetype = c(6, 1,
                                                                1, 1)))
  )

#----------------------------------------------------------
### Extrapolation for dynamic linear smoothers
set.seed(147)
dlm.fit4 <- predict(dlm_2019, h = 20, prob = 0.95)
head(dlm.fit4)

#' Figure 22: Posterior summaries via DLM: Prediction for the United States. Total population, ages 0-100 and year 2019
plot(dlm_2019, plotIC = FALSE, plotData = FALSE) +
  geom_line(data = dlm.fit4,
            aes(x = age, y = qx.fitted, col = "Predict")
  ) +
  geom_ribbon(data = dlm.fit4,
              aes(x = age, ymin = qx.lower, ymax = qx.upper, fill = "Predict"),
              alpha = 0.4) +
  scale_color_manual(values = c("seagreen", "red"),
                     label = c("DLM fitted", "Predict")
  ) +
  guides(fill = "none", linetype = "none") + labs(colour = "")

## Filtering data
data <- select(USA, Year, Age, Ex.Male, Dx.Male) %>%
  filter(Year == 1980 & Age %in% 0:110) %>%
  mutate(logmx = log(Dx.Male / Ex.Male)) %>%
  select(Age, logmx)

## DLM with fixed discount factor
set.seed(148)
dlm_1980_1 = dlm(data$logmx, delta = 0.85)

## DLM with varying discount factor
set.seed(149)
dlm_1980_2 = dlm(data$logmx, delta = rep(c(0.99, 0.6, 0.85), c(5, 31, 75))
                 )

#' Figure 23: Posterior summaries via DLM: (a) the mortality curve fitted considering static discount factor per age equal 0.85 and,  (b) the mortality curve fitted considering different discount factor per age, 0.80 for rage 0-35 and 0.90 after 35 years old.
plot(dlm_1980_1)
plot(dlm_1980_2)


# -------------------------------------------------------------------------
#' Section 5 - Bayesian Lee-Carter model
data("PT")
Y <- PT
head(Y[,1:4])

### fitting the BLC model
set.seed(150)
fit.blc <- blc(Y)

### estimation
set.seed(151)
head(fitted(fit.blc)[[1]][, 1:4])

#' Figure 24: Posterior summaries via BLC: Median mortality curves in log-scale. Portugal, total population, ages 18-80 and years 2000-2015.
plot(fit.blc, parameter = "fitted", ages = 18:80)

#' Figure 25: Posterior summaries via BLC: Means and variance for each parameter of the fitted BLC model. Portugal, total population, ages 18-80 and years 2000-2015.
plot(fit.blc, parameter = "all", ages = 18:80, year = 2000:2015)

### improvement estimation
set.seed(152)
head(improvement(fit.blc, prob = 0.95)
)

### life expectancy
set.seed(153)
expectancy(fit.blc, at = c(1, 21, 41, 61))$expectancy[, 1:4]


# -------------------------------------------------------------------------
### Forecast for fitted BLC models
set.seed(154)
fit.blc2 <- predict(fit.blc, h = 10)
print(fit.blc2)

### estimated mortality rates for forecast
set.seed(155)
head(fitted(fit.blc2)$mean[, 1:4])

### life expectancy for the forecast
set.seed(156)
expectancy(fit.blc2, at = c(1, 21, 41, 61))$expectancy[, 1:4]

### life expectancy forecast behavior at birth
set.seed(157)
ex.fitted <- expectancy(fit.blc, at = 1)
ex.pred <- expectancy(fit.blc2, at = 1)
ex.obs <- apply(Y, 2, function(x) sum(cumprod(1 - (1 - exp(-exp(x))))))

#' Figure 26: Posterior summaries via BLC: Fitted and predicted life expectancy at age 18 by year. Portugal, total population, ages 18-80 and years 2000-2015. The black dots represent the observed life expectancy.
ggplot(NULL) + theme_bw() +
  theme(legend.text = element_text(size = 12)) +
  geom_point(data = data.frame(x = 2000:2015, y = ex.obs),
             aes(x = x, y = y), color = "black") +
  geom_line(data = data.frame(x = 2000:2015, y = ex.fitted$expectancy),
            aes(x = x, y = y), color = "seagreen") + 
  geom_ribbon(data = data.frame(x = 2000:2015, l = ex.fitted$lower, 
                                u = ex.fitted$upper),
              aes(x = x, ymax = u, ymin = l), fill = "seagreen", alpha = .4) +
  geom_line(data = data.frame(x = 2016:2025, y = ex.pred$expectancy),
            aes(x = x, y = y), color = "red") + 
  geom_ribbon(data = data.frame(x = 2016:2025, l = ex.pred$lower,
                                u = ex.pred$upper),
              aes(x = x, ymax = u, ymin = l), fill = "red", alpha = .4) +
  labs(y = "Expectancy (years)", x = "Year") 


sessionInfo()