## ---- include = FALSE-----------------------------------------------------------------------------------
library("deepregression")
library("keras")
library("tfprobability")
library("mgcv")

## -----------------------------------------------------------------------------------------------------
url <- "https://github.com/davidruegamer/airbnb/raw/main/munich_clean_text.RDS"
destfile <- file.path(getwd(), "munich.RDS")
download.file(url, destfile, mode = "wb")
airbnb <- readRDS("munich.RDS")
airbnb$days_since_last_review <- as.numeric(difftime(airbnb$date, airbnb$last_review))

list_of_formulas <- list(
  loc = ~ 1 + x + s(z1, bs = "tp"),
  scale = ~ 0 + s(z2, bs = "ps") + dnn(u)
)

## pictures
url1 <- "https://github.com/davidruegamer/deepregression_tutorial/raw/main/32_1.zip"
destfile <- file.path(getwd(), "32_1.zip")
download.file(url1, destfile, mode = "wb")
unzip("32_1.zip")
url2 <- "https://github.com/davidruegamer/deepregression_tutorial/raw/main/32_2.zip"
destfile <- file.path(getwd(), "32_2.zip")
download.file(url2, destfile, mode = "wb")
unzip("32_2.zip")


## -----------------------------------------------------------------------------------------------------
y <- log(airbnb$price)


## -----------------------------------------------------------------------------------------------------
list_of_formulas <- list(
  loc = ~ 1 + te(latitude, longitude, df = 5),
  scale = ~ 1
)


## -----------------------------------------------------------------------------------------------------
mod_simple <- deepregression(
  y = y, 
  data = airbnb, 
  list_of_formulas = list_of_formulas,
  list_of_deep_models = NULL
)


## -----------------------------------------------------------------------------------------------------
class(mod_simple$model)


## -----------------------------------------------------------------------------------------------------
str(mod_simple$init_params$parsed_formulas_contents$loc,1)
sapply(mod_simple$init_params$parsed_formulas_contents$loc, "[[", "term")


## -----------------------------------------------------------------------------------------------------
deep_model <- function(x)
{
  x %>% 
    layer_dense(units = 5, activation = "relu", use_bias = FALSE) %>%
    layer_dropout(rate = 0.2) %>%
    layer_dense(units = 3, activation = "relu") %>%
    layer_dropout(rate = 0.2) %>%
    layer_dense(units = 1, activation = "linear")
}


## -----------------------------------------------------------------------------------------------------
options(identify_intercept = TRUE)
mod <- deepregression(
  y = y, 
  data = airbnb,
  list_of_formulas = list(
    location = ~ 1 + beds + s(accommodates, bs = "ps") +
      s(days_since_last_review, bs = "tp") + 
      deep(review_scores_rating, reviews_per_month),
    scale = ~1
  ),
  list_of_deep_models = list(deep = deep_model)
)


## -----------------------------------------------------------------------------------------------------
mod %>% fit(
  epochs = 100, 
  verbose = FALSE, 
  view_metrics = FALSE,
  validation_split = 0.2
  )


## -----------------------------------------------------------------------------------------------------
fitted_vals <- mod %>% fitted()
cor(fitted_vals, y)


## ---- cv_plot-----------------------------------------------------------------------------------------
mod_cv <- deepregression(
  y = y, 
  data = airbnb,
  list_of_formulas = list(
    location = ~ 1 + beds + s(accommodates, bs = "ps") +
      s(days_since_last_review, bs = "tp") + 
      deep(review_scores_rating, reviews_per_month),
    scale = ~1
  ),
  list_of_deep_models = list(deep = deep_model)
)

res_cv <- mod_cv %>% cv(
  plot = FALSE, 
  cv_folds = 3,
  epochs = 100
  )
plot_cv(res_cv)


## -----------------------------------------------------------------------------------------------------
coef(mod, type = "linear")


## ---- plot_mod, fig.height = 4, fig.width = 8-------------------------------------------------------------
par(mfrow = c(1,2))
plot(mod)


## -----------------------------------------------------------------------------------------------------
dist <- mod %>% get_distribution()
str(dist, 1)


## ---- quant_mean_plot---------------------------------------------------------------------------------
first_obs_airbnb <- as.data.frame(airbnb)[1, , drop = FALSE]
dist1 <- mod %>% get_distribution(first_obs_airbnb)
meanval <- mod %>% mean(first_obs_airbnb) %>% c
q05 <- mod %>% quant(data = first_obs_airbnb, probs = 0.05) %>% c
q95 <- mod %>% quant(data = first_obs_airbnb, probs = 0.95) %>% c
xseq <- seq(q05 - 1, q95 + 1, l = 1000)
plot(xseq, sapply(xseq, function(x) c(as.matrix(
     dist1$tensor_distribution$prob(x)))), 
     type = "l", ylab = "density(price)", xlab = "price")
abline(v = c(q05, meanval, q95), col = "red", lty = 2)


## -----------------------------------------------------------------------------------------------------
str(mod_simple$model$trainable_weights, 1)


## -----------------------------------------------------------------------------------------------------
lambda <- 0.5
addpen <- function(x) lambda * tf$reduce_sum(tf$abs(mod_simple$model$trainable_weights[[1]]))
mod_simple_pen <- deepregression(
  y = y, 
  data = airbnb, 
  list_of_formulas = list_of_formulas,
  list_of_deep_models = NULL, 
  additional_penalty = addpen
)


## ---- smooth_plot_comparison, fig.height = 3, fig.width = 8-----------------------------------------------
form_df_3 <- list(loc = ~ 1 + s(days_since_last_review, df = 3), scale = ~ 1)
form_df_6 <- list(loc = ~ 1 + s(days_since_last_review, df = 6), scale = ~ 1)
form_df_10 <- list(loc = ~ 1 + s(days_since_last_review, df = 10), scale = ~ 1)
args <- list(y = y, data = airbnb, 
             list_of_deep_models = NULL)

mod_df_low <- do.call("deepregression", c(args, list(list_of_formulas = form_df_3)))
mod_df_med <- do.call("deepregression", c(args, list(list_of_formulas = form_df_6)))
mod_df_max <- do.call("deepregression", c(args, list(list_of_formulas = form_df_10)))

mod_df_low %>% fit(epochs = 1000, early_stopping = TRUE, verbose = FALSE)
mod_df_med %>% fit(epochs = 1000, early_stopping = TRUE, verbose = FALSE)
mod_df_max %>% fit(epochs = 1000, early_stopping = TRUE, verbose = FALSE)

par(mfrow = c(1,3))
plot(mod_df_low)
plot(mod_df_med)
plot(mod_df_max)


## -----------------------------------------------------------------------------------------------------
embd_mod <- function(x) x %>%
  layer_embedding(input_dim = 1000,
                  output_dim = 100) %>%
  layer_lambda(f = function(x) k_mean(x, axis = 2)) %>%
  layer_dense(20, activation = "tanh") %>% 
  layer_dropout(0.3) %>% 
  layer_dense(2) 


## -----------------------------------------------------------------------------------------------------
mod <- deepregression(
  y = y,
  list_of_formulas = 
    list(
      location = ~ 1,
      scale = ~ 1,
      both = ~ 0 + embd_mod(texts)
    ),
  list_of_deep_models = 
    list(embd_mod = embd_mod), 
  mapping = list(1,2,1:2),
  data = airbnb
)


## -----------------------------------------------------------------------------------------------------
embd_mod <- function(x) x %>%
  layer_embedding(input_dim = 1000,
                  output_dim = 100) %>%
  layer_lambda(f = function(x) k_mean(x, axis = 2)) %>%
  layer_dense(20, activation = "tanh") %>% 
  layer_dropout(0.3) %>% 
  layer_dense(1) 

form_lists <- list(
  location = ~ 1 + embd_mod(texts),
  scale = ~ 1
)
  
mod <- deepregression(
  y = y,
  list_of_formulas = form_lists,
  list_of_deep_models = 
    list(embd_mod = embd_mod), 
  data = airbnb
  )


## -----------------------------------------------------------------------------------------------------
set.seed(42)
n <- 1000
toyX <- rnorm(n)
toyY <- 2 * toyX + rnorm(n)    


## -----------------------------------------------------------------------------------------------------
deep_model <- function(x)
{
  x %>% 
    layer_dense(units = 100, activation = "relu", use_bias = FALSE) %>%
    layer_dropout(rate = 0.2) %>%
    layer_dense(units = 50, activation = "relu") %>%
    layer_dropout(rate = 0.2) %>%
    layer_dense(units = 1, activation = "linear")
}


## -----------------------------------------------------------------------------------------------------
forms <- list(loc = ~ -1 + toyX + deep_model(toyX), scale = ~ 1)
args <- list(
  y = toyY, 
  data = data.frame(toyX = toyX), 
  list_of_formulas = forms, 
  list_of_deep_models = list(deep_model = deep_model)
)

w_oz <- orthog_control(orthogonalize = TRUE)
wo_oz <- orthog_control(orthogonalize = FALSE)
  
mod_w_oz <- do.call("deepregression", c(args, list(orthog_options = w_oz)))
mod_wo_oz <- do.call("deepregression", c(args, list(orthog_options = wo_oz)))

mod_w_oz %>% fit(epochs = 1000, early_stopping = TRUE, batch_size = 50, verbose = FALSE)
mod_wo_oz %>% fit(epochs = 1000, early_stopping = TRUE, batch_size = 50, verbose = FALSE)

cbind(
  with = c(coef(mod_w_oz, which_param = 1)[[1]]),
  without = c(coef(mod_wo_oz, which_param = 1)[[1]]),
  linmod = coef(lm(toyY ~ 0 + toyX))
)

create_family(tfd_dist = tfd_normal, trafo_list = list(
  function(x) tf$math$reciprocal(x),
  function(x) tf$math$square(x)
  )
)


## -----------------------------------------------------------------------------------------------------
function(x){ 
  do.call(your_tfd_dist, 
    lapply(1:ncol(x)[[1]],
      function(i) your_trafo_list_on_inputs[[i]](
        x[,i,drop = FALSE])
    )
  )
}


## -----------------------------------------------------------------------------------------------------
toyXinDisguise <- toyX


## -----------------------------------------------------------------------------------------------------
form_known <- list(loc = ~ -1 + toyX + deep_model(toyX), scale = ~ 1)
form_unknown <- list(loc = ~ -1 + toyX + deep_model(toyXinDisguise), scale = ~ 1)
form_manual <- list(loc = ~ -1 + toyX + deep_model(toyXinDisguise) %OZ% (toyXinDisguise), scale = ~ 1)
args <- list(
  y = toyY, 
  data = data.frame(toyX = toyX, toyXinDisguise = toyXinDisguise), 
  list_of_deep_models = list(deep_model = deep_model)
)

mod_known <- do.call("deepregression", c(args, list(list_of_formulas = form_known)))
mod_unknown <- do.call("deepregression", c(args, list(list_of_formulas = form_unknown)))
mod_manual <- do.call("deepregression", c(args, list(list_of_formulas = form_manual)))

mod_known %>% fit(epochs = 1000, early_stopping = FALSE, verbose = FALSE)
mod_unknown %>% fit(epochs = 1000, early_stopping = FALSE, verbose = FALSE)
mod_manual %>% fit(epochs = 1000, early_stopping = FALSE, verbose = FALSE)

cbind(
  known = coef(mod_known, which_param = 1)[[1]],
  unknown = coef(mod_unknown, which_param = 1)[[1]],
  manual = coef(mod_manual, which_param = 1)[[1]]
)


## -----------------------------------------------------------------------------------------------------
airbnb$image <- paste0(getwd(), "/32/", airbnb$id, ".jpg")


## -----------------------------------------------------------------------------------------------------
cnn_block <- function(filters, kernel_size, pool_size, rate, input_shape = NULL){
    function(x){
      x %>% 
        layer_conv_2d(filters, kernel_size, padding = "same", input_shape = input_shape) %>% 
        layer_activation(activation = "relu") %>% 
        layer_batch_normalization() %>% 
        layer_max_pooling_2d(pool_size = pool_size) %>% 
        layer_dropout(rate = rate)
    }
  }


## -----------------------------------------------------------------------------------------------------
cnn <- cnn_block(filters = 16, kernel_size = c(3,3), pool_size = c(3,3), rate = 0.25,
                 shape(200, 200, 3))
deep_model_cnn <- function(x){
    x %>% 
    cnn() %>%
    layer_flatten() %>% 
    layer_dense(32) %>% 
    layer_activation(activation = "relu") %>% 
    layer_batch_normalization() %>% 
    layer_dropout(rate = 0.5) %>% 
    layer_dense(1)
}


## -----------------------------------------------------------------------------------------------------
mod_cnn <- deepregression(
  y = y,
  list_of_formulas = list(
    ~1 + room_type + bedrooms + beds + 
      deep_model_cnn(image), 
    ~1 + room_type), 
  data = airbnb,
  list_of_deep_models = list(deep_model_cnn = list(deep_model_cnn, c(200,200,3))),
  optimizer = optimizer_adam(learning_rate = 0.0001)
)


## ---- eval = FALSE--------------------------------------------------------------------------------------
# mod_cnn %>% fit(
#   epochs = 100,
#  early_stopping = TRUE,
#   patience = 5,
#   verbose = FALSE)

## -----------------------------------------------------------------------------------------------------
coef(mod_cnn)


## -----------------------------------------------------------------------------------------------------
mod <- deepregression(
  y = y, 
  data = airbnb,
  list_of_formulas = list(
    location = ~ 1 + s(longitude),
    scale = ~ 1 + s(longitude)
  ),
  list_of_deep_models = NULL,
  optimizer = optimizer_adam(learning_rate = 0.1, decay = 1e-4)
)

ens <- ensemble(mod, n_ensemble = 5, epochs = 1000, early_stopping = TRUE,
                validation_split = 0.2)


## -----------------------------------------------------------------------------------------------------
mems <- deepregression:::.call_for_all_members(ens, get_distribution)
ensd <- get_ensemble_distribution(ens)

obs_idx <- 1

ys <- seq(min(y), max(y), length.out = 1e3)
mem_preds <- do.call("cbind", lapply(mems, function(x) {
  tfd_prob(x$tensor_distribution, ys)$numpy()[obs_idx, ]
}))
ens_preds <- tfd_prob(ensd, ys)$numpy()[obs_idx, ]

avg_nll <- mean(unlist(lapply(mems, function(x) - mean(as.matrix(tfd_log_prob(x, y))))))
ens_nll <- - mean(as.matrix(tfd_log_prob(ensd, y)))

# pdat <- data.frame(y = ys, preds = mem_preds, obs = y[obs_idx])
# write.csv(pdat, "ens-dens.csv", row.names = FALSE, quote = FALSE)

# pdf("ens-dens.pdf", height = 4, width = 6)
matplot(ys, mem_preds, type = "l", lty = 2, col = "gray80", xlab = "log-price",
        ylab = "predicted density", las = 1, lwd = 1.3)
abline(v = y[obs_idx], col = "cornflowerblue")
lines(ys, ens_preds, lwd = 1.3)
legend("topright", c("Individual prediction", "Ensemble prediction", "Observation"),
       col = c("gray80", "black", "cornflowerblue"), lty = c(2, 1, 1), lwd = 2, bty = "n")
# legend("right", paste(c("Average    NLL:", "Ensemble NLL:"), 
                      # round(c(avg_nll, ens_nll), 3)), bty = "n")
# dev.off()


## -----------------------------------------------------------------------------------------------------
lons <- seq(min(airbnb$longitude), max(airbnb$longitude), length.out = 1e3)
nd <- data.frame(longitude = lons)
mems <- deepregression:::.call_for_all_members(ens, get_distribution, data = nd)
ensd <- get_ensemble_distribution(ens, data = nd)
mem_preds <- do.call("cbind", lapply(mems, function(x) as.matrix(tfd_mean(x))))
ens_preds <- as.matrix(tfd_mean(ensd))
alea <- as.matrix(tfd_stddev(ensd))
unc <- apply(mem_preds, 1, sd)

# pdat <- data.frame(lons = lons, preds = mem_preds, ens_preds = ens_preds, unc = unc)
# write.csv(pdat, "ens-pred.csv", row.names = FALSE, quote = FALSE)
# 
# lons <- pdat$lons
# mem_preds <- pdat[, 2:6]
# ens_preds <- pdat$ens_preds

# pdf("ens-pred.pdf", height = 4, width = 6)
matplot(lons, mem_preds, lty = 2, col = "gray50", type = "l", las = 1,
        xlab = "longitude", ylab = "Expected log-price", lwd = 1.3)
lines(lons, ens_preds, lwd = 1.3)
lines(lons, ens_preds - unc, lwd = 1.3)
lines(lons, ens_preds + unc, lwd = 1.3)
polygon(c(lons, rev(lons)), c(ens_preds - unc, rev(ens_preds + unc)),
        col = rgb(.1, .1, .1, .1), border = NA)
# points(airbnb$longitude, y, pch = 20, cex = 0.5, col = rgb(.1, .1, .1, .1))
legend("topright", c("Individual prediction", "Ensemble prediction"),
       col = c("gray50", "black"), lty = c(2, 1), lwd = 2, bty = "n")
# dev.off()

## -----------------------------------------------------------------------------------------------------
mod <- deepregression(
  y = y, 
  data = airbnb,
  list_of_formulas = list(
    location = ~ 1 + s(longitude),
    scale = ~ 1 + s(longitude)
  ),
  list_of_deep_models = NULL,
  optimizer = optimizer_adam(learning_rate = 0.1, decay = 1e-4)
)

ens <- ensemble(mod, n_ensemble = 5, epochs = 1000, early_stopping = TRUE,
                validation_split = 0.2)


## -----------------------------------------------------------------------------------------------------
deep_model <- function(x)
{
  x %>% 
    layer_dense(units = 5, activation = "relu", use_bias = FALSE) %>%
    layer_dense(units = 3, activation = "linear", name = "penultimate_layer") %>%
    layer_dense(units = 1, activation = "linear")
}


## -----------------------------------------------------------------------------------------------------
mod <- deepregression(
  y = y, 
  data = airbnb,
  list_of_formulas = list(
    location = ~ 1 + beds + s(accommodates, bs = "ps") +
      s(days_since_last_review, bs = "tp") + 
      deep(review_scores_rating, reviews_per_month),
    scale = ~1
  ),
  list_of_deep_models = list(deep = deep_model)
)


## -----------------------------------------------------------------------------------------------------
mod %>% fit(
  epochs = 100, 
  verbose = FALSE, 
  view_metrics = FALSE,
  validation_split = 0.2
  )


## -----------------------------------------------------------------------------------------------------
intermediate_mod <- keras_model(mod$model$input, mod$model$get_layer(
  name = "penultimate_layer")$output
  )
newdata_processed <- deepregression:::prepare_newdata(
  mod$init_params$parsed_formulas_contents, 
  airbnb, 
  gamdata = mod$init_params$gamdata$data_trafos
  )
tilde_u <- as.data.frame(intermediate_mod$predict(newdata_processed))
str(tilde_u, 1)


## -----------------------------------------------------------------------------------------------------
gam_mod <- gam(log(price) ~ 
  1 + beds + s(accommodates, bs = "ps") + 
  s(days_since_last_review, bs = "tp") + 
    V1 + V2 + V3,
  data = 
    cbind(as.data.frame(airbnb), 
      tilde_u)
  )
plot(gam_mod, pages = 1)

sessionInfo()