## ---- include = FALSE----------------------------------------------------------------------------------- library("deepregression") library("keras") library("tfprobability") library("mgcv") ## ----------------------------------------------------------------------------------------------------- url <- "https://github.com/davidruegamer/airbnb/raw/main/munich_clean_text.RDS" destfile <- file.path(getwd(), "munich.RDS") download.file(url, destfile, mode = "wb") airbnb <- readRDS("munich.RDS") airbnb$days_since_last_review <- as.numeric(difftime(airbnb$date, airbnb$last_review)) list_of_formulas <- list( loc = ~ 1 + x + s(z1, bs = "tp"), scale = ~ 0 + s(z2, bs = "ps") + dnn(u) ) ## pictures url1 <- "https://github.com/davidruegamer/deepregression_tutorial/raw/main/32_1.zip" destfile <- file.path(getwd(), "32_1.zip") download.file(url1, destfile, mode = "wb") unzip("32_1.zip") url2 <- "https://github.com/davidruegamer/deepregression_tutorial/raw/main/32_2.zip" destfile <- file.path(getwd(), "32_2.zip") download.file(url2, destfile, mode = "wb") unzip("32_2.zip") ## ----------------------------------------------------------------------------------------------------- y <- log(airbnb$price) ## ----------------------------------------------------------------------------------------------------- list_of_formulas <- list( loc = ~ 1 + te(latitude, longitude, df = 5), scale = ~ 1 ) ## ----------------------------------------------------------------------------------------------------- mod_simple <- deepregression( y = y, data = airbnb, list_of_formulas = list_of_formulas, list_of_deep_models = NULL ) ## ----------------------------------------------------------------------------------------------------- class(mod_simple$model) ## ----------------------------------------------------------------------------------------------------- str(mod_simple$init_params$parsed_formulas_contents$loc,1) sapply(mod_simple$init_params$parsed_formulas_contents$loc, "[[", "term") ## ----------------------------------------------------------------------------------------------------- deep_model <- function(x) { x %>% layer_dense(units = 5, activation = "relu", use_bias = FALSE) %>% layer_dropout(rate = 0.2) %>% layer_dense(units = 3, activation = "relu") %>% layer_dropout(rate = 0.2) %>% layer_dense(units = 1, activation = "linear") } ## ----------------------------------------------------------------------------------------------------- options(identify_intercept = TRUE) mod <- deepregression( y = y, data = airbnb, list_of_formulas = list( location = ~ 1 + beds + s(accommodates, bs = "ps") + s(days_since_last_review, bs = "tp") + deep(review_scores_rating, reviews_per_month), scale = ~1 ), list_of_deep_models = list(deep = deep_model) ) ## ----------------------------------------------------------------------------------------------------- mod %>% fit( epochs = 100, verbose = FALSE, view_metrics = FALSE, validation_split = 0.2 ) ## ----------------------------------------------------------------------------------------------------- fitted_vals <- mod %>% fitted() cor(fitted_vals, y) ## ---- cv_plot----------------------------------------------------------------------------------------- mod_cv <- deepregression( y = y, data = airbnb, list_of_formulas = list( location = ~ 1 + beds + s(accommodates, bs = "ps") + s(days_since_last_review, bs = "tp") + deep(review_scores_rating, reviews_per_month), scale = ~1 ), list_of_deep_models = list(deep = deep_model) ) res_cv <- mod_cv %>% cv( plot = FALSE, cv_folds = 3, epochs = 100 ) plot_cv(res_cv) ## ----------------------------------------------------------------------------------------------------- coef(mod, type = "linear") ## ---- plot_mod, fig.height = 4, fig.width = 8------------------------------------------------------------- par(mfrow = c(1,2)) plot(mod) ## ----------------------------------------------------------------------------------------------------- dist <- mod %>% get_distribution() str(dist, 1) ## ---- quant_mean_plot--------------------------------------------------------------------------------- first_obs_airbnb <- as.data.frame(airbnb)[1, , drop = FALSE] dist1 <- mod %>% get_distribution(first_obs_airbnb) meanval <- mod %>% mean(first_obs_airbnb) %>% c q05 <- mod %>% quant(data = first_obs_airbnb, probs = 0.05) %>% c q95 <- mod %>% quant(data = first_obs_airbnb, probs = 0.95) %>% c xseq <- seq(q05 - 1, q95 + 1, l = 1000) plot(xseq, sapply(xseq, function(x) c(as.matrix( dist1$tensor_distribution$prob(x)))), type = "l", ylab = "density(price)", xlab = "price") abline(v = c(q05, meanval, q95), col = "red", lty = 2) ## ----------------------------------------------------------------------------------------------------- str(mod_simple$model$trainable_weights, 1) ## ----------------------------------------------------------------------------------------------------- lambda <- 0.5 addpen <- function(x) lambda * tf$reduce_sum(tf$abs(mod_simple$model$trainable_weights[[1]])) mod_simple_pen <- deepregression( y = y, data = airbnb, list_of_formulas = list_of_formulas, list_of_deep_models = NULL, additional_penalty = addpen ) ## ---- smooth_plot_comparison, fig.height = 3, fig.width = 8----------------------------------------------- form_df_3 <- list(loc = ~ 1 + s(days_since_last_review, df = 3), scale = ~ 1) form_df_6 <- list(loc = ~ 1 + s(days_since_last_review, df = 6), scale = ~ 1) form_df_10 <- list(loc = ~ 1 + s(days_since_last_review, df = 10), scale = ~ 1) args <- list(y = y, data = airbnb, list_of_deep_models = NULL) mod_df_low <- do.call("deepregression", c(args, list(list_of_formulas = form_df_3))) mod_df_med <- do.call("deepregression", c(args, list(list_of_formulas = form_df_6))) mod_df_max <- do.call("deepregression", c(args, list(list_of_formulas = form_df_10))) mod_df_low %>% fit(epochs = 1000, early_stopping = TRUE, verbose = FALSE) mod_df_med %>% fit(epochs = 1000, early_stopping = TRUE, verbose = FALSE) mod_df_max %>% fit(epochs = 1000, early_stopping = TRUE, verbose = FALSE) par(mfrow = c(1,3)) plot(mod_df_low) plot(mod_df_med) plot(mod_df_max) ## ----------------------------------------------------------------------------------------------------- embd_mod <- function(x) x %>% layer_embedding(input_dim = 1000, output_dim = 100) %>% layer_lambda(f = function(x) k_mean(x, axis = 2)) %>% layer_dense(20, activation = "tanh") %>% layer_dropout(0.3) %>% layer_dense(2) ## ----------------------------------------------------------------------------------------------------- mod <- deepregression( y = y, list_of_formulas = list( location = ~ 1, scale = ~ 1, both = ~ 0 + embd_mod(texts) ), list_of_deep_models = list(embd_mod = embd_mod), mapping = list(1,2,1:2), data = airbnb ) ## ----------------------------------------------------------------------------------------------------- embd_mod <- function(x) x %>% layer_embedding(input_dim = 1000, output_dim = 100) %>% layer_lambda(f = function(x) k_mean(x, axis = 2)) %>% layer_dense(20, activation = "tanh") %>% layer_dropout(0.3) %>% layer_dense(1) form_lists <- list( location = ~ 1 + embd_mod(texts), scale = ~ 1 ) mod <- deepregression( y = y, list_of_formulas = form_lists, list_of_deep_models = list(embd_mod = embd_mod), data = airbnb ) ## ----------------------------------------------------------------------------------------------------- set.seed(42) n <- 1000 toyX <- rnorm(n) toyY <- 2 * toyX + rnorm(n) ## ----------------------------------------------------------------------------------------------------- deep_model <- function(x) { x %>% layer_dense(units = 100, activation = "relu", use_bias = FALSE) %>% layer_dropout(rate = 0.2) %>% layer_dense(units = 50, activation = "relu") %>% layer_dropout(rate = 0.2) %>% layer_dense(units = 1, activation = "linear") } ## ----------------------------------------------------------------------------------------------------- forms <- list(loc = ~ -1 + toyX + deep_model(toyX), scale = ~ 1) args <- list( y = toyY, data = data.frame(toyX = toyX), list_of_formulas = forms, list_of_deep_models = list(deep_model = deep_model) ) w_oz <- orthog_control(orthogonalize = TRUE) wo_oz <- orthog_control(orthogonalize = FALSE) mod_w_oz <- do.call("deepregression", c(args, list(orthog_options = w_oz))) mod_wo_oz <- do.call("deepregression", c(args, list(orthog_options = wo_oz))) mod_w_oz %>% fit(epochs = 1000, early_stopping = TRUE, batch_size = 50, verbose = FALSE) mod_wo_oz %>% fit(epochs = 1000, early_stopping = TRUE, batch_size = 50, verbose = FALSE) cbind( with = c(coef(mod_w_oz, which_param = 1)[[1]]), without = c(coef(mod_wo_oz, which_param = 1)[[1]]), linmod = coef(lm(toyY ~ 0 + toyX)) ) create_family(tfd_dist = tfd_normal, trafo_list = list( function(x) tf$math$reciprocal(x), function(x) tf$math$square(x) ) ) ## ----------------------------------------------------------------------------------------------------- function(x){ do.call(your_tfd_dist, lapply(1:ncol(x)[[1]], function(i) your_trafo_list_on_inputs[[i]]( x[,i,drop = FALSE]) ) ) } ## ----------------------------------------------------------------------------------------------------- toyXinDisguise <- toyX ## ----------------------------------------------------------------------------------------------------- form_known <- list(loc = ~ -1 + toyX + deep_model(toyX), scale = ~ 1) form_unknown <- list(loc = ~ -1 + toyX + deep_model(toyXinDisguise), scale = ~ 1) form_manual <- list(loc = ~ -1 + toyX + deep_model(toyXinDisguise) %OZ% (toyXinDisguise), scale = ~ 1) args <- list( y = toyY, data = data.frame(toyX = toyX, toyXinDisguise = toyXinDisguise), list_of_deep_models = list(deep_model = deep_model) ) mod_known <- do.call("deepregression", c(args, list(list_of_formulas = form_known))) mod_unknown <- do.call("deepregression", c(args, list(list_of_formulas = form_unknown))) mod_manual <- do.call("deepregression", c(args, list(list_of_formulas = form_manual))) mod_known %>% fit(epochs = 1000, early_stopping = FALSE, verbose = FALSE) mod_unknown %>% fit(epochs = 1000, early_stopping = FALSE, verbose = FALSE) mod_manual %>% fit(epochs = 1000, early_stopping = FALSE, verbose = FALSE) cbind( known = coef(mod_known, which_param = 1)[[1]], unknown = coef(mod_unknown, which_param = 1)[[1]], manual = coef(mod_manual, which_param = 1)[[1]] ) ## ----------------------------------------------------------------------------------------------------- airbnb$image <- paste0(getwd(), "/32/", airbnb$id, ".jpg") ## ----------------------------------------------------------------------------------------------------- cnn_block <- function(filters, kernel_size, pool_size, rate, input_shape = NULL){ function(x){ x %>% layer_conv_2d(filters, kernel_size, padding = "same", input_shape = input_shape) %>% layer_activation(activation = "relu") %>% layer_batch_normalization() %>% layer_max_pooling_2d(pool_size = pool_size) %>% layer_dropout(rate = rate) } } ## ----------------------------------------------------------------------------------------------------- cnn <- cnn_block(filters = 16, kernel_size = c(3,3), pool_size = c(3,3), rate = 0.25, shape(200, 200, 3)) deep_model_cnn <- function(x){ x %>% cnn() %>% layer_flatten() %>% layer_dense(32) %>% layer_activation(activation = "relu") %>% layer_batch_normalization() %>% layer_dropout(rate = 0.5) %>% layer_dense(1) } ## ----------------------------------------------------------------------------------------------------- mod_cnn <- deepregression( y = y, list_of_formulas = list( ~1 + room_type + bedrooms + beds + deep_model_cnn(image), ~1 + room_type), data = airbnb, list_of_deep_models = list(deep_model_cnn = list(deep_model_cnn, c(200,200,3))), optimizer = optimizer_adam(learning_rate = 0.0001) ) ## ---- eval = FALSE-------------------------------------------------------------------------------------- # mod_cnn %>% fit( # epochs = 100, # early_stopping = TRUE, # patience = 5, # verbose = FALSE) ## ----------------------------------------------------------------------------------------------------- coef(mod_cnn) ## ----------------------------------------------------------------------------------------------------- mod <- deepregression( y = y, data = airbnb, list_of_formulas = list( location = ~ 1 + s(longitude), scale = ~ 1 + s(longitude) ), list_of_deep_models = NULL, optimizer = optimizer_adam(learning_rate = 0.1, decay = 1e-4) ) ens <- ensemble(mod, n_ensemble = 5, epochs = 1000, early_stopping = TRUE, validation_split = 0.2) ## ----------------------------------------------------------------------------------------------------- mems <- deepregression:::.call_for_all_members(ens, get_distribution) ensd <- get_ensemble_distribution(ens) obs_idx <- 1 ys <- seq(min(y), max(y), length.out = 1e3) mem_preds <- do.call("cbind", lapply(mems, function(x) { tfd_prob(x$tensor_distribution, ys)$numpy()[obs_idx, ] })) ens_preds <- tfd_prob(ensd, ys)$numpy()[obs_idx, ] avg_nll <- mean(unlist(lapply(mems, function(x) - mean(as.matrix(tfd_log_prob(x, y)))))) ens_nll <- - mean(as.matrix(tfd_log_prob(ensd, y))) # pdat <- data.frame(y = ys, preds = mem_preds, obs = y[obs_idx]) # write.csv(pdat, "ens-dens.csv", row.names = FALSE, quote = FALSE) # pdf("ens-dens.pdf", height = 4, width = 6) matplot(ys, mem_preds, type = "l", lty = 2, col = "gray80", xlab = "log-price", ylab = "predicted density", las = 1, lwd = 1.3) abline(v = y[obs_idx], col = "cornflowerblue") lines(ys, ens_preds, lwd = 1.3) legend("topright", c("Individual prediction", "Ensemble prediction", "Observation"), col = c("gray80", "black", "cornflowerblue"), lty = c(2, 1, 1), lwd = 2, bty = "n") # legend("right", paste(c("Average NLL:", "Ensemble NLL:"), # round(c(avg_nll, ens_nll), 3)), bty = "n") # dev.off() ## ----------------------------------------------------------------------------------------------------- lons <- seq(min(airbnb$longitude), max(airbnb$longitude), length.out = 1e3) nd <- data.frame(longitude = lons) mems <- deepregression:::.call_for_all_members(ens, get_distribution, data = nd) ensd <- get_ensemble_distribution(ens, data = nd) mem_preds <- do.call("cbind", lapply(mems, function(x) as.matrix(tfd_mean(x)))) ens_preds <- as.matrix(tfd_mean(ensd)) alea <- as.matrix(tfd_stddev(ensd)) unc <- apply(mem_preds, 1, sd) # pdat <- data.frame(lons = lons, preds = mem_preds, ens_preds = ens_preds, unc = unc) # write.csv(pdat, "ens-pred.csv", row.names = FALSE, quote = FALSE) # # lons <- pdat$lons # mem_preds <- pdat[, 2:6] # ens_preds <- pdat$ens_preds # pdf("ens-pred.pdf", height = 4, width = 6) matplot(lons, mem_preds, lty = 2, col = "gray50", type = "l", las = 1, xlab = "longitude", ylab = "Expected log-price", lwd = 1.3) lines(lons, ens_preds, lwd = 1.3) lines(lons, ens_preds - unc, lwd = 1.3) lines(lons, ens_preds + unc, lwd = 1.3) polygon(c(lons, rev(lons)), c(ens_preds - unc, rev(ens_preds + unc)), col = rgb(.1, .1, .1, .1), border = NA) # points(airbnb$longitude, y, pch = 20, cex = 0.5, col = rgb(.1, .1, .1, .1)) legend("topright", c("Individual prediction", "Ensemble prediction"), col = c("gray50", "black"), lty = c(2, 1), lwd = 2, bty = "n") # dev.off() ## ----------------------------------------------------------------------------------------------------- mod <- deepregression( y = y, data = airbnb, list_of_formulas = list( location = ~ 1 + s(longitude), scale = ~ 1 + s(longitude) ), list_of_deep_models = NULL, optimizer = optimizer_adam(learning_rate = 0.1, decay = 1e-4) ) ens <- ensemble(mod, n_ensemble = 5, epochs = 1000, early_stopping = TRUE, validation_split = 0.2) ## ----------------------------------------------------------------------------------------------------- deep_model <- function(x) { x %>% layer_dense(units = 5, activation = "relu", use_bias = FALSE) %>% layer_dense(units = 3, activation = "linear", name = "penultimate_layer") %>% layer_dense(units = 1, activation = "linear") } ## ----------------------------------------------------------------------------------------------------- mod <- deepregression( y = y, data = airbnb, list_of_formulas = list( location = ~ 1 + beds + s(accommodates, bs = "ps") + s(days_since_last_review, bs = "tp") + deep(review_scores_rating, reviews_per_month), scale = ~1 ), list_of_deep_models = list(deep = deep_model) ) ## ----------------------------------------------------------------------------------------------------- mod %>% fit( epochs = 100, verbose = FALSE, view_metrics = FALSE, validation_split = 0.2 ) ## ----------------------------------------------------------------------------------------------------- intermediate_mod <- keras_model(mod$model$input, mod$model$get_layer( name = "penultimate_layer")$output ) newdata_processed <- deepregression:::prepare_newdata( mod$init_params$parsed_formulas_contents, airbnb, gamdata = mod$init_params$gamdata$data_trafos ) tilde_u <- as.data.frame(intermediate_mod$predict(newdata_processed)) str(tilde_u, 1) ## ----------------------------------------------------------------------------------------------------- gam_mod <- gam(log(price) ~ 1 + beds + s(accommodates, bs = "ps") + s(days_since_last_review, bs = "tp") + V1 + V2 + V3, data = cbind(as.data.frame(airbnb), tilde_u) ) plot(gam_mod, pages = 1) sessionInfo()