###################################################
### Sec.3.1 Logistic regression
###################################################
titanic <- as.data.frame(Titanic)
titanic

### generate raw data from frequency

library("bbl")
titanic_raw <- freq2raw(data = titanic, freq = Freq)
head(titanic_raw)
summary(titanic_raw)

### fit glm model

gfit0 <- glm(Survived ~ Class + Sex + Age, family = binomial(), 
  data = titanic, weights = Freq)
coef(summary(gfit0))

### fit glm with interactions

gfit1 <- glm(Survived ~ (Class + Sex + Age)^2, family = binomial(), 
  data = titanic, weights = Freq)
coef(summary(gfit1))

### divide sample into training and validation sets

set.seed(159)
nsample <- NROW(titanic_raw)
flag <- rep(TRUE, nsample)
flag[sample(nsample, nsample/2)] <- FALSE
dtrain <- titanic_raw[flag, ]
dtest <- titanic_raw[!flag, ]

### fit glm into training set

gfit2 <- glm(Survived ~ Class * Sex + Sex * Age, family = binomial(), 
  data = dtrain)
prl <- predict(gfit2, newdata = dtest)
yhat <- ifelse(prl > 0, "Yes", "No")
mean(yhat == dtest$Survived)
gauc <- pROC::roc(response = dtest$Survived, predictor = prl, 
  direction = "<")$auc
gauc

### fit glmnet

if (!require("glmnet")) {
    install.packages("glmnet")
}    
library("glmnet")
xdat <- model.matrix(~ Class + Sex + Age, data = dtrain)[, -1]
y <- dtrain[, 4]
gnet <- cv.glmnet(x = xdat, y = y, family = "binomial", alpha = 1, 
  nfolds = 5, type.measure = "auc")
plot(gnet)

head(xdat)

###################################################
### Sec.3.2 Boltmann Bayes learning

### Non-interacting model (naive Bayes)
bfit0 <- bbl(Survived ~ Class + Sex + Age, data = titanic, weights = Freq, 
  prior.count = 0)

### print method

bfit0

### summary method
summary(bfit0)

### comparison of Bayesian- and regresion-derived coefficients

cb0 <- coef(bfit0)
beta <- list(Class = cb0$h$Yes$Class - cb0$h$No$Class, 
  Sex = cb0$h$Yes$Sex - cb0$h$No$Sex, Age = cb0$h$Yes$Age - cb0$h$No$Age)
unlist(beta)
coef(summary(gfit0))[, "Estimate"]

### interacting model fit

bfit <- bbl(Survived ~ Class * Sex + Sex * Age, data = titanic, 
  weights = Freq)
bfit

### plot method

oldpar <- par(mar = c(6, 4.5, 3, 4), tck = -0.05, cex.axis = 0.8)
plot(bfit)
par(oldpar)

### fit on training set and predict on test set

bfit2 <- bbl(Survived ~ Class * Sex + Sex * Age, data = dtrain)
pr <- predict(bfit2, newdata = dtest, type = "prob")
head(pr)
auc <- pROC::roc(response = dtest$Survived, predictor = pr[, 2], 
  direction = "<")$auc
auc

### cross-validation to determine penalizer

cv <- crossVal(Survived ~ .^2, data = dtrain, method = "pseudo", 
  lambda = 10^seq(-5, -2, 0.2), verbose = 0)
cv

### plot method applied to bbl.cv object

plot(cv, mar = c(4, 4, 3, 3), tck = -0.04, bty = "n")

### predict using optimal conditon determined

model <- bbl(Survived ~ .^2, data = dtrain, lambda = cv$regstar)
pr2 <- predict(model, newdata = dtest)
bscore <- mean(dtest$Survived == pr2$yhat)
bscore
bauc <- pROC::roc(response = dtest$Survived, predictor = pr2[, 2], 
  direction = "<")$auc
bauc

### estimate most probable configurations in each response group

map <- mcSample(bfit, nstep = 1000, progress.bar = FALSE)
map

###################################################
### Sec.3.3 Simulated data

### generate random parameters

predictors <- list()
m <- 5
L <- 3
for (i in 1:m) predictors[[i]] <- seq(0, L-1)
par <- randompar(predictors)
names(par)

### generate random samples

xi <- sample_xi(nsample = 10000, predictors = predictors, h = par$h, 
  J = par$J, code_out = TRUE)
head(xi)

### fit using mlestimate for single response group

fit <- mlestimate(xi = xi, method = "pseudo", lambda = 0)

### compare inferred and true parameters

oldpar <- par(mar = c(4, 4, 1, 2), lwd = 0.5, cex.axis = 0.8, 
  cex.lab = 1.0, mgp = c(2.2 , 0.9, 0), tck = -0.03)
range <- range(par$h, par$J, fit$h, fit$J)
plot(x = unlist(par$h), y = unlist(fit$h), bg = "cornflowerblue", 
  xlim = range, ylim = range, pch = 21, cex = 0.8, xlab = "True", 
  ylab = "Inferred", lwd = 0.7, xaxt = "n", yaxt = "n", bty = "n")
axis(side = 1, at = seq(-1.5, 1.5, 0.5), lwd = 0.5, las = 1)
axis(side = 2, at = seq(-1.5, 1.5, 0.5), lwd = 0.5, las = 1)
segments(x0 = -1, x1 = 1, y0 = -1, y1 = 1, lty = 2, lwd = 0.7)
points(x = unlist(par$J), y = unlist(fit$J), pch = 24, bg = "orange", 
  cex = 0.8, lwd = 0.7)
legend(x = 0.5, y = -0.5, legend = expression(italic(h), italic(J)), 
  cex = 0.8, pch = c(21, 24), pt.bg = c("cornflowerblue", "orange"))
par(oldpar)

### simulate full binary response data with four levels

nt <- c("a", "c", "g", "t")
set.seed(135)
for (i in 1:m) predictors[[i]] <- nt
names(predictors) <- paste0("v", 1:m)
par <- list()
par[[1]] <- randompar(predictors)
par[[2]] <- randompar(predictors, h0 = 0.1, J0 = 0.1)
dat <- randomsamp(predictors, response = c("ctrl", "case"), par = par, 
  nsample = 1000)

### perform cross-validation to optimize eps

cv <- crossVal(y ~ .^2, data = dat, method = "mf", eps = seq(0, 1, 0.1), 
  verbose = 0)
cv

### select three fitted models with different epsilon

fit <- list()
eps <- c(0.2, 0.7, 1.0)
for (i in seq_along(eps))
  fit[[i]] <- bbl(y ~ .^2, data = dat, method = "mf", eps = eps[i], 
    verbose = 0)

### compare inferred and true parameters from three fitted models

oldpar <- par(mfrow = c(2, 2), mar = c(4, 4, 2, 2), lwd = 0.5, 
  cex.axis = 0.8, cex.lab = 0.9, mgp = c(2.2, 0.8, 0), tck = -0.03, 
  las = 1)
estar <- cv$regstar
plot(cv, xlab = expression(epsilon), ylab = "AUC", lwd = 0.7, cex = 0.7, 
  bty = "n", log = "")
segments(x0 = estar, x1 = estar, y0 = 0, y1 = cv$maxscore, lty = 2, 
  lwd = 0.5, col = "red")
title(adj = 0, cex.main = 1.2, font = 2, main = "a")

for (i in 1:3) {
  plot(x = c(unlist(par[[2]]$h), unlist(par[[1]]$h)), 
    y = unlist(coef(fit[[i]])$h), bg = "cornflowerblue", 
    xlim = c(-1.5, 1.5), ylim = c(-1.5, 1.5), pch = 21, cex = 0.7, 
    xlab = "True", ylab = "Inferred", lwd = 0.7, xaxt = "n", yaxt = "n", 
    bty = "n")
  axis(side = 1, at = seq(-1.5, 1.5, 0.5), lwd = 0.5, las = 1)
  axis(side = 2, at = seq(-1.5, 1.5, 0.5), lwd = 0.5, las = 1)
  segments(x0 = -2, x1 = 2 , y0 = -2, y1 = 2, lty = 2, lwd = 0.7)
  points(x = c(unlist(par[[2]]$J), unlist(par[[1]]$J)), 
    y = unlist(coef(fit[[i]])$J), pch = 24, bg = "orange", cex = 0.7, 
    lwd = 0.7)
  if (i == 1) legend(x = 0.5, y = -0.5, legend = expression(italic(h), 
    italic(J)), cex = 0.8, pch = c(21, 24), 
    pt.bg = c("cornflowerblue", "orange"))
  title(adj = 0, main = letters[i + 1], cex.main = 1.1, font = 2)
  mtext(side = 3, line = 1.0, cex = 0.8, bquote(epsilon == .(eps[i])), 
    adj = 0.5)
}
par(oldpar)

###################################################
### Sec.3.4 Genetic code

set.seed(351)
n <- 2000
dat <- data.frame(b1 = sample(nt, size = n, replace = TRUE), 
  b2 = sample(nt, size = n, replace = TRUE), 
  b3 = sample(nt, size = n, replace = TRUE))
head(dat)

### use package Biostrings to translate codons into amino acids

if (!require("Biostrings")) {
  if (!require("BiocManager"))
    install.packages("BiocManager")
  BiocManager::install("Biostrings")
}
aa <- Biostrings::DNAString(paste(t(dat), collapse = ""))
aa
aa <- strsplit(as.character(Biostrings::translate(aa)), split = "")[[1]]
xdat <- cbind(data.frame(aa = aa), dat)
head(xdat)

### perform cross-validation to optimize model

cv <- crossVal(aa ~ .^2, data = xdat, lambda = 10^seq(-3, 1, 0.5), 
  verbose = 0)
cv

### test prediction on panel of all possible codons

panel <- expand.grid(b1 = nt, b2 = nt, b3 = nt)
head(panel)
dim(panel)
p <- predict(cv, panel)
ap <- Biostrings::DNAString(paste(t(panel), collapse = ""))
ap <- strsplit(as.character(Biostrings::translate(ap)), split = "")[[1]]
accuracy <- mean(ap == p$yhat)
accuracy

###################################################
### Sec.3.5 Image data

### read image data and cross-validate (WARNING: ~8 min. to run)

dat0 <- read.csv(system.file(file.path("extdata", "mnist_train.csv"),
                             package = "bbl"))
dat <- removeConst(dat0)
dat[1:5, 1:10]
cv <- crossVal(y ~ .^2, data = dat, method = "mf", eps = 0.05)

### re-fit using optimal eps and predict (WARNING: ~3 min. to run)

mnist <- bbl(y ~ .^2, data = dat, method = "mf", eps = 0.05)
dtest <- read.csv(system.file(file.path("extdata", "mnist_test.csv"),
                              package = "bbl"))
dtest <- dtest[, colnames(dtest) %in% colnames(dat)]
pr <- predict(mnist, newdata = dtest[, -1], progress.bar = TRUE)
accuracy <- mean(pr$yhat == dtest$y)
accuracy

### estimate most probable configurations in each response group
### WARNING(: ~12 min. to run)

mnist_map <- mcSample(mnist, nstep = 20, progress.bar = TRUE)
oldpar <- par(mfrow = c(2, 5), mar = c(1, 1, 1, 1))
xvar <- colnames(dat0[, -1])
xmap <- apply(mnist_map$xmax, 1:2, as.numeric)
xf <- matrix(0, nrow = length(xvar), ncol = 10)
rownames(xf) <- xvar
for (i in 1:10) xf[rownames(xmap), i] <- xmap[, i]
for (i in 1:10) {
  mat <- matrix(t(xf[, i]), nrow = 28, ncol = 28)
  image(x = 1:28, y = 1:28, z = mat[, 28:1], col = c("white", "black"), 
        xaxt = "n", yaxt = "n", xlab = "", ylab = "")
}
par(oldpar)

###################################################
### Cross-validation of bbl inference on MNIST data
### WARNING: ~10 hr to run

eps <- c(c(1e-4, 2e-4, 5e-4, 1e-3, 2e-3, 5e-3, 1e-2, 2e-2, 5e-2), 
         seq(0.1, 0.9, 0.1))
cv6 <- crossVal(y ~ .^2, data = dat, method = "mf", eps = eps, verbose = 4)
if (!dir.exists("fig6")) dir.create("fig6")

oldpar <- par(las = 1, mar = c(4, 4, 4, 2))
plot(cv6$cvframe, type = "b", log = "x", ylim = c(0, 1), ylab = "Score", xlab = bquote(epsilon), 
  bty = "n", xaxt = "n")
axis(side = 1, at = c(1e-4, 1e-3, 1e-2, 1e-1, 1), label = expression(10^{-4}, 10^{-3}, 10^{-2}, 10^{-1}, 1))
par(oldpar)

###################################################
### Table 1. Performance comparision of bbl and other models on MNIST data
### WARNING: ~5 hr to run

eps <- 0.02 # optimal epsilon
dtrain <- readRDS(file.path("table1-data", "mnist_train_binary.rds"))
dtrain <- removeConst(dtrain)
nb <- bbl(y ~ .^2, data = dtrain, method = "mf", eps = 0)  # naive Bayes
mnist <- bbl(y ~ .^2, data = dtrain, method = "mf", eps = eps)

dtest <- readRDS(file.path("table1-data", "mnist_test_binary.rds"))
dtest <- dtest[, colnames(dtest) %in% colnames(dtrain)]
pred0 <- predict(nb, newdata = dtest, progress.bar = TRUE)
score0 <- mean(pred0$yhat == dtest$y)
cat("NB score = ", score0, "\n", sep = "")

pred <- predict(mnist, newdata = dtest, progress.bar = TRUE)
score <- mean(pred$yhat == dtest$y)
cat("eps = 0.02 score = ", score, "\n", sep = "")

###################################################
### Sec.3.6 Transcription factor binding site

### read tf binding set data

seq <- readFasta(system.file(file.path("extdata", "MA0014.3.fasta"),
                             package = "bbl"))
head(seq)
dim(seq)

### generate mutated data sets

set.seed(561)
nsample <- NROW(seq)
m <- NCOL(seq)
nt <- c("A", "C", "G", "T")
ctrl <- as.matrix(seq)
for (k in seq_len(nsample))
  ctrl[k, sample(m, 3)] <- sample(nt, 3, replace = TRUE)
colnames(ctrl) <- 1:m
data <- rbind(data.frame(y = rep("Binding", nsample), seq), 
  data.frame(y = rep("Non-binding", nsample), ctrl))
data <- data[sample(NROW(data)), ]

### perform cross-validation (WARNING: ~1 min. to run)

ps <- crossVal(y ~ .^2, data = data, method = "pseudo", lambda = 10^seq(-2, -1, 0.2), verbose = 0)
ps
mf <- crossVal(y ~ .^2, data = data, method = "mf", eps = seq(0.1, 0.4, 0.1), verbose = 0)
mf

###################################################
### Cross-validation with TF binding site data
### WARNING: ~7 min. to run

cvps <- crossVal(y ~ .^2, data = data, method = "pseudo", lambda = 10^seq(1, -4, -0.2))
cvmf <- crossVal(y ~ .^2, data = data, method = "mf", eps = seq(0, 0.95, 0.05))
oldpar <- par(mar = c(4, 4, 4, 1), tck = -0.04, mgp = c(2.2, 0.8, 0), cex.axis = 0.8, cex.lab = 1.0, 
              mfrow = c(1, 2), las = 1, mgp = c(2.5, 0.9, 0))

plot(cvps, log = "x", bty = "n", pch = 21, cex = 0.5)
title(adj = 0, main = "a", font = 2, cex.main = 1.2)
plot(cvmf, log = "", bty = "n", pch = 21, cex = 0.5)
title(adj = 0, main = "b", font = 2, cex.main = 1.2)

par(oldpar)
