## -------------------------------------------------------------------------------
## R script for reproducing results from paper: 
## "Additive Bayesian Network Modelling with the R Package abn"
##
## -------------------------------------------------------------------------------
##
## Authors: Gilles Kratzer, Reinhard Furrer, with input from Virgilio Rubio
## Date: 12.06.2022
## contact: gilles.kratzer@gmail.com, reinhard.furrer@math.uzh.ch
##
## -------------------------------------------------------------------------------
##
## The variable fullstudy is set to FALSE to reduce the computational load.
## To reproduce the results of the paper, set `fullstudy <- TRUE` before
## launching the script.
##
## The script uses the bug file  `./input/model8vPois.bug`
## Intermediate results are stored in the directory `./Rdata/`
##
## As provided, the script does not save the figures, tables, and other code
## elements. To do so, you need to uncomment the lines consisting of - 
## essentially - calls like `pdf(...)`, `dev.off(...)`, `capture.output(...)`.
## The output is saved in the directory `./output/`, with the name convention 
## for figures ("FIG_X.pdf"), tables ("TAB_X.txt"), and code output 
## ("CODE_X.txt"), where X is a description of the object. 

## Getting started:  Create / clean output structure
ptm <- proc.time()

## In case a fresh output directory is needed:
# if (dir.exists(paths = "output/")) {
#   do.call(file.remove, list(list.files("output/", full.names = TRUE)))
# } else {
#   dir.create(path = "output/")
# }
if (dir.exists(paths = "Rdata/")) {
  do.call(file.remove, list(list.files("Rdata/", full.names = TRUE)))
} else {
  dir.create(path = "Rdata/")
}


## ---------------------------------------------------------------------
## loading libraries:
library("Rgraphviz")  ## visualizations
library("ggplot2")    ## various functions
library("fields")     ## function `image.plot`
library("rjags")      ## link to JAGS engine
library("reshape2")   ## function `melt`, ...
library("akima")      ## function `bilinear.grid` 
library("doParallel") ## to use `foreach`
library("bnlearn")    ## `asia` dataset 
library("knitr")      ## function `kable`
library("abn")        ## we need version abn 2.7 or higher
if (compareVersion(as.character(packageVersion("abn")), "2.7") < 0)
    stop("Newer version of 'abn' is required.")


if (!exists("verbose")) {
    verbose <- FALSE  ## Passed to all functions.
}    
if (!verbose) options(jags.pb = "none")

# fullstudy <- TRUE
if (!exists("fullstudy")) {
    fullstudy <- FALSE
}

if (fullstudy) {
    nBootstrap <- 2500
    codaThinning <- 10
    Rrep <- 100
} else {
    nBootstrap <- 250
    codaThinning <- 1
    Rrep <- 10
}

ncores <- 0 ## number of cores within the abn functions. 
registerDoParallel(cores = min(detectCores(), 10) - 1) ## Number of cores to use (all but 1), except multi machine


## ---------------------------------------------------------------------
## Section 1: Introduction ---------------------------------------------
## ---------------------------------------------------------------------

## Chunk 1: loading data 
library("abn")
data("asia", package = "bnlearn")
colnames(asia) <- c("Asia", "Smoking", "Tuberculosis", "LungCancer", 
    "Bronchitis", "Either", "XRay", "Dyspnea")


## Chunk 2: first dag structure
distrib <- as.list( rep("binomial", 8))
names(distrib) <- names(asia)
mycache <- buildScoreCache(data.df = asia, data.dist = distrib, 
    max.parents = 4, verbose = verbose)
mp.dag <- mostProbable(score.cache = mycache, verbose = verbose)


## Chunk 3: fitting and plotting
fabn <- fitAbn(object = mp.dag, verbose = verbose)
plot(fabn, fitted.values = NULL)

## Output: Construct left panel of Figure 1:
# pdf("output/FIG_dag_asia_wo_constraints.pdf", width = 4.5)
plot(fabn, fitted.values = NULL, node.fontsize = 14)
# dev.off()


## Chunk 4: constraint fitting and plotting
mycache <- buildScoreCache(data.df = asia, data.dist = distrib, 
    max.parents = 4, dag.retained = ~ LungCancer|Smoking)
mp.dag <- mostProbable(score.cache = mycache, verbose = verbose)
fabn <- fitAbn(object = mp.dag)


## Chunk 5: fabn object
fabn

## Output: 
# capture.output(fabn, file = "output/CODE_fabn_asia.txt")

## Output: Construct left panel of Figure 1:
# pdf("output/FIG_dag_asia_w_constraints.pdf", width = 4.5)
plotAbn(dag = mp.dag$dag, data.dists = distrib, node.fillcolor = "lightyellow", node.fontsize = 14)
# dev.off()


## Values in text: example, node Smoking -> LungCancer
round(fabn$modes$LungCancer[2], 2)
round(exp(fabn$modes$LungCancer[2]), 2)

## time comparison with bnlearn 
system.time({
    mycache1 <- buildScoreCache(data.df = asia, data.dist = distrib, 
        max.parents = 4, dag.retained = ~ LungCancer | Smoking)
    mp.dag1 <- mostProbable(score.cache = mycache, verbose = verbose)
    fabn1 <- fitAbn(object = mp.dag1)
})
system.time({
    fbn <- bn.fit(hc(asia, whitelist = data.frame(from = c("Smoking"),
        to = c("LungCancer"))), asia, method = "bayes")
})

## ---------------------------------------------------------------------
## Section 4: Case study: adg
## ---------------------------------------------------------------------

## Chunk 1: loading adg data
library("abn")
dim(adg)
head(adg, 2)
(n.obs <- nrow(adg))


## output: Figure 3: Descriptive distributions of the variables.
# pdf("output/FIG_descr_distr_case_study.pdf", width = 7, height = 4.5)
par(mfrow = c(3, 3), mar = c(2, 4, 1.5, 1))
xx <- barplot(table(adg$AR) / n.obs, ylim = c(0, 1), main = "AR", ylab = "Proportion")
text(x = xx, y = table(adg$AR) / n.obs, label = table(adg$AR), pos = 3, cex = 1.2, col = 4)
xx <- barplot(table(adg$pneumS) / n.obs, ylim = c(0, 1.04), main = "pneumS", ylab = "Proportion")
text(x = xx, y = table(adg$pneumS) / n.obs, label = table(adg$pneumS), pos = 3, cex = 1.1, col = 4)
xx <- barplot(table(adg$female) / n.obs, ylim = c(0, 1.04), main = "female", ylab = "Proportion")
text(x = xx, y = table(adg$female) / n.obs, label = table(adg$female), pos = 3, cex = 1.2, col = 4)
xx <- barplot(table(adg$livdam) / n.obs, ylim = c(0, 1.04), main = "livdam", ylab = "Proportion")
text(x = xx, y = table(adg$livdam) / n.obs, label = table(adg$livdam), pos = 3, cex = 1.2, col = 4)
xx <- barplot(table(adg$eggs) / n.obs, ylim = c(0, 1.04), main = "eggs", ylab = "Proportion")
text(x = xx, y = table(adg$eggs) / n.obs, label = table(adg$eggs), pos = 3, cex = 1.2, col = 4)
hist(adg$wormCount, main = "worms", prob = TRUE, col = "grey", border = "white",
    ylim = c(0, 0.27), breaks = 15)
lines(density(adg$wormCount, bw = 1), lwd = 1.)
hist(adg$age, xlab = "", main = "age", prob = TRUE, col = "grey", border = "white")
lines(density(adg$age), lwd = 1.)
hist(adg$adg, xlab = "", main = "adg", prob = TRUE, col = "grey", border = "white")
lines(density(adg$adg), lwd = 1.)
barplot(table(adg$farm), main = "Farm ID", ylim = c(0, 40), ylab = "Count")
# dev.off()


## Chunk 2: setting-up-dataframe
dists <- list(AR = "binomial", pneumS = "binomial", female = "binomial", 
    livdam = "binomial", eggs = "binomial", wormCount = "poisson",
    age = "gaussian", adg = "gaussian")
df <- adg[, which("farm" !=  names(adg))]
df[, 1:5] <- lapply(df[, 1:5], function(x) factor(x))


## Chunk 3: print banned
banned <- matrix(0, ncol(df), ncol(df))
colnames(banned) <- rownames(banned) <- names(df)
banned["female", which("female" !=  names(df))] <- 1  
banned


## Chunk 4: incremental search
result <- list()
for (i in 1:7) {
  mycache <- buildScoreCache(data.df = df, data.dists = dists, 
    dag.banned = banned, max.parents = i, method = "bayes", verbose = verbose)
  mydag <- mostProbable(score.cache = mycache, verbose = verbose)
  result[[i]] <- fitAbn(object = mydag)
}
result.mlik <- sapply(result, function(x) x$mlik)

## output: Figure 4: Total network log marginal likelihood as a function of the number of parents
# pdf("output/FIG_parents_limit.pdf", width = 6, height = 3.2)
plot(1:7, result.mlik, xlab = "Parent limit", ylab = "Log marginal likelihood", 
     type = "b", col = "red", ylim = range(result.mlik)) 
abline(v = which(result.mlik ==  max(result.mlik))[1], col = "grey", lty = 2)
# dev.off()


## Chunk 5: summary of DAG selected using an exact search with a model complexity of four parents.
max.par <- which(result.mlik ==  max(result.mlik))[1]
fabn <- result[[max.par]]
unlist(infoDag(fabn))

## output: Figure: DAG selected using an exact search with a model complexity of four parents.
# pdf(file = "output/FIG_dag_4p.pdf", width = 5, height = 5)
plotAbn(fabn, node.fontsize = 14)
# dev.off()


## Chunk 6: fit marginal densities over a fixed grid
marg.f <- fitAbn(object = mydag, compute.fixed = TRUE)

## Assign marginal densities (blocks of 2 columns):
AR.p <- do.call(cbind.data.frame, marg.f$marginals[["AR"]])
pneumS.p <- do.call(cbind.data.frame, marg.f$marginals[["pneumS"]])
female.p <- do.call(cbind.data.frame, marg.f$marginals[["female"]])
livdam.p <- do.call(cbind.data.frame, marg.f$marginals[["livdam"]])
eggs.p <- do.call(cbind.data.frame, marg.f$marginals[["eggs"]])
wormCount.p <- do.call(cbind.data.frame, marg.f$marginals[["wormCount"]])
age.p <- do.call(cbind.data.frame, marg.f$marginals[["age"]][1:2])
adg.p <- do.call(cbind.data.frame, marg.f$marginals[["adg"]][1:2])

prec.age.p <- marg.f$marginals[["age"]][[ "age|precision" ]]
prec.adg.p <- marg.f$marginals[["adg"]][[ "adg|precision" ]]


## run model once:
jj <- jags.model(file = "input/model8vPois.bug", data = list(
    "AR.p" = AR.p, "pneumS.p" = pneumS.p, "female.p" = female.p, 
    "livdam.p" = livdam.p, "eggs.p" = eggs.p, "wormCount.p" = wormCount.p, 
    "age.p" = age.p,"prec.age.p" = prec.age.p, "adg.p" = adg.p, 
    "prec.adg.p" = prec.adg.p),
    inits = list(".RNG.name" = "base::Mersenne-Twister", ".RNG.seed" = 122),
    n.chains = 1,  n.adapt = 50000, quiet = !verbose)

## sample data (same size as original) & thinning to reduce autocorrelation:
samp <- coda.samples(jj, c("AR", "pneumS", "female", "livdam", "eggs", 
    "wormCount", "age", "prec.age", "adg", "prec.adg"),
    n.iter = n.obs * codaThinning, thin = codaThinning)

## extract posterior densities and put in a data frame:
post.df <- data.frame(samp[[1]])[, names(dists)]

## output: Figure: simulated versus original data
# pdf("output/FIG_simulated_data_by_bootstrapping.pdf", height = 5, width = 8) 
par(mfrow = c(4, 4), mar = c(2, 4, 1.5, 1))

xx <- barplot(table(adg$AR) / n.obs, ylim = c(0,1), main = "AR - original")
text(x = xx, y = table(adg$AR) / n.obs, label = table(adg$AR), pos = 3, cex = 1.2)
xx <- barplot(table(post.df$AR) / n.obs,  ylim = c(0,1), main = "AR - simulated", 
        col.main = "blue", border = "blue") 
text(x = xx, y = table(post.df$AR) / n.obs, label = table(post.df$AR), pos = 3, cex = 1.2, col = "blue")

xx <- barplot(table(adg$pneumS) / n.obs, ylim = c(0,1), main = "pneumS - original")
text(x = xx, y = table(adg$pneumS) / n.obs, label = table(adg$pneumS), pos = 3, cex = 1.2)
xx <- barplot(table(post.df$pneumS) / n.obs,  ylim = c(0,1), main = "pneumS - simulated", 
        col.main = "blue", border = "blue") 
text(x = xx, y = table(post.df$pneumS) / n.obs, label = table(post.df$pneumS), pos = 3, cex = 1.2, col = "blue")

xx <- barplot(table(adg$female) / n.obs, ylim = c(0,1), main = "female - original")
text(x = xx, y = table(adg$female) / n.obs, label = table(adg$female), pos = 3, cex = 1.2)
xx <- barplot(table(post.df$female) / n.obs,  ylim = c(0,1), main = "female - simulated", 
        col.main = "blue", border = "blue") 
text(x = xx, y = table(post.df$female) / n.obs, label = table(post.df$female), pos = 3, cex = 1.2, col = "blue")

xx <- barplot(table(adg$livdam) / n.obs, ylim = c(0, 1), main = "livdam - original")
text(x = xx, y = table(adg$livdam) / n.obs, label = table(adg$livdam), pos = 3, cex = 1.2)
xx <- barplot(table(post.df$livdam) / n.obs,  ylim = c(0, 1), main = "livdam - simulated", 
        col.main = "blue", border = "blue")
text(x = xx, y = table(post.df$livdam) / n.obs, label = table(post.df$livdam), pos = 3, cex = 1.2, col = "blue")

xx <- barplot(table(adg$eggs) / n.obs, ylim = c(0, 1), main = "eggs - original")
text(x = xx, y = table(adg$eggs) / n.obs, label = table(adg$eggs), pos = 3, cex = 1.2)
xx <- barplot(table(post.df$eggs) / n.obs, ylim = c(0, 1), main = "eggs - simulated", 
        col.main = "blue", border = "blue") 
text(x = xx, y = table(post.df$eggs) / n.obs, label = table(post.df$eggs), pos = 3, cex = 1.2, col = "blue")

hist(df$wormCount, xlab = "", main = "wormCount - original",
     prob = TRUE, col = "grey", border = "white", xlim = c(0, 75), ylim = c(0, 0.27), breaks = 15)
lines(density(df$wormCount, bw = 1), lwd = 1.5)
hist(post.df$wormCount, xlab = "", main = "wormCount - simulated", col.main = "blue", 
     prob = TRUE, col = "grey", border = "white", xlim = c(0, 75), ylim = c(0, 0.27), breaks = 15)
lines(density(post.df$wormCount, bw = 1), lwd = 1.5, col = "blue")

## use centered version of age and adg to compare to bootstrap data
hist(scale(df$age), xlab = "", main = "age - original",
     prob = TRUE,col = "grey",border = "white", xlim = c(-3, 3), ylim = c(0, 0.5), breaks = 13)
lines(density(scale(df$age)), lwd = 1.5)
hist(post.df$age, xlab = "", main = "age - simulated", col.main = "blue",
     prob = TRUE,col = "grey",border = "white", xlim = c(-3, 3), ylim = c(0, 0.5), breaks = 13)
lines(density(post.df$age), lwd = 1.5, col = "blue")

hist(scale(df$adg), xlab = "", main = "adg - original",
     prob = TRUE,col = "grey",border = "white", xlim = c(-4, 4), ylim = c(0, 0.5), breaks = 13)
lines(density(scale(df$adg)), lwd = 1.5)
hist(post.df$adg, xlab = "", main = "adg - simulated", col.main = "blue",
     prob = TRUE,col = "grey",border = "white", xlim = c(-4, 4), ylim = c(0, 0.5), breaks = 13)
lines(density(post.df$adg), lwd = 1.5, col = "blue")
# dev.off()

## -------------------------------------------------------------------
## Section:  Parametric bootstraping
## -------------------------------------------------------------------

## get `nBootstrap` random numbers to set different initial values
set.seed(123)
seeds <- sample(1:100000, nBootstrap)

## Simulate data and run ABN on such dataset
bootout <- foreach(i = 1:length(seeds)) %dopar% {
   jj <- jags.model(file = "input/model8vPois.bug", data = list(
       "AR.p" = AR.p, "pneumS.p" = pneumS.p, "female.p" = female.p,
       "livdam.p" = livdam.p, "eggs.p" = eggs.p, "wormCount.p"  = 
       wormCount.p, "age.p" = age.p,"prec.age.p" = prec.age.p,
       "adg.p" = adg.p, "prec.adg.p" = prec.adg.p),
       inits = list(".RNG.name" = "base::Mersenne-Twister",
       ".RNG.seed" = seeds[i]), n.chains = 1, n.adapt = 50000, quiet = !verbose)

   ## Sample data (same size as original with thinning):
   samp <- coda.samples(jj, c("AR", "pneumS", "female",  "livdam", "eggs",
       "wormCount", "age", "prec.age",  "adg", "prec.adg"),
       n.iter = n.obs * codaThinning, thin = codaThinning)

  ## Build data frame in the same shape as the original one:
  df.boot <- data.frame(samp[[1]])[,names(dists)] ## careful with ordering
  df.boot[, 1:5] <- lapply(df.boot[, 1:5], function(x) factor(x))

  ## Build a cache of all local computations
  mycache <- buildScoreCache(data.df = df.boot, data.dists = dists,
      dag.banned = banned, max.parents = max.par, verbose = verbose)
 
  ## Run an exact search and return DAG only:
  mp.dag <- mostProbable(score.cache = mycache, verbose = verbose)
  mp.dag$dag
  ## or if more information needs to be retrieved later:
  # fabn <- fitAbn(object = mp.dag, verbose = verbose)
  # list(seeds[i], df.boot, mycache, mp.dat, fabn)
  }

save(bootout, file = paste0("Rdata/BootDAGs", nBootstrap, ".RData"))
dags <- bootout ## only dags passed back.


## output: Figure: Histogram of the number of arcs in the bootstrapped searches
# pdf("output/FIG_hist_nb_searches.pdf", height = 3.2, width = 6)
arcs <- sapply(dags, sum)
barplot(table(arcs))
# dev.off()

## Chunk: count arc appeareance
## The function `Reduce()` sums each element of each matrix in the list and stores 
## the result in a new matrix of same size. We keep it as percentage
percdag <- Reduce("+", dags) / length(dags) * 100
trim.dag <- (percdag > 50) * 1
round(percdag, digits = 0)

## output: 
# capture.output(print(round(percdag, digits = 0)), file = "output/CODE_percentage_dag.txt")


## -------------------------------------------------------------------
## Section: Control for clustering 
## -------------------------------------------------------------------

## Marginal densities of the model parameter NOT corrected for random effect
marg.f <- fitAbn(dag = trim.dag, 
    data.df = df, data.dists = dists, compute.fixed = TRUE)

## Adjustments at the regression step. CAUTION: takes several minutes:
df.gr <- adg
df.gr[, c(1:5,9)] <- lapply(df.gr[, c(1:5, 9)], function(x) factor(x))
marg.f.grouped <- fitAbn(dag = trim.dag, data.df = df.gr, data.dists = dists,
    cor.vars = c("AR", "livdam", "eggs", "wormCount", "age", "adg"),
    group.var = "farm", compute.fixed = TRUE)

save(file = "Rdata/margfgrouped.RData", marg.f.grouped, marg.f, trim.dag, df.gr, dists)


## output: Figure: with and without correction and with variance parameter
# pdf("output/FIG_marginal_densities_with_without_correction.pdf", width = 7, height = 4.5)
par(mfrow = c(5, 4), mar = c(2, 2, 1.5, 1))
for(i in 1:length(marg.f$marginals)){
  ## get the marginal for current node, which is a matrix [x, f(x)]
  cur.node <- marg.f$marginals[i][[1]]
  cur.node.gr <- marg.f.grouped$marginals[i][[1]]
  nom1 <- names(marg.f$marginals)[i]
  for(j in 1:length(cur.node) ) {
    nom2 <- names(cur.node)[j]
    cur.param <- cur.node[[j]]
    cur.param.gr <- cur.node.gr[[j]]
    if (grepl("precision", nom2)) {    ## not necessary for variances
      cur.param[, 1] <- 1 / cur.param[, 1]
      cur.param.gr[, 1] <- 1 / cur.param.gr[, 1]
      nom2 <- gsub("precision", "variance", nom2)
    }
    plot(cur.param, type = "l", main = paste(nom1, ":", nom2), cex = 0.7, 
        xlim = range(cur.param[, 1], cur.param.gr[, 1]),
        ylim = c(0, max(cur.param[, 2], cur.param.gr[, 2])))
    lines(cur.param.gr, type = "l", col = "blue")
  }
}
# dev.off()


## Chunk: (last chunk of the paper)
AUC <- function(x)  sum( diff(x[, 1]) * (x[-1, 2] + x[-length(x[, 1]), 2]) / 2)
aucadg <- lapply(marg.f.grouped$marginals, function(x) lapply(x, AUC))
summary(unlist(aucadg))


## extract marginals from non corrected model
marg.dens <- marg.f$marginals[[1]]
for (i in 2:length(marg.f$marginals)) {
  marg.dens <- c(marg.dens, marg.f$marginals[[i]])
}

## extract marginals adjusted for grouped data
marg.dens.grouped <- marg.f.grouped$marginals[[1]]
for (i in 2:length(marg.f.grouped$marginals)) {
  marg.dens.grouped <- c(marg.dens.grouped, marg.f.grouped$marginals[[i]])
}

mat <- matrix(0, nrow = length(marg.dens.grouped), ncol = 3)
rownames(mat) <- names(marg.dens.grouped)
colnames(mat) <- c("2.5Q", "median", "97.5Q")
ignore.me <- union(union(grep("\\(Int", names(marg.dens.grouped)),
    grep("prec", names(marg.dens.grouped))), grep("group.precision", names(marg.dens.grouped))) 
comment <- rep("", length(marg.dens.grouped))
for (i in 1:length(marg.dens.grouped)) {
  tmp <- marg.dens.grouped[[i]]
  tmp2 <- cumsum(tmp[, 2]) / sum(tmp[, 2])
  mat[i, ] <- c(tmp[which(tmp2 > 0.025)[1] - 1, 1], ## -1 to use the value on the left of 2.5%
                tmp[which(tmp2 > 0.5)[1], 1],
                tmp[which(tmp2 > 0.975)[1], 1])
  vec <- mat[i,]

  if (!(i %in% ignore.me) && (vec[1] < 0 && vec[3] > 0)) {
    comment[i] <- "not sig. at 5%"
  }

  ## truncate for printing
  mat[i,] <- as.numeric(formatC(mat[i,], digits = 3, format = "f"))
}

mar <- data.frame(mat[-ignore.me,], check.names = FALSE)
mar[1:7,] <- exp(mar[1:7,])
mar$interpretation <- rep(c("odds ratio", "rate ratio", "correlation"), c(3, 4, 2))

## output: Marginals posterior distribution of the parameter estimates
# capture.output(mar, file = "output/TABLE_marginal_posterior_densities.txt")

## Adjustment at the learning phase
# mycache.l <- buildScoreCache(data.df = df.gr, data.dists = dists,
#     dag.banned = banned, max.parents = 4, group.var = "farm",
#     cor.vars = c("AR", "livdam", "eggs", "wormCount", "age", "adg"))


## -------------------------------------------------------------------
## Section: Accounting for uncertainty
## -------------------------------------------------------------------

## Chunk: print LS
PLS <- linkStrength(dag = trim.dag, data.df = df, data.dists = dists, 
    method = "ls.pc")
rownames(PLS) <- colnames(PLS) <- names(dists)
print(PLS, digits = 3)

## output:
# capture.output(print(PLS, digits = 3), file = "output/CODE_link_strength.txt")

## -------------------------------------------------------------------
## Section: Presentation of the results
## -------------------------------------------------------------------

## output: Figure: Final ABN model.
# pdf("output/FIG_final_ABN_model.pdf", width = 5, height = 5)
PLSplot <- plotAbn(trim.dag, data.dists = dists, edge.strength = PLS, 
    plot = FALSE, fitted.values = marg.f$modes, node.fontsize = 14,
    edge.fontsize = 14, edge.arrowsize = 1.2)
edgelty <- rep(c("dashed", "solid", "dotted"), c(3, 4, 2))
names(edgelty) <- names( edgeRenderInfo(PLSplot, "col"))
edgeRenderInfo(PLSplot) <- list(lty = edgelty)
renderGraph(PLSplot)
# dev.off()


## Chunk: final marginals
mar$PLS <- c(PLS["AR", "age"], PLS["livdam", "eggs"], PLS["eggs", "adg"],
    PLS["wormCount", "AR"], PLS["wormCount", "eggs"], PLS["wormCount", "age"], 
    PLS["wormCount", "adg"], PLS["age", "female"], PLS["adg", "age"])

## output: 
out <- kable(mar, row.names = TRUE, digits = 2, align = "rrrrr", "latex", booktabs = TRUE) 
out
# capture.output(out, file = "output/TABLE_marg_post_distr.txt")


## -------------------------------------------------------------------
## Section 5: Simulation study ---------------------------------------
## -------------------------------------------------------------------

nNode <- 10
dists <- as.list(rep("gaussian", nNode))
names(dists) <- letters[1:nNode]

## defining parameter matrix
data.param <- matrix(data = c(0, 0.2, 0.5, 0, 0, 0, 0, 0.8, 0.9, 0.5,
    0, 0, 0.3, 0.1, 0, 0.8, 0.6, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0,
    0.5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.1, 0, 0, 0, 0, 0, 0.8, 0.2, 0.3,
    0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0.3, 0.8, 0, 0.8, 0, 0, 0,
    0.6, 0, 0, 0.3, 0.6, 0, 0.8, 0.6, 0, 0, 0.6, 0, 0, 0.3, 0.7, 0, 0.8,
    0.6, 0, 0, 0.6, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0),
    nrow = nNode, ncol = nNode, byrow = TRUE)

## defining precision matrix
data.param.var <- diag(c(10, 20, 30, 40, 10, 10, 20, 20, 10, 10))
colnames(data.param) <- rownames(data.param) <- colnames(data.param.var) <-
    rownames(data.param.var) <- names(dists)

grid_abn <- expand.grid(nobs = c(5, 10, 50, 100, 500, 1000, 2000, 3000, 
  5000, 7500, 10000), rep = 1:Rrep) 

seed <- 35674
seeds <- sample(1:100000, dim(grid_abn)[1])
out <- foreach(i = 1:dim(grid_abn)[1], .combine = rbind) %dopar% {

  simulated.data <- simulateAbn(data.dists = dists, data.param = data.param,
    data.param.var = data.param.var, data.param.mult = data.param.mult,
    n.chains = 1, n.thin = 1, n.iter = grid_abn[i,1],
    bug.file = tempfile(), seed = seeds[i], verbose = verbose)

  mycache.b <- buildScoreCache(data.df = simulated.data, data.dists = dists,
    max.parents = 5, dry.run = FALSE, centre = FALSE, control = list(ncores = ncores))
  dag.b <- mostProbable(score.cache = mycache.b, verbose = verbose)
  t0 <- essentialGraph(dag = (dag.b$dag))
  
  mycache.computed.mle <- buildScoreCache(data.df = simulated.data, data.dists = dists,
    max.parents = 5, method = "mle", centre = FALSE, control = list(ncores = ncores))
  dag.mle.mlik <- mostProbable(score.cache = mycache.computed.mle, score = "mlik", verbose = verbose)
  t1 <- essentialGraph(dag.mle.mlik$dag)

  dag.mle.aic <- mostProbable(score.cache = mycache.computed.mle,score = "aic", verbose = verbose)
  t2 <- essentialGraph(dag.mle.aic$dag)

  dag.mle.bic <- mostProbable(score.cache = mycache.computed.mle, score = "bic", verbose = verbose)
  t3 <- essentialGraph(dag.mle.bic$dag)

  dag.mle.mdl <- mostProbable(score.cache = mycache.computed.mle, score = "mdl", verbose = verbose)
  t4 <- essentialGraph(dag.mle.mdl$dag)

  abn <- compareEG(ref = essentialGraph(data.param), test = t0)
  mlik <- compareEG(ref = essentialGraph(data.param), test = t1)
  aic <- compareEG(ref = essentialGraph(data.param), test = t2)
  bic <- compareEG(ref = essentialGraph(data.param), test = t3)
  mdl <- compareEG(ref = essentialGraph(data.param), test = t4)

  unlist( c(seeds[i], abn, mlik, aic, bic, mdl))
}

df <- data.frame(out, grid_abn)

## analysis
df.names <- df[,grep("TPR|FPR|Accuracy|nobs|rep", names(df))]

names(df.names) <- c("abn.TP", "abn.FP", "abn.acc", "mlik.TP", "mlik.FP", 
  "mlik.acc", "aic.TP", "aic.FP", "aic.acc", "bic.TP", "bic.FP",
  "bic.acc", "mdl.TP", "mdl.FP", "mdl.acc", "nobs", "rep")

df.long.scores.Acc <- df.names[, c(3, 6, 9, 12, 15, 16, 17)]
df.long.scores.FP <- df.names[, c(2, 5, 8, 11, 14, 16, 17)]
df.long.scores.TP <- df.names[, c(1, 4, 7, 10, 13, 16, 17)]

df.long.scores.Acc <- melt(as.data.frame(df.long.scores.Acc), id = c("nobs", "rep"))
df.long.scores.FP <- melt(as.data.frame(df.long.scores.FP), id = c("nobs", "rep"))
df.long.scores.TP <- melt(as.data.frame(df.long.scores.TP), id = c("nobs", "rep"))

p1 <- ggplot(data = df.long.scores.Acc) + xlab("") + ylab("") +
  geom_boxplot(aes(x = as.factor(nobs), y = value, fill = variable)) +
  ylim(c(0.4,1)) + ggtitle("Percentage of arcs retrieved (30% connected BN)") +
  theme(legend.title = element_blank(), plot.title = element_text(size = 12))

p2 <- ggplot(data = df.long.scores.FP) + xlab("") + ylab("") +
  geom_boxplot(aes(x = as.factor(nobs), y = value, fill = variable)) +
  ylim(c(0,0.5)) + ggtitle("Percentage of false positive") +
  theme(legend.title = element_blank(), plot.title = element_text(size = 12))

p3 <- ggplot(data = df.long.scores.TP) + xlab("Number of Observations") + ylab("") +
  geom_boxplot(aes(x = as.factor(nobs), y = value, fill = variable)) +
  ylim(c(0,1)) + ggtitle("Percentage of true positive") +
  theme(legend.title = element_blank(), plot.title = element_text(size = 12))

## Multiple plot function, taken from
## http://www.cookbook-r.com/Graphs/Multiple_graphs_on_one_page_(ggplot2)/ 
multiplot <- function(..., plotlist = NULL, file, cols = 1, layout = NULL) {
  library("grid")
  plots <- c(list(...), plotlist)
  numPlots = length(plots)
  if (is.null(layout)) {
    layout <- matrix(seq(1, cols * ceiling(numPlots / cols)),
      ncol = cols, nrow = ceiling(numPlots / cols))
  }
  if (numPlots ==  1) {
    print(plots[[1]])
  } else {
    grid.newpage()
    pushViewport(viewport(layout = grid.layout(nrow(layout), ncol(layout))))
   for (i in 1:numPlots) {
      matchidx <- as.data.frame(which(layout ==  i, arr.ind = TRUE))
      print(plots[[i]], vp = viewport(layout.pos.row = matchidx$row,
        layout.pos.col = matchidx$col))
    }
  }
}
multiplot(p1, p2, p3)


# pdf("output/FIG_boxplot_score_accuracy.pdf", width = 7, height = 7.7)
multiplot(p1, p2, p3)
# dev.off()


##-------------------------------------------------------------------
## Appendix 
##-------------------------------------------------------------------
## ------------------------------------------------------------------
## Section A: Dag simulation: more technical details  ---------------
## ------------------------------------------------------------------

## Chunk: structural metrics
library("ggplot2")
library("reshape2")
set.seed(567)
grid_abn <- expand.grid(val = seq(0, 1, 0.1), rep = 1:(Rrep * 10))
seeds <- sample(1:1000000, dim(grid_abn)[1])

## simulate BN and store metrics
out <- foreach(i = 1:dim(grid_abn)[1], .combine = rbind)  %dopar%  {
  set.seed(seeds[i])
  sim <- simulateDag(node.name = c(letters, paste0("a", letters[1:14])),
    edge.density = grid_abn$val[i])

  c(seed = seeds[i], unlist(infoDag(sim[[1]], node.names = names(sim[[2]]))))
}

## metrics normalization and reshaping
out.norm <- cbind(grid_abn, sweep(out[, -c(1:2)], 2, c(780, 39, 39, 19.5, 19.5), FUN = "/"))
out.long <- melt(out.norm, id.vars = c("val", "rep"))

## plotting
p <- ggplot(data = out.long) + ylim(c(0, 1)) +
  geom_boxplot(aes(x = as.factor(val), y = value, fill = variable)) +
  ggtitle(paste0("Density metric of DAG with 40 nodes (n = ", Rrep*10,")")) +
  xlab("Normalized density of the DAGs") + ylab("Normalized maximum [%]") +
  theme(legend.title = element_blank(), plot.title = element_text(size = 12))

## output
# pdf("output/FIG_normalized_DAG_metrics.pdf", width = 8, height = 5)
print(p)
# dev.off()


## Chunk: regression-coefficients
library("fields")
library("akima")
Nnode <- 20  ## Number of nodes
dist <- as.list(rep("gaussian", Nnode))
names(dist) <- letters[1:Nnode]

varseq <- exp(seq(log(0.01), log(1000), length.out = 6))
obsseq <- c(50, 100, 200, 400, 800, 1600, 3200)

grid_abn <- expand.grid(nobs = obsseq, vars = varseq, rep = 1:Rrep)
seed <- 247913
seeds <- sample(1:100000, dim(grid_abn)[1])

##low density network
seed <- 247913
data.param <- simulateDag(node.name = letters[1:Nnode], data.dists = dist,
  edge.density = 0.2)$dag

data.param.var <- matrix(data = 0, nrow = Nnode, ncol = Nnode)
colnames(data.param.var) <- rownames(data.param.var) <- names(dist)

outforeachsparse <- foreach(i = 1:dim(grid_abn)[1]) %dopar% {

  diag(data.param.var) <- rep(grid_abn[i, 2], 20)

  simulated.data <- simulateAbn(data.dists = dist, data.param = data.param,
    data.param.var = data.param.var, data.param.mult = data.param.mult,
    n.chains = 1, n.thin = 1, n.iter = grid_abn[i, 1],
    bug.file = tempfile(), verbose = verbose, seed = seeds[i])

  dag.tmp <- plotAbn(dag = data.param, data.dists = dist, plot = FALSE)@adjMat
  rownames(dag.tmp) <- colnames(dag.tmp)

  data.bayes <- fitAbn(dag = dag.tmp, data.df = simulated.data, data.dists = dist,
    method = "bayes", centre = FALSE, verbose = verbose)
  data.mle <- fitAbn(dag = dag.tmp, data.df = simulated.data, data.dists = dist,
    method = "mle", verbose = verbose)

  list(unlist(data.bayes$modes),                      ## named vectors
    unlist(sapply(data.mle$coef, function(x) x[1,])), ## named matrices
    seeds[i])
}

out.bayes <- sapply(outforeachsparse, function(x) x[[1]])
out.mle <- sapply(outforeachsparse, function(x) x[[2]])

df.bayes <- out.bayes[ -grep("precision|Intercept", rownames(out.bayes)), ]
df.mle <- out.mle[ -grep("intercept", rownames(out.mle)), ]

df.bayes <- cbind(grid_abn, t(df.bayes))
df.mle <- cbind(grid_abn, t(df.mle))

df_agg.bayes <- aggregate(x = df.bayes, by = list(df.bayes$nobs, df.bayes$var),
    FUN = function(x) sqrt(mean((1 - x)^2)))
df_agg.mle <- aggregate(x = df.mle, by = list(df.mle$nobs, df.mle$var),
    FUN = function(x) sqrt(mean((1 - x)^2)))

df_agg_mean.bayes <- apply(df_agg.bayes[,-(1:5)], 1, mean)
df_agg_mean.mle <- apply(df_agg.mle[,-(1:5)], 1, mean)

df_var.bayes <- cbind(df_agg_mean.bayes, df_agg.bayes[,(1:2)])
df_var.mle <- cbind(df_agg_mean.mle, df_agg.mle[,(1:2)])

s.lowdens.bayes <- bilinear.grid(x = log10(obsseq), y = log10(varseq),
    z = matrix(df_var.bayes$df_agg_mean, 7, 6), dx = 0.05, dy = 0.1)
s.lowdens.diff <- bilinear.grid(x = log10(obsseq), y = log10(varseq),
    z = matrix(df_var.bayes$df_agg_mean - df_var.mle$df_agg_mean, 7, 6), dx = 0.05, dy = 0.1)


## high density network
seed <- 24791
data.param <- simulateDag(node.name = letters[1:Nnode], data.dists = dist,
  edge.density = 0.8)[[1]]

data.param.var <- matrix(data = 0, nrow = Nnode, ncol = Nnode)
colnames(data.param.var) <- rownames(data.param.var) <- names(dist)

outforeachdense <- foreach(i = 1:dim(grid_abn)[1]) %dopar% {
  diag(data.param.var) <- rep(grid_abn[i, 2], Nnode)

  simulated.data <- simulateAbn(data.dists = dist, data.param = data.param,
    data.param.var = data.param.var,data.param.mult = data.param.mult,
    n.chains = 1, n.thin = 1, n.iter = grid_abn[i, 1],
    bug.file = tempfile(), seed = seeds[i], verbose = verbose)

  dag.tmp <- plotAbn(dag = data.param,data.dists = dist,plot = FALSE)@adjMat
  rownames(dag.tmp) <- colnames(dag.tmp)

  data.bayes <- fitAbn(dag = dag.tmp, data.df = simulated.data, data.dists = dist,
    method = "bayes", centre = FALSE, verbose = verbose)
  data.mle <- fitAbn(dag = dag.tmp, data.df = simulated.data, data.dists = dist,
    method = "mle", verbose = verbose)

  list(unlist(data.bayes$modes),                      ## named vectors
    unlist(sapply( data.mle$coef, function(x) x[1,])), ## named matrices
    seeds[i])
}

out.bayes <- sapply(outforeachdense, function(x) x[[1]])
out.mle <- sapply(outforeachdense, function(x) x[[2]])

df.bayes <- out.bayes[ -grep("precision|Intercept", rownames(out.bayes)), ]
df.mle <- out.mle[ -grep("intercept", rownames(out.mle)), ]

df.bayes <- cbind(grid_abn, t(df.bayes))
df.mle <- cbind(grid_abn, t(df.mle))

df_agg.bayes <- aggregate(x = df.bayes, by = list(df.bayes$nobs, df.bayes$var),
  FUN = function(x) sqrt(mean((1 - x)^2)))
df_agg.mle <- aggregate(x = df.mle,by = list(df.mle$nobs, df.mle$var),
  FUN = function(x) sqrt(mean((1 - x)^2)))

df_agg_mean.bayes <- apply(df_agg.bayes[,-(1:5)], 1, mean)
df_agg_mean.mle <- apply(df_agg.mle[,-(1:5)], 1, mean)

df_var.bayes <- cbind(df_agg_mean.bayes, df_agg.bayes[,(1:2)])
df_var.mle <- cbind(df_agg_mean.mle, df_agg.mle[,(1:2)])

## plotting
s.highdens.bayes <- bilinear.grid(x = log10(obsseq), y = log10(varseq),
  z = matrix(df_var.bayes$df_agg_mean, 7, 6), dx = 0.05, dy = 0.1)
s.highdens.diff <- bilinear.grid(x = log10(obsseq), y = log10(varseq),
  z = matrix(df_var.bayes$df_agg_mean - df_var.mle$df_agg_mean, 7, 6), dx = 0.05, dy = 0.1)


# pdf(file = "output/FIG_coef.pdf", width = 9, height = 8)
par(mfrow = c(2, 2), oma = c(0.2, 0.2, 0.2, 0.2) - 0.2, mar = c(4.1, 4, 3, 1))
image.plot(s.lowdens.bayes, main = "Low density BN: Bayesian estimation",
  xlab = "Number of observations [log10]", ylab = "Coef of Var [log10]",
  legend.lab = "mean RMSE", legend.line = 4, col = hcl.colors(5, "YlOrRd", rev = TRUE), legend.mar = 15)
image.plot(s.lowdens.diff, main = "Difference Bayesian - MLE",
  xlab = "Number of observations [log10]", ylab = "Coef of Var [log10]",
  zlim = range(s.lowdens.diff$z, -s.lowdens.diff$z), legend.lab = "difference in RMSE",
  legend.line = 4, col = hcl.colors(9, "spectral", rev = TRUE))

image.plot(s.highdens.bayes, main = "High density BN: Bayesian estimation",
  xlab = "Number of observations [log10]", ylab = "Coef of Var [log10]",
  legend.lab = "mean RMSE", legend.line = 4, col = hcl.colors(5, "YlOrRd", rev = TRUE))
image.plot(s.highdens.diff, main = "Difference Bayesian - MLE",
  xlab = "Number of observations [log10]", ylab = "Coef of Var [log10]",
  zlim = range(s.highdens.diff$z, -s.highdens.diff$z), legend.lab = "difference in RMSE",
  legend.line = 4, col = hcl.colors(9, "spectral", rev = TRUE))
# dev.off()

## ==================================================================
## End of Code
## ==================================================================

sessionInfo()
abn.Version("system")
proc.time() - ptm
