## Structure learning benchmarks in Scutari, Marquis and Azzimonti, Proceedings of Machine Learning Research (2022)

This is a short HOWTO describing the relevant R code used to perform the simulation experiments in “Using Mixed-Effects Models to Learn Bayesian Networks from Related Data Sets” by Scutari, Marquis and Azzimonti (2022, PMLR volume for PGM 2022).

### Generating the Ground-Truth Models

The function `rlmebnfit()` below generates both the DAG and the parameters of the ground-truth BNs used to generate the data sets used in the simulation study. Its arguments are the number of nodes (`nnodes`), the number of related data sets (`ngroups`), the arc inclusion probability (`prob`) and whether the grouping variable should have a uniform distribution or not (`homogeneous`). The BNs it produces are stored in a data structure that is effectively a `bn.fit` for a CGBN.

```> rlmebnfit = function(nnodes, ngroups, prob, homogeneous = FALSE) {

+   nodes = paste0("X", seq(nnodes))

+   graph = random.graph(nodes, num = 1, method = "ordered", prob = prob)

+   while(igraph::is_connected(as.igraph(graph)) == FALSE)
+     graph = random.graph(nodes, num = 1, method = "ordered", prob = prob)

+   local.distributions =
+     structure(vector(nnodes, mode = "list"), names = nodes)

+   for (node in nodes) {

+     par = parents(graph, node)

+     betas = matrix(rep(0, ngroups * (length(par) + 1)),
+                  nrow = length(par) + 1, ncol = ngroups,
+                  dimnames = list(c("(Intercept)", par), seq(ngroups) - 1))

+     if (homogeneous) {

+       for (p in c("(Intercept)", par))
+         betas[p, ] = rnorm(1, mean = rnorm(1, mean = 2, sd = 1),
+                            sd = sqrt(rchisq(1, 1)))
+     }#THEN
+     else {

+       for (p in c("(Intercept)", par))
+         betas[p, ] = rnorm(ngroups, mean = rnorm(1, mean = 2, sd = 1),
+                            sd = sqrt(rchisq(1, 1)))

+     }#ELSE

+     sds = rep(sqrt(rchisq(1, 1)), ngroups)

+     local.distributions[[node]] = list(coef = betas, sd = sds)

+   }#FOR

+   for (node in nodes)
+     graph = set.arc(graph, from = "G", to = node)

+   local.distributions\$G =
+     matrix(rep(1/ngroups, ngroups), ncol = ngroups,
+            dimnames = list(NULL, paste0("g", seq(ngroups))))

+   bn = custom.fit(graph, local.distributions)

+   for (node in setdiff(node.ordering(bn), "G")) {

+     pp = setdiff(parents(bn, node), "G")
+     if (length(pp) == 0)
+       next
+     else
+       ff = paste(node, "~", paste(pp, collapse = "+"))

+     data = rbn(bn, 100 * nparams(bn))

+     for (g in levels(data\$G)) {

+       model = lm(as.formula(ff), data[data\$G == g, ])
+       exp.var = var(resid(model)) / var(data[data\$G == g, node])
+       local.distributions[[node]]\$sd[levels(data\$G) == g] =
+         sqrt(var(fitted(model)) / 0.85 * 0.15)

+     }#FOR

+     bn = custom.fit(graph, local.distributions)

+   }#FOR

+   return(bn)

+ }#RLMEBNFIT
```

### Scoring Local Distributions that are Linear Mixed-Effects Models

Structure learning is implemented using the `"custom"` score function: we fit a linear mixed-effect model with the `lmer()` function from the lme4 package, and we use the `BIC()` method from the same package to compute the score component for the node. The `group` argument is the label of the F variable that identifies the different related data sets.

```> # calls lme4 to estimate the parameters of a local distribution.
> fitME = function(node, parents, group, data) {

+   rhs = paste(c("1", parents), collapse = " + ")
+   formula = paste(node, "~", rhs, "+", paste0("(", rhs, " | ", group, ")"))

+   model = try(lmer(formula, data = data, control = lmerControl(calc.derivs = FALSE)))

+   if (is(model, "try-error"))
+     return(NULL)

+   return(model)

+ }#FITME
>
> scoreME =  function(node, parents, data, args = list()) {

+   if (node == args\$group)
+     return(0)

+   # mixed effects models with performance tuning to speed up learning.
+   require(lme4)
+   environment(nloptwrap)\$defaultControl\$maxeval = 5 * 1e3
+   environment(nloptwrap)\$defaultControl\$xtol_abs = 1e-6
+   environment(nloptwrap)\$defaultControl\$ftol_abs = 1e-6

+   local.distribution =
+     fitME(node = node, parents = setdiff(parents, args\$group),
+           group = args\$group, data = data)

+   if (is.null(local.distribution))
+     return(-Inf)

+   return(- BIC(local.distribution))

+ }#SCOREME
```

Note that we reduce the maximum number of iterations and the tolerance of `lmer()` to ensure model estimation completes within a reasonable time frame even when the local distribution it is trying to fit is nearly singular. The maximum number of iterations is never reached in the final experimental design we included in the paper, but that was an issues in earlier simulations in which we did not set the proportion of explained variance.

### Learning the Structures of the Different Types of BNs

Below is the R script that, for each data set, learns a GBN (complete pooling), a CGBN (no pooling) and a BN with mixed-effects models (partial pooling) and saves them for later analyses.

```> library(bnlearn)
> source("lme.score.R")
> source("lme.fit.R")
>
> data.file = commandArgs()
> out.file = commandArgs()
>
> inbound.arcs = data.frame(from = setdiff(names(data), "G"), to = "G")
> outbound.arcs = data.frame(from = "G", to = setdiff(names(data), "G"))
>
>
> dag.lme = hc(data, score = "custom", fun = scoreME, args = list(group = "G"),
+              whitelist = outbound.arcs)
> dag.cgbn = hc(data, score = "bic-cg", whitelist = outbound.arcs)
> dag.gbn = hc(data, score = "bic-cg",
+              blacklist = rbind(inbound.arcs, outbound.arcs))
>
> local.distributions =
+   structure(vector(nnodes(dag.lme), mode = "list"), names = nodes(dag.lme))
> for (node in nodes(dag.lme)) {

+   if (node == "G")
+     local.distributions[["G"]] = prop.table(table(data\$G))
+   else
+     local.distributions[[node]] =
+       ldistME(node = node, parents = setdiff(parents(dag.lme, node), "G"),
+             group = "G", data = data)

+ }#FOR
>
> bn.lme = custom.fit(dag.lme, local.distributions)
> bn.cgbn = bn.fit(dag.cgbn, data, replace.unidentifiable = TRUE)
> bn.gbn = bn.fit(dag.gbn, data, replace.unidentifiable = TRUE)
>
> saveRDS(list(dag.lme = dag.lme,
+              dag.cgbn = dag.cgbn,
+              dag.gbn = dag.gbn,
+              fit.lme = bn.lme,
+              fit.cgbn = bn.cgbn,
+              fit.gbn = bn.gbn,
+              sample.size = nrow(data)),
+         file = out.file)
```

The parameters of the BN with mixed-effects models as local distributions are saved in a data structure that is identical to that of a CGBN for ease processing by the `ldistME` below. Note again that its code has been made robust to singular models in early experiments, but these precautions were redundant in the simulations we included in the paper.

```> ldistME = function(node, parents, group, data) {

+   # mixed effectes models with performance tuning to speed up learning.
+   require(lme4)
+   environment(nloptwrap)\$defaultControl\$maxeval = 5 * 1e3
+   environment(nloptwrap)\$defaultControl\$xtol_abs = 1e-6
+   environment(nloptwrap)\$defaultControl\$ftol_abs = 1e-6

+   model = fitME(node = node, parents = parents, group = group, data = data)
+   ngroups = nlevels(data[, group])

+   ldist = list(
+      coef = array(0, dim = c(length(parents) + 1, ngroups),
+                    dimnames = list(c("(Intercept)", parents), NULL)),
+      sd = rep(0, ngroups))

+   if (is.null(model)) {

+     # retry with more iterations and a more forgiving precision.
+     environment(nloptwrap)\$defaultControl\$maxeval = 1e5
+     environment(nloptwrap)\$defaultControl\$xtol_abs = 1e-5
+     environment(nloptwrap)\$defaultControl\$ftol_abs = 1e-5

+     model = fitME(node = node, parents = parents, group = group, data = data)

+     # reset performance tuning.
+     environment(nloptwrap)\$defaultControl\$maxeval = 5 * 1e3
+     environment(nloptwrap)\$defaultControl\$xtol_abs = 1e-6
+     environment(nloptwrap)\$defaultControl\$ftol_abs = 1e-6

+     if (is.null(model)) {

+       warning("failed to fit node ", node, " with lme4.")
+       return(ldist)

+     }#THEN

+   }#THEN

+   # add the fixed effects to all columns.
+   fixefs = fixef(model)
+   for (i in seq(ncol(ldist\$coef)))
+     ldist\$coef[, i] = fixefs

+   # update each column with the random effects from the corresponding group.
+   ranefs = ranef(model)[[group]]
+   for (i in seq(ncol(ldist\$coef)))
+     ldist\$coef[, i] = ldist\$coef[, i] + as.numeric(ranefs[i, ])

+   # compute the standard errors for the residuals in the groups.
+   resids = resid(model)
+   g = levels(data[, group])
+   for (i in seq(ngroups))
+     ldist\$sd[i] = sd(resids[data[, group] == g[i]])

+   # replace NA coefficients and standard errors with zeroes to avoid errors in
+   # custom.fit().
+   ldist\$coef[is.na(ldist\$coef)] = 0
+   ldist\$sd[is.na(ldist\$sd)] = 0

+   return(ldist)

+   }#LDISTME
```

### Comparing the Learned Networks

The complete R script to compare the learned networks is below. It leverages the parallel package to perform comparison in parallel, the caret package to compute the F1 score, and the bnlearn package to compute the SHD distance as well as the number of arcs, nodes and parameters.

```> library(bnlearn)
> library(parallel)
>
> cl = makeCluster(30)
> clusterEvalQ(cl, library(bnlearn))
> clusterEvalQ(cl, library(caret))
> clusterEvalQ(cl, source("kl.R"))
>
> data.dir = commandArgs()
> out.file = commandArgs()
> clusterExport(cl, "data.dir")
>
> learned = list.files(data.dir)
>
> stats = parLapply(cl, learned, function(file) {

+   model = readRDS(paste0(data.dir, "/", file))
+   dag.lme = model\$dag.lme
+   dag.gbn = model\$dag.gbn
+   dag.cgbn = model\$dag.cgbn
+   golden = readRDS(Sys.glob(paste0("networks/", sub("-.*", "", file), "-*")))
+   dag.golden = bn.net(golden)

+   sub.golden = subgraph(golden, setdiff(nodes(golden), "G"))
+   sub.lme = subgraph(dag.lme, setdiff(nodes(dag.lme), "G"))
+   sub.gbn = subgraph(dag.gbn, setdiff(nodes(dag.gbn), "G"))
+   sub.cgbn = subgraph(dag.cgbn, setdiff(nodes(dag.cgbn), "G"))

+   fit.lme = model\$fit.lme
+   fit.gbn = model\$fit.gbn
+   fit.cgbn = model\$fit.cgbn

+   validation = rbn(golden, 1000)

+   clustering.error = function(fitted) {

+     pred = predict(fitted, node = "G", data = validation, method = "bayes-lw")
+     obs = validation\$G

+     if (nlevels(pred) == 2)
+       F1s = confusionMatrix(pred, obs)[["byClass"]]["F1"]
+     else
+       F1s = confusionMatrix(pred, obs)[["byClass"]][, "F1"]

+     return(mean(F1s, na.rm = TRUE))

+   }#CLUSTERING.ERROR

+   prediction.error = function(fitted, with.group = TRUE) {

+     error = 0

+     for (node in setdiff(nodes(fitted), "G")) {

+       if (with.group)
+         from = setdiff(nodes(fitted), node)
+       else
+         from = setdiff(nodes(fitted), c(node, "G"))

+       pred = predict(fitted, node = node, data = validation,
+                method = "bayes-lw", from = from)
+       obs = validation[, node]

+       error = error + mean(abs((pred - obs) / obs))

+     }#FOR

+     return(error)

+   }#PREDICTION.ERROR

+   c(nnodes = nnodes(golden) - 1,
+     ngroups = length(golden\$G\$prob),
+     density = attr(golden, "density"),
+     sample.size = model\$sample.size,
+     nparams.golden = nparams(golden),
+     nparams.lme = nparams(fit.lme),
+     nparams.gbn = nparams(fit.gbn),
+     nparams.cgbn = nparams(fit.cgbn),
+     narcs.golden = narcs(sub.golden),
+     narcs.lme = narcs(sub.lme),
+     narcs.gbn = narcs(sub.gbn),
+     narcs.cgbn = narcs(sub.cgbn),
+     shd.lme = shd(sub.lme, sub.golden),
+     shd.gbn = shd(sub.gbn, sub.golden),
+     shd.cgbn = shd(sub.cgbn, sub.golden),
+     kl.lme = KL(golden, fit.lme),
+     kl.gbn = KL(golden, fit.gbn),
+     kl.cgbn = KL(golden, fit.cgbn),
+     clustering.lme = clustering.error(fit.lme),
+     clustering.cgbn = clustering.error(fit.cgbn),
+     with.group.lme = prediction.error(fit.lme, with.group = TRUE),
+     with.group.cgbn = prediction.error(fit.cgbn, with.group = TRUE),
+     without.group.lme = prediction.error(fit.lme, with.group = FALSE),
+     without.group.cgbn = prediction.error(fit.cgbn, with.group = FALSE)
+   )

+ })
>
> stats = data.frame(do.call("rbind", stats))
>
> stopCluster(cl)
>
> saveRDS(stats, file = out.file)
```

The limited implementation of the Kullback-Leibler distance we needed for this simulation, and that is sourced in the code above, is the following.

```> KL = function(P, Q) {

+   Pdag = remove.node(bn.net(P), "G")
+   Qdag = remove.node(bn.net(Q), "G")
+   klvalue = 0

+   for (group in seq_along(coef(P\$G))) {

+     Pldist = structure(vector(nnodes(Pdag), mode = "list"), names = nodes(Pdag))
+     for (node in nodes(Pdag)) {

+       if (is(P[[node]], "bn.fit.cgnode"))
+         Pldist[[node]] = list(coef = coef(P[[node]])[, group],
+                               sd = sigma(P[[node]])[group])
+       else
+         Pldist[[node]] = list(coef = coef(P[[node]]), sd = sigma(P[[node]]))

+     }#FOR
+     Pgroup = custom.fit(Pdag, Pldist)

+     Qldist = structure(vector(nnodes(Qdag), mode = "list"), names = nodes(Qdag))
+     for (node in nodes(Qdag)) {

+       if (is(Q[[node]], "bn.fit.cgnode"))
+         Qldist[[node]] = list(coef = coef(Q[[node]])[, group],
+                               sd = sigma(Q[[node]])[group])
+       else
+         Qldist[[node]] = list(coef = coef(Q[[node]]), sd = sigma(Q[[node]]))

+     }#FOR
+     Qgroup = custom.fit(Qdag, Qldist)

+     Pmvn = gbn2mvnorm(Pgroup)
+     Qmvn = gbn2mvnorm(Qgroup)

+     if (isTRUE(all.equal(Pmvn, Qmvn)))
+       next

+     svdQ = svd(Qmvn\$sigma)
+     det.sigmaQ = prod(svdQ\$d)
+     pos = svdQ\$d > svdQ\$d * .Machine\$double.eps
+     inv.sigmaQ = svdQ\$v[, pos, drop = FALSE] %*%
+                    diag(1 / svdQ\$d[pos], nrow = length(svdQ\$d[pos])) %*%
+                    t(svdQ\$u[, pos, drop = FALSE])

+     svdP = svd(Pmvn\$sigma)
+     det.sigmaP = prod(svdP\$d)

+     if ((det.sigmaP == 0) || (det.sigmaQ == 0))
+       return(Inf)

+     klvalue = klvalue +
+       0.5 * as.numeric(
+         sum(log(svdQ\$d)) - sum(log(svdP\$d)) +
+         sum(diag(inv.sigmaQ %*% Pmvn\$sigma)) - nrow(Qmvn\$sigma) +
+         t(Pmvn\$mu - Qmvn\$mu) %*% inv.sigmaQ %*% (Pmvn\$mu - Qmvn\$mu)
+       )

+   }#FOR

+   return(klvalue)

+ }#KL
```
Last updated on `Thu Aug 4 12:34:58 2022` with bnlearn `4.8-20220314` and `R version 4.2.1 (2022-06-23)`.