Structure learning benchmarks in Scutari, Marquis and Azzimonti, Proceedings of Machine Learning Research (2022)
This is a short HOWTO describing the relevant R code used to perform the simulation experiments in “Using Mixed-Effects Models to Learn Bayesian Networks from Related Data Sets” by Scutari, Marquis and Azzimonti (2022, PMLR volume for PGM 2022).
Generating the Ground-Truth Models
The function rlmebnfit()
below generates both the DAG and the parameters of the ground-truth BNs used to
generate the data sets used in the simulation study. Its arguments are the number of nodes (nnodes
), the
number of related data sets (ngroups
), the arc inclusion probability (prob
) and whether the
grouping variable should have a uniform distribution or not (homogeneous
). The BNs it produces are stored
in a data structure that is effectively a bn.fit
for a CGBN.
> rlmebnfit = function(nnodes, ngroups, prob, homogeneous = FALSE) { + nodes = paste0("X", seq(nnodes)) + graph = random.graph(nodes, num = 1, method = "ordered", prob = prob) + while(igraph::is_connected(as.igraph(graph)) == FALSE) + graph = random.graph(nodes, num = 1, method = "ordered", prob = prob) + local.distributions = + structure(vector(nnodes, mode = "list"), names = nodes) + for (node in nodes) { + par = parents(graph, node) + betas = matrix(rep(0, ngroups * (length(par) + 1)), + nrow = length(par) + 1, ncol = ngroups, + dimnames = list(c("(Intercept)", par), seq(ngroups) - 1)) + if (homogeneous) { + for (p in c("(Intercept)", par)) + betas[p, ] = rnorm(1, mean = rnorm(1, mean = 2, sd = 1), + sd = sqrt(rchisq(1, 1))) + }#THEN + else { + for (p in c("(Intercept)", par)) + betas[p, ] = rnorm(ngroups, mean = rnorm(1, mean = 2, sd = 1), + sd = sqrt(rchisq(1, 1))) + }#ELSE + sds = rep(sqrt(rchisq(1, 1)), ngroups) + local.distributions[[node]] = list(coef = betas, sd = sds) + }#FOR + graph = add.node(graph, "G") + for (node in nodes) + graph = set.arc(graph, from = "G", to = node) + local.distributions$G = + matrix(rep(1/ngroups, ngroups), ncol = ngroups, + dimnames = list(NULL, paste0("g", seq(ngroups)))) + bn = custom.fit(graph, local.distributions) + for (node in setdiff(node.ordering(bn), "G")) { + pp = setdiff(parents(bn, node), "G") + if (length(pp) == 0) + next + else + ff = paste(node, "~", paste(pp, collapse = "+")) + data = rbn(bn, 100 * nparams(bn)) + for (g in levels(data$G)) { + model = lm(as.formula(ff), data[data$G == g, ]) + exp.var = var(resid(model)) / var(data[data$G == g, node]) + local.distributions[[node]]$sd[levels(data$G) == g] = + sqrt(var(fitted(model)) / 0.85 * 0.15) + }#FOR + bn = custom.fit(graph, local.distributions) + }#FOR + return(bn) + }#RLMEBNFIT
Scoring Local Distributions that are Linear Mixed-Effects Models
Structure learning is implemented using the "custom"
score function: we fit a linear mixed-effect model
with the lmer()
function from the lme4 package, and we use the BIC()
method
from the same package to compute the score component for the node. The group
argument is the label of the
F variable that identifies the different related data sets.
> # calls lme4 to estimate the parameters of a local distribution. > fitME = function(node, parents, group, data) { + rhs = paste(c("1", parents), collapse = " + ") + formula = paste(node, "~", rhs, "+", paste0("(", rhs, " | ", group, ")")) + model = try(lmer(formula, data = data, control = lmerControl(calc.derivs = FALSE))) + if (is(model, "try-error")) + return(NULL) + return(model) + }#FITME > > scoreME = function(node, parents, data, args = list()) { + if (node == args$group) + return(0) + # mixed effects models with performance tuning to speed up learning. + require(lme4) + environment(nloptwrap)$defaultControl$maxeval = 5 * 1e3 + environment(nloptwrap)$defaultControl$xtol_abs = 1e-6 + environment(nloptwrap)$defaultControl$ftol_abs = 1e-6 + local.distribution = + fitME(node = node, parents = setdiff(parents, args$group), + group = args$group, data = data) + if (is.null(local.distribution)) + return(-Inf) + return(- BIC(local.distribution)) + }#SCOREME
Note that we reduce the maximum number of iterations and the tolerance of lmer()
to ensure model
estimation completes within a reasonable time frame even when the local distribution it is trying to fit is nearly
singular. The maximum number of iterations is never reached in the final experimental design we included in the paper,
but that was an issues in earlier simulations in which we did not set the proportion of explained variance.
Learning the Structures of the Different Types of BNs
Below is the R script that, for each data set, learns a GBN (complete pooling), a CGBN (no pooling) and a BN with mixed-effects models (partial pooling) and saves them for later analyses.
> library(bnlearn) > source("lme.score.R") > source("lme.fit.R") > > data.file = commandArgs()[7] > data = readRDS(data.file) > out.file = commandArgs()[8] > > inbound.arcs = data.frame(from = setdiff(names(data), "G"), to = "G") > outbound.arcs = data.frame(from = "G", to = setdiff(names(data), "G")) > > > dag.lme = hc(data, score = "custom", fun = scoreME, args = list(group = "G"), + whitelist = outbound.arcs) > dag.cgbn = hc(data, score = "bic-cg", whitelist = outbound.arcs) > dag.gbn = hc(data, score = "bic-cg", + blacklist = rbind(inbound.arcs, outbound.arcs)) > > local.distributions = + structure(vector(nnodes(dag.lme), mode = "list"), names = nodes(dag.lme)) > for (node in nodes(dag.lme)) { + if (node == "G") + local.distributions[["G"]] = prop.table(table(data$G)) + else + local.distributions[[node]] = + ldistME(node = node, parents = setdiff(parents(dag.lme, node), "G"), + group = "G", data = data) + }#FOR > > bn.lme = custom.fit(dag.lme, local.distributions) > bn.cgbn = bn.fit(dag.cgbn, data, replace.unidentifiable = TRUE) > bn.gbn = bn.fit(dag.gbn, data, replace.unidentifiable = TRUE) > > saveRDS(list(dag.lme = dag.lme, + dag.cgbn = dag.cgbn, + dag.gbn = dag.gbn, + fit.lme = bn.lme, + fit.cgbn = bn.cgbn, + fit.gbn = bn.gbn, + sample.size = nrow(data)), + file = out.file)
The parameters of the BN with mixed-effects models as local distributions are saved in a data structure that is
identical to that of a CGBN for ease processing by the ldistME
below. Note again that its code has been
made robust to singular models in early experiments, but these precautions were redundant in the simulations we included
in the paper.
> ldistME = function(node, parents, group, data) { + # mixed effectes models with performance tuning to speed up learning. + require(lme4) + environment(nloptwrap)$defaultControl$maxeval = 5 * 1e3 + environment(nloptwrap)$defaultControl$xtol_abs = 1e-6 + environment(nloptwrap)$defaultControl$ftol_abs = 1e-6 + model = fitME(node = node, parents = parents, group = group, data = data) + ngroups = nlevels(data[, group]) + ldist = list( + coef = array(0, dim = c(length(parents) + 1, ngroups), + dimnames = list(c("(Intercept)", parents), NULL)), + sd = rep(0, ngroups)) + if (is.null(model)) { + # retry with more iterations and a more forgiving precision. + environment(nloptwrap)$defaultControl$maxeval = 1e5 + environment(nloptwrap)$defaultControl$xtol_abs = 1e-5 + environment(nloptwrap)$defaultControl$ftol_abs = 1e-5 + model = fitME(node = node, parents = parents, group = group, data = data) + # reset performance tuning. + environment(nloptwrap)$defaultControl$maxeval = 5 * 1e3 + environment(nloptwrap)$defaultControl$xtol_abs = 1e-6 + environment(nloptwrap)$defaultControl$ftol_abs = 1e-6 + if (is.null(model)) { + warning("failed to fit node ", node, " with lme4.") + return(ldist) + }#THEN + }#THEN + # add the fixed effects to all columns. + fixefs = fixef(model) + for (i in seq(ncol(ldist$coef))) + ldist$coef[, i] = fixefs + # update each column with the random effects from the corresponding group. + ranefs = ranef(model)[[group]] + for (i in seq(ncol(ldist$coef))) + ldist$coef[, i] = ldist$coef[, i] + as.numeric(ranefs[i, ]) + # compute the standard errors for the residuals in the groups. + resids = resid(model) + g = levels(data[, group]) + for (i in seq(ngroups)) + ldist$sd[i] = sd(resids[data[, group] == g[i]]) + # replace NA coefficients and standard errors with zeroes to avoid errors in + # custom.fit(). + ldist$coef[is.na(ldist$coef)] = 0 + ldist$sd[is.na(ldist$sd)] = 0 + return(ldist) + }#LDISTME
Comparing the Learned Networks
The complete R script to compare the learned networks is below. It leverages the parallel package to perform comparison in parallel, the caret package to compute the F1 score, and the bnlearn package to compute the SHD distance as well as the number of arcs, nodes and parameters.
> library(bnlearn) > library(parallel) > > cl = makeCluster(30) > clusterEvalQ(cl, library(bnlearn)) > clusterEvalQ(cl, library(caret)) > clusterEvalQ(cl, source("kl.R")) > > data.dir = commandArgs()[7] > out.file = commandArgs()[8] > clusterExport(cl, "data.dir") > > learned = list.files(data.dir) > > stats = parLapply(cl, learned, function(file) { + model = readRDS(paste0(data.dir, "/", file)) + dag.lme = model$dag.lme + dag.gbn = model$dag.gbn + dag.cgbn = model$dag.cgbn + golden = readRDS(Sys.glob(paste0("networks/", sub("-.*", "", file), "-*"))) + dag.golden = bn.net(golden) + sub.golden = subgraph(golden, setdiff(nodes(golden), "G")) + sub.lme = subgraph(dag.lme, setdiff(nodes(dag.lme), "G")) + sub.gbn = subgraph(dag.gbn, setdiff(nodes(dag.gbn), "G")) + sub.cgbn = subgraph(dag.cgbn, setdiff(nodes(dag.cgbn), "G")) + fit.lme = model$fit.lme + fit.gbn = model$fit.gbn + fit.cgbn = model$fit.cgbn + validation = rbn(golden, 1000) + clustering.error = function(fitted) { + pred = predict(fitted, node = "G", data = validation, method = "bayes-lw") + obs = validation$G + if (nlevels(pred) == 2) + F1s = confusionMatrix(pred, obs)[["byClass"]]["F1"] + else + F1s = confusionMatrix(pred, obs)[["byClass"]][, "F1"] + return(mean(F1s, na.rm = TRUE)) + }#CLUSTERING.ERROR + prediction.error = function(fitted, with.group = TRUE) { + error = 0 + for (node in setdiff(nodes(fitted), "G")) { + if (with.group) + from = setdiff(nodes(fitted), node) + else + from = setdiff(nodes(fitted), c(node, "G")) + pred = predict(fitted, node = node, data = validation, + method = "bayes-lw", from = from) + obs = validation[, node] + error = error + mean(abs((pred - obs) / obs)) + }#FOR + return(error) + }#PREDICTION.ERROR + c(nnodes = nnodes(golden) - 1, + ngroups = length(golden$G$prob), + density = attr(golden, "density"), + sample.size = model$sample.size, + nparams.golden = nparams(golden), + nparams.lme = nparams(fit.lme), + nparams.gbn = nparams(fit.gbn), + nparams.cgbn = nparams(fit.cgbn), + narcs.golden = narcs(sub.golden), + narcs.lme = narcs(sub.lme), + narcs.gbn = narcs(sub.gbn), + narcs.cgbn = narcs(sub.cgbn), + shd.lme = shd(sub.lme, sub.golden), + shd.gbn = shd(sub.gbn, sub.golden), + shd.cgbn = shd(sub.cgbn, sub.golden), + kl.lme = KL(golden, fit.lme), + kl.gbn = KL(golden, fit.gbn), + kl.cgbn = KL(golden, fit.cgbn), + clustering.lme = clustering.error(fit.lme), + clustering.cgbn = clustering.error(fit.cgbn), + with.group.lme = prediction.error(fit.lme, with.group = TRUE), + with.group.cgbn = prediction.error(fit.cgbn, with.group = TRUE), + without.group.lme = prediction.error(fit.lme, with.group = FALSE), + without.group.cgbn = prediction.error(fit.cgbn, with.group = FALSE) + ) + }) > > stats = data.frame(do.call("rbind", stats)) > > stopCluster(cl) > > saveRDS(stats, file = out.file)
The limited implementation of the Kullback-Leibler distance we needed for this simulation, and that is sourced in the code above, is the following.
> KL = function(P, Q) { + Pdag = remove.node(bn.net(P), "G") + Qdag = remove.node(bn.net(Q), "G") + klvalue = 0 + for (group in seq_along(coef(P$G))) { + Pldist = structure(vector(nnodes(Pdag), mode = "list"), names = nodes(Pdag)) + for (node in nodes(Pdag)) { + if (is(P[[node]], "bn.fit.cgnode")) + Pldist[[node]] = list(coef = coef(P[[node]])[, group], + sd = sigma(P[[node]])[group]) + else + Pldist[[node]] = list(coef = coef(P[[node]]), sd = sigma(P[[node]])) + }#FOR + Pgroup = custom.fit(Pdag, Pldist) + Qldist = structure(vector(nnodes(Qdag), mode = "list"), names = nodes(Qdag)) + for (node in nodes(Qdag)) { + if (is(Q[[node]], "bn.fit.cgnode")) + Qldist[[node]] = list(coef = coef(Q[[node]])[, group], + sd = sigma(Q[[node]])[group]) + else + Qldist[[node]] = list(coef = coef(Q[[node]]), sd = sigma(Q[[node]])) + }#FOR + Qgroup = custom.fit(Qdag, Qldist) + Pmvn = gbn2mvnorm(Pgroup) + Qmvn = gbn2mvnorm(Qgroup) + if (isTRUE(all.equal(Pmvn, Qmvn))) + next + svdQ = svd(Qmvn$sigma) + det.sigmaQ = prod(svdQ$d) + pos = svdQ$d > svdQ$d[1] * .Machine$double.eps + inv.sigmaQ = svdQ$v[, pos, drop = FALSE] %*% + diag(1 / svdQ$d[pos], nrow = length(svdQ$d[pos])) %*% + t(svdQ$u[, pos, drop = FALSE]) + svdP = svd(Pmvn$sigma) + det.sigmaP = prod(svdP$d) + if ((det.sigmaP == 0) || (det.sigmaQ == 0)) + return(Inf) + klvalue = klvalue + + 0.5 * as.numeric( + sum(log(svdQ$d)) - sum(log(svdP$d)) + + sum(diag(inv.sigmaQ %*% Pmvn$sigma)) - nrow(Qmvn$sigma) + + t(Pmvn$mu - Qmvn$mu) %*% inv.sigmaQ %*% (Pmvn$mu - Qmvn$mu) + ) + }#FOR + return(klvalue) + }#KL
Thu Aug 4 12:34:58 2022
with bnlearn
4.8-20220314
and R version 4.2.1 (2022-06-23)
.