Learning, selecting and evaluating a state-space Bayesian network model

Import the data prepared in the previous notebook (available here) and learn a useful spatio-temporal Bayesian network from it.

Structure learning with model averaging

> ttdata = readRDS("ttdata.rds")

The following two functions are the core of the analysis:

  • blacklist() creates a blacklist for the causal directions that do not make sense in the context of the data. In addition to forbidding instantaneous arcs and arcs that go back in time, we impose that:
    • Arcs can only point from socio-demographic variables to other variables: socio-demographic variables are static, and they have been recorded before any other variable.
    • Arcs can point from the weather to the pollutants and the conditions but not the other way around.
    • Pollutants are not connected by arcs because, in reality, they are effects of the local economy and not of each other.
    • Arcs can only point to conditions from other types of variables.
    • There are no arcs between the weather and socio-demographic variables.
    The guiding principle for establishing which causal directions should be banned is the implicit asymmetry between cause and effect: if I affect a change to the cause, the effect also changes, but the reverse is not true.
    > blacklist = function(data) {
    
    +   t0.vars = grep("_0$", colnames(data), value = TRUE)
    +   t1.vars = grep("_1$", colnames(data), value = TRUE)
    
    +   social = c("UNI", "UNEMPLOYMENT", "INCOME", "POPULATION", "DENSITY", "POVERTY")
    +   pollutants = c("NO2", "SO2", "PM25")
    +   conditions = c("ANX", "DEP", "DER", "OBE", "SLD")
    +   weather = c("TEMP", "WIND", "RAIN", "RANGETEMP", "RANGEWIND")
    
    +   # basic blacklist for dynamic BNs.
    +   bl = rbind(tiers2blacklist(list(t0.vars, t1.vars)),
    +              set2blacklist(t0.vars), set2blacklist(t1.vars))
    +   # domain blacklists.
    +   bl = rbind(bl,
    +     # no arcs between socio-demographics.
    +     tiers2blacklist(list(paste0(social, "_1"), paste0(social, "_0"))),
    +     # no arcs from pollutants to socio-demographics.
    +     tiers2blacklist(list(paste0(social, "_1"), paste0(pollutants, "_0"))),
    +     # no arcs from weather to socio-demographics.
    +     tiers2blacklist(list(paste0(social, "_1"), paste0(weather, "_0"))),
    +     # no arcs from socio-demographics to the weather.
    +     tiers2blacklist(list(paste0(weather, "_1"), paste0(social, "_0"))),
    +     # no arcs from the pollution to the weather.
    +     tiers2blacklist(list(paste0(weather, "_1"), paste0(pollutants, "_0"))),
    +     # no arcs from the conditions to the weather.
    +     tiers2blacklist(list(paste0(weather, "_1"), paste0(conditions, "_0"))),
    +     # no arcs from the conditions to the socio-demographics.
    +     tiers2blacklist(list(paste0(social, "_1"), paste0(conditions, "_0"))),
    +     # no arcs from the conditions to the pollutants.
    +     tiers2blacklist(list(paste0(pollutants, "_1"), paste0(conditions, "_0")))
    +   )
    
    +   # no arcs between different pollutants (but loops as still allowed).
    +   for (var1 in pollutants)
    +     for (var2 in pollutants)
    +       if (var1 != var2)
    +         bl = rbind(bl, c(from = paste0(var1, "_0"), to = paste0(var2, "_1")))
    
    +   return(bl)
    
    + }#BLACKLIST
    
  • learn.dbn() takes the data, forms a blacklist of arc directions that make no sense (including the arcs that run backwards in time), and learns a two-time BN, which is the raw form of a dynamic BN.
  • averaging() runs multiple instances of learn.dbn() on bootstrap samples with downsampling and column permutation and then constructs a consensus model using arc strengths/frequencies.

NOTE: We use the penalised node-average log-likelihood to paper over the missing values, even though we know that they are not missing at random. We also do not specifically model the spatial dependence structure of the data.

> learn.dbn = function(data, penalty = 2) {

+   nodes = grep("_[01]$", colnames(data), value = TRUE)

+   tabu(data[, nodes], blacklist = blacklist(data[, nodes]), score = "pnal-g",
+     k = penalty * log(nrow(data)) / 2)

+ }#LEARN.DBN
> 
> averaging = function(data, penalty = 2, reps = 100) {

+   # bootstrap, column permutation and subsampling.
+   bagging = function(i, data, counties, penalty) {

+     keep = sample(levels(counties), 0.80 * nlevels(counties))
+     boot.sample = data[counties %in% keep, ]
+     boot.sample = boot.sample[, sample(ncol(data), ncol(data))]

+     learn.dbn(boot.sample, penalty = penalty)

+   }#BAGGING

+   # export the function and the variables we need to the slaves.
+   clusterExport(cl, c("learn.dbn", "blacklist"))
+   # distribute the bootstrapping to the slaves.
+   dags = parLapply(cl, seq(reps), bagging, data = data,
+                    counties = data[, "COUNTY"], penalty = penalty)
+   # compute the arc strengths.
+   strength = custom.strength(dags, nodes = nodes(dags[[1]]))
+   # construct the consensus network.
+   consensus = averaged.network(strength)
+   consensus$learning$args[["strength"]] = strength

+   return(consensus)

+ }#AVERAGING

The following function measures the predictive accuracy of a dynamic BN using the R2 measured on a separate validation set for each node. It also computes the average over all the nodes as a summary statistic.

> dbn.accuracy = function(dag, training, validation) {

+   # ignore the accuracy of socio-demographic variables, they are not really
+   # temporal and should not be evaluated along with the others.
+   social = c("UNI_1", "UNEMPLOYMENT_1", "INCOME_1", "POPULATION_1", "DENSITY_1",
+              "POVERTY_1")
+   t1.vars = grep("_1$", nodes(dag), value = TRUE)

+   fitted = bn.fit(dag, training[, nodes(dag)])

+   to.evaluate = setdiff(t1.vars, social)
+   accuracy = structure(vector(length(to.evaluate), mode = "numeric"),
+                        names = to.evaluate)

+   for (node in to.evaluate) {

+     pred = predict(fitted, data = validation[, nodes(fitted)],
+                    node = node, method = "parents")
+     accuracy[node] = summary(lm(validation[, node] ~ pred))$adj.r.squared

+   }#FOR

+   accuracy["AVERAGE"] = mean(accuracy)

+   return(accuracy)

+ }#DBN.ACCURACY

To ensure a fair assessment of the dynamic BNs, we leverage the six-month gap between "2022-05-23" and "2022-12-26" to separate the training and testing sets. (This gap is created when the different data sets are merged and subsetted to reduce the number of missing values, not when the data are preprocessed for the structure learning.)

The training and test split is such that it leaves about ~2000 observations in the validation set. We estimate the predictive accuracy for dynamic BNs learned with increasing penalties (which are increasingly sparse as a result).

> training = subset(ttdata, WEEK <= "2022-05-23")
> validation = subset(ttdata, WEEK >= "2022-12-26")
> 
> 
> penalty = c(2, 4, 8, 16, 32, 64, 128, 256, 512)
> narcs = vector(length(penalty), mode = "numeric")
> accuracy = vector(length(penalty), mode = "list")
> dags = vector(length(penalty), mode = "list")
> 
> for (i in seq_along(penalty)) {

+   dags[[i]] = averaging(training, penalty = penalty[i])
+   narcs[i] = narcs(dags[[i]])
+   accuracy[[i]] = dbn.accuracy(dags[[i]], training, validation)

+ }#FOR
plot of chunk plot_predictive_accuracy_decay

The number of arcs (absolute) and the density (relative) of the dynamic BNs.

penalty arcs density AVERAGE CONDITIONS
2 60 3.158 0.581 0.755
4 53 2.789 0.584 0.759
8 49 2.579 0.583 0.756
16 44 2.316 0.584 0.757
32 41 2.158 0.583 0.758
64 38 2.000 0.585 0.762
128 34 1.789 0.575 0.735
256 21 1.105 0.560 0.711
512 15 0.789 0.570 0.755

The curve is predictive accuracy for the conditions is quite flat: we choose penalty = 16, which gives a sparse (but connected) network with about two arcs per node.

> dbn.iid = averaging(ttdata, penalty = 16)
plot of chunk plot_best_dynamic_bn

This model does not appear realistic: it lacks structural features we know exist, such as the three-way feedback loop linking anxiety, depression and sleep disorder.

For completeness, we look at predictive accuracy for unobserved locations for this level of sparsity. Similarly to what we did earlier, we remove the observations in the validation set that are too close to those of the training set to ensure a fair assessment. The threshold is set to one longitude degree, which is about 110km.

> remove.too.close = function(training, validation, threshold = 1) {

+   countiesTR = unique(training[, c("FIPS", "LON", "LAT")])
+   countiesVA = unique(validation[, c("FIPS", "LON", "LAT")])

+   for (i in seq(nrow(countiesVA))) {

+     cross.dist = apply(countiesTR[, c("LON", "LAT")], 1,
+                 function(cc) sqrt(sum((cc - countiesVA[i, c("LON", "LAT")])^2)))

+     if (any(cross.dist < threshold))
+       validation = subset(validation, FIPS != countiesVA[i, "FIPS"])

+   }#FOR

+   return(validation)

+ }#REMOVE.TOO.CLOSE

For the training-validation split, we divide the data into six geographical areas, each containing 15–20% of the sample. Therefore, a leave-one-area-out cross-validation uses 80–85% of the data for training.

> regions = list(
+   SE = c("Florida", "Georgia", "Alabama", "South Carolina", "North Carolina",
+          "Tennessee", "Virginia"),
+   CS = c("Texas", "Oklahoma", "Arkansas", "Louisiana", "Kansas", "Missouri",
+          "New Mexico", "Mississippi"),
+   CE = c("Wisconsin", "Michigan", "Illinois", "Indiana", "Ohio", "Kentucky"),
+   NW = c("Minnesota", "Iowa", "Nebraska", "North Dakota", "South Dakota",
+          "Idaho", "Montana", "Wyoming", "Utah", "Colorado"),
+   WC = c("California", "Nevada", "Arizona", "Washington", "Oregon"),
+   NE = c("Maine", "Vermont", "New Hampshire", "Massachussetts", "Rhode Island",
+          "Connecticut", "New York", "New Jersey", "Pennsylvania", "West Virginia")
+ )
> 
> accuracy = vector(length(regions), mode = "list")
> names(accuracy) = names(regions)
> 
> for (r in names(regions)) {

+   # split training and validation...
+   training = subset(ttdata, !(STATE %in% regions[[r]]))
+   validation = subset(ttdata, (STATE %in% regions[[r]]))
+   # ... remove the counties in the validation set that are too close to those in
+   # the training set...
+   validation = remove.too.close(training, validation)
+   # ... learn the structure of the dynamic BN...
+   dag = averaging(training, penalty = 8)
+   # ... and predict the validation set to compute the associated accuracy.
+   accuracy[[r]] = dbn.accuracy(dag, training, validation)

+ }#FOR

The R2 values for the conditions over the different regions and on average.

ANX DEP DER OBE SLD
SE 0.697 0.742 0.913 0.730 0.609
CS 0.639 0.647 0.851 0.646 0.575
CE 0.763 0.803 0.914 0.744 0.627
NW 0.677 0.674 0.820 0.717 0.480
WC 0.714 0.697 0.842 0.828 0.706
NE 0.785 0.784 0.874 0.828 0.674
AVERAGE 0.713 0.724 0.869 0.749 0.612

Model validation: the residuals

Firstly, we test whether there is temporal autocorrelation in the residuals to verify that the AR(1) assumption is satisfied, at least approximately. Ideally, we would need a portmanteau test for all variables simultaneously, but there is no such test in R that can deal with time series with NAs in the middle. Therefore, we resort to an automated check of the autocorrelation plots. We check each county independently to avoid confounding with any residual spatial dependence pattern.

> check.autocor = function(fitted, node, data) {

+   # frame the residuals together with the county and the week.
+   pars = fitted[[node]]$parents
+   use = complete.cases(data[, c(node, pars)])

+   if (is(fitted, "bn.fit")) {

+     res = data.frame(
+       values = fitted[[node]]$residuals[use],
+       county = data[use, "COUNTY"],
+       week = data[use, "WEEK"]
+     )

+   }#THEN
+   else {

+     res = data.frame(
+       values = fitted[[node]]$residuals,
+       county = data[use, "COUNTY"],
+       week = data[use, "WEEK"]
+     )

+   }#ELSE

+   # define a function to check autocorrelation at lags 1 to 4 for a county.
+   out.of.bounds = function(data, county) {

+     # extract the residuals.
+     res = data[data$county == county, "values"]

+     acf = numeric(8)
+     nsamples = numeric(8)

+     # for each lag...
+     for (lag in 1:8) {

+       # ... check whether we have enough samples...
+       nsamples[lag] = length(res) - lag

+       if (nsamples[lag] <= 3) {

+         acf[lag] = 0
+         next

+       }#THEN
+       # ... align the residuals of the two time points...
+       r0 = res[-(seq(0, lag))]
+       rl = res[-(length(res) - seq(0, lag - 1))]
+       # ... and compute the correlation and sample size from complete pairs.
+       acf[lag]  = cor(r0,  rl, use = "pairwise.complete")

+     }#FOR

+     # compute the p-values for the correlation using the exact t-test.
+     df = pmax(nsamples - 2, 1)
+     statistics = acf * sqrt(df / (1 - acf^2))
+     pvalues = 2 * pt(abs(statistics), df = df, lower.tail = FALSE)
+     # paper over the few p-values we cannot compute.
+     pvalues[is.na(pvalues)] = 1

+     return(as.vector(pvalues))

+   }#OUT.OF.BOUNDS

+   # find out which counties have residuals to check...
+   all.counties = table(res[!is.na(res[, "values"]), "county"])
+   observed.counties = names(all.counties[all.counties > 0])

+   # ... check them...
+   all.pvalues =
+    t(sapply(observed.counties, out.of.bounds, data = res))
+   # ... apply multiplicity correction...
+   all.pvalues[] = p.adjust(all.pvalues, method = "BY")
+   # ... and compute the proportion of the p-values suggest a temporal pattern.
+   proportions = matrix(colMeans(all.pvalues < 0.05), nrow = 1,
+                dimnames = list(node, c("lag 1", "lag 2", "lag 3", "lag 4",
+                                        "lag 5", "lag 6", "lag 7", "lag 8")))

+   return(proportions)

+ }#CHECK.AUTOCOR

As expected, the overall proportion of p-values smaller than 0.05 is less than 0.05 for each condition. The best network we chose earlier also appears to have well-behaved residuals: there are somewhat larger traces of autocorrelation in the residuals at lag 1 but not at higher lags. This justifies the choice of limiting the dynamic BN to an AR(1) temporal dependence structure.

lag 1 lag 2 lag 3 lag 4 lag 5 lag 6 lag 7 lag 8
ANX 0.008 0.000 0.000 0.008 0.000 0.000 0.000 0.000
DEP 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
DER 0.024 0.000 0.000 0.000 0.008 0.000 0.000 0.000
OBE 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
SLD 0.078 0.007 0.007 0.000 0.000 0.000 0.000 0.000

Secondly, we check whether there is spatial autocorrelation in the residuals. As before, we check each week individually because there is no test for spatial dependence in R that allows for multiple observations with the same coordinates.

> check.spatialcor = function(ldist) {

+   # frame the residuals together with the county and the week.
+   coords = attr(ldist, "coords")

+   res = data.frame(
+     values = normalized.residuals(ldist),
+     week = droplevels(coords[, "WEEK"]),
+     longitude = coords[, "LON"],
+     latitude = coords[, "LAT"]
+   )

+   # define a function to check autocorrelation at lags 1 to 4 for a county.
+   out.of.bounds = function(data, week) {

+     # extract the residuals and the corresponding coordinates.
+     current.week = (data$week == week) & !is.na(data[, "values"])
+     values = data[current.week, "values"]

+     # if there are enough residuals...
+     if (length(values) <= 3)
+       return(1)

+     dmat = as.matrix(dist(data[current.week, c("longitude", "latitude")]))
+     dmat = 1 / dmat
+     diag(dmat) = 0

+     # ... compute Moran's I test.
+     ape::Moran.I(values, dmat, alternative = "greater")$p.value

+   }#OUT.OF.BOUNDS

+   # find out which weeks have residuals to check...
+   all.weeks = table(res[!is.na(res[, "values"]), "week"])
+   observed.weeks = names(all.weeks[all.weeks > 0])
+   # ... check them...
+   pvalues = sapply(observed.weeks, out.of.bounds, data = res)
+   # ... apply multiplicity correction...
+   multiplicity = p.adjust(pvalues, method = "BY")
+   #... and compute the proportion of the p-values suggest a spatial pattern.
+   proportions = mean(multiplicity < 0.05)

+   return(proportions)

+ }#CHECK.SPATIALCOR

Unfortunately, there are clear signs of spatial correlation in the residuals in the dynamic BN we chose above, with 30–70% of the tests being statistically significant for various conditions.

proportion
ANX_1 0.468
DEP_1 0.397
DER_1 0.754
OBE_1 0.579
SLD_1 0.381

The Moran I tests above confirmed that spatial correlation is statistically significant. In a simpler analysis, we can determine what proportion of the residual variance can be explained by the spatial structure of the data by plotting their variogram and looking at the (height) difference between the nugget and the sill. However, here, we would have to inspect 134 variograms (one for each week). Instead, we fit a parametric variogram model with an exponential correlation decay function, extract the nugget and the sill from the variogram model parameters, and compute the proportion of variance explained in each week by taking their scaled difference.

> assess.spatial.variance = function(ldist) {

+   # extract the residuals, the coordinates and the weights...
+   if (is(ldist, "bn.fit.gnode")) {

+     residuals = residuals(ldist)
+     ccc = attr(ldist, "coords")
+     weights = rep(1, length(residuals))

+     # remove any missing values.
+     weights = weights[!is.na(residuals)]
+     ccc = ccc[!is.na(residuals), ]
+     residuals = residuals[!is.na(residuals)]

+   }#THEN
+   else {

+     residuals = normalized.residuals(ldist)
+     ccc = attr(ldist, "coords")
+     weights = attr(ldist, "weights")

+   }#ELSE

+   # ... and for each week, independently of the others:
+   weeks = levels(droplevels(ccc$WEEK))
+   proportions = structure(numeric(length(weeks)), names = weeks)

+   for (w in weeks) {

+     # subset the associated (scaled) residuals and coordinates...
+     rrr = residuals[ccc$WEEK == w]
+     ccc2 = ccc[ccc$WEEK == w, ]
+     # ... and if there are enough of them...
+     if (length(rrr) < 10) {

+       proportions[w] = NA
+       next

+     }#THEN

+     # ... construct the variogram...
+     geoR = geoR::as.geodata(data.frame(rrr, ccc2), data.col = 1, coords.col = 2:3)
+     vario = geoR::variog(geoR, max.dist = 50, option = "cloud", messages =  FALSE)

+     # ... and fit it.
+     fitted = geoR::variofit(vario, nugget = 0, max.dist = 50,
+                                 cov.model = "exponential", messages = FALSE)

+     # finally, save the proportion of spatial variance.
+     proportions[w] =
+       fitted$cov.pars[1] / (fitted$cov.pars[1] + fitted$nugget)

+   }#FOR

+   return(proportions)

+ }#ASSESS.SPATIAL.VARIANCE

In about half of the weeks, the proportion of residual variance that can be attributed to the spatial structure of the data is 20% or less. In about one-third of the weeks, it is more than 50%. Therefore, we can expect the causal effects measured by the dynamic BN to be strongly biased. The strongly significant result from Moran's test we observed earlier did not necessarily imply that: the sample size is so large that even a small departure from the null hypothesis will produce a near-zero p-value.

less than 20% less than 50% total
ANX_1 64 77 121
DEP_1 68 94 121
DER_1 51 67 121
OBE_1 52 74 121
SLD_1 72 99 121

Add a spatial correlation structure

Estimating a gls() model with a spatial correlation structure inside structure learning is computationally prohibitive, because it scales O(n3) in the sample size. A faster option is to:

  1. Estimate the nugget and the range by refitting the local distributions in the model from dags.first.try with gls() and a spatial correlation matrix.
    > estimate.nugget.and.range = function(dag, node, data) {
    
    +   # get the parents of the node...
    +   pars = parents(dag, node)
    +   # ... find the subset of the data that is locally complete...
    +   full = data[complete.cases(data[, c(node, pars)]), ]
    +   # ... refit the local distribution...
    +   f = paste(node, "~", paste(c("1", pars), collapse = "+"))
    
    +   ldist = nlme::gls(as.formula(f), data = full,
    +             cor = nlme::corExp(form = ~ LAT + LON | WEEK, nugget = TRUE),
    +             control = list(apVar = FALSE))
    
    +   nlme:::coef.corSpatial(ldist$modelStruct$corStruct, unconstrained = FALSE)
    
    + }#ESTIMATE.NUGGET.AND.RANGE
    > 
    > t1.vars = grep("_1$", colnames(ttdata), value = TRUE)
    > 
    > spatial.params =
    +   parSapply(cl, t1.vars, estimate.nugget.and.range, data = ttdata, dag = dbn.iid)
    
  2. Note the nugget and the range for each node.
  3. Run structure learning while keeping the nugget and the range fixed to the values we noted. To further improve speed, we can also increase the numeric tolerances for convergence inside gls().

To do this, first define the model for the local distributions. There is no point in considering spatial correlation for root nodes (all t0.vars and the sociodemographic variables). For the other variables, consider spatial correlations but fix the nugget and range parameters.

> custom.local.distribution = function(node, parents, data, args) {

+   t0.vars = grep("_0", colnames(data), value = TRUE)
+   t1.social = c("UNI_1", "UNEMPLOYMENT_1", "INCOME_1", "POPULATION_1",
+                 "DENSITY_1", "POVERTY_1")

+   # add back the coordinates to the data.
+   data = cbind(data, args$coords)
+   # find the subset of the data that is locally complete.
+   full = data[complete.cases(data[, c(node, parents)]), ]
+   # create the formula for gls().
+   f = paste(node, "~", paste(c("1", parents), collapse = "+"))

+   if ((node %in% t0.vars) || (node %in% t1.social)) {

+     # these nodes are always root nodes, so there is no point in modelling their
+     # spatial correlation structure.
+     model = nlme::gls(as.formula(f), data = full)
+     np = length(parents) + 2

+   }#THEN
+   else {

+     model = nlme::gls(as.formula(f), data = full,
+                 cor = nlme::corExp(value = args$spatial[, node],
+                                    form = ~ LAT + LON | WEEK,
+                                    nugget = TRUE, fixed = TRUE),
+           control = list(singular.ok = TRUE, returnObject = TRUE, apVar = FALSE))

+     np = length(parents) + 4

+     # save additional information to evaluate heterogeneity.
+     attr(model, "states") = full[, "STATE"]
+     attr(model, "coords") = full[, c("LAT", "LON", "WEEK")]

+   }#ELSE

+   # save what we need to compute the node-averaged log-likelihood.
+   attr(model, "pnal.nall") = nrow(data)
+   attr(model, "pnal.ncomplete") = nrow(full)
+   attr(model, "pnal.nparams") = np

+   return(model)

+ }#CUSTOM.LOCAL.DISTRIBUTION

Redefine the penalised NAL score for the custom local distribution above.

> custom.pnal = function(node, parents, data, args) {

+   ldist = custom.local.distribution(node, parents, data, args)

+   nall = attr(ldist, "pnal.nall")
+   ncomp = attr(ldist, "pnal.ncomplete")
+   np = attr(ldist, "pnal.nparams")

+   pnal = nlme:::logLik.gls(ldist, REML = FALSE) / ncomp -
+            args$w * log(nall) / 2 * np / nall

+   return(as.numeric(pnal))

+ }#CUSTOM.PNAL

Redefine the tabu search wrapper that performs structure learning and the wrapper that performs the model averaging.

> learn.spatial.dbn = function(data, penalty = 2) {

+   nodes = grep("_[01]$", colnames(data), value = TRUE)

+   tabu(data[, nodes], blacklist = blacklist(data[, nodes]),
+     score = "custom", fun = custom.pnal,
+     args = list(spatial = spatial.params, w = penalty,
+                 coords = data[, c("LAT", "LON", "WEEK", "STATE")]))

+ }#LEARN.SPATIAL.DBN
> 
> spatial.averaging = function(data, penalty = 2, reps = 100) {

+   # bootstrap, column permutation and subsampling.
+   bagging = function(i, data, counties, penalty) {

+     keep = sample(levels(counties), 0.80 * nlevels(counties))
+     boot.sample = data[counties %in% keep, ]
+     boot.sample = boot.sample[, sample(ncol(data), ncol(data))]

+     learn.spatial.dbn(boot.sample, penalty = penalty)

+   }#BAGGING

+   # export the function and the variables we need to the slaves.
+   clusterExport(cl, c("custom.pnal", "custom.local.distribution",
+                       "learn.spatial.dbn", "spatial.params", "blacklist"))
+   # distribute the bootstrapping to the slaves.
+   dags = parLapply(cl, seq(reps), bagging, data = data,
+                    counties = data[, "COUNTY"], penalty = penalty)
+   # compute the arc strengths.
+   strength = custom.strength(dags, nodes = nodes(dags[[1]]))
+   # construct the consensus network, saving the arc strengths.
+   consensus = averaged.network(strength)
+   consensus$learning$args[["strength"]] = strength

+   return(consensus)

+ }#SPATIAL.AVERAGING
> spatial.dbn = spatial.averaging(ttdata, penalty = 16)
> spatial.dbn

  Random/Generated Bayesian network

  model:
   [ANX_0][DENSITY_0][DENSITY_1][DEP_0][DER_0][INCOME_0][INCOME_1][NO2_0][OBE_0]
   [PM25_0][POPULATION_0][POPULATION_1][POVERTY_0][POVERTY_1][RAIN_0]
   [RANGETEMP_0][RANGEWIND_0][SLD_0][SO2_0][TEMP_0][UNEMPLOYMENT_0]
   [UNEMPLOYMENT_1][UNI_0][UNI_1][WIND_0]
   [ANX_1|ANX_0:DEP_0:DER_0:NO2_0:OBE_0:PM25_0:SLD_0:SO2_0]
   [DEP_1|ANX_0:DEP_0:DER_0:NO2_0:PM25_0:SLD_0:SO2_0:TEMP_0]
   [DER_1|ANX_0:DER_0:NO2_0:PM25_0:RANGEWIND_0][NO2_1|DENSITY_0:NO2_0]
   [OBE_1|ANX_0:NO2_0:OBE_0:PM25_0:SLD_0:SO2_0:TEMP_0][PM25_1|PM25_0]
   [RAIN_1|RAIN_0][RANGETEMP_1|RANGETEMP_0][RANGEWIND_1|RANGEWIND_0]
   [SLD_1|ANX_0:DEP_0:NO2_0:OBE_0:PM25_0:SLD_0:SO2_0:TEMP_0][SO2_1|SO2_0]
   [TEMP_1|RANGETEMP_0:TEMP_0][WIND_1|RANGEWIND_0:WIND_0]
  nodes:                                 38
  arcs:                                  47
    undirected arcs:                     0
    directed arcs:                       47
  average markov blanket size:           4.74
  average neighbourhood size:            2.47
  average branching factor:              1.24

  generation algorithm:                  Model Averaging
  significance threshold:                0.53
plot of chunk plot_compare_spatial_dynamic_bns

This model looks better in terms of structural features while still being reasonably sparse (2.47 arcs per node). We have the feedback loops connecting sleep disorder, anxiety and depression; and that linking sleep disorder and obesity.

The dynamic BN with a spatial correlation structure has a better PNAL score than the model that assumes that observations are independent over space.

> pnal.iid = score(dbn.iid, type = "pnal-g", data = ttdata[, nodes(dbn.iid)],
+               k = 16 * log(nrow(ttdata)) / 2)
> pnal.spatial = score(spatial.dbn, type = "custom",
+                  data = ttdata[, nodes(spatial.dbn)], fun = custom.pnal,
+                  args = list(spatial = spatial.params, w = 16,
+                              coords = ttdata[, c("LAT", "LON", "WEEK", "STATE")]))

We can compute the Bayes factor from these PNALs, which are the equivalent of BICs for use with incomplete data: it comes out in favour of the dynamic BN with a spatial correlation structure overwhelmingly strongly.

Bayes factor for the heterogenity model

We have now incorporated spatial correlation into the dynamic BN, which already models (temporal) autocorrelation by construction. Is there any evidence of heterogeneity left in the residuals of the variables with a spatial correlation structure, say, in different states? Firstly, Google normalised the search frequencies for the conditions by state. Secondly, we cannot investigate heterogeneity at the level of the counties because it would require an excessive number of parameters.

> heterogeneity = function(ldist) {

+   # compute the normalized residuals...
+   res = normalized.residuals(ldist, decorrelate = FALSE)
+   # ... find the matching state labels...
+   states = attr(ldist, "states")
+   # ... and test them for heterogeneity between states.
+   p.value = bartlett.test(res ~ states)$p.value

+   return(p.value)

+ }#HETEROGENEITY

The answer is positive: even after adjusting for multiplicity, all the Bartlett test p-values for the spatially correlated variables are strongly significant.

variable p-value
PM25_1 0
OBE_1 6.42e-100
SO2_1 0
DEP_1 9.70999999999999e-217
DER_1 0
ANX_1 1.95e-182
RANGETEMP_1 0
RANGEWIND_1 0
WIND_1 0
TEMP_1 0
SLD_1 1.1e-154
RAIN_1 0
NO2_1 0

On the other hand, spatial correlation is now modelled correctly.

proportion
ANX_1 0.024
DEP_1 0.016
DER_1 0.016
OBE_1 0.024
SLD_1 0.016

Model heterogeneity between states

We can redefine the custom local distribution to have different residual variances for the various states. However, estimating the new variance parameters is as computationally challenging as estimating the parameters of the data correlation structure. To perform structure learning in a reasonable time frame, we model heterogeneity by nesting gls() inside iteratively reweighted least squares (IRLS) to adjust the residual variance by state. The standard errors for the different states in the local distributions are treated as nuisance parameters.

We redefine custom.local.distribution() and learn.spatial.dbn() to implement this extended model.

> custom.local.distribution = function(node, parents, data, args) {

+   t0.vars = grep("_0", colnames(data), value = TRUE)
+   t1.social = c("UNI_1", "UNEMPLOYMENT_1", "INCOME_1", "POPULATION_1",
+                 "DENSITY_1", "POVERTY_1")

+   # add back the coordinates to the data.
+   data = cbind(data, args$coords)
+   # find the subset of the data that is locally complete.
+   full = data[complete.cases(data[, c(node, parents)]), ]
+   full[, "STATE"] = droplevels(full[, "STATE"])
+   # create the formula for gls().
+   f = paste(node, "~", paste(c("1", parents), collapse = "+"))

+   if ((node %in% t0.vars) || (node %in% t1.social)) {

+     # these nodes are always root nodes, so there is no point in modelling their
+     # spatial correlation structure nor their heteroscedasticity.
+     model = nlme::gls(as.formula(f), data = full)
+     np = length(parents) + 2

+   }#THEN
+   else {

+     # add weights to the model.
+     full[, "w"] = numeric(nrow(full))
+     # provide an initial estimate.
+     model = nlme::gls(as.formula(f), data = full, method = "ML",
+                 cor = nlme::corExp(value = args$spatial[, node],
+                                    form = ~ LAT + LON | WEEK,
+                                    nugget = TRUE, fixed = TRUE))

+     old.logl = as.numeric(nlme:::logLik.gls(model), REML = FALSE)

+     # iteratively reweighted least squares.
+     for (iter in 1:(args$irls.max.iter)) {

+       # compute the per-state variances...
+       weights = sapply(levels(full[, "STATE"]),
+                        function(s) var(resid(model)[full[, "STATE"] == s]) )
+       # ... make sure that they are not zero...
+       weights = pmax(weights, 10^-4)
+       # ... associate it with the observations...
+       for (i in seq(nrow(full)))
+         full[i, "w"] = weights[names(weights) == full[i, "STATE"]]
+       # ... and re-estimate the model.
+       model = nlme::gls(as.formula(f), data = full, method = "ML",
+                   cor = nlme::corExp(value = args$spatial[, node],
+                                      form = ~ LAT + LON | WEEK,
+                                      nugget = TRUE, fixed = TRUE),
+                   weights = nlme::varFixed(~ w))

+       new.logl = as.numeric(nlme:::logLik.gls(model, REML = FALSE))

+       if (isTRUE(all.equal(old.logl, new.logl)))
+         break
+       else
+         old.logl = new.logl

+      }#FOR

+     np = length(parents) + 4 + nlevels(full[, "STATE"])

+     # save additional information to evaluate heterogeneity.
+     attr(model, "weights") = full[, "w"]
+     attr(model, "variances") = weights
+     attr(model, "states") = full[, "STATE"]
+     attr(model, "coords") = full[, c("LAT", "LON", "WEEK")]

+   }#ELSE

+   # save what we need to compute the node-averaged log-likelihood.
+   attr(model, "pnal.nall") = nrow(data)
+   attr(model, "pnal.ncomplete") = nrow(full)
+   attr(model, "pnal.nparams") = np

+   return(model)

+ }#CUSTOM.LOCAL.DISTRIBUTION
> 
> learn.spatial.dbn = function(data, penalty = 2) {

+   nodes = grep("_[01]$", colnames(data), value = TRUE)

+   fname = tempfile(tmpdir = "/mnt/data/projects-loreal23/analysis",
+                    pattern = paste0("file-", Sys.time(), "-data-"),
+                    fileext = ".rds")
+   dag = tabu(data[, nodes], blacklist = blacklist(data[, nodes]),
+           score = "custom", fun = custom.pnal,
+           args = list(spatial = spatial.params, w = penalty, irls.max.iter = 4,
+                       coords = data[, c("LAT", "LON", "WEEK", "STATE")]))

+    return(dag)

+ }#LEARN.SPATIAL.DBN

Then, we redefine spatial.averaging() to pass call them and to pass the additional arguments we now need.

> spatial.averaging = function(data, penalty = 2, reps = 100) {

+   # bootstrap, column permutation and subsampling.
+   bagging = function(i, data, counties, penalty) {

+     keep = sample(levels(counties), 0.80 * nlevels(counties))
+     boot.sample = data[counties %in% keep, ]
+     boot.sample = boot.sample[, sample(ncol(data), ncol(data))]

+     learn.spatial.dbn(boot.sample, penalty = penalty)

+   }#BAGGING

+   # export the function and the variables we need to the slaves.
+   clusterExport(cl, c("custom.pnal", "custom.local.distribution",
+                       "learn.spatial.dbn", "spatial.params", "blacklist"))
+   # distribute the bootstrapping to the slaves.
+   dags = parLapply(cl, seq(reps), bagging, data = data,
+                    counties = data[, "COUNTY"], penalty = penalty)
+   # compute the arc strengths.
+   strength = custom.strength(dags, nodes = nodes(dags[[1]]))
+   # construct the consensus network, saving the arc strengths.
+   consensus = averaged.network(strength)
+   consensus$learning$args[["strength"]] = strength

+   return(consensus)

+ }#SPATIAL.AVERAGING
> 
> hetero.dbn = spatial.averaging(ttdata, penalty = 16)

The resulting network is shown below. Compared to the earlier one, it lacks quite a few feedback loops that we know exist, for instance, that between anxiety and depression. This suggests that we impose too much sparsity, forcing the learning process to leave relevant arcs out. Modelling the heterogeneity of the residuals requires additional parameters, so it makes sense that the BIC penalty is now too high.

plot of chunk plot_compare_spatial_heteroscedastic_dbns

As we did earlier, we can compare the new causal network with the earlier one by approximating their Bayes factor.

Bayes factor for the heterogenity model

Increase the density of the network and validate the residuals

Reducing the sparsity from penalty = 16 to penalty = 4 results in the dynamic BN below.

> hetero3.dbn = spatial.averaging(ttdata, penalty = 4)
> hetero3.dbn

  Random/Generated Bayesian network

  model:
   [ANX_0][DENSITY_0][DENSITY_1][DEP_0][DER_0][INCOME_0][INCOME_1][NO2_0][OBE_0]
   [PM25_0][POPULATION_0][POPULATION_1][POVERTY_0][POVERTY_1][RAIN_0]
   [RANGETEMP_0][RANGEWIND_0][SLD_0][SO2_0][TEMP_0][UNEMPLOYMENT_0]
   [UNEMPLOYMENT_1][UNI_0][UNI_1][WIND_0]
   [ANX_1|ANX_0:DEP_0:DER_0:NO2_0:OBE_0:PM25_0:SLD_0:SO2_0:WIND_0]
   [DEP_1|ANX_0:DEP_0:DER_0:NO2_0:PM25_0:RANGETEMP_0:SLD_0:SO2_0:TEMP_0]
   [DER_1|ANX_0:DER_0:NO2_0:OBE_0:PM25_0:RANGETEMP_0:SO2_0]
   [NO2_1|DENSITY_0:NO2_0:POPULATION_0:POVERTY_0]
   [OBE_1|ANX_0:DEP_0:DER_0:NO2_0:OBE_0:PM25_0:SLD_0:SO2_0]
   [PM25_1|INCOME_0:PM25_0:POPULATION_0][RAIN_1|RAIN_0:RANGEWIND_0]
   [RANGETEMP_1|RANGETEMP_0:TEMP_0]
   [RANGEWIND_1|RAIN_0:RANGEWIND_0:TEMP_0:WIND_0]
   [SLD_1|ANX_0:DEP_0:DER_0:NO2_0:OBE_0:PM25_0:RANGETEMP_0:SLD_0:SO2_0:TEMP_0:WIND_0]
   [SO2_1|SO2_0:TEMP_0][TEMP_1|RANGETEMP_0:TEMP_0]
   [WIND_1|RANGETEMP_0:RANGEWIND_0:TEMP_0:WIND_0]
  nodes:                                 38
  arcs:                                  67
    undirected arcs:                     0
    directed arcs:                       67
  average markov blanket size:           7.21
  average neighbourhood size:            3.53
  average branching factor:              1.76

  generation algorithm:                  Empty
plot of chunk plot_compare_spatial_heteroscedastic_dbns_3

The model learned with penalty = 4 is preferable to that learned with penalty = 16.

> pnal.hetero3 = score(hetero3.dbn, type = "custom",
+                  data = ttdata[, nodes(hetero3.dbn)], fun = custom.pnal,
+                  args = list(spatial = spatial.params, w = 4, irls.max.iter = 8,
+                              coords = ttdata[, c("LAT", "LON", "WEEK", "STATE")]))
Bayes factor for the final model

We define a function that fits the parameters of all the local distributions in one go, much like bn.fit().

> spatial.fit = function(dag, data, spatial.params) {

+   clusterExport(cl, "custom.local.distribution")

+   parSapply(cl, nodes(dag), simplify = FALSE,
+                               function(node, dag, data, spatial.params) {

+     pars = parents(dag, node)

+     ldist = custom.local.distribution(node, pars, data,
+               args = list(spatial = spatial.params, irls.max.iter = 8,
+                           coords = data[, c("LAT", "LON", "WEEK", "STATE")]))

+     ldist$parents = pars

+     return(ldist)

+   }, dag = dag, data = data, spatial.params = spatial.params)

+ }#SPATIAL.FIT

We confirm that this model does not have any unmodelled heterogeneity in the residuals of the nodes with a spatio-temporal correlation structure.

We also confirm that the model has no autocorrelation in the residuals. It has even less than before, suggesting that other violations of the model assumptions inflated the p-values for the earlier network.

lag 1 lag 2 lag 3 lag 4 lag 5 lag 6 lag 7 lag 8
ANX 0.000 0.000 0.000 0.016 0.000 0.000 0.000 0.000
DEP 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
DER 0.032 0.000 0.008 0.000 0.008 0.000 0.000 0.000
OBE 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000
SLD 0.000 0.000 0.000 0.000 0.000 0.000 0.000 0.000

Finally, we confirm that the model has little spatial correlation left in the residuals. Even though the spatial correlation structure we assume is extremely simple (uniform, anisotropic, etc.), only very little is left unmodelled after adjusting for multiplicity. In turn, this result validates our choice to estimate the spatial correlation matrix separately, outside of the structure learning process.

> local({

+   # for all the conditions (at time 1, where they can have parents)...
+   conditions = c("ANX_1", "ANX_1", "DEP_1", "DER_1", "OBE_1", "SLD_1")
+   # ... test spatial correlation with Moran's test (hot-fixing caching issues).
+   values = sapply(hetero3.fitted[conditions], check.spatialcor)

+   kable(data.frame(proportion = values[-1]), digits = 3)

+ })
proportion
ANX_1 0.024
DEP_1 0.016
DER_1 0.016
OBE_1 0.032
SLD_1 0.040

Tabulating the proportions of residual variance that can be explained by the spatial structure of the data. Again, only a small proportion of that variance can be attributed to the unmodelled spatial structure in the data.

less than 20% less than 50% total
ANX_1 102 114 121
DEP_1 100 112 121
DER_1 83 105 121
OBE_1 96 111 121
SLD_1 83 100 121

Assessing predictive accuracy

We assess the predictive accuracy for existing locations but new time points.

> spatial.accuracy = function(fitted, validation) {

+   # ignore the accuracy of socio-demographic variables, they are not really
+   # temporal and should not be evaluated along with the others.
+   social = c("UNI_1", "UNEMPLOYMENT_1", "INCOME_1", "POPULATION_1",
+              "DENSITY_1", "POVERTY_1")
+   t1.vars = grep("_1$", names(fitted), value = TRUE)
+   to.evaluate = setdiff(t1.vars, social)

+   accuracy = structure(vector(length(to.evaluate), mode = "numeric"),
+                        names = to.evaluate)

+   # for each node to evaluate...
+   for (node in to.evaluate) {

+     # ... take the locally-complete data points...
+     pars = fitted[[node]]$parents
+     full = validation[complete.cases(validation[, c(node, pars)]), , drop = FALSE]
+     # ... compute the predicted values...
+     pred = nlme:::predict.gls(fitted[[node]], newdata = full)
+     # ... and calculate the R^2 coefficient with the true values.
+     accuracy[node] = summary(lm(full[, node] ~ pred))$adj.r.squared

+   }#FOR

+   accuracy["AVERAGE"] = mean(accuracy)

+   return(accuracy)

+ }#SPATIAL.ACCURACY

We repeat the assessment of predictive accuracy that we used to tune the sparsity of the dynamic networks at the beginning of the notebook. Again, we leverage the six-month gap between "2022-05-23" and "2022-12-26" to separate the training and testing sets.

> training = subset(ttdata, WEEK <= "2022-05-23")
> validation = subset(ttdata, WEEK >= "2022-12-26")
> 
> penalty = c(2, 4, 8, 16)
> narcs = vector(length(penalty), mode = "numeric")
> accuracy = vector(length(penalty), mode = "list")
> dags = vector(length(penalty), mode = "list")
> fitted = vector(length(penalty), mode = "list")
> 
> for (i in seq_along(penalty)) {

+   # structure learning with model averaging...
+   dags[[i]] = spatial.averaging(training, penalty = penalty[i])
+   narcs[i] = narcs(dags[[i]])
+   # ... parameter learning...
+   fitted = spatial.fit(dags[[i]], training, spatial.params)
+   # ... prediction and the associated accuracy.
+   accuracy[[i]] = spatial.accuracy(fitted, validation)

+ }#FOR
plot of chunk plot_predictive_accuracy_decay_again

The number of arcs (absolute) and the density (relative) of the dynamic BNs. For the same penalty, the networks appear denser than those at the beginning of the notebook. However, we confirm our conclusion by examining the Bayes factors: both w = 8 and w = 4 result in slightly better models than w = 16.

penalty arcs density AVERAGE CONDITIONS
2 69 3.632 0.581 0.758
4 65 3.421 0.581 0.758
8 58 3.053 0.582 0.761
16 52 2.737 0.579 0.760

To complement these results, we assess the predictive accuracy for existing time points but new locations. The setup is the same as when we assumed observations were i.i.d. earlier.

The R2 values for the conditions over the different regions and on average.

> accuracy = vector(length(regions), mode = "list")
> names(accuracy) = names(regions)
> 
> for (r in names(regions)) {

+   # split training and validation...
+   training = subset(ttdata, !(STATE %in% regions[[r]]))
+   validation = subset(ttdata, (STATE %in% regions[[r]]))
+   # ... remove the counties in the validation set that are too close to those in
+   # the training set...
+   validation = remove.too.close(training, validation)
+   # ... learn the structure of the dynamic BN...
+   dag = spatial.averaging(training, penalty = 4)
+   # ... and predict the validation set to compute the associated accuracy.
+   accuracy[[r]] = dbn.accuracy(dag, training, validation)

+ }#FOR
ANX DEP DER OBE SLD
SE 0.666 0.737 0.892 0.701 0.611
CS 0.598 0.645 0.862 0.610 0.491
CE 0.736 0.804 0.907 0.714 0.570
NW 0.680 0.672 0.814 0.704 0.484
WC 0.697 0.695 0.837 0.805 0.647
NE 0.774 0.782 0.884 0.809 0.643
AVERAGE 0.692 0.723 0.866 0.724 0.574
Last updated on Fri May 9 21:24:55 2025 with bnlearn 5.0-20240418 and R version 4.3.2 (2023-10-31).