bnlearn - Scutari, Salah, Kerob, Krutmann, Aritificial Intelligence in Medicine (2025)

Data preprocessing

Import the data (from the earlier data fusion notebook, available here) and reformat it to learn a two-time network.

> data = readRDS("prepd.useful.data.rds")
> key.vars = c("fips_county_code", "week", "state", "county")
> 
> two.time.data.format = function(data) {

+   reshaped = lapply(levels(data$fips_county_code),
+                        function(each.county, data) {

+     shift.level = function(x) {

+       ordered(as.integer(x) + 1, levels = seq_along(levels(x)), labels = levels(x))

+     }#SHIFT.LEVEL

+     key.vars = c("fips_county_code", "week", "state", "county", "latitude", "longitude")

+     # extract the data from a specific county...
+     county.data = subset(data, fips_county_code == each.county)
+     # ... order the observations to follow the time of collection...
+     county.data = county.data[order(county.data$week), ]
+     # ... identify the time ranges available for times 0 and 1...
+     available.t0 = county.data$week[-length(county.data$week)]
+     available.t1 = shift.level(available.t0)

+     both.t0.and.t1 =
+       (available.t0 %in% county.data$week) & (available.t1 %in% county.data$week)

+     available.t0 = available.t0[both.t0.and.t1]
+     available.t1 = available.t1[both.t0.and.t1]

+     # ... extract the data at time 0 and time 1...
+     t0 = subset(county.data, week %in% available.t0)
+     t1 = subset(county.data, week %in% available.t1)
+     # ... remove the ID variables from the data at time 1...
+     t1 = t1[, !(colnames(t1) %in% key.vars)]
+     # ... and postfix the variable names associated with different time points.
+     names(t0)[!(names(t0) %in% key.vars)] =
+       paste0(names(t0)[!(names(t0) %in% key.vars)], "_0")
+     names(t1) = paste0(names(t1), "_1")

+     return(cbind(t0, t1))

+   }, data = data)

+   # collect the data from all the counties in a single data frame.
+   return(do.call(rbind, reshaped))

+ }#TWO.TIME.DATA.FORMAT
> 
> ttdata = two.time.data.format(data)

Clean up and rename variables:

The threshold variables "*_b_*" are deterministic functions of some quantile of the corresponding continuous variables, making them redundant.
Metro and RUCC contain information that overlaps with the population density, and they are discrete: we would have to use a CGBN to model them, which is difficult when trying to do causal modelling because only some arc directions are allowed. Furthermore, neither variable explains much of the variability of the pollutants or the conditions.

	Metro	RUCC
PM10_mean_1	0.005	0.018
PM25_mean_1	0.005	0.006
SO2_mean_1	0.001	0.037
NO2_mean_1	0.088	0.149
O3_mean_1	0.013	0.019
anxiety_1	0.003	0.039
depression_1	0.000	0.055
atopic_dermatitis_1	0.004	0.031
obesity_1	0.015	0.031
sleep_disorder_1	0.002	0.053

PM10 contains the most missing values in the data, and `03` might not be relevant.
Remove unobserved levels from week and county.
Remove the maximum level of pollutants, which are strongly correlated to the average levels and, therefore, are not very informative in the models. They also get in the way of the latter being connected to the conditions.
Remove the maximum and minimum temperature and wind speed, which are strongly correlated with the means. Replace them with a normalised range over the week.

> ttdata$RANGETEMP_0 =
+   (ttdata$temperature_max_0 - ttdata$temperature_min_0) /
+   (ttdata$temperature_min_0 - min(ttdata$temperature_min_0, na.rm = TRUE) + 1)
> ttdata$RANGETEMP_1 =
+   (ttdata$temperature_max_1 - ttdata$temperature_min_1) /
+   (ttdata$temperature_min_1 - min(ttdata$temperature_min_1, na.rm = TRUE) + 1)
> ttdata$RANGEWIND_0 =
+   (ttdata$windspeed_max_0 - ttdata$windspeed_min_0) /
+   (ttdata$windspeed_min_0 + 1)
> ttdata$RANGEWIND_1 =
+   (ttdata$windspeed_max_1 - ttdata$windspeed_min_1) /
+   (ttdata$windspeed_min_1 + 1)

min	max	range
0.982	0.976	-0.298
0.982	0.975	-0.290
0.991	0.982	0.034
0.991	0.981	0.036

Minimum and maximum precipitations are not strongly correlated with each other and with the average only because they are equal to zero so often. Therefore, they are redundant in the analysis, and we can remove them. And so are the maximum and minimum temperatures and wind speed, now that we have introduced the ranges.

> redundant = grep("_b_", names(ttdata), value = TRUE)
> redundant = c(redundant, "temperature_min_0", "temperature_max_0",
+                "temperature_min_1", "temperature_max_1",
+                "windspeed_min_0", "windspeed_max_0",
+                "windspeed_min_1", "windspeed_max_1",
+                "precipitations_min_0", "precipitations_max_0",
+                "precipitations_min_1", "precipitations_max_1")
> discrete = c("Metro_0", "Metro_1", "RUCC_0", "RUCC_1")
> irrelevant = c("O3_mean_0", "O3_max_0", "PM10_mean_0", "PM10_max_0",
+                "O3_mean_1", "O3_max_1", "PM10_mean_1", "PM10_max_1",
+                "SO2_max_0", "NO2_max_0", "PM25_max_0",
+                "SO2_max_1", "NO2_max_1", "PM25_max_1")
> 
> ttdata = ttdata[, !(colnames(ttdata) %in% c(redundant, discrete, irrelevant))]

Two variables will always be isolated in every model we try: vote_rate and household_size. Remove them.
```
> ttdata = ttdata[, !grepl("vote_rate|household_size", colnames(ttdata))]
```
There is a substantial overlap between college and university graduate frequencies; merge them into a single variable.

> ttdata$bachelor_degree_or_higher_0 =
+   ttdata$bachelor_degree_or_higher_0 + ttdata$college_or_associate_degree_0
> ttdata$bachelor_degree_or_higher_1 =
+   ttdata$bachelor_degree_or_higher_1 + ttdata$college_or_associate_degree_1
> 
> ttdata = ttdata[, !grepl("college_or_associate_degree", colnames(ttdata))]

Replace variable names with abbreviations to make the code more compact and the figures more readable.

> old.names = names(ttdata)
> new.names =
+   c("FIPS", "WEEK", "STATE", "COUNTY", "UNI_0", "UNEMPLOYMENT_0", "INCOME_0",
+     "POPULATION_0", "DENSITY_0", "POVERTY_0", "NO2_0", "SO2_0", "PM25_0",
+     "ANX_0", "DEP_0", "DER_0", "OBE_0", "SLD_0", "TEMP_0", "WIND_0", "RAIN_0",
+     "LON", "LAT", "UNI_1", "UNEMPLOYMENT_1", "INCOME_1", "POPULATION_1",
+     "DENSITY_1", "POVERTY_1", "NO2_1", "SO2_1", "PM25_1", "ANX_1", "DEP_1",
+     "DER_1", "OBE_1", "SLD_1", "TEMP_1", "WIND_1", "RAIN_1", "RANGETEMP_0",
+     "RANGETEMP_1", "RANGEWIND_0", "RANGEWIND_1")
> 
> names(ttdata) = new.names
> 
> ttdata$WEEK = droplevels(ttdata$WEEK)
> ttdata$COUNTY = droplevels(ttdata$COUNTY)
> ttdata$STATE = droplevels(ttdata$STATE)

Remote territories have large geographical distances from the continental US, and they can easily skew the tail of the variograms we examine below.

> codes = as.numeric(substr(sprintf("%05d", ttdata$FIPS), 1, 2))
> ttdata = ttdata[codes < 60, ]     # continental US...
> ttdata = ttdata[codes != 43, ]    # without Puerto Rico...
> ttdata = ttdata[codes != 52, ]    # without Virgin Islands...
> ttdata = ttdata[codes != 03, ]    # without American Samoa...
> ttdata = ttdata[codes != 15, ]    # without Hawaii...
> ttdata = ttdata[codes != 02, ]    # without Alaska...
> ttdata = ttdata[codes != 07, ]    # without the Panama canal.

Remove counties that, at the end of this data-cleaning process, have less than two months' worth of data.

> ttdata = ttdata[ttdata$FIPS %in% names(which(table(droplevels(ttdata$FIPS)) >= 8)), ]

The mapping between the original variable names and the abbreviations is as follows.

old.variable.names	abbreviations
fips_county_code	FIPS
week	WEEK
state	STATE
county	COUNTY
bachelor_degree_or_higher_0	UNI_0
unemployment_0	UNEMPLOYMENT_0
household_income_0	INCOME_0
population_0	POPULATION_0
population_density_0	DENSITY_0
POVALL_0	POVERTY_0
NO2_mean_0	NO2_0
SO2_mean_0	SO2_0
PM25_mean_0	PM25_0
anxiety_0	ANX_0
depression_0	DEP_0
atopic_dermatitis_0	DER_0
obesity_0	OBE_0
sleep_disorder_0	SLD_0
temperature_mean_0	TEMP_0
windspeed_mean_0	WIND_0
precipitations_mean_0	RAIN_0
longitude	LON
latitude	LAT
bachelor_degree_or_higher_1	UNI_1
unemployment_1	UNEMPLOYMENT_1
household_income_1	INCOME_1
population_1	POPULATION_1
population_density_1	DENSITY_1
POVALL_1	POVERTY_1
NO2_mean_1	NO2_1
SO2_mean_1	SO2_1
PM25_mean_1	PM25_1
anxiety_1	ANX_1
depression_1	DEP_1
atopic_dermatitis_1	DER_1
obesity_1	OBE_1
sleep_disorder_1	SLD_1
temperature_mean_1	TEMP_1
windspeed_mean_1	WIND_1
precipitations_mean_1	RAIN_1
RANGETEMP_0	RANGETEMP_0
RANGETEMP_1	RANGETEMP_1
RANGEWIND_0	RANGEWIND_0
RANGEWIND_1	RANGEWIND_1

After this preprocessing, the data contain 52538 observations over 474 counties and 134 weeks. Each time point has records for 19 variables.

Last updated on Mon Apr 28 16:20:49 2025 with bnlearn 5.0-20240418 and R version 4.3.2 (2023-10-31).