Data preprocessing

Import the data (from the earlier data fusion notebook, available here) and reformat it to learn a two-time network.

> data = readRDS("prepd.useful.data.rds")
> key.vars = c("fips_county_code", "week", "state", "county")
> 
> two.time.data.format = function(data) {

+   reshaped = lapply(levels(data$fips_county_code),
+                        function(each.county, data) {

+     shift.level = function(x) {

+       ordered(as.integer(x) + 1, levels = seq_along(levels(x)), labels = levels(x))

+     }#SHIFT.LEVEL

+     key.vars = c("fips_county_code", "week", "state", "county", "latitude", "longitude")

+     # extract the data from a specific county...
+     county.data = subset(data, fips_county_code == each.county)
+     # ... order the observations to follow the time of collection...
+     county.data = county.data[order(county.data$week), ]
+     # ... identify the time ranges available for times 0 and 1...
+     available.t0 = county.data$week[-length(county.data$week)]
+     available.t1 = shift.level(available.t0)

+     both.t0.and.t1 =
+       (available.t0 %in% county.data$week) & (available.t1 %in% county.data$week)

+     available.t0 = available.t0[both.t0.and.t1]
+     available.t1 = available.t1[both.t0.and.t1]

+     # ... extract the data at time 0 and time 1...
+     t0 = subset(county.data, week %in% available.t0)
+     t1 = subset(county.data, week %in% available.t1)
+     # ... remove the ID variables from the data at time 1...
+     t1 = t1[, !(colnames(t1) %in% key.vars)]
+     # ... and postfix the variable names associated with different time points.
+     names(t0)[!(names(t0) %in% key.vars)] =
+       paste0(names(t0)[!(names(t0) %in% key.vars)], "_0")
+     names(t1) = paste0(names(t1), "_1")

+     return(cbind(t0, t1))

+   }, data = data)

+   # collect the data from all the counties in a single data frame.
+   return(do.call(rbind, reshaped))

+ }#TWO.TIME.DATA.FORMAT
> 
> ttdata = two.time.data.format(data)

Clean up and rename variables:

  • The threshold variables "*_b_*" are deterministic functions of some quantile of the corresponding continuous variables, making them redundant.
  • Metro and RUCC contain information that overlaps with the population density, and they are discrete: we would have to use a CGBN to model them, which is difficult when trying to do causal modelling because only some arc directions are allowed. Furthermore, neither variable explains much of the variability of the pollutants or the conditions.
  • Metro RUCC
    PM10_mean_1 0.005 0.018
    PM25_mean_1 0.005 0.006
    SO2_mean_1 0.001 0.037
    NO2_mean_1 0.088 0.149
    O3_mean_1 0.013 0.019
    anxiety_1 0.003 0.039
    depression_1 0.000 0.055
    atopic_dermatitis_1 0.004 0.031
    obesity_1 0.015 0.031
    sleep_disorder_1 0.002 0.053
  • PM10 contains the most missing values in the data, and `03` might not be relevant.
  • Remove unobserved levels from week and county.
  • Remove the maximum level of pollutants, which are strongly correlated to the average levels and, therefore, are not very informative in the models. They also get in the way of the latter being connected to the conditions.
  • Remove the maximum and minimum temperature and wind speed, which are strongly correlated with the means. Replace them with a normalised range over the week.
  • > ttdata$RANGETEMP_0 =
    +   (ttdata$temperature_max_0 - ttdata$temperature_min_0) /
    +   (ttdata$temperature_min_0 - min(ttdata$temperature_min_0, na.rm = TRUE) + 1)
    > ttdata$RANGETEMP_1 =
    +   (ttdata$temperature_max_1 - ttdata$temperature_min_1) /
    +   (ttdata$temperature_min_1 - min(ttdata$temperature_min_1, na.rm = TRUE) + 1)
    > ttdata$RANGEWIND_0 =
    +   (ttdata$windspeed_max_0 - ttdata$windspeed_min_0) /
    +   (ttdata$windspeed_min_0 + 1)
    > ttdata$RANGEWIND_1 =
    +   (ttdata$windspeed_max_1 - ttdata$windspeed_min_1) /
    +   (ttdata$windspeed_min_1 + 1)
    
    min max range
    0.982 0.976 -0.298
    0.982 0.975 -0.290
    0.991 0.982 0.034
    0.991 0.981 0.036
  • Minimum and maximum precipitations are not strongly correlated with each other and with the average only because they are equal to zero so often. Therefore, they are redundant in the analysis, and we can remove them. And so are the maximum and minimum temperatures and wind speed, now that we have introduced the ranges.
  • > redundant = grep("_b_", names(ttdata), value = TRUE)
    > redundant = c(redundant, "temperature_min_0", "temperature_max_0",
    +                "temperature_min_1", "temperature_max_1",
    +                "windspeed_min_0", "windspeed_max_0",
    +                "windspeed_min_1", "windspeed_max_1",
    +                "precipitations_min_0", "precipitations_max_0",
    +                "precipitations_min_1", "precipitations_max_1")
    > discrete = c("Metro_0", "Metro_1", "RUCC_0", "RUCC_1")
    > irrelevant = c("O3_mean_0", "O3_max_0", "PM10_mean_0", "PM10_max_0",
    +                "O3_mean_1", "O3_max_1", "PM10_mean_1", "PM10_max_1",
    +                "SO2_max_0", "NO2_max_0", "PM25_max_0",
    +                "SO2_max_1", "NO2_max_1", "PM25_max_1")
    > 
    > ttdata = ttdata[, !(colnames(ttdata) %in% c(redundant, discrete, irrelevant))]
    
  • Two variables will always be isolated in every model we try: vote_rate and household_size. Remove them.
    > ttdata = ttdata[, !grepl("vote_rate|household_size", colnames(ttdata))]
    
  • There is a substantial overlap between college and university graduate frequencies; merge them into a single variable.
  • > ttdata$bachelor_degree_or_higher_0 =
    +   ttdata$bachelor_degree_or_higher_0 + ttdata$college_or_associate_degree_0
    > ttdata$bachelor_degree_or_higher_1 =
    +   ttdata$bachelor_degree_or_higher_1 + ttdata$college_or_associate_degree_1
    > 
    > ttdata = ttdata[, !grepl("college_or_associate_degree", colnames(ttdata))]
    
  • Replace variable names with abbreviations to make the code more compact and the figures more readable.
  • > old.names = names(ttdata)
    > new.names =
    +   c("FIPS", "WEEK", "STATE", "COUNTY", "UNI_0", "UNEMPLOYMENT_0", "INCOME_0",
    +     "POPULATION_0", "DENSITY_0", "POVERTY_0", "NO2_0", "SO2_0", "PM25_0",
    +     "ANX_0", "DEP_0", "DER_0", "OBE_0", "SLD_0", "TEMP_0", "WIND_0", "RAIN_0",
    +     "LON", "LAT", "UNI_1", "UNEMPLOYMENT_1", "INCOME_1", "POPULATION_1",
    +     "DENSITY_1", "POVERTY_1", "NO2_1", "SO2_1", "PM25_1", "ANX_1", "DEP_1",
    +     "DER_1", "OBE_1", "SLD_1", "TEMP_1", "WIND_1", "RAIN_1", "RANGETEMP_0",
    +     "RANGETEMP_1", "RANGEWIND_0", "RANGEWIND_1")
    > 
    > names(ttdata) = new.names
    > 
    > ttdata$WEEK = droplevels(ttdata$WEEK)
    > ttdata$COUNTY = droplevels(ttdata$COUNTY)
    > ttdata$STATE = droplevels(ttdata$STATE)
    
  • Remote territories have large geographical distances from the continental US, and they can easily skew the tail of the variograms we examine below.
  • > codes = as.numeric(substr(sprintf("%05d", ttdata$FIPS), 1, 2))
    > ttdata = ttdata[codes < 60, ]     # continental US...
    > ttdata = ttdata[codes != 43, ]    # without Puerto Rico...
    > ttdata = ttdata[codes != 52, ]    # without Virgin Islands...
    > ttdata = ttdata[codes != 03, ]    # without American Samoa...
    > ttdata = ttdata[codes != 15, ]    # without Hawaii...
    > ttdata = ttdata[codes != 02, ]    # without Alaska...
    > ttdata = ttdata[codes != 07, ]    # without the Panama canal.
    
  • Remove counties that, at the end of this data-cleaning process, have less than two months' worth of data.
  • > ttdata = ttdata[ttdata$FIPS %in% names(which(table(droplevels(ttdata$FIPS)) >= 8)), ]
    

The mapping between the original variable names and the abbreviations is as follows.

old.variable.names abbreviations
fips_county_code FIPS
week WEEK
state STATE
county COUNTY
bachelor_degree_or_higher_0 UNI_0
unemployment_0 UNEMPLOYMENT_0
household_income_0 INCOME_0
population_0 POPULATION_0
population_density_0 DENSITY_0
POVALL_0 POVERTY_0
NO2_mean_0 NO2_0
SO2_mean_0 SO2_0
PM25_mean_0 PM25_0
anxiety_0 ANX_0
depression_0 DEP_0
atopic_dermatitis_0 DER_0
obesity_0 OBE_0
sleep_disorder_0 SLD_0
temperature_mean_0 TEMP_0
windspeed_mean_0 WIND_0
precipitations_mean_0 RAIN_0
longitude LON
latitude LAT
bachelor_degree_or_higher_1 UNI_1
unemployment_1 UNEMPLOYMENT_1
household_income_1 INCOME_1
population_1 POPULATION_1
population_density_1 DENSITY_1
POVALL_1 POVERTY_1
NO2_mean_1 NO2_1
SO2_mean_1 SO2_1
PM25_mean_1 PM25_1
anxiety_1 ANX_1
depression_1 DEP_1
atopic_dermatitis_1 DER_1
obesity_1 OBE_1
sleep_disorder_1 SLD_1
temperature_mean_1 TEMP_1
windspeed_mean_1 WIND_1
precipitations_mean_1 RAIN_1
RANGETEMP_0 RANGETEMP_0
RANGETEMP_1 RANGETEMP_1
RANGEWIND_0 RANGEWIND_0
RANGEWIND_1 RANGEWIND_1

After this preprocessing, the data contain 52538 observations over 474 counties and 134 weeks. Each time point has records for 19 variables.

Last updated on Mon Apr 28 16:20:49 2025 with bnlearn 5.0-20240418 and R version 4.3.2 (2023-10-31).