Data preprocessing
Import the data (from the earlier data fusion notebook, available here) and reformat it to learn a two-time network.
> data = readRDS("prepd.useful.data.rds") > key.vars = c("fips_county_code", "week", "state", "county") > > two.time.data.format = function(data) { + reshaped = lapply(levels(data$fips_county_code), + function(each.county, data) { + shift.level = function(x) { + ordered(as.integer(x) + 1, levels = seq_along(levels(x)), labels = levels(x)) + }#SHIFT.LEVEL + key.vars = c("fips_county_code", "week", "state", "county", "latitude", "longitude") + # extract the data from a specific county... + county.data = subset(data, fips_county_code == each.county) + # ... order the observations to follow the time of collection... + county.data = county.data[order(county.data$week), ] + # ... identify the time ranges available for times 0 and 1... + available.t0 = county.data$week[-length(county.data$week)] + available.t1 = shift.level(available.t0) + both.t0.and.t1 = + (available.t0 %in% county.data$week) & (available.t1 %in% county.data$week) + available.t0 = available.t0[both.t0.and.t1] + available.t1 = available.t1[both.t0.and.t1] + # ... extract the data at time 0 and time 1... + t0 = subset(county.data, week %in% available.t0) + t1 = subset(county.data, week %in% available.t1) + # ... remove the ID variables from the data at time 1... + t1 = t1[, !(colnames(t1) %in% key.vars)] + # ... and postfix the variable names associated with different time points. + names(t0)[!(names(t0) %in% key.vars)] = + paste0(names(t0)[!(names(t0) %in% key.vars)], "_0") + names(t1) = paste0(names(t1), "_1") + return(cbind(t0, t1)) + }, data = data) + # collect the data from all the counties in a single data frame. + return(do.call(rbind, reshaped)) + }#TWO.TIME.DATA.FORMAT > > ttdata = two.time.data.format(data)
Clean up and rename variables:
- The threshold variables
"*_b_*"
are deterministic functions of some quantile of the corresponding continuous variables, making them redundant. Metro
andRUCC
contain information that overlaps with the population density, and they are discrete: we would have to use a CGBN to model them, which is difficult when trying to do causal modelling because only some arc directions are allowed. Furthermore, neither variable explains much of the variability of the pollutants or the conditions.PM10
contains the most missing values in the data, and `03` might not be relevant.- Remove unobserved levels from
week
andcounty
. - Remove the maximum level of pollutants, which are strongly correlated to the average levels and, therefore, are not very informative in the models. They also get in the way of the latter being connected to the conditions.
- Remove the maximum and minimum temperature and wind speed, which are strongly correlated with the means. Replace them with a normalised range over the week.
- Minimum and maximum precipitations are not strongly correlated with each other and with the average only because they are equal to zero so often. Therefore, they are redundant in the analysis, and we can remove them. And so are the maximum and minimum temperatures and wind speed, now that we have introduced the ranges.
- Two variables will always be isolated in every model we try:
vote_rate
andhousehold_size
. Remove them.> ttdata = ttdata[, !grepl("vote_rate|household_size", colnames(ttdata))]
- There is a substantial overlap between college and university graduate frequencies; merge them into a single variable.
- Replace variable names with abbreviations to make the code more compact and the figures more readable.
- Remote territories have large geographical distances from the continental US, and they can easily skew the tail of the variograms we examine below.
- Remove counties that, at the end of this data-cleaning process, have less than two months' worth of data.
Metro | RUCC | |
---|---|---|
PM10_mean_1 | 0.005 | 0.018 |
PM25_mean_1 | 0.005 | 0.006 |
SO2_mean_1 | 0.001 | 0.037 |
NO2_mean_1 | 0.088 | 0.149 |
O3_mean_1 | 0.013 | 0.019 |
anxiety_1 | 0.003 | 0.039 |
depression_1 | 0.000 | 0.055 |
atopic_dermatitis_1 | 0.004 | 0.031 |
obesity_1 | 0.015 | 0.031 |
sleep_disorder_1 | 0.002 | 0.053 |
> ttdata$RANGETEMP_0 = + (ttdata$temperature_max_0 - ttdata$temperature_min_0) / + (ttdata$temperature_min_0 - min(ttdata$temperature_min_0, na.rm = TRUE) + 1) > ttdata$RANGETEMP_1 = + (ttdata$temperature_max_1 - ttdata$temperature_min_1) / + (ttdata$temperature_min_1 - min(ttdata$temperature_min_1, na.rm = TRUE) + 1) > ttdata$RANGEWIND_0 = + (ttdata$windspeed_max_0 - ttdata$windspeed_min_0) / + (ttdata$windspeed_min_0 + 1) > ttdata$RANGEWIND_1 = + (ttdata$windspeed_max_1 - ttdata$windspeed_min_1) / + (ttdata$windspeed_min_1 + 1)
> redundant = grep("_b_", names(ttdata), value = TRUE) > redundant = c(redundant, "temperature_min_0", "temperature_max_0", + "temperature_min_1", "temperature_max_1", + "windspeed_min_0", "windspeed_max_0", + "windspeed_min_1", "windspeed_max_1", + "precipitations_min_0", "precipitations_max_0", + "precipitations_min_1", "precipitations_max_1") > discrete = c("Metro_0", "Metro_1", "RUCC_0", "RUCC_1") > irrelevant = c("O3_mean_0", "O3_max_0", "PM10_mean_0", "PM10_max_0", + "O3_mean_1", "O3_max_1", "PM10_mean_1", "PM10_max_1", + "SO2_max_0", "NO2_max_0", "PM25_max_0", + "SO2_max_1", "NO2_max_1", "PM25_max_1") > > ttdata = ttdata[, !(colnames(ttdata) %in% c(redundant, discrete, irrelevant))]
> ttdata$bachelor_degree_or_higher_0 = + ttdata$bachelor_degree_or_higher_0 + ttdata$college_or_associate_degree_0 > ttdata$bachelor_degree_or_higher_1 = + ttdata$bachelor_degree_or_higher_1 + ttdata$college_or_associate_degree_1 > > ttdata = ttdata[, !grepl("college_or_associate_degree", colnames(ttdata))]
> old.names = names(ttdata) > new.names = + c("FIPS", "WEEK", "STATE", "COUNTY", "UNI_0", "UNEMPLOYMENT_0", "INCOME_0", + "POPULATION_0", "DENSITY_0", "POVERTY_0", "NO2_0", "SO2_0", "PM25_0", + "ANX_0", "DEP_0", "DER_0", "OBE_0", "SLD_0", "TEMP_0", "WIND_0", "RAIN_0", + "LON", "LAT", "UNI_1", "UNEMPLOYMENT_1", "INCOME_1", "POPULATION_1", + "DENSITY_1", "POVERTY_1", "NO2_1", "SO2_1", "PM25_1", "ANX_1", "DEP_1", + "DER_1", "OBE_1", "SLD_1", "TEMP_1", "WIND_1", "RAIN_1", "RANGETEMP_0", + "RANGETEMP_1", "RANGEWIND_0", "RANGEWIND_1") > > names(ttdata) = new.names > > ttdata$WEEK = droplevels(ttdata$WEEK) > ttdata$COUNTY = droplevels(ttdata$COUNTY) > ttdata$STATE = droplevels(ttdata$STATE)
> codes = as.numeric(substr(sprintf("%05d", ttdata$FIPS), 1, 2)) > ttdata = ttdata[codes < 60, ] # continental US... > ttdata = ttdata[codes != 43, ] # without Puerto Rico... > ttdata = ttdata[codes != 52, ] # without Virgin Islands... > ttdata = ttdata[codes != 03, ] # without American Samoa... > ttdata = ttdata[codes != 15, ] # without Hawaii... > ttdata = ttdata[codes != 02, ] # without Alaska... > ttdata = ttdata[codes != 07, ] # without the Panama canal.
> ttdata = ttdata[ttdata$FIPS %in% names(which(table(droplevels(ttdata$FIPS)) >= 8)), ]
The mapping between the original variable names and the abbreviations is as follows.
old.variable.names | abbreviations |
---|---|
fips_county_code | FIPS |
week | WEEK |
state | STATE |
county | COUNTY |
bachelor_degree_or_higher_0 | UNI_0 |
unemployment_0 | UNEMPLOYMENT_0 |
household_income_0 | INCOME_0 |
population_0 | POPULATION_0 |
population_density_0 | DENSITY_0 |
POVALL_0 | POVERTY_0 |
NO2_mean_0 | NO2_0 |
SO2_mean_0 | SO2_0 |
PM25_mean_0 | PM25_0 |
anxiety_0 | ANX_0 |
depression_0 | DEP_0 |
atopic_dermatitis_0 | DER_0 |
obesity_0 | OBE_0 |
sleep_disorder_0 | SLD_0 |
temperature_mean_0 | TEMP_0 |
windspeed_mean_0 | WIND_0 |
precipitations_mean_0 | RAIN_0 |
longitude | LON |
latitude | LAT |
bachelor_degree_or_higher_1 | UNI_1 |
unemployment_1 | UNEMPLOYMENT_1 |
household_income_1 | INCOME_1 |
population_1 | POPULATION_1 |
population_density_1 | DENSITY_1 |
POVALL_1 | POVERTY_1 |
NO2_mean_1 | NO2_1 |
SO2_mean_1 | SO2_1 |
PM25_mean_1 | PM25_1 |
anxiety_1 | ANX_1 |
depression_1 | DEP_1 |
atopic_dermatitis_1 | DER_1 |
obesity_1 | OBE_1 |
sleep_disorder_1 | SLD_1 |
temperature_mean_1 | TEMP_1 |
windspeed_mean_1 | WIND_1 |
precipitations_mean_1 | RAIN_1 |
RANGETEMP_0 | RANGETEMP_0 |
RANGETEMP_1 | RANGETEMP_1 |
RANGEWIND_0 | RANGEWIND_0 |
RANGEWIND_1 | RANGEWIND_1 |
After this preprocessing, the data contain 52538
observations over
474
counties and 134
weeks.
Each time point has records for 19
variables.
Last updated on
Mon Apr 28 16:20:49 2025
with bnlearn
5.0-20240418
and R version 4.3.2 (2023-10-31)
.