bnlearn - Scutari, Salah, Kerob, Krutmann, Aritificial Intelligence in Medicine (2025)

Data fusion

In this first notebook, we load the data taken from Google, explore and fuse them.

Loading all the data sets

> census = read.csv("census.csv.bz2", header = TRUE)
> pollution = read.csv("pollution.weekly.csv.bz2", header = TRUE)
> searches = read.csv("search.trends.csv.bz2", header = TRUE)
> weather = read.csv("weather.week.csv.bz2", header = TRUE)
> coords = read.csv("county.coordinates.csv.bz2", header = TRUE)

The raw data files are available from here:

The census data

The census data contain 3064 observations and 20 variables, which are imported as follows by default.

LABEL	TYPE	MISSING VALUES
fips_county_code	integer	0
sub_region_1	character	0
sub_region_2	character	0
bachelor_degree_or_higher	numeric	0
college_or_associate_degree	numeric	0
POVALL_2019	integer	0
Unemployment_rate_2020	numeric	0
Median_Household_Income_2019	numeric	0
Metro_2013	numeric	0
RUCC_2013	integer	0
Pop	integer	0
vote_rate	numeric	0
Household_Size	numeric	0
Density	numeric	0
bachelor_degree_or_higher_b	integer	0
college_or_associate_degree_b	integer	0
POVALL_2019_p	numeric	0
POVALL_2019_b	integer	0
Unemployment_rate_2020_b	integer	0
Median_Household_Income_2019_b	integer	0

The unique county identifier is fips_county_code. Each fips_county_code appears multiple times in the data with different vote_rate records. Summing up the vote rates for each fips_county_code and keeping only the unique records gives one entry for each county and vote rates that are all strictly less than 1.

> # identify duplicate ids which appear multiple times.
> ttt = table(census$fips_county_code)
> dup_fips_id = names(ttt[ttt > 1])
> 
> for (id in dup_fips_id) {

+   # sum up the vote rates for each id.
+   all_votes = sum(census[census$fips_county_code == id, "vote_rate"])
+   census[census$fips_county_code == id, "vote_rate"] = all_votes

+ }#FOR
> 
> # keep only the unique records.
> census = unique(census)
> # double-check that each id appears only once.
> ttt = table(census$fips_county_code)
> length(ttt[ttt > 1])

[1] 0

> # double check that all vote rates are in [0, 1].
> summary(census$vote_rate)

   Min. 1st Qu.  Median    Mean 3rd Qu.    Max.
 0.0873  0.5121  0.6316  0.6090  0.7258  0.9035

Vote rates should be normalised to a [0, 1] scale to match other continuous variables.

> census[, "vote_rate"] = census[, "vote_rate"] / 100

The variable sub_region_1 is the US state, and sub_region_2 is the county name. We cannot use sub_region_2 for anything other than data exploration because there are counties with the same name in different states.

> names(census)[names(census) == "sub_region_2"] = "county"
> names(census)[names(census) == "sub_region_1"] = "state"
> # an example:
> census[census$county == "Adams County", c("county", "state")]

           county        state
279  Adams County     Colorado
746  Adams County     Illinois
802  Adams County      Indiana
1334 Adams County  Mississippi
1445 Adams County     Nebraska
1916 Adams County         Ohio
2149 Adams County Pennsylvania
3002 Adams County    Wisconsin

Both these variables can be encoded as factors for convenience.

> census[, "county"] = as.factor(census[, "county"])
> census[, "state"] = as.factor(census[, "state"])

The bachelor_degree_or_higher_b and college_or_associate_degree_b identify the proportions of the population with a specific educational achievement above or below a certain threshold.

The variables bachelor_degree_or_higher and college_or_associate_degree encode the proportions of the population with the respective educational achievements.

> standard.histogram(census[, "bachelor_degree_or_higher"],
+   legend = "Proportion of the population with a bachelor degree or better")

> standard.histogram(census[, "college_or_associate_degree"],
+   legend = "Proportion of the population with a college/associate degree")

Both percentages can be normalised to a [0, 1] scale to match other variables.

> census[, "bachelor_degree_or_higher"] = census[, "bachelor_degree_or_higher"] / 100
> census[, "college_or_associate_degree"] = census[, "college_or_associate_degree"] / 100

The bachelor_degree_or_higher_b and college_or_associate_degree_b are indicators that identify the proportions of the population with a specific educational achievement that are above or below a certain threshold.

> range(census[census[, c("bachelor_degree_or_higher_b")] == 0,
+                c("bachelor_degree_or_higher")])

[1] 0.05386811 0.29220734

> range(census[census[, c("bachelor_degree_or_higher_b")] == 1,
+                c("bachelor_degree_or_higher")])

[1] 0.2923395 0.7529932

> range(census[census[, c("college_or_associate_degree_b")] == 0,
+                c("college_or_associate_degree")])

[1] 0.1124183 0.3325122

> range(census[census[, c("college_or_associate_degree_b")] == 1,
+                c("college_or_associate_degree")])

[1] 0.3325383 0.4733221

> standard.xyplot(bachelor_degree_or_higher ~ seq(nrow(census)),
+   data = census, xlab = "", ylab = "Proportion with a bachelor degree or higher",
+   col = c("skyblue", "orange")[census[, c("bachelor_degree_or_higher_b")] + 1])

plot of chunk education_proportion_range

> standard.xyplot(college_or_associate_degree ~ seq(nrow(census)),
+   data = census, xlab = "", ylab = "Proportion with a college/associate degree",
+   col = c("skyblue", "orange")[census[, c("college_or_associate_degree_b")] + 1])

They are best encoded as factors.

> census[, c("bachelor_degree_or_higher_b")] =
+   factor(census[, c("bachelor_degree_or_higher_b")], levels = c(0, 1),
+          labels = c("LOW", "HIGH"))
> census[, c("college_or_associate_degree_b")] =
+   factor(census[, c("college_or_associate_degree_b")], levels = c(0, 1),
+          labels = c("LOW", "HIGH"))

The variable POVALL_2019 is the number of people of all ages in poverty. It comes both as an absolute value and a proportion under the name POVALL_2019_p. The variable POVALL_2019_b then identifies the proportions (not the absolute values!) above a certain threshold.

> range(census[census[, "POVALL_2019_b"] == 0, "POVALL_2019_p"])

[1] 0.03370641 0.17268516

> range(census[census[, "POVALL_2019_b"] == 1, "POVALL_2019_p"])

[1] 0.1726897 0.3401949

> standard.xyplot(POVALL_2019_p ~ seq(nrow(census)),
+   data = census, xlab = "", ylab = "POVALL",
+   col = c("skyblue", "orange")[census[, c("POVALL_2019_b")] + 1])

There is no point in keeping POVALL_2019 in the data when we have both the corresponding proportion and the population size. The proportion should also be rescaled to [0, 1] for consistency with other continuous variables.

> all.equal(census[, "POVALL_2019"] / census[, "Pop"], census[, "POVALL_2019_p"])

[1] TRUE

> census[, names(census) == "POVALL_2019"] = NULL
> census[, "POVALL_2019_p"] = census[, "POVALL_2019_p"] / 100
> census[, c("POVALL_2019_b")] =
+   factor(census[, c("POVALL_2019_b")], levels = c(0, 1), labels = c("LOW", "HIGH"))
> names(census)[names(census) == "POVALL_2019_p"] = "POVALL"
> names(census)[names(census) == "POVALL_2019_b"] = "POVALL_b"

The variable Unemployment_rate_2020 is the percentage of unemployed people in the county, and Unemployment_rate_2020_b is the corresponding binarised transform that identifies unemployment rates above a certain threshold.

> standard.histogram(census[, "Unemployment_rate_2020"],
+   legend = "Unemployment Rate (2020)")

> standard.xyplot(Unemployment_rate_2020 ~ seq(nrow(census)),
+   data = census, xlab = "", ylab = "Unemployment rate",
+   col = c("skyblue", "orange")[census[, c("Unemployment_rate_2020_b")] + 1])

We should rescale the rate to [0, 1] for consistency, rename it for brevity, and transform the binarised variable to a factor.

> census[, "Unemployment_rate_2020"] = census[, "Unemployment_rate_2020"] / 100
> census[, c("Unemployment_rate_2020_b")] =
+   factor(census[, c("Unemployment_rate_2020_b")], levels = c(0, 1),
+          labels = c("LOW", "HIGH"))
> names(census)[names(census) == "Unemployment_rate_2020"] = "unemployment"
> names(census)[names(census) == "Unemployment_rate_2020_b"] = "unemployment_b"

The variable Median_Household_Income_2019, in dollars, and its binarised transform Median_Household_Income_2019_b.

> standard.histogram(log10(census[, "Median_Household_Income_2019"]),
+   legend = "Median Household Income (2019, on a log10 scale)")

> standard.xyplot(Median_Household_Income_2019 ~ seq(nrow(census)),
+   data = census, xlab = "", ylab = "Median Household Income",
+   col = c("skyblue", "orange")[census[, c("Median_Household_Income_2019_b")] + 1])

Household income does not appear to correlate with the unemployment rate, but it does correlate with the proportion of university graduates.

> standard.xyplot(Median_Household_Income_2019 ~ unemployment,
+   data = census, xlab = "Unemployment", ylab = "Median Household Income")

> standard.xyplot(Median_Household_Income_2019 ~ bachelor_degree_or_higher,
+   data = census, xlab = "bachelor_degree_or_higher", ylab = "Median Household Income",
+   regression = TRUE)

Shortening the variable names for brevity. We should also change the income to a log10() scale for consistency with other numeric variables and transform the binarised version into a factor.

> census[, "Median_Household_Income_2019"] =
+   log10(census[, "Median_Household_Income_2019"])
> census[, "Median_Household_Income_2019_b"] =
+   factor(census[, c("Median_Household_Income_2019_b")], levels = c(0, 1),
+          labels = c("LOW", "HIGH"))
> names(census)[names(census) == "Median_Household_Income_2019"] = "household_income"
> names(census)[names(census) == "Median_Household_Income_2019_b"] = "household_income_b"

Metro_2013 distinguishes between metropolitan and non-metropolitan counties. The variable RUCC_2013 contains the Rural-Urban Continuum Codes (RUCC), which further classifies how rural/urban a county is. RUCC is a score from 1 to 9, defined as follows:

Metropolitan counties
- 1: Counties in metro areas of 1 million population or more.
- 2: Counties in metro areas of 250,000 to 1 million population.
- 3: Counties in metro areas of fewer than 250,000 population.
Non-metropolitan counties
- 4: Urban population of 20,000 or more, adjacent to a metro area.
- 5: Urban population of 20,000 or more, not adjacent to a metro area.
- 6: Urban population of 2,500 to 19,999, adjacent to a metro area.
- 7: Urban population of 2,500 to 19,999, not adjacent to a metro area.
- 8: Completely rural or less than 2,500 urban population, adjacent to a metro area.
- 9: Completely rural or less than 2,500 urban population, not adjacent to a metro area.

> table(census[, c("Metro_2013", "RUCC_2013")])

          RUCC_2013
Metro_2013   1   2   3   4   5   6   7   8   9
         0   0   0   0 213  91 345 173  16  13
         1 346 327 268   0   0   0   0   0   0

Both variables are best encoded as factors.

> census[, "Metro_2013"] =
+   factor(census[, "Metro_2013"], levels = c(0, 1), labels = c("metro", "non-metro"))
> census[, "RUCC_2013"] = factor(census[, "RUCC_2013"], levels = as.character(1:9))
> names(census)[names(census) == "Metro_2013"] = "Metro"
> names(census)[names(census) == "RUCC_2013"] = "RUCC"

Pop is the county population size.

> standard.histogram(log10(census[, "Pop"]), legend = "Population (on a log10 scale)")

We should put it on a log10() scale for consistency with other numeric variables.

> census[, "Pop"] = log10(census[, "Pop"])
> names(census)[names(census) == "Pop"] = "population"

Household_Size is the average number of people living together in each household.

> standard.histogram(census[, "Household_Size"], legend = "Household size")

Renamed for consistency with household_income.

> names(census)[names(census) == "Household_Size"] = "household_size"

Density is the population density, most likely in people/sqft.

> standard.histogram(log10(census[, "Density"]), legend = "Population Density (on a log10 scale)")

Renamed for clarity and reascaled to log10() for consistency with other numeric variables.

> census[, "Density"] = log10(census[, "Density"])
> names(census)[names(census) == "Density"] = "population_density"

The final form of the census data set.

LABEL	TYPE	MISSING VALUES
fips_county_code	integer	0
state	factor	0
county	factor	0
bachelor_degree_or_higher	numeric	0
college_or_associate_degree	numeric	0
unemployment	numeric	0
household_income	numeric	0
Metro	factor	0
RUCC	factor	0
population	numeric	0
vote_rate	numeric	0
household_size	numeric	0
population_density	numeric	0
bachelor_degree_or_higher_b	factor	0
college_or_associate_degree_b	factor	0
POVALL	numeric	0
POVALL_b	factor	0
unemployment_b	factor	0
household_income_b	factor	0

The pollution data

The pollution data contain 1742937 observations and 22 variables, which are imported as follows by default:

LABEL	TYPE	MISSING VALUES
state_code	integer	0.000
county_code	integer	0.000
fips_county_code	integer	0.000
week_start_date	character	0.000
weekly_avg_CO	numeric	0.799
weekly_max_CO	numeric	0.799
weekly_aqi_CO	numeric	0.799
weekly_avg_NO2	numeric	0.781
weekly_max_NO2	numeric	0.781
weekly_aqi_NO2	numeric	0.781
weekly_avg_SO2	numeric	0.683
weekly_max_SO2	numeric	0.683
weekly_aqi_SO2	numeric	0.683
weekly_avg_O3	numeric	0.445
weekly_max_O3	numeric	0.445
weekly_aqi_O3	numeric	0.445
weekly_avg_PM25	numeric	0.581
weekly_max_PM25	numeric	0.581
weekly_aqi_PM25	numeric	0.581
weekly_avg_PM10	numeric	0.549
weekly_max_PM10	numeric	0.549
weekly_aqi_PM10	numeric	0.549

Variables are already homogeneous: they comprise a triplet of mean, maximum and air quality index (AQI) for each pollutant. The ranges are very different for different pollutants.

> bwplot(variable ~ value, data = melt(pollution[, -(1:4)]),
+        scales = list(x = list(log = 10)))

Therefore, we only need to remove the county_code and state_code, which are redundant given the census data, and to shorten the variable names for brevity.

> pollution[, names(pollution) %in% c("county_code", "state_code")] = NULL
> names(pollution) = sub("^weekly_", "", names(pollution))

For convenience and to match other data sets, we rename the date of the first day of the week.

> names(pollution)[names(pollution) == "week_start_date"] = "week"

Finally, we rename the pollutants to follow the same naming pattern as the weather variables.

> names(pollution) = gsub("avg_(.*)", "\\1_mean", names(pollution))
> names(pollution) = gsub("max_(.*)", "\\1_max", names(pollution))
> names(pollution) = gsub("aqi_(.*)", "\\1_aqi", names(pollution))

The air quality index for a pollutant is defined as the mean concentration divided by the national standard and multiplied by 100. Therefore, it correlates almost perfectly with the mean concentrations.

> cor(pollution$CO_mean, pollution$CO_aqi, use = "pairwise.complete")

[1] 0.9718042

> cor(pollution$NO2_mean, pollution$NO2_aqi, use = "pairwise.complete")

[1] 0.9518696

> cor(pollution$SO2_mean, pollution$SO2_aqi, use = "pairwise.complete")

[1] 0.8392479

> cor(pollution$O3_mean, pollution$O3_aqi, use = "pairwise.complete")

[1] 0.8365865

> cor(pollution$PM10_mean, pollution$PM10_aqi, use = "pairwise.complete")

[1] 0.9856848

> cor(pollution$PM25_mean, pollution$PM25_aqi, use = "pairwise.complete")

[1] 0.9606402

> pollution[, grepl("_aqi", names(pollution))] = NULL

The final form of the pollution data set.

LABEL	TYPE	MISSING VALUES
fips_county_code	integer	0.000
week	character	0.000
CO_mean	numeric	0.799
CO_max	numeric	0.799
NO2_mean	numeric	0.781
NO2_max	numeric	0.781
SO2_mean	numeric	0.683
SO2_max	numeric	0.683
O3_mean	numeric	0.445
O3_max	numeric	0.445
PM25_mean	numeric	0.581
PM25_max	numeric	0.581
PM10_mean	numeric	0.549
PM10_max	numeric	0.549

Google searches for the medical conditions

The searches data contain 327758 observations and 13 variables, which are imported as follows by default:

LABEL	TYPE	MISSING VALUES
country_region_code	character	0
country_region	character	0
sub_region_1	character	0
sub_region_1_code	character	0
sub_region_2	character	0
fips_county_code	integer	0
place_id	character	0
date	character	0
symptom_anxiety	numeric	0
symptom_depression	numeric	0
symptom_atopic_dermatitis	numeric	0
symptom_obesity	numeric	0
symptom_sleep_disorder	numeric	0

First, we rename the date.

> names(searches)[names(searches) == "date"] = "week"

Then we remove the location variables we do not need to merge the data sets.

> searches[, c("country_region_code", "country_region", "sub_region_1")] = NULL
> searches[, c("sub_region_1_code", "sub_region_2", "place_id")] = NULL

Finally, we rename the conditions for brevity.

> names(searches) = gsub("^symptom_", "", names(searches))

The weather data

The weather data contain 327920 observations and 18 variables, which by default are imported as:

LABEL	TYPE	MISSING VALUES
statefp	integer	0.000
countyfp	integer	0.000
countyns	integer	0.000
county_name	character	0.000
fips_county_code	integer	0.000
date	character	0.000
temp_mean	numeric	0.009
temp_sum	numeric	0.000
temp_max	numeric	0.009
temp_min	numeric	0.009
wdsp_mean	numeric	0.009
wdsp_sum	numeric	0.000
wdsp_max	numeric	0.009
wdsp_min	numeric	0.009
prcp_mean	numeric	0.009
prcp_sum	numeric	0.000
prcp_max	numeric	0.009
prcp_min	numeric	0.009

We can remove all redundant location variables, leaving only fips_county_code.

> weather$statefp = weather$countyfp = weather$county_name = weather$countyns = NULL

We also rename the date to be consistent with the other data sets and give more descriptive names to the other variables.

> names(weather)[names(weather) == "date"] = "week"
> names(weather) = gsub("^temp_", "temperature_", names(weather))
> names(weather) = gsub("^wdsp_", "windspeed_", names(weather))
> names(weather) = gsub("^prcp_", "precipitations_", names(weather))

For obvious reasons, the _mean and _sum variables are perfectly correlated up to measurement noise: we only keep the former. Before we remove the sums over the whole week, we use them to fill in the missing values in the means.

> cor(weather$windspeed_sum, weather$windspeed_mean, use = "pairwise.complete")

[1] 0.9954285

> cor(weather$precipitations_sum, weather$precipitations_mean, use = "pairwise.complete")

[1] 0.9924422

> cor(weather$temperature_sum, weather$temperature_mean, use = "pairwise.complete")

[1] 0.96267

> weather[is.na(weather$windspeed_mean), "windspeed_mean"] =
+   weather[is.na(weather$windspeed_mean), "windspeed_sum"] / 7
> weather[is.na(weather$precipitations_mean), "precipitations_mean"] =
+   weather[is.na(weather$precipitations_mean), "precipitations_sum"] / 7
> weather[is.na(weather$temperature_mean), "temperature_mean"] =
+   weather[is.na(weather$temperature_mean), "temperature_sum"] / 7
> 
> weather[, grepl("_sum", names(weather))] = NULL

A quick overview of the weather variables.

> bwplot(variable ~ value, data = melt(weather[, -(1:2)]))

The final form of the weather data set.

LABEL	TYPE	MISSING VALUES
fips_county_code	integer	0.000
week	character	0.000
temperature_mean	numeric	0.000
temperature_max	numeric	0.009
temperature_min	numeric	0.009
windspeed_mean	numeric	0.000
windspeed_max	numeric	0.009
windspeed_min	numeric	0.009
precipitations_mean	numeric	0.000
precipitations_max	numeric	0.009
precipitations_min	numeric	0.009

The county coordinates

The coords data contain 3233 observations and 4 variables, which are imported as follows by default:

LABEL	TYPE	MISSING VALUES
fips_code	integer	0
name	character	0
lng	numeric	0
lat	numeric	0

The only variable that is redundant is the county name; the other variables could use names that are consistent with other data sets or just more descriptive.

> coords = coords[, names(coords) != "name"]
> names(coords) = c("fips_county_code", "longitude", "latitude")

Merging all the data sets

The variables that act as keys to merge the different data sets are fips_county_code and week. Not all codes appear in all data sets, and neither do all weeks.

> all.weeks = sort(unique(c(pollution$week, searches$week, weather$week)))
> pollution$week = ordered(pollution$week, levels = all.weeks)
> searches$week = ordered(searches$week, levels = all.weeks)
> weather$week = ordered(weather$week, levels = all.weeks)

The pollution data ranges from 1990-01-01 to 2023-05-01.
The searches data ranges from 2020-03-02 to 2023-10-23.
The weather data ranges from 2020-01-06 to 2023-11-06.
The census data has no time dimension.
The county coordinates obviously have no time dimension.

> all.fips = unique(c(pollution$fips_county_code,
+                     searches$fips_county_code,
+                     census$fips_county_code,
+                     weather$fips_county_code,
+                     coords$fips_county_code))
> pollution$fips_county_code = factor(pollution$fips_county_code, levels = all.fips)
> searches$fips_county_code = factor(searches$fips_county_code, levels = all.fips)
> census$fips_county_code = factor(census$fips_county_code, levels = all.fips)
> weather$fips_county_code = factor(weather$fips_county_code, levels = all.fips)
> coords$fips_county_code = factor(coords$fips_county_code, levels = all.fips)

The pollution data covers 1470 counties.
The searches data covers 2021 counties.
The weather data covers 1652 counties.
The census data covers 1792 counties.
The coords data covers 3233 counties.

> ps = merge(pollution, searches, by = c("fips_county_code", "week"), all = TRUE)
> psc = merge(census, ps, by = "fips_county_code", all = TRUE)
> pscw = merge(psc, weather, by = c("fips_county_code", "week"), all = TRUE)
> all.data = merge(pscw, coords, by = "fips_county_code", all = TRUE)

The final merged data set contains 2053067 observations and 48 variables.

We restrict ourselves to the period and the counties where we observe the conditions we would like to study.

> # we should observe at least half of the conditions and half of the pollutants.
> missing.pollutants =
+   rowSums(is.na(all.data[, c("NO2_mean", "SO2_mean", "O3_mean", "PM25_mean", "PM10_mean")]))
> missing.conditions =
+   rowSums(is.na(all.data[, c("anxiety", "depression", "atopic_dermatitis", "obesity", "sleep_disorder")]))
> 
> useful.data = all.data[missing.pollutants <= 3 & missing.conditions <= 3, ]
> 
> # drop CO as it is missing > 50% of the time.
> useful.data = useful.data[, setdiff(colnames(useful.data), c("CO_mean", "CO_max"))]

This reduces the number of observations to 53797 over 566 counties and 136 weeks.

Finally, we can fill in the missing county and state labels. Both variables come from the census data, and we have counties for which we have no census data, but we do have pollutants.

> patch = data.frame(
+   code = c("2090", "4013", "6001", "6013", "6027", "6037", "6043", "6051",
+            "6059", "6065", "6067", "6071", "6073", "6077", "6085", "12011",
+            "12057", "12086", "12095", "12099", "17031", "25017", "26125",
+            "26163", "27053", "32003", "36005", "36061", "36081", "36103",
+            "39035", "39049", "42003", "42101", "46127", "48029", "48113",
+            "48201", "48439", "48453", "49013", "49035", "51059", "53033",
+            "56023", "36085", "37173", "51071", "2170", "55003", "2110"),
+   name = c("Fairbanks North Star", "Maricopa", "Alameda", "Contra Costa", "Inyo",
+            "Los Angeles", "Mariposa", "Mono", "Orange", "Riverside", "Sacramento",
+            "San Bernardino", "San Diego", "San Joaquin", "Santa Clara",
+            "Broward", "Hillsborough", "Miami-Dade", "Orange", "Palm Beach",
+            "Cook", "Middlesex", "Oakland", "Wayne", "Hennepin", "Clark",
+            "Bronx", "New York", "Queens", "Suffolk", "Cuyahoga", "Franklin",
+            "Allegheny", "Philadelphia", "Union", "Bexar", "Dallas", "Harris",
+            "Tarrant", "Travis", "Duchesne", "Salt Lake", "Fairfax", "King",
+            "Lincoln", "Richmond", "Swain", "Giles", "Matanuska-Susitna",
+            "Ashland", "Juneau"),
+   state = c("Alaska", "Arizona", rep("California", 13), rep("Florida", 5),
+             "Illinois", "Massachusetts", rep("Michigan", 2), "Minnesota",
+             "Nevada", rep("New York", 4), rep("Ohio", 2), rep("Pennsylvania", 2),
+             "South Dakota", rep("Texas", 5), rep("Utah", 2), "Virginia",
+             "Washington", "Wyoming", "New York", "North Carolina", "Virginia",
+             "Alaska", "Wisconsin", "Alaska")
+ )
> 
> for (i in seq(nrow(patch))) {

+   if (!(patch$name[i] %in% levels(useful.data$county)))
+     levels(useful.data$county) = c(levels(useful.data$county), patch$name[i])
+   useful.data[useful.data$fips_county_code == patch$code[i], "county"] = patch$name[i]
+   useful.data[useful.data$fips_county_code == patch$code[i], "state"] = patch$state[i]

+ }#FOR

The final form of the fused data set.

LABEL	TYPE	MISSING VALUES
fips_county_code	factor	0.000
week	ordered	0.000
state	factor	0.000
county	factor	0.000
bachelor_degree_or_higher	numeric	0.086
college_or_associate_degree	numeric	0.086
unemployment	numeric	0.086
household_income	numeric	0.086
Metro	factor	0.086
RUCC	factor	0.086
population	numeric	0.086
vote_rate	numeric	0.086
household_size	numeric	0.086
population_density	numeric	0.086
bachelor_degree_or_higher_b	factor	0.086
college_or_associate_degree_b	factor	0.086
POVALL	numeric	0.086
POVALL_b	factor	0.086
unemployment_b	factor	0.086
household_income_b	factor	0.086
NO2_mean	numeric	0.555
NO2_max	numeric	0.555
SO2_mean	numeric	0.532
SO2_max	numeric	0.532
O3_mean	numeric	0.135
O3_max	numeric	0.135
PM25_mean	numeric	0.112
PM25_max	numeric	0.112
PM10_mean	numeric	0.502
PM10_max	numeric	0.502
anxiety	numeric	0.000
depression	numeric	0.000
atopic_dermatitis	numeric	0.000
obesity	numeric	0.000
sleep_disorder	numeric	0.000
temperature_mean	numeric	0.143
temperature_max	numeric	0.145
temperature_min	numeric	0.145
windspeed_mean	numeric	0.143
windspeed_max	numeric	0.145
windspeed_min	numeric	0.145
precipitations_mean	numeric	0.143
precipitations_max	numeric	0.145
precipitations_min	numeric	0.145
longitude	numeric	0.000
latitude	numeric	0.000

The observations are spread across different weeks as follows.

The observations are spread across different counties as follows. Observations from outlying territories (Puerto Rico, Northern Mariana Islands, American Samoa) are excluded from the plot and appear as NA in the legend. The largest possible number is the number of weeks still present in the data (136).

Last updated on Mon Apr 28 13:54:54 2025 with bnlearn 5.0-20240418 and R version 4.3.2 (2023-10-31).