where \(StoreCount\) is the number of stores in county \(i\), \(MedianIncome_i\) is median household income collected from the US Census for county \(i\), \(\beta_0\) is the intercept, and \(\beta_1\) is the slope.
What is the conceptual interpretation of \(\beta_0\)?
What is the conceptual interpretation of \(\beta_1\)?
Data Processing
We will continue using the convenience store data that we have been using. You do not need to run the following code. It is here as a reference.
R script that generates the dataframe county_demo to be used in the regression analysis.
# These are the lab notes for Week 14library(pacman)p_load(tidyverse,modelsummary,sf,progress,janitor,tigris,mapview,tidycensus,dplyr,GGally,scales)setwd("Set your working directory")getwd() # Confirm you are in the correct working directory# =============== Option 1=============== ## ------- Working with Census Data ------ ## ======================================= ## Research Question: # What is the relationship between **median income** and **total store count** at the county level?# Part 1: Read in Spatial Data# Read in convenience store location datasetstore_raw <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/store_info.csv.gz")# Read in the shopper sales datasetshopper_info <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/shopper_info.csv.gz")# Convert data frame to a points simple feature objectstore_geo <- store_raw %>%# First, we subset the variables `store_id` and the coordinates.select(store_id, latitude, longitude) %>%# Tells R that these are essentially $x$ and $y$ coordinates in a certain projection (represented by the code 4326). st_as_sf(coords=c("longitude","latitude"), crs =st_crs(4326)) # Fetch data for county polygons using the 'tigris' packageus_co <- tigris::counties(cb=T,class="sf") %>% janitor::clean_names() %>%mutate(aland=aland/2.59e+6) %>%st_transform(4326)# Look at the distinct state list by statefp codeunique_statefp <- us_co %>%st_set_geometry(NULL) %>%# Remove geometry columnselect(stusps, statefp) %>%# Select the columns of interestdistinct() %>%# Remove duplicates, keeping only unique rowsarrange(stusps) # Arrange alphabetically by stusps# Filter out counties from American Samoa (60), Guam (66), Saipan Municipality (69), Puerto Rico (72), Virgin Islands (78), Alaska (02), and Hawaii (15)us_co_filtered <- us_co %>%filter(!statefp %in%c("60", "66", "72", "02", "15", "69", "78"))# Join county to store_geostore_co_geo <-st_join(store_geo, us_co_filtered, join=st_intersects)# Aggregate store count by countystore_count_by_county <- store_co_geo %>%group_by(geoid) %>%summarize(store_count =n(), # Count the number of stores in each county.groups ='drop') # Drop groups to prevent regrouping# Join aggregated data back with county geometries for mappingcounty_store_map <-st_join(us_co_filtered, store_count_by_county, join=st_intersects)# Create store sales by countysales_by_county <- shopper_info %>%select(store_id, unit_price, unit_quantity) %>%mutate(sales = unit_price * unit_quantity) %>%group_by(store_id) %>%summarize(total_sales =sum(sales),.groups ='drop' ) %>%left_join(store_co_geo, by ="store_id") %>%group_by(geoid) %>%summarize(county_sales =sum(total_sales),.groups ='drop' )# Part 2: Bring in Census Data# Load variable metadata for the 2022 ACS 5-year estimates (used to look up variable codes and labels)census_22 <-load_variables(2022, "acs5", cache =TRUE)# --- Household income --- ## Download median household income data from the 2022 ACS 5-year estimates at the county levelcensus_hhi <-get_acs(geography ="county", # Get data for all U.S. countiessurvey ="acs5", # Use 5-year ACS data (more reliable for small areas)variables =c(medincome ="B19013_001"), # B19013_001 = median household incomestate =NULL, # NULL = include all states (not just one)year =2022# Use the most recent available year)# Clean column names and keep only GEOID and the income estimate, renamed as 'hhi'hhi <- census_hhi %>%clean_names() %>%select(geoid, hhi = estimate)# --- Population --- ## Download population data from the 2022 ACS 5-year estimates at the county levelcensus_pop <-get_acs(geography ="county", # Get data for all U.S. countiessurvey ="acs5", # Use 5-year ACS data (more reliable for small areas)variables =c(pop ="B01003_001"), # B01003_001 = total populationstate =NULL, # NULL = include all states (not just one)year =2022# Use the most recent available year)# Clean column names and keep only GEOID and the population estimate, renamed as 'pop'pop <- census_pop %>%clean_names() %>%select(geoid, pop = estimate)# Join median household income (hhi) to the store-level dataset using county GEOIDdemo <- hhi %>%left_join(pop, by ="geoid")# Join median household income (hhi) and total population (pop) to the county-store map using county GEOIDcounty_demo <- county_store_map %>%select(-geoid.y) %>%rename(geoid = geoid.x) %>%inner_join(demo, by ="geoid") %>%# Join on county identifier (GEOID)mutate(popden = pop/aland) %>%# Calculate population density (population over land in square miles)left_join(sales_by_county, by ="geoid") %>%st_set_geometry(NULL) # Drop geometry to simplify data manipulationlength(county_demo$geoid)write_csv(county_demo, "county_demo.csv")
Exploratory Data Analysis
library(pacman)p_load(tidyverse,modelsummary,sf,progress,janitor,tigris,mapview,tidycensus,dplyr,GGally,scales)# Read in datacounty_demo <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/county_demo.csv")
Let’s briefly explore the data:
Summary statistics
# Data summarydatasummary_skim(county_demo, type ="numeric")datasummary_skim(county_demo, type ="categorical")
Plot distributions
county_demo %>%mutate(store_count =replace_na(store_count, 0)) %>%# Replace counties that have "NA" values with "0"filter(!is.na(hhi)) %>%# Drop counties with missing household income select(hhi, store_count) %>%ggpairs()
Zoom in to see the scatter plot of points.
# Zoom in on scatter plotcounty_demo_slim <- county_demo %>%mutate(store_count =replace_na(store_count, 0)) %>%filter(!is.na(hhi)) %>%# Drop counties with missing household incomeselect(hhi, popden, store_count)# Basic scatter plot with ggplotscatter_plot <-ggplot(county_demo_slim, aes(x = hhi, y = store_count)) +geom_point(alpha=.6) +labs(subtitle ="Relationship between Median Household Income and Number of Stores",x ="Median Household Income (2022)",y ="Store Count (County Level)") +theme_bw(base_size =15)scatter_plot # Print scatter plot
Univariate Regression Analysis
Now, we can estimate a regression model to test our hypothesis. The lm() function (linear model) estimates a linear regression. lm requires a formula and a dataset. The formula is similar to how we would write the equation using the ~ as the equals sign. lm understands that it needs to estimate coefficients based on this model. We specify data=county_demo to indicate which dataset we want it to use to estimate the model coefficients.
model1 <-lm(store_count ~ hhi, county_demo_slim) # Modeling the relationship between household income and store count at the county level. Store it as "model1"# Create a formatted regression table and export to Worduni_out <-modelsummary( model1,fmt =5, # Show 5 decimal places for estimatesstars =TRUE, # Add significance stars to estimatescoef_rename =c( # Rename coefficients for clarity"(Intercept)"="Intercept","hhi"="Median Income" ),gof_map =c("nobs", "r.squared") # Include only number of observations and R2)uni_out # Print regression output
The modelsummary() function is used to create clean, customizable regression tables from one or more model objects (like those from lm()). It formats the output in a way that’s clean and supports multiple output types, including MS Word.
Interpret the coefficients. What does the \(\beta_0\) coefficient (Intercept) mean? What does the \(\beta_1\) coefficient mean?
How credible are the estimates? What are the results of the statistical hypothesis test?
You can export your regression results in a nicer looking table using the function modelsummary(). See the documentation for additional options to include additional information like p-values or confidence intervals.
# Output the regression tablemodelsummary(model1)modelsummary(model1, statistic ="conf.int")
Multivariate Regression Analysis
Based on our results for the univariate regression, our model fit is low. Let’s try adding another explanatory variable to estimate the following regression:
model2 <-lm(store_count ~ hhi + popden, county_demo_slim) # Modeling the relationship between household income, pop density, and store count at the county level. Store it as "model2"multi_out <-modelsummary(model2,fmt =5, # Format estimate so it shows five decimal placesstars =TRUE,coef_rename =c("(Intercept)"="Intercept","hhi"="Median Inc","popden"="Population Density"),gof_map =c("nobs", "r.squared"))multi_out # Print regression output
The \(R^2\) increased from 0.046 to 0.054, which means that adding population density improved the fit our of model.
Interpret the coefficients. What does the \(\beta_0\) coefficient (Intercept) mean? What do the \(\beta_1\) and \(beta_2\) coefficients mean?
How credible are the estimates? What are the results of the statistical hypothesis test?
# Output the regression tablemodelsummary(model2)modelsummary(model2, statistic ="conf.int")
Comparing the line of best fit from the univariate and multivariate models.
# Advanced scatter plot with ggplot (additional formatting)scatter_plot <-ggplot(county_demo_slim, aes(x = hhi, y = store_count)) +geom_point(alpha=.6) +# Add points for scatter plotlabs(subtitle ="Relationship between Median Household Income and Number of Stores",x ="Median Household Income (2022)",y ="Store Count (County Level)") +theme_bw(base_size =15) +geom_hline(yintercept =0, color ="black", linewidth =0.5) +# <- x-axis line at y = 0theme(axis.line =element_line(color ="black"), # Draw axis linespanel.border =element_blank() # Remove default plot border ) +scale_x_continuous(breaks =seq.int(0, 10, 2),expand =c(0, 0) )scatter_plot # Print scatter plot# Create new data for predictionspred_data1 <- county_demo_slim %>% dplyr::select(hhi) %>%distinct() %>%arrange(hhi) %>%mutate(fitted =predict(model1, newdata = .))# Add to plotscatter_plot +geom_line(data = pred_data1, aes(x = hhi, y = fitted), color ="blue", linewidth =1)# Create new data for predictionspred_data2 <- county_demo_slim %>% dplyr::select(hhi, popden) %>%distinct() %>%arrange(hhi) %>%mutate(fitted =predict(model2, newdata = .))# Add to plotscatter_plot +geom_line(data = pred_data1, aes(x = hhi, y = fitted), color ="blue", linewidth =1) +geom_line(data = pred_data2, aes(x = hhi, y = fitted), color ="red", linewidth =1)
Part 2: Total Rainfall and Total Sales
We will answer this question by estimating the following regression:
where \(TotalSales\) is the total sales generated by convenience stores in county \(i\), \(TotalRainfall_i\) is the total inches of rain within for county \(i\), \(\beta_0\) is the intercept, and \(\beta_1\) is the slope.
What is the conceptual interpretation of \(\beta_0\)?
What is the conceptual interpretation of \(\beta_1\)?
Data Processing
We will continue using the convenience store data that we have been using. You do not need to run the following code. It is here as a reference.
R script that generates the dataframe store_sales_weather to be used in the regression analysis.
# These are the lab notes for Week 14library(pacman)p_load(tidyverse,modelsummary,sf,progress,janitor,tigris,mapview,tidycensus,dplyr,GGally,scales)setwd("Set your working directory")getwd() # Confirm you are in the correct working directory# =============== Option 2 =============== ## ------- Working with Weather Data ------ ## ======================================== ## Research Question: # What is the relationship between **total monthly rainfall** and **total montly sales** at the county level?# Part 1: Read in raw datashopper_info <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/shopper_info.csv.gz")# Read in the weather dataweather_raw <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/gridmet_county_daily_2023-2023.csv.gz")# Read in convenience store location datasetstore_raw <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/store_info.csv.gz")# Plot store coordinates and convert to spatial data framestore_geo <- store_raw %>%# First, we subset the variables `store_id` and the coordinates.select(store_id, latitude, longitude) %>%# Tells R that these are essentially $x$ and $y$ coordinates in a certain projection (represented by the code 4326).st_as_sf(coords=c("longitude","latitude"), crs =st_crs(4326)) # Fetch data for county polygons using the 'tigris' packageus_co_filtered <- tigris::counties(cb=T,class="sf") %>% janitor::clean_names() %>%mutate(aland=aland/2.59e+6) %>%st_transform(4326) %>%filter(!statefp %in%c("60", "66", "72", "02", "15", "69", "78")) # Keep only counties in contiguous US# Join county to store_geostore_co_geo <-st_join(store_geo, us_co_filtered, join=st_intersects)# Aggregate daily sales by store from transaction-level datastore_sales <- shopper_info %>%mutate(measure_date =as_date(date_time)) %>%# Extract date component from timestampgroup_by(store_id, measure_date) %>%# Group by store and datesummarize(sales =sum(unit_price * unit_quantity, na.rm =TRUE) # Compute total sales (price × quantity) ) %>%ungroup() # Remove grouping to clean up the result# Add county GEOID to the store-level sales data (so we can link to county-level weather data)store_sales_co <- store_co_geo %>%st_set_geometry(NULL) %>%select(store_id, geoid) %>%# Keep only store_id (for joining) and geoid (for county-level linkage)inner_join(store_sales, by ="store_id") # Join to sales data using store_id; keep only stores with known county info# Reshape weather data from long to wide format (each variable becomes a column)weather_wide <- weather_raw %>%pivot_wider(id_cols =c(county, date), # Keep county and date as identifiersnames_from = variable, # Each unique variable becomes a new columnvalues_from = value # Fill those columns with the corresponding values )# Join daily weather data to full county data using date and county identifiersco_weather <- us_co_filtered %>%st_set_geometry(NULL) %>%select(geoid) %>%right_join(weather_wide, by =c("geoid"="county"))length(unique(co_weather$geoid)) # N=3,104store_sales_weather_co <- co_weather %>%left_join(store_sales_co, by =c("date"="measure_date", "geoid"="geoid")) %>%filter(month(date)==7) %>%arrange(geoid, date, store_id)length(unique(store_sales_weather_co$geoid)) # N=3,104write_csv(store_sales_weather_co, "store_sales_weather_co")
Exploratory Data Analysis
library(pacman)p_load(tidyverse,modelsummary,sf,progress,janitor,tigris,mapview,tidycensus,dplyr,GGally,scales)# Read in datastore_sales_weather_co <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/store_sales_weather_co.csv")# Aggregate store sales and weather data to the monthsales_weather_month <- store_sales_weather_co %>%mutate(month =month(date)) %>%mutate(year =year(date)) %>%mutate(day =day(date)) %>%group_by(geoid, month) %>%summarize(total_rainfall =sum(pr_precipitation_amount, na.rm =TRUE),avg_humidity =mean(rmax_relative_humidity, na.rm =TRUE),avg_temp =mean(tmmx_air_temperature, na.rm =TRUE),total_sales =sum(sales, na.rm =TRUE),.groups ='drop' ) length(unique(sales_weather_month$geoid)) # N=3,104write_csv(sales_weather_month, "sales_weather_month.csv")
Let’s briefly explore the data:
Summary statistics
# Data summarydatasummary_skim(sales_weather_month, type ="numeric")datasummary_skim(sales_weather_month, type ="categorical")
# Zoom in on scatter plotsales_weather_month_slim <- sales_weather_month %>%select(total_rainfall, avg_temp, total_sales, month)# Basic scatter plot with ggplotscatter_plot <-ggplot(sales_weather_month_slim, aes(x = total_rainfall, y = total_sales)) +geom_point(alpha=.6) +labs(subtitle ="Relationship between Total Rainfall and Total Sales",x ="Total Inches of Rain",y ="Total Sales (Dollars)") +theme_bw(base_size =15)scatter_plot # Print scatter plot
Univariate Regression Analysis
Now, we can estimate a regression model to test our hypothesis. The lm() function (linear model) estimates a linear regression. lm requires a formula and a dataset. The formula is similar to how we would write the equation using the ~ as the equals sign. lm understands that it needs to estimate coefficients based on this model. We specify data=county_demo to indicate which dataset we want it to use to estimate the model coefficients.
model1 <-lm(total_sales ~ total_rainfall, sales_weather_month_slim) # Modeling the relationship between total rainfall and store sales at the county level. Store it as "model1"uni_out <-modelsummary(model1,fmt =5, # Format estimate so it shows five decimal placesstars =TRUE,coef_rename =c("(Intercept)"="Intercept","total_rainfall"="Total Rainfall"),gof_map =c("nobs", "r.squared"))uni_out # Print regression output
The modelsummary() function is used to create clean, customizable regression tables from one or more model objects (like those from lm()). It formats the output in a way that’s clean and supports multiple output types, including MS Word.
Interpret the coefficients. What does the (Intercept) mean? What does the dist coefficient mean?
How credible are the estimates? What are the results of the statistical hypothesis test?
You can export your regression results in a nicer looking table using the function modelsummary(). See the documentation for additional options to include additional information like p-values or confidence intervals.
# Output the regression tablemodelsummary(model1)modelsummary(model1, statistic ="conf.int")
Multivariate Regression Analysis
Based on our results for the univariate regression, our model fit is low. Let’s try adding another explanatory variable to estimate the following regression:
model2 <-lm(total_sales ~ total_rainfall + avg_temp, sales_weather_month_slim) # Modeling the relationship between total rainfall, average temperature, and total sales at the county level. Store it as "model2"multi_out <-modelsummary(model2,fmt =5, # Format estimate so it shows five decimal placesstars =TRUE,coef_rename =c("(Intercept)"="Intercept","total_rainfall"="Total Rainfall","avg_temp"="Average Temperature"),gof_map =c("nobs", "r.squared"))multi_out # Print regression output
The \(R^2\) increased from 0.247 to 0.265, which means that adding average air temperatore improved the fit our of model.
Interpret the coefficients. What does the (Intercept) mean? What does the dist coefficient mean?
How credible are the estimates? What are the results of the statistical hypothesis test?
# Output the regression tablemodelsummary(model2)modelsummary(model2, statistic ="conf.int")
Comparing the line of best fit from the univariate and multivariate models.
# Advanced scatter plot with ggplot (additional formatting)scatter_plot <-ggplot(sales_weather_month_slim, aes(x = total_rainfall, y = total_sales)) +geom_point(alpha=.6) +labs(subtitle ="Relationship between Total Rainfall and Total Sales",x ="Total Inches of Rain",y ="Total Sales (Dollars)") +theme_bw(base_size =15) +geom_hline(yintercept =0, color ="black", linewidth =0.5) +# <- x-axis line at y = 0theme(panel.border =element_blank(), # Remove borderaxis.line.y =element_line(color ="black"), # Only y-axis lineaxis.line.x =element_blank() # Remove x-axis line at y = 0 ) +scale_x_continuous(expand =c(0, 0)) +scale_y_continuous(labels =label_number(scale_cut =cut_short_scale())) # e.g., 100,000 → 100Kscatter_plot # Print scatter plot# Create new data for predictionspred_data1 <- sales_weather_month_slim %>% dplyr::select(total_rainfall) %>%distinct() %>%arrange(total_rainfall) %>%mutate(fitted =predict(model1, newdata = .))# Add to plotscatter_plot +geom_line(data = pred_data1, aes(x = total_rainfall, y = fitted), color ="blue", linewidth =1)# Create new data for predictionspred_data2 <- sales_weather_month_slim %>% dplyr::select(total_rainfall, avg_temp) %>%distinct() %>%arrange(total_rainfall, avg_temp) %>%mutate(fitted =predict(model2, newdata = .))# Add to plotscatter_plot +geom_line(data = pred_data1, aes(x = total_rainfall, y = fitted), color ="blue", linewidth =1) +geom_line(data = pred_data2, aes(x = total_rainfall, y = fitted), color ="red", linewidth =1)
Regression Analysis with Fixed Effects
In this section, we will explore how total rainfall affects store sales, controlling for day-specific effects (i.e., adding fixed effects for each day). This approach accounts for daily shocks, such as special events, holidays, or weather patterns, that could influence store sales independently of rainfall.
The regression equation for this fixed effects model is:
\[ TotalSales_i = \beta_0 + \beta_1 ~ TotalRainfall_i + \gamma_{Day(i)} + \varepsilon_i \] where \(\gamma_{Day(i)}\) controls for unobserved day-specific factors common across all counties.
Data Processing
First, we need to aggregate store_sales_weather to the day, rather than the month:
library(pacman)p_load(tidyverse,modelsummary,sf,progress,janitor,tigris,mapview,tidycensus,dplyr,GGally,scales)# Read in datastore_sales_weather_co <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/store_sales_weather_co.csv")# Aggregate store sales and weather data to the daysales_weather_day <- store_sales_weather %>%mutate(month =month(date())) %>%mutate(year =year(date)) %>%mutate(day =day(date)) %>%group_by(geoid, day) %>%summarize(total_rainfall =sum(pr_precipitation_amount),avg_humidity =mean(rmax_relative_humidity),avg_temp =mean(tmmx_air_temperature),total_sales =sum(sales),.groups ='drop' )
Estimating the Model
Once the data are properly aggregated, we can estimate a linear regression model that relates total store sales to total rainfall, while controlling for day fixed effects:
model1_fe <-lm(total_sales ~ total_rainfall +factor(day), sales_weather_day_slim) # Modeling the relationship between total rainfall and store sales at the county level. Store it as "model1_fe"fe_out <-modelsummary(model1_fe,fmt =5, # Format estimate so it shows five decimal placesstars =TRUE,coef_rename =c("(Intercept)"="Intercept","total_rainfall"="Total Rainfall"),gof_map =c("nobs", "r.squared"))fe_out # Print regression output
In this setup:
total_rainfall captures the effect of rainfall on store sales, holding daily factors constant.
factor(day) controls for unobserved daily shocks that are common across counties.
The results are automatically formatted and exported to a Word document for easier reporting.
Lab Script for Week 14: Introduction to Regression in R
# These are the lab notes for Week 14library(pacman)p_load(tidyverse,modelsummary,sf,progress,janitor,tigris,mapview,tidycensus,dplyr,GGally,scales)setwd("Set your working directory")getwd() # Confirm you are in the correct working directory# =============== Option 1=============== ## ------- Working with Census Data ------ ## ======================================= ## Research Question: # What is the relationship between **median income** and **total store count** at the county level?# Part 1: Read in Spatial Data# Read in convenience store location datasetstore_raw <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/store_info.csv.gz")# Read in the shopper sales datasetshopper_info <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/shopper_info.csv.gz")# Convert data frame to a points simple feature objectstore_geo <- store_raw %>%# First, we subset the variables `store_id` and the coordinates.select(store_id, latitude, longitude) %>%# Tells R that these are essentially $x$ and $y$ coordinates in a certain projection (represented by the code 4326). st_as_sf(coords=c("longitude","latitude"), crs =st_crs(4326)) # Fetch data for county polygons using the 'tigris' packageus_co <- tigris::counties(cb=T,class="sf") %>% janitor::clean_names() %>%mutate(aland=aland/2.59e+6) %>%st_transform(4326)# Look at the distinct state list by statefp codeunique_statefp <- us_co %>%st_set_geometry(NULL) %>%# Remove geometry columnselect(stusps, statefp) %>%# Select the columns of interestdistinct() %>%# Remove duplicates, keeping only unique rowsarrange(stusps) # Arrange alphabetically by stusps# Filter out counties from American Samoa (60), Guam (66), Saipan Municipality (69), Puerto Rico (72), Virgin Islands (78), Alaska (02), and Hawaii (15)us_co_filtered <- us_co %>%filter(!statefp %in%c("60", "66", "72", "02", "15", "69", "78"))# Join county to store_geostore_co_geo <-st_join(store_geo, us_co_filtered, join=st_intersects)# Aggregate store count by countystore_count_by_county <- store_co_geo %>%group_by(geoid) %>%summarize(store_count =n(), # Count the number of stores in each county.groups ='drop') # Drop groups to prevent regrouping# Join aggregated data back with county geometries for mappingcounty_store_map <-st_join(us_co_filtered, store_count_by_county, join=st_intersects)# Create store sales by countysales_by_county <- shopper_info %>%select(store_id, unit_price, unit_quantity) %>%mutate(sales = unit_price * unit_quantity) %>%group_by(store_id) %>%summarize(total_sales =sum(sales),.groups ='drop' ) %>%left_join(store_co_geo, by ="store_id") %>%group_by(geoid) %>%summarize(county_sales =sum(total_sales),.groups ='drop' )# Part 2: Bring in Census Data# Load variable metadata for the 2022 ACS 5-year estimates (used to look up variable codes and labels)census_22 <-load_variables(2022, "acs5", cache =TRUE)# --- Household income --- ## Download median household income data from the 2022 ACS 5-year estimates at the county levelcensus_hhi <-get_acs(geography ="county", # Get data for all U.S. countiessurvey ="acs5", # Use 5-year ACS data (more reliable for small areas)variables =c(medincome ="B19013_001"), # B19013_001 = median household incomestate =NULL, # NULL = include all states (not just one)year =2022# Use the most recent available year)# Clean column names and keep only GEOID and the income estimate, renamed as 'hhi'hhi <- census_hhi %>%clean_names() %>%select(geoid, hhi = estimate)# --- Population --- ## Download population data from the 2022 ACS 5-year estimates at the county levelcensus_pop <-get_acs(geography ="county", # Get data for all U.S. countiessurvey ="acs5", # Use 5-year ACS data (more reliable for small areas)variables =c(pop ="B01003_001"), # B01003_001 = total populationstate =NULL, # NULL = include all states (not just one)year =2022# Use the most recent available year)# Clean column names and keep only GEOID and the population estimate, renamed as 'pop'pop <- census_pop %>%clean_names() %>%select(geoid, pop = estimate)# Join median household income (hhi) to the store-level dataset using county GEOIDdemo <- hhi %>%left_join(pop, by ="geoid")# Join median household income (hhi) and total population (pop) to the county-store map using county GEOIDcounty_demo <- county_store_map %>%select(-geoid.y) %>%rename(geoid = geoid.x) %>%inner_join(demo, by ="geoid") %>%# Join on county identifier (GEOID)mutate(popden = pop/aland) %>%# Calculate population density (population over land in square miles)left_join(sales_by_county, by ="geoid") %>%st_set_geometry(NULL) # Drop geometry to simplify data manipulationlength(county_demo$geoid)write_csv(county_demo, "C:/Users/lachenar/OneDrive - Colostate/Documents/GitProjectsWithR/csu-arec-330.github.io/materials/unit_03/inputs/county_demo.csv")# Part 3: Plot the Data# Data summarydatasummary_skim(county_demo, type ="numeric")datasummary_skim(county_demo, type ="categorical")# EDA with ggpairscounty_demo %>%mutate(store_count =replace_na(store_count, 0)) %>%filter(!is.na(hhi)) %>%# Drop counties with missing household income select(hhi, popden, store_count) %>%ggpairs()# Zoom in on scatter plotcounty_demo_slim <- county_demo %>%mutate(store_count =replace_na(store_count, 0)) %>%filter(!is.na(hhi)) %>%# Drop counties with missing household incomeselect(hhi, popden, store_count)# Basic scatter plot with ggplotscatter_plot <-ggplot(county_demo_slim, aes(x = hhi, y = store_count)) +geom_point(alpha=.6) +labs(subtitle ="Relationship between Median Household Income and Number of Stores",x ="Median Household Income (2022)",y ="Store Count (County Level)") +theme_bw(base_size =15)scatter_plot # Print scatter plot# Advanced scatter plot with ggplot (additional formatting)scatter_plot <-ggplot(county_demo_slim, aes(x = hhi, y = store_count)) +geom_point(alpha=.6) +# Add points for scatter plotlabs(subtitle ="Relationship between Median Household Income and Number of Stores",x ="Median Household Income (2022)",y ="Store Count (County Level)") +theme_bw(base_size =15) +geom_hline(yintercept =0, color ="black", linewidth =0.5) +# <- x-axis line at y = 0theme(axis.line =element_line(color ="black"), # Draw axis linespanel.border =element_blank() # Remove default plot border ) +scale_x_continuous(breaks =seq.int(0, 10, 2),expand =c(0, 0) )scatter_plot # Print scatter plot# Part 4: Estimate Regression - univariatemodel1 <-lm(store_count ~ hhi, county_demo_slim) # Modeling the relationship between household income and store count at the county level. Store it as "model1"uni_out <-modelsummary(model1,fmt =5, # Format estimate so it shows five decimal placesstars =TRUE,coef_rename =c("(Intercept)"="Intercept","hhi"="Median Inc"),gof_map =c("nobs", "r.squared"))uni_out # Print regression output# Create new data for predictionspred_data1 <- county_demo_slim %>% dplyr::select(hhi) %>%distinct() %>%arrange(hhi) %>%mutate(fitted =predict(model1, newdata = .))# Add to plotscatter_plot +geom_line(data = pred_data1, aes(x = hhi, y = fitted), color ="blue", linewidth =1)# Estimate regression - multivariatemodel2 <-lm(store_count ~ hhi + popden, county_demo_slim) # Modeling the relationship between household income, pop density, and store count at the county level. Store it as "model2"multi_out <-modelsummary(model2,fmt =5, # Format estimate so it shows five decimal placesstars =TRUE,coef_rename =c("(Intercept)"="Intercept","hhi"="Median Inc","popden"="Population Density"),gof_map =c("nobs", "r.squared"))multi_out # Print regression output# Create new data for predictionspred_data2 <- county_demo_slim %>% dplyr::select(hhi, popden) %>%distinct() %>%arrange(hhi) %>%mutate(fitted =predict(model2, newdata = .))# Add to plotscatter_plot +geom_line(data = pred_data1, aes(x = hhi, y = fitted), color ="blue", linewidth =1) +geom_line(data = pred_data2, aes(x = hhi, y = fitted), color ="red", linewidth =1)# =============== Option 2 =============== ## ------- Working with Weather Data ------ ## ======================================== ## Research Question: # What is the relationship between **total monthly rainfall** and **total montly sales** at the county level?# Part 1: Read in raw datashopper_info <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/shopper_info.csv.gz")# Read in the weather dataweather_raw <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/gridmet_county_daily_2023-2023.csv.gz")# Read in convenience store location datasetstore_raw <-read_csv("https://csu-arec-330.github.io/materials/unit_03/inputs/store_info.csv.gz")# Plot store coordinates and convert to spatial data framestore_geo <- store_raw %>%# First, we subset the variables `store_id` and the coordinates.select(store_id, latitude, longitude) %>%# Tells R that these are essentially $x$ and $y$ coordinates in a certain projection (represented by the code 4326).st_as_sf(coords=c("longitude","latitude"), crs =st_crs(4326)) # Fetch data for county polygons using the 'tigris' packageus_co_filtered <- tigris::counties(cb=T,class="sf") %>% janitor::clean_names() %>%mutate(aland=aland/2.59e+6) %>%st_transform(4326) %>%filter(!statefp %in%c("60", "66", "72", "02", "15", "69", "78")) # Keep only counties in contiguous US# Join county to store_geostore_co_geo <-st_join(store_geo, us_co_filtered, join=st_intersects)# Aggregate daily sales by store from transaction-level datastore_sales <- shopper_info %>%mutate(measure_date =as_date(date_time)) %>%# Extract date component from timestampgroup_by(store_id, measure_date) %>%# Group by store and datesummarize(sales =sum(unit_price * unit_quantity, na.rm =TRUE) # Compute total sales (price × quantity) ) %>%ungroup() # Remove grouping to clean up the result# Add county GEOID to the store-level sales data (so we can link to county-level weather data)store_sales_co <- store_co_geo %>%st_set_geometry(NULL) %>%select(store_id, geoid) %>%# Keep only store_id (for joining) and geoid (for county-level linkage)inner_join(store_sales, by ="store_id") # Join to sales data using store_id; keep only stores with known county info# Reshape weather data from long to wide format (each variable becomes a column)weather_wide <- weather_raw %>%pivot_wider(id_cols =c(county, date), # Keep county and date as identifiersnames_from = variable, # Each unique variable becomes a new columnvalues_from = value # Fill those columns with the corresponding values )# Join daily weather data to full county data using date and county identifiersco_weather <- us_co_filtered %>%st_set_geometry(NULL) %>%select(geoid) %>%right_join(weather_wide, by =c("geoid"="county"))length(unique(co_weather$geoid)) # N=3,104store_sales_weather_co <- co_weather %>%left_join(store_sales_co, by =c("date"="measure_date", "geoid"="geoid")) %>%filter(month(date)==7) %>%arrange(geoid, date, store_id)length(unique(store_sales_weather_co$geoid)) # N=3,104write_csv(store_sales_weather_co, "C:/Users/lachenar/OneDrive - Colostate/Documents/GitProjectsWithR/csu-arec-330.github.io/materials/unit_03/inputs/store_sales_weather_co.csv")# ======================================== ## ------ Monthly ------ ## ======================================== ## Aggregate store sales and weather data to the monthsales_weather_month <- store_sales_weather_co %>%mutate(month =month(date)) %>%mutate(year =year(date)) %>%mutate(day =day(date)) %>%group_by(geoid, month) %>%summarize(total_rainfall =sum(pr_precipitation_amount, na.rm =TRUE),avg_humidity =mean(rmax_relative_humidity, na.rm =TRUE),avg_temp =mean(tmmx_air_temperature, na.rm =TRUE),total_sales =sum(sales, na.rm =TRUE),.groups ='drop' ) length(unique(sales_weather_month$geoid)) # N=3,104write_csv(sales_weather_month, "sales_weather_month.csv")# Part 2: Plot the Data# Basic scatter plot with ggplotsales_weather_month %>%select(total_rainfall, avg_temp, total_sales) %>%ggpairs()# Zoom in on scatter plotsales_weather_month_slim <- sales_weather_month %>%select(total_rainfall, avg_temp, total_sales, month)# Basic scatter plot with ggplotscatter_plot <-ggplot(sales_weather_month_slim, aes(x = total_rainfall, y = total_sales)) +geom_point(alpha=.6) +labs(subtitle ="Relationship between Total Rainfall and Total Sales",x ="Total Inches of Rain",y ="Total Sales (Dollars)") +theme_bw(base_size =15)scatter_plot # Print scatter plot# Advanced scatter plot with ggplot (additional formatting)scatter_plot <-ggplot(sales_weather_month_slim, aes(x = total_rainfall, y = total_sales)) +geom_point(alpha=.6) +labs(subtitle ="Relationship between Total Rainfall and Total Sales",x ="Total Inches of Rain",y ="Total Sales (Dollars)") +theme_bw(base_size =15) +geom_hline(yintercept =0, color ="black", linewidth =0.5) +# <- x-axis line at y = 0theme(panel.border =element_blank(), # Remove borderaxis.line.y =element_line(color ="black"), # Only y-axis lineaxis.line.x =element_blank() # Remove x-axis line at y = 0 ) +scale_x_continuous(expand =c(0, 0)) +scale_y_continuous(labels =label_number(scale_cut =cut_short_scale())) # e.g., 100,000 → 100Kscatter_plot # Print scatter plot# Part 4: Estimate Regression - univariatemodel1 <-lm(total_sales ~ total_rainfall, sales_weather_month_slim) # Modeling the relationship between total rainfall and store sales at the county level. Store it as "model1"uni_out <-modelsummary(model1,fmt =5, # Format estimate so it shows five decimal placesstars =TRUE,coef_rename =c("(Intercept)"="Intercept","total_rainfall"="Total Rainfall"),gof_map =c("nobs", "r.squared"))uni_out # Print regression output# Create new data for predictionspred_data1 <- sales_weather_month_slim %>% dplyr::select(total_rainfall) %>%distinct() %>%arrange(total_rainfall) %>%mutate(fitted =predict(model1, newdata = .))# Add to plotscatter_plot +geom_line(data = pred_data1, aes(x = total_rainfall, y = fitted), color ="blue", linewidth =1)# Estimate regression - multivariatemodel2 <-lm(total_sales ~ total_rainfall + avg_temp, sales_weather_month_slim) # Modeling the relationship between total rainfall, average temperature, and total sales at the county level. Store it as "model2"multi_out <-modelsummary(model2,fmt =5, # Format estimate so it shows five decimal placesstars =TRUE,coef_rename =c("(Intercept)"="Intercept","total_rainfall"="Total Rainfall","avg_temp"="Average Temperature"),gof_map =c("nobs", "r.squared"))multi_out # Print regression output# Create new data for predictionspred_data2 <- sales_weather_month_slim %>% dplyr::select(total_rainfall, avg_temp) %>%distinct() %>%arrange(total_rainfall, avg_temp) %>%mutate(fitted =predict(model2, newdata = .))# Add to plotscatter_plot +geom_line(data = pred_data1, aes(x = total_rainfall, y = fitted), color ="blue", linewidth =1) +geom_line(data = pred_data2, aes(x = total_rainfall, y = fitted), color ="red", linewidth =1)# ======================================== ## ------ Daily ------ # # ======================================== #sales_weather_day <- store_sales_weather %>%mutate(month =month(measure_date)) %>%mutate(year =year(measure_date)) %>%mutate(day =day(measure_date)) %>%group_by(geoid, day) %>%summarize(total_rainfall =sum(pr_precipitation_amount),avg_humidity =mean(rmax_relative_humidity),avg_temp =mean(tmmx_air_temperature),total_sales =sum(sales),.groups ='drop' ) %>%# Match on date and county FIPSfull_join(us_co_filtered %>%st_set_geometry(NULL) %>%# Remove the spatial elementselect(geoid), by ="geoid") # Join in the county level fips codes# Part 2: Plot the Data# Basic scatter plot with ggplotsales_weather_day %>%select(total_rainfall, avg_temp, total_sales) %>%ggpairs()# Zoom in on scatter plotsales_weather_day_slim <- sales_weather_day %>%select(total_rainfall, avg_temp, total_sales, day)# Basic scatter plot with ggplotscatter_plot <-ggplot(sales_weather_day_slim, aes(x = total_rainfall, y = total_sales)) +geom_point(alpha=.6) +labs(subtitle ="Relationship between Total Rainfall and Total Sales",x ="Total Inches of Rain",y ="Total Sales (Dollars)") +theme_bw(base_size =15)scatter_plot # Print scatter plot# Advanced scatter plot with ggplot (additional formatting)scatter_plot <-ggplot(sales_weather_day_slim, aes(x = total_rainfall, y = total_sales)) +geom_point(alpha=.6) +labs(subtitle ="Relationship between Total Rainfall and Total Sales",x ="Total Inches of Rain",y ="Total Sales (Dollars)") +theme_bw(base_size =15) +geom_hline(yintercept =0, color ="black", linewidth =0.5) +# <- x-axis line at y = 0theme(panel.border =element_blank(), # Remove borderaxis.line.y =element_line(color ="black"), # Only y-axis lineaxis.line.x =element_blank() # Remove x-axis line at y = 0 ) +scale_x_continuous(expand =c(0, 0)) +scale_y_continuous(labels =label_number(scale_cut =cut_short_scale())) # e.g., 100,000 → 100Kscatter_plot # Print scatter plot# Part 4: Estimate Regression - univariatemodel1 <-lm(total_sales ~ total_rainfall, sales_weather_day_slim) # Modeling the relationship between total rainfall and store sales at the county level. Store it as "model1"uni_out <-modelsummary(model1,fmt =5, # Format estimate so it shows five decimal placesstars =TRUE,coef_rename =c("(Intercept)"="Intercept","total_rainfall"="Total Rainfall"),gof_map =c("nobs", "r.squared"))uni_out # Print regression output# Create new data for predictionspred_data1 <- sales_weather_day_slim %>% dplyr::select(total_rainfall) %>%distinct() %>%arrange(total_rainfall) %>%mutate(fitted =predict(model1, newdata = .))# Add to plotscatter_plot +geom_line(data = pred_data1, aes(x = total_rainfall, y = fitted), color ="blue", linewidth =1)# What if we control for day?model1_fe <-lm(total_sales ~ total_rainfall +factor(day), sales_weather_day_slim) # Modeling the relationship between total rainfall and store sales at the county level. Store it as "model1"fe_out <-modelsummary(model1_fe,fmt =5, # Format estimate so it shows five decimal placesstars =TRUE,coef_rename =c("(Intercept)"="Intercept","total_rainfall"="Total Rainfall"),gof_map =c("nobs", "r.squared"))fe_out # Print regression output# Estimate regression - multivariatemodel2_fe <-lm(total_sales ~ total_rainfall + avg_temp +factor(day), sales_weather_day_slim) # Modeling the relationship between total rainfall, average temperature, and total sales at the county level. Store it as "model2"multi_out_fe <-modelsummary(model2_fe,fmt =5, # Format estimate so it shows five decimal placesstars =TRUE,coef_rename =c("(Intercept)"="Intercept","total_rainfall"="Total Rainfall","avg_temp"="Average Temperature"),gof_map =c("nobs", "r.squared"))multi_out_fe # Print regression output# Create new data for predictionspred_data2 <- sales_weather_day_slim %>% dplyr::select(total_rainfall, avg_temp) %>%distinct() %>%arrange(total_rainfall, avg_temp) %>%mutate(fitted =predict(model2, newdata = .))# Add to plotscatter_plot +geom_line(data = pred_data1, aes(x = total_rainfall, y = fitted), color ="blue", linewidth =1) +geom_line(data = pred_data2, aes(x = total_rainfall, y = fitted), color ="red", linewidth =1)