Seasonal Adjustment with R and X-13
An Example with Border Migration Data
This post is a work in progress to highlight the usage of X-13ARIMA-SEATS
in R
for seasonal adjust and forecasting via the seasonal
package. I will provide some skeleton code to seasonally adjust a few series, highlight the most relevant diagnostics practitioners look at for seasonal adjustment and forecast quality control, and provide some simple functions to extract the data from seasonal
package outputs and produce self-contained charts.
Overview
Methodology
For this exercise I will be using monthly encounters at the Southwest Border by U.S. Border Patrol.
# Packages to load
pacman::p_load(tidyverse,
pdftools,
tsbox,
seasonal)
# Assemble the historical CBP encounters ----
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
cbp_link <- "https://www.cbp.gov/sites/default/files/assets/documents/2021-Aug/U.S.%20Border%20Patrol%20Monthly%20Encounters%20%28FY%202000%20-%20FY%202020%29%20%28508%29.pdf"
if(!file.exists("./data/cbp_data.pdf")) {
download.file(url = cbp_link, destfile = "./data/cbp_data.pdf")
}
# Load the PDF
border_encounter <- pdf_text("data/cbp_data.pdf")
make_cbp_data <- function(i) {
message(i)
# Process the page
fy = border_encounter[i]
fy = strsplit(fy, "\n")[[1]]
fy = trimws(fy)
# Fiscal year
fiscal_year <- 2000 + i - 1
# Column headers
if(as.integer(fiscal_year) < 2019) {
fy_headers <- fy[grep("SECTOR", fy)]
fy_headers <- c(str_split_fixed(fy_headers, " {2,}", 13), "Yearly Total")
} else {
fy_headers <- c("SECTOR", month.name[10:12], month.name[1:9], "Yearly Total")
}
# Start line
start_row <- grep("SECTOR", fy)
# Collect into data frame
fy_data <- str_split_fixed(fy[start_row:length(fy)], " {2,}", 14)
fy_data <- data.frame(fy_data)
# Apply names
names(fy_data) <- fy_headers
# Assemble final data
fy_final <- fy_data %>%
filter(October != "" & `Yearly Total` != "") %>%
mutate(FY = fiscal_year)
if (as.integer(fiscal_year) < 2019) {
# Formerly named areas
formerly <- fy_data[grep("Marfa|McAllen", fy_data$SECTOR) - 1, ]
# Shift over 1 col
for(i in 14:1) {
formerly[i] <- formerly[i-1]
}
# Re-Add formerly named areas
formerly$SECTOR <- c("Big Bend", "Rio Grande Valley")
# Add the missing sectors
fy_final <- bind_rows(fy_final, formerly) %>%
mutate(FY = fiscal_year)
}
return(fy_final)
}
cbp_encounters <- map_dfr(1:length(border_encounter), make_cbp_data) %>%
# Sort by sector and FY
arrange(SECTOR, FY) %>%
# Drop the annual totals
select(-`Yearly Total`) %>%
# Collect into tidy form
pivot_longer(cols = October:September,
names_to = "mnth",
values_to = "encounters",
values_transform = list(encounters = parse_number)) %>%
# Make formatted date columns
mutate(mnth = match(mnth, month.name),
year = ifelse(mnth %in% 10:12,
FY - 1,
FY),
date = as.Date(sprintf("%s-%02.0f-01", year, mnth)))
# Assemble the new CBP encounters data ----
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
cbp_updt <- "https://www.cbp.gov/sites/default/files/assets/documents/2022-Oct/nationwide-encounters-fy20-fy22.csv"
table(cbp_encounters$SECTOR)
##
## Big Bend Blaine Buffalo Coastal Border
## 252 252 252 252
## Del Rio Detroit El Centro El Paso
## 252 252 252 252
## Grand Forks Havre Houlton Laredo
## 252 252 252 252
## Livermore Livermore* Miami Monthly Total
## 72 156 252 252
## New Orleans Northern Border Ramey Rio Grande Valley
## 252 252 252 252
## San Diego Southwest Border Spokane Swanton
## 252 252 252 252
## Tucson Yuma
## 252 252
cbp_new <- read_csv(cbp_updt) %>%
# Drop the OFO
filter(grepl("U.S. Border Patrol", Component)) %>%
# Grouping vars + renames
group_by(FY = `Fiscal Year`,
mnth = `Month (abbv)`,
border_region = `Land Border Region`,
aor_name = `Area of Responsibility`,
aor_code = `AOR (Abbv)`
) %>%
# Tabulate encounters by month
summarize(encounters = sum(`Encounter Count`), .groups = "drop") %>%
# Make time variables
mutate(mnth = stringi::stri_trans_general(mnth, id = "Title"),
mnth = match(mnth, month.abb),
year = ifelse(mnth %in% 10:12,
FY - 1,
FY),
date = as.Date(sprintf("%s-%02.0f-01", year, mnth)),
SECTOR = trimws(gsub("Sector", "", aor_name))) %>%
arrange(aor_name, date)
# Data validations ----
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
rows_different <- inner_join(cbp_new, cbp_encounters, by = c("SECTOR", "date")) %>%
filter(encounters.x != encounters.y) %>%
nrow(.)
cat("Rows different:", rows_different)
## Rows different: 0
# Assemble the new CBP encounters data ----
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::
sector_to_code <- cbp_new %>%
group_by(border_region, aor_code, aor_name) %>%
summarize(.groups = "drop") %>%
mutate(sector_name = trimws(gsub("Sector", "", aor_name)))
df_encounters <- bind_rows(cbp_new, cbp_encounters) %>%
# Subset cols of interest
select(sector_name = SECTOR,
date,
encounters) %>%
# Get sector codes
left_join(sector_to_code, by = "sector_name") %>%
# Drop duplicated entries
distinct(border_region, aor_code, sector_name, date, encounters) %>%
# Sort by sector and date
arrange(sector_name, date)
Seasonal Adjustment and Forecasting
Here I will demonstrate how to examine the border encounters series for seasonality, seasonally adjust them, and forecast them through the end of 2023. Additionally, I will highlight X-13
’s outlier detection abilities.
# Forecast Southwest
ts_encounters <- df_encounters %>%
# Keep the sector of interest
filter(border_region == "Southwest Land Border") %>%
# Subset unnecessary cols
select(aor_code, date, encounters) %>%
# Set as `ts`
ts_tslist()
## [time]: 'date' [value]: 'encounters'
Now that we have the border sector encounters set up as a list of time series, we can apply X-13ARIMA-SEATS
in “batch” mode, which will apply SEATS to each individual sector’s time series. With a number of series to run, batch mode can save a lot of time.
# X-13ARIMA-SEATS in batch
x13_southwest <- seas(x = ts_encounters)
# Here I define a function to collect the objects we want
# - The original time series
# - The seasonally adjusted series
# - The outlier specifications
# - The forecasts and their CI's
get_x13_objects <- function(x13_obj)
{
left_join(
ts_wide(ts_df(x13_obj$data)),
ts_wide(ts_df(x13_obj$x)),
by = "time") %>%
select(time, original = value, final) %>%
left_join(ts_df(outlier(x13_obj)), by = "time") %>%
rename(outlier = value) %>%
mutate(outlier = as.character(outlier)) %>%
bind_rows(ts_wide(ts_df(series(x13_obj, c("forecast.forecasts")))))
}
plot_sector_model <- function(df)
{
ggplot(df, aes(x = time, y = original, color = "Encounters")) +
geom_line(size = 1,
na.rm = T) +
geom_line(aes(y = final, color = "Adjusted"),
size = 1,
na.rm = T) +
geom_line(aes(y = forecast, color = "Adjusted"),
size = 1,
linetype = "dashed",
na.rm = T) +
# Color scheme
scale_color_manual("",
values = c("Encounters" = "darkblue",
"Adjusted" = "darkred")) +
# Axis formatting
scale_y_continuous("Border Encounters",
labels = scales::comma) +
scale_x_date("",
date_breaks = "24 months",
date_labels = "%b-%y") +
# General theme
theme_minimal() +
theme(legend.position = "bottom",
plot.caption = element_text(hjust = 0)) +
# Plot titling
labs(title = sprintf("Monthly Encounters by U.S. Border Patrol: %s Sector", "Rio Grande Valley"),
subtitle = "Forecast encounters in dashed line",
caption = "Source: U.S. Border Patrol\n Notes: Seasonally Adjusted using X-13ARIMA-SEATS")
}