Seasonal Adjustment with R and X-13

An Example with Border Migration Data

This post is a work in progress to highlight the usage of X-13ARIMA-SEATS in R for seasonal adjust and forecasting via the seasonal package. I will provide some skeleton code to seasonally adjust a few series, highlight the most relevant diagnostics practitioners look at for seasonal adjustment and forecast quality control, and provide some simple functions to extract the data from seasonal package outputs and produce self-contained charts.

Overview

Methodology

For this exercise I will be using monthly encounters at the Southwest Border by U.S. Border Patrol.

# Packages to load
pacman::p_load(tidyverse,
               pdftools,
               tsbox,
               seasonal)
# Assemble the historical CBP encounters ----
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::


cbp_link <- "https://www.cbp.gov/sites/default/files/assets/documents/2021-Aug/U.S.%20Border%20Patrol%20Monthly%20Encounters%20%28FY%202000%20-%20FY%202020%29%20%28508%29.pdf"


if(!file.exists("./data/cbp_data.pdf")) {
  download.file(url = cbp_link, destfile = "./data/cbp_data.pdf")
}

# Load the PDF
border_encounter <- pdf_text("data/cbp_data.pdf")



make_cbp_data <- function(i) {
  
  
  message(i)
  
  # Process the page
  fy = border_encounter[i]
  fy = strsplit(fy, "\n")[[1]]
  fy = trimws(fy)
  
  # Fiscal year
  fiscal_year <- 2000 + i - 1
  
  # Column headers
  if(as.integer(fiscal_year) < 2019) {
    fy_headers <- fy[grep("SECTOR", fy)]
    fy_headers <- c(str_split_fixed(fy_headers, " {2,}", 13), "Yearly Total")
  } else {
    fy_headers <- c("SECTOR", month.name[10:12], month.name[1:9], "Yearly Total")
  }
  
  # Start line
  start_row <- grep("SECTOR", fy)
  
  
  # Collect into data frame
  fy_data <- str_split_fixed(fy[start_row:length(fy)], " {2,}", 14)
  fy_data <- data.frame(fy_data)
  
  # Apply names
  names(fy_data) <- fy_headers
  
  # Assemble final data
  fy_final <- fy_data %>% 
    filter(October != "" & `Yearly Total` != "") %>% 
    mutate(FY = fiscal_year)
  
  if (as.integer(fiscal_year) < 2019) {
    # Formerly named areas
    formerly <- fy_data[grep("Marfa|McAllen", fy_data$SECTOR) - 1, ]
    
    # Shift over 1 col
    for(i in 14:1) {
      formerly[i] <- formerly[i-1]
    }
    
    # Re-Add formerly named areas
    formerly$SECTOR <- c("Big Bend", "Rio Grande Valley")
    
    # Add the missing sectors
    fy_final <- bind_rows(fy_final, formerly) %>% 
      mutate(FY = fiscal_year)
    
  }
  
  return(fy_final)

}



cbp_encounters <- map_dfr(1:length(border_encounter), make_cbp_data) %>% 
  
  # Sort by sector and FY
  arrange(SECTOR, FY) %>% 
  
  # Drop the annual totals
  select(-`Yearly Total`) %>% 
  
  # Collect into tidy form
  pivot_longer(cols = October:September,
               names_to  = "mnth",
               values_to = "encounters",
               values_transform = list(encounters = parse_number)) %>% 
  
  # Make formatted date columns
  mutate(mnth = match(mnth, month.name),
         year = ifelse(mnth %in% 10:12,
                       FY - 1,
                       FY),
         date = as.Date(sprintf("%s-%02.0f-01", year, mnth)))
  

# Assemble the new CBP encounters data ----
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

cbp_updt <- "https://www.cbp.gov/sites/default/files/assets/documents/2022-Oct/nationwide-encounters-fy20-fy22.csv"


table(cbp_encounters$SECTOR)
## 
##          Big Bend            Blaine           Buffalo    Coastal Border 
##               252               252               252               252 
##           Del Rio           Detroit         El Centro           El Paso 
##               252               252               252               252 
##       Grand Forks             Havre           Houlton            Laredo 
##               252               252               252               252 
##         Livermore        Livermore*             Miami     Monthly Total 
##                72               156               252               252 
##       New Orleans   Northern Border             Ramey Rio Grande Valley 
##               252               252               252               252 
##         San Diego  Southwest Border           Spokane           Swanton 
##               252               252               252               252 
##            Tucson              Yuma 
##               252               252
cbp_new <- read_csv(cbp_updt)  %>% 
  
  # Drop the OFO
  filter(grepl("U.S. Border Patrol", Component)) %>% 
  
  # Grouping vars + renames
  group_by(FY            = `Fiscal Year`, 
           mnth          = `Month (abbv)`, 
           border_region = `Land Border Region`, 
           aor_name      = `Area of Responsibility`, 
           aor_code      = `AOR (Abbv)`
           ) %>% 
  
  # Tabulate encounters by month
  summarize(encounters = sum(`Encounter Count`), .groups = "drop") %>% 
  
  # Make time variables
  mutate(mnth = stringi::stri_trans_general(mnth, id = "Title"),
         mnth = match(mnth, month.abb),
         year = ifelse(mnth %in% 10:12,
                       FY - 1,
                       FY),
         date   = as.Date(sprintf("%s-%02.0f-01", year, mnth)),
         SECTOR = trimws(gsub("Sector", "", aor_name))) %>% 
  
  
  arrange(aor_name, date)


# Data validations ----
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

rows_different <- inner_join(cbp_new, cbp_encounters, by = c("SECTOR", "date")) %>% 
  filter(encounters.x != encounters.y) %>% 
  nrow(.)

cat("Rows different:", rows_different)
## Rows different: 0
# Assemble the new CBP encounters data ----
# :::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::::

sector_to_code <- cbp_new %>% 
  group_by(border_region, aor_code, aor_name) %>% 
  summarize(.groups = "drop") %>% 
  mutate(sector_name = trimws(gsub("Sector", "", aor_name)))


df_encounters <- bind_rows(cbp_new, cbp_encounters) %>% 
  
  # Subset cols of interest
  select(sector_name = SECTOR, 
         date,
         encounters) %>% 
  
  # Get sector codes 
  left_join(sector_to_code, by = "sector_name") %>% 
  
  # Drop duplicated entries
  distinct(border_region, aor_code, sector_name, date, encounters) %>% 
  
  # Sort by sector and date
  arrange(sector_name, date)

Seasonal Adjustment and Forecasting

Here I will demonstrate how to examine the border encounters series for seasonality, seasonally adjust them, and forecast them through the end of 2023. Additionally, I will highlight X-13’s outlier detection abilities.

# Forecast Southwest
ts_encounters <- df_encounters %>% 
  
  # Keep the sector of interest
  filter(border_region == "Southwest Land Border") %>% 
  
  # Subset unnecessary cols
  select(aor_code, date, encounters) %>% 
  
  # Set as `ts`
  ts_tslist()
## [time]: 'date' [value]: 'encounters'

Now that we have the border sector encounters set up as a list of time series, we can apply X-13ARIMA-SEATS in “batch” mode, which will apply SEATS to each individual sector’s time series. With a number of series to run, batch mode can save a lot of time.

# X-13ARIMA-SEATS in batch
x13_southwest <- seas(x = ts_encounters)


# Here I define a function to collect the objects we want
#   - The original time series
#   - The seasonally adjusted series
#   - The outlier specifications
#   - The forecasts and their CI's
get_x13_objects <- function(x13_obj)
{
  
  left_join(
    ts_wide(ts_df(x13_obj$data)),
    ts_wide(ts_df(x13_obj$x)),
    by = "time") %>% 
  
  select(time, original = value, final) %>%
    
    left_join(ts_df(outlier(x13_obj)), by = "time") %>% 
    
    rename(outlier = value) %>% 
    mutate(outlier = as.character(outlier)) %>% 
    
    bind_rows(ts_wide(ts_df(series(x13_obj, c("forecast.forecasts")))))
    
}

plot_sector_model <- function(df)
{
  
ggplot(df, aes(x = time, y = original, color = "Encounters")) +
  
  geom_line(size = 1,
            na.rm = T) +
  
  geom_line(aes(y = final, color = "Adjusted"),
            size = 1,
            na.rm = T) +
  
  geom_line(aes(y = forecast, color = "Adjusted"),
            size = 1,
            linetype = "dashed",
            na.rm = T) +
  
  # Color scheme
  scale_color_manual("",
                     values = c("Encounters" = "darkblue",
                                "Adjusted"   = "darkred")) +
  
  # Axis formatting
  scale_y_continuous("Border Encounters",
                     labels = scales::comma) +
  scale_x_date("",
               date_breaks = "24 months",
               date_labels = "%b-%y") +
  
  # General theme
  theme_minimal() +
  theme(legend.position = "bottom",
        plot.caption = element_text(hjust = 0)) +
  
  # Plot titling
  labs(title = sprintf("Monthly Encounters by U.S. Border Patrol: %s Sector", "Rio Grande Valley"),
       subtitle = "Forecast encounters in dashed line",
       caption = "Source: U.S. Border Patrol\n Notes: Seasonally Adjusted using X-13ARIMA-SEATS")
  

}
Andrew C. Forrester
Andrew C. Forrester
Economist and Statistician in Washington, DC

Economist and statistician in Washington, DC working on economic statistics, labor and financial economics, time series and seasonal adjustment, and quantitative demography. All views are my own.