1 Environment Setup

1.1 Import Standard Libraries

Show the code

# For Analysis
import numpy as np
import pandas as pd
import geopandas as gpd

# For Visualization
import matplotlib.pyplot as plt
import seaborn as sns

# misc
import os
import requests
from pathlib import Path
import importlib.util

# Census data query libraries & modules
from pygris import blocks, block_groups
from pygris.helpers import validate_state, validate_county
from pygris.data import get_census, get_lodes

from dotenv import load_dotenv

load_dotenv()

True

1.2 Environment Variables

Show the code

CRS_UTM = "EPSG:26912"

1.3 Region Definition

Show the code

# --- Configuration ---
target_state = "UT"
target_counties = ["BOX ELDER", "WEBER", "DAVIS", "SALT LAKE", "UTAH"]

Show the code

# --- 1. Validate FIPS Codes ---
# Validate State
state_fips = validate_state(target_state)
print(f"State FIPS for {target_state}: {state_fips}")

# Validate Counties and get their FIPS codes
county_fips = [validate_county(state_fips, county) for county in target_counties]

print(f"Target County FIPS: {county_fips}")

Using FIPS code '49' for input 'UT'
State FIPS for UT: 49
Using FIPS code '003' for input 'BOX ELDER'
Using FIPS code '057' for input 'WEBER'
Using FIPS code '011' for input 'DAVIS'
Using FIPS code '035' for input 'SALT LAKE'
Using FIPS code '049' for input 'UTAH'
Target County FIPS: ['003', '057', '011', '035', '049']

Show the code

# Converting County Names to FIPS code
fips_codes = [f"{state_fips}{county}" for county in county_fips]
fips_codes

['49003', '49057', '49011', '49035', '49049']

1.4 Helper Functions

Show the code

# Create function to read ArcGIS FeatureLayer or Table
def arc_read(url, where="1=1", outFields="*", outSR=4326, **kwargs):
    """
    Read an ArcGIS FeatureLayer or Table to a GeoDataFrame.

    Parameters:
    url (str): The ArcGIS REST service URL (e.g., ending in /FeatureServer/0)
    where (str): SQL WHERE clause for filtering. Default: "1=1"
    outFields (str): Comma-separated field names. Default: "*"
    outSR (int): Output spatial reference EPSG code. Default: 4326
    **kwargs: Additional query parameters passed to the ArcGIS REST API

    Returns:
    geopandas.GeoDataFrame: Spatial data from the service
    """
    # Ensure URL ends with /query
    if not url.endswith('/query'):
        url = url.rstrip('/') + '/query'

    # Build query parameters
    params = {
        'where': where,
        'outFields': outFields,
        'returnGeometry': 'true',
        'outSR': outSR,
        'f': 'geojson'
    }

    # Add any additional parameters
    params.update(kwargs)

    # Make request
    response = requests.get(url, params=params)
    response.raise_for_status() # Good practice to check for HTTP errors

    # Read as GeoDataFrame
    # We use io.BytesIO to handle the response content safely for read_file
    import io
    return gpd.read_file(io.BytesIO(response.content), engine="pyogrio")

2 Load Data

2.1 Regional Boundary

Show the code

# 1. Configuration
service_url = "https://services1.arcgis.com/taguadKoI1XFwivx/ArcGIS/rest/services/RegionalBoundaryComponents/FeatureServer/0"

output_dir = Path("data")
gdb_path = output_dir / "RegionalBoundary.gpkg"
layer_name = "RegionalBoundaryComponents"

# 2. Check if file exists locally, otherwise download
if not gdb_path.exists():
    print(f"File not found at {gdb_path}. Downloading Regional Boundaries...")
    output_dir.mkdir(parents=True, exist_ok=True)

    try:
        # Use the custom arc_read function
        gdf_download = arc_read(service_url)

        # Export to Geodatabase
        print(f"Saving to {gdb_path}...")
        gdf_download.to_file(gdb_path, layer=layer_name, driver="GPKG")
        print("Download and export complete.")

    except Exception as e:
        print(f"Error during download: {e}")
else:
    print(f"File found locally at {gdb_path}. Skipping download.")

# 3. Read from local GDB with Filter and Dissolve
gdf_region = (
    gpd.read_file(
        gdb_path, layer=layer_name, where="PlanOrg IN ('MAG MPO', 'WFRC MPO')"
    )
    .dissolve()
    .to_crs(CRS_UTM)
)

# Fix the holes and slivers in the regional boundary
gdf_region["geometry"] = gdf_region.buffer(1).buffer(-1)

gdf_region.explore()

File found locally at data\RegionalBoundary.gpkg. Skipping download.

Make this Notebook Trusted to load map: File -> Trust Notebook

2.2 Download IPEDS Access Databases

Downloads any missing IPEDS complete-data Access packages to data/IPEDS_Access/. Each zip contains one .accdb (the full database), one .docx (readme), and one .xlsx (data dictionary). Files are 40–90 MB each; already-downloaded zips are skipped.

Show the code

import zipfile
import io
import warnings

warnings.filterwarnings("ignore")

IPEDS_ACCESS_URLS = {
    2024: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2024-25_Provisional.zip",
    2023: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2023-24_Final.zip",
    2022: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2022-23_Final.zip",
    2021: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2021-22_Final.zip",
    2020: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2020-21_Final.zip",
    2019: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2019-20_Final.zip",
    2018: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2018-19_Final.zip",
    2017: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2017-18_Final.zip",
    2016: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2016-17_Final.zip",
    2015: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2015-16_Final.zip",
    2014: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2014-15_Final.zip",
    2013: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2013-14_Final.zip",
    2012: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2012-13_Final.zip",
    2011: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2011-12_Final.zip",
    2010: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2010-11_Final.zip",
    2009: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2009-10_Final.zip",
    2008: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2008-09_Final.zip",
    2007: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2007-08_Final.zip",
    2006: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2006-07_Final.zip",
    2005: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2005-06_Final.zip",
    2004: "https://nces.ed.gov/ipeds/tablefiles/zipfiles/IPEDS_2004-05_Final.zip",
}

access_dir = Path("data/IPEDS_Access")
access_dir.mkdir(parents=True, exist_ok=True)

for year, url in IPEDS_ACCESS_URLS.items():
    dest = access_dir / url.split("/")[-1]
    if dest.exists():
        print(f"Cached  {dest.name}")
        continue
    r = requests.get(url, timeout=300)
    r.raise_for_status()
    dest.write_bytes(r.content)
    print(f"Downloaded {dest.name}  ({len(r.content)/1e6:.0f} MB)")

print(f"\n{sum(1 for _ in access_dir.glob('*.zip'))} IPEDS Access zips in {access_dir}")

Cached  IPEDS_2024-25_Provisional.zip
Cached  IPEDS_2023-24_Final.zip
Cached  IPEDS_2022-23_Final.zip
Cached  IPEDS_2021-22_Final.zip
Cached  IPEDS_2020-21_Final.zip
Cached  IPEDS_2019-20_Final.zip
Cached  IPEDS_2018-19_Final.zip
Cached  IPEDS_2017-18_Final.zip
Cached  IPEDS_2016-17_Final.zip
Cached  IPEDS_2015-16_Final.zip
Cached  IPEDS_2014-15_Final.zip
Cached  IPEDS_2013-14_Final.zip
Cached  IPEDS_2012-13_Final.zip
Cached  IPEDS_2011-12_Final.zip
Cached  IPEDS_2010-11_Final.zip
Cached  IPEDS_2009-10_Final.zip
Cached  IPEDS_2008-09_Final.zip
Cached  IPEDS_2007-08_Final.zip
Cached  IPEDS_2006-07_Final.zip
Cached  IPEDS_2005-06_Final.zip
Cached  IPEDS_2004-05_Final.zip

21 IPEDS Access zips in data\IPEDS_Access

2.3 Download IPEDS Fall Enrollment CSV Files (All Years — Backup)

Downloads individual IPEDS fall enrollment CSV zip files for all years to data/IPEDS_Raw/ as a backup. For 2012–2023, EF{YEAR}A_DIST.zip provides the distance-education breakdown (EFDENON, EFDESOM); for 2000–2011, EF{YEAR}A.zip provides total enrollment. These CSVs are used as a fallback in notebook 6 if the full Access database packages are unavailable.

Show the code

IPEDS_CSV_URLS = {
    **{year: f"https://nces.ed.gov/ipeds/tablefiles/zipfiles/EF{year}A_DIST.zip" for year in range(2012, 2024)},
    **{year: f"https://nces.ed.gov/ipeds/tablefiles/zipfiles/EF{year}A.zip" for year in range(2000, 2012)},
}

raw_dir = Path("data/IPEDS_Raw")
raw_dir.mkdir(parents=True, exist_ok=True)

for year, url in IPEDS_CSV_URLS.items():
    suffix = "a_dist" if year >= 2012 else "a"
    csv_path = raw_dir / f"ef{year}{suffix}.csv"
    if csv_path.exists():
        print(f"Cached  {csv_path.name}")
        continue
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    with zipfile.ZipFile(io.BytesIO(r.content)) as z:
        csv_name = next(f for f in z.namelist() if f.lower().endswith(".csv"))
        csv_path.write_bytes(z.read(csv_name))
    print(f"Downloaded and extracted {csv_path.name}")

print(f"\n{sum(1 for _ in raw_dir.glob('ef*.csv'))} IPEDS CSV files in {raw_dir}")

Cached  ef2012a_dist.csv
Cached  ef2013a_dist.csv
Cached  ef2014a_dist.csv
Cached  ef2015a_dist.csv
Cached  ef2016a_dist.csv
Cached  ef2017a_dist.csv
Cached  ef2018a_dist.csv
Cached  ef2019a_dist.csv
Cached  ef2020a_dist.csv
Cached  ef2021a_dist.csv
Cached  ef2022a_dist.csv
Cached  ef2023a_dist.csv
Cached  ef2000a.csv
Cached  ef2001a.csv
Cached  ef2002a.csv
Cached  ef2003a.csv
Cached  ef2004a.csv
Cached  ef2005a.csv
Cached  ef2006a.csv
Cached  ef2007a.csv
Cached  ef2008a.csv
Cached  ef2009a.csv
Cached  ef2010a.csv
Cached  ef2011a.csv

82 IPEDS CSV files in data\IPEDS_Raw

2.4 Download EDGE Geocode Files

Downloads any missing EDGE Geocode zip files to data/IPEDS_Geocode/.

Show the code

EDGE_GEOCODE_URLS = {
    2024: "https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_POSTSECSCH_2425.zip",
    2023: "https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_POSTSECSCH_2324.zip",
    2022: "https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_POSTSECSCH_2223.zip",
    2021: "https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_POSTSECONDARYSCH_2122.zip",
    2020: "https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_POSTSECONDARYSCH_2021.zip",
    2019: "https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_POSTSEC_1920.zip",
    2018: "https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_POSTSECONDARYSCH_1819.zip",
    2017: "https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_POSTSECONDARYSCH_1718.zip",
    2016: "https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_POSTSECONDARYSCH_1617.zip",
    2015: "https://nces.ed.gov/programs/edge/data/EDGE_GEOCODE_POSTSECONDARYSCH_1516.zip",
}

geocode_dir = Path("data/IPEDS_Geocode")
geocode_dir.mkdir(parents=True, exist_ok=True)

for year, url in EDGE_GEOCODE_URLS.items():
    dest = geocode_dir / url.split("/")[-1]
    if dest.exists():
        print(f"Cached  {dest.name}")
        continue
    r = requests.get(url, timeout=60)
    r.raise_for_status()
    dest.write_bytes(r.content)
    print(f"Downloaded {dest.name}")

print(f"\n{sum(1 for _ in geocode_dir.glob('*.zip'))} EDGE geocode zips in {geocode_dir}")

Cached  EDGE_GEOCODE_POSTSECSCH_2425.zip
Cached  EDGE_GEOCODE_POSTSECSCH_2324.zip
Cached  EDGE_GEOCODE_POSTSECSCH_2223.zip
Cached  EDGE_GEOCODE_POSTSECONDARYSCH_2122.zip
Cached  EDGE_GEOCODE_POSTSECONDARYSCH_2021.zip
Cached  EDGE_GEOCODE_POSTSEC_1920.zip
Cached  EDGE_GEOCODE_POSTSECONDARYSCH_1819.zip
Cached  EDGE_GEOCODE_POSTSECONDARYSCH_1718.zip
Cached  EDGE_GEOCODE_POSTSECONDARYSCH_1617.zip
Cached  EDGE_GEOCODE_POSTSECONDARYSCH_1516.zip

10 EDGE geocode zips in data\IPEDS_Geocode

2.5 Post-Secondary Institutions

Loads the EDGE Geocode for the year matching the most recent IPEDS enrollment year. The zip contains an XLSX with columns UNITID, NAME, LAT, LON, STFIP, CNTY. Points are converted to a GeoDataFrame, filtered to Utah counties in the WF-TDM region, and clipped to the regional boundary.

Show the code

# Match EDGE geocode year to most recent IPEDS enrollment year
latest_ipeds_year = max(IPEDS_ACCESS_URLS)

edge_url = EDGE_GEOCODE_URLS.get(latest_ipeds_year) or EDGE_GEOCODE_URLS[max(EDGE_GEOCODE_URLS)]
geocode_zip = geocode_dir / edge_url.split("/")[-1]

with zipfile.ZipFile(geocode_zip) as z:
    xlsx_name = next(f for f in z.namelist() if f.lower().endswith(".xlsx"))
    df_geocode = pd.read_excel(io.BytesIO(z.read(xlsx_name)))

gdf_college_locations = (
    gpd.GeoDataFrame(
        df_geocode,
        geometry=gpd.points_from_xy(df_geocode["LON"], df_geocode["LAT"]),
        crs="EPSG:4326",
    )
    .loc[lambda x: x["STFIP"] == int(state_fips)]
    .to_crs(CRS_UTM)
    .clip(gdf_region)
)

print(f"EDGE Geocode year: {latest_ipeds_year}")
print(f"Institutions in WF-TDM region: {len(gdf_college_locations)}")
gdf_college_locations.explore()

EDGE Geocode year: 2024
Institutions in WF-TDM region: 0

Make this Notebook Trusted to load map: File -> Trust Notebook