From 3401654ab966092b65c25cb17190a8b69dfd93fc Mon Sep 17 00:00:00 2001 From: Tudor Sitaru Date: Thu, 16 Apr 2026 09:18:55 +0100 Subject: [PATCH] fix(pipeline): restore multi-year KS4 data Two bugs prevented historical secondary school data from loading: 1. stg_ees_ks4.sql filtered breakdown_topic = 'Total' only, but EES releases prior to 2023/24 use breakdown_topic = 'All pupils' (matching the KS2 convention). All older years were silently dropped to zero rows. Fix: accept both values with an IN clause. 2. get_all_releases() in tap-uk-ees fetched only the first page of the EES releases API. Now follows all pages via the paging.totalPages field so no historical release is missed when more than 20 exist. After re-running the annual EES pipeline, secondary school comparison charts should show data across all available years (2018/19 onwards). Co-Authored-By: Claude Sonnet 4.6 --- .../extractors/tap-uk-ees/tap_uk_ees/tap.py | 39 ++++++++++++++----- .../transform/models/staging/stg_ees_ks4.sql | 6 ++- 2 files changed, 33 insertions(+), 12 deletions(-) diff --git a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py index d0d9300..b7557e4 100644 --- a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py +++ b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py @@ -40,17 +40,36 @@ def _slug_to_time_period(slug: str) -> str | None: def get_all_releases(publication_slug: str) -> list[dict]: - """Return all releases for a publication as dicts with 'id' and 'time_period'.""" - url = f"{CONTENT_API_BASE}/publications/{publication_slug}/releases" - resp = requests.get(url, timeout=TIMEOUT) - resp.raise_for_status() - data = resp.json() - # API returns either a plain list or a paginated object with a "results" key - releases = data if isinstance(data, list) else data.get("results", []) + """Return all releases for a publication as dicts with 'id' and 'time_period'. + + The EES content API paginates with a 'paging' envelope when there are many + releases. This function follows all pages so no historical release is missed. + """ result = [] - for r in releases: - time_period = _slug_to_time_period(r.get("slug", "")) - result.append({"id": r["id"], "time_period": time_period}) + page = 1 + while True: + url = f"{CONTENT_API_BASE}/publications/{publication_slug}/releases?page={page}&pageSize=20" + resp = requests.get(url, timeout=TIMEOUT) + resp.raise_for_status() + data = resp.json() + + # API returns either a plain list or a paginated object with a "results" key + if isinstance(data, list): + releases = data + total_pages = 1 + else: + releases = data.get("results", []) + paging = data.get("paging", {}) + total_pages = paging.get("totalPages", 1) + + for r in releases: + time_period = _slug_to_time_period(r.get("slug", "")) + result.append({"id": r["id"], "time_period": time_period}) + + if page >= total_pages: + break + page += 1 + return result diff --git a/pipeline/transform/models/staging/stg_ees_ks4.sql b/pipeline/transform/models/staging/stg_ees_ks4.sql index a99c8d9..7cd1d3e 100644 --- a/pipeline/transform/models/staging/stg_ees_ks4.sql +++ b/pipeline/transform/models/staging/stg_ees_ks4.sql @@ -3,9 +3,11 @@ -- Staging model: KS4 attainment data from EES -- KS4 performance data is long-format with breakdown dimensions (breakdown_topic, -- breakdown, sex). Unlike KS2 which has a subject dimension, KS4 metrics are --- already in separate columns — we just filter to the 'All pupils' breakdown. +-- already in separate columns — we just filter to the all-pupils total row. -- EES uses 'z' (not applicable) and 'c' (confidential) as suppression codes — -- safe_numeric handles both by treating any non-numeric string as NULL. +-- NOTE: older EES releases (pre-2023/24) use breakdown_topic = 'All pupils'; +-- the 2023/24 release switched to breakdown_topic = 'Total'. Both are included. with performance as ( select * from {{ source('raw', 'ees_ks4_performance') }} @@ -46,7 +48,7 @@ all_pupils as ( {{ safe_numeric('gcse_91_percent') }} as gcse_grade_91_pct from performance - where breakdown_topic = 'Total' + where breakdown_topic in ('Total', 'All pupils') and breakdown = 'Total' and sex = 'Total' ),