diff --git a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py index d0d9300..b7557e4 100644 --- a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py +++ b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py @@ -40,17 +40,36 @@ def _slug_to_time_period(slug: str) -> str | None: def get_all_releases(publication_slug: str) -> list[dict]: - """Return all releases for a publication as dicts with 'id' and 'time_period'.""" - url = f"{CONTENT_API_BASE}/publications/{publication_slug}/releases" - resp = requests.get(url, timeout=TIMEOUT) - resp.raise_for_status() - data = resp.json() - # API returns either a plain list or a paginated object with a "results" key - releases = data if isinstance(data, list) else data.get("results", []) + """Return all releases for a publication as dicts with 'id' and 'time_period'. + + The EES content API paginates with a 'paging' envelope when there are many + releases. This function follows all pages so no historical release is missed. + """ result = [] - for r in releases: - time_period = _slug_to_time_period(r.get("slug", "")) - result.append({"id": r["id"], "time_period": time_period}) + page = 1 + while True: + url = f"{CONTENT_API_BASE}/publications/{publication_slug}/releases?page={page}&pageSize=20" + resp = requests.get(url, timeout=TIMEOUT) + resp.raise_for_status() + data = resp.json() + + # API returns either a plain list or a paginated object with a "results" key + if isinstance(data, list): + releases = data + total_pages = 1 + else: + releases = data.get("results", []) + paging = data.get("paging", {}) + total_pages = paging.get("totalPages", 1) + + for r in releases: + time_period = _slug_to_time_period(r.get("slug", "")) + result.append({"id": r["id"], "time_period": time_period}) + + if page >= total_pages: + break + page += 1 + return result diff --git a/pipeline/transform/models/staging/stg_ees_ks4.sql b/pipeline/transform/models/staging/stg_ees_ks4.sql index a99c8d9..7cd1d3e 100644 --- a/pipeline/transform/models/staging/stg_ees_ks4.sql +++ b/pipeline/transform/models/staging/stg_ees_ks4.sql @@ -3,9 +3,11 @@ -- Staging model: KS4 attainment data from EES -- KS4 performance data is long-format with breakdown dimensions (breakdown_topic, -- breakdown, sex). Unlike KS2 which has a subject dimension, KS4 metrics are --- already in separate columns — we just filter to the 'All pupils' breakdown. +-- already in separate columns — we just filter to the all-pupils total row. -- EES uses 'z' (not applicable) and 'c' (confidential) as suppression codes — -- safe_numeric handles both by treating any non-numeric string as NULL. +-- NOTE: older EES releases (pre-2023/24) use breakdown_topic = 'All pupils'; +-- the 2023/24 release switched to breakdown_topic = 'Total'. Both are included. with performance as ( select * from {{ source('raw', 'ees_ks4_performance') }} @@ -46,7 +48,7 @@ all_pupils as ( {{ safe_numeric('gcse_91_percent') }} as gcse_grade_91_pct from performance - where breakdown_topic = 'Total' + where breakdown_topic in ('Total', 'All pupils') and breakdown = 'Total' and sex = 'Total' ),