From fba8e74b72f44ba34ae7b1932e1bc55e7d4cd7bf Mon Sep 17 00:00:00 2001 From: Tudor Sitaru Date: Tue, 31 Mar 2026 22:44:11 +0100 Subject: [PATCH] =?UTF-8?q?refactor(legacy-ks2):=20use=20explicit=20year?= =?UTF-8?q?=E2=86=92URL=20mapping=20instead=20of=20base=20URL=20pattern?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit The file hosting uses non-deterministic URLs, so replace legacy_ks2_base_url + legacy_ks2_years with a single legacy_ks2_urls object mapping year codes to download URLs. Configure the 4 pre-COVID years in meltano.yml. Co-Authored-By: Claude Opus 4.6 --- pipeline/meltano.yml | 9 +++++ .../extractors/tap-uk-ees/tap_uk_ees/tap.py | 39 ++++++------------- 2 files changed, 21 insertions(+), 27 deletions(-) diff --git a/pipeline/meltano.yml b/pipeline/meltano.yml index 93e6a64..324a596 100644 --- a/pipeline/meltano.yml +++ b/pipeline/meltano.yml @@ -23,6 +23,15 @@ plugins: - name: datasets kind: array description: List of EES dataset configs to extract + - name: legacy_ks2_urls + kind: object + description: "Year code → URL mapping for legacy KS2 CSVs" + config: + legacy_ks2_urls: + "201516": "http://10.0.1.224:8081/filebrowser/api/public/dl/R9jjXFWa?inline=true" + "201617": "http://10.0.1.224:8081/filebrowser/api/public/dl/tIwJPVQS?inline=true" + "201718": "http://10.0.1.224:8081/filebrowser/api/public/dl/GO7SKE0p?inline=true" + "201819": "http://10.0.1.224:8081/filebrowser/api/public/dl/jchDEHsv?inline=true" - name: tap-uk-ofsted namespace: uk_ofsted diff --git a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py index 16517d1..7b81fac 100644 --- a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py +++ b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py @@ -501,16 +501,12 @@ _LEGACY_KS2_COLUMN_MAP = { "PTMOBN": "stability_pct", } -# Default years to load (new-curriculum KS2 only — 2014-15 used Level 4+ metrics) -_LEGACY_KS2_DEFAULT_YEARS = ["201516", "201617", "201718", "201819"] - - class LegacyKS2Stream(Stream): """Stream for pre-COVID KS2 data from DfE performance tables CSVs. - Downloads england_ks2final.csv for each configured year from a base URL, - maps old DfE column names to match the stg_ees_ks2 output schema, and - emits one record per school per year. + Downloads CSVs from URLs configured in legacy_ks2_urls (a mapping of + 6-digit year code → download URL), maps old DfE column names to match + the stg_ees_ks2 output schema, and emits one record per school per year. """ name = "legacy_ks2" @@ -558,20 +554,15 @@ class LegacyKS2Stream(Stream): def get_records(self, context): import pandas as pd - base_url = self.config.get("legacy_ks2_base_url", "") - if not base_url: - self.logger.warning("legacy_ks2_base_url not configured, skipping legacy KS2") + url_map = self.config.get("legacy_ks2_urls", {}) + if not url_map: + self.logger.warning("legacy_ks2_urls not configured, skipping legacy KS2") return - years = self.config.get("legacy_ks2_years", _LEGACY_KS2_DEFAULT_YEARS) - self.logger.info("Loading legacy KS2 for years: %s from %s", years, base_url) + self.logger.info("Loading legacy KS2 for %d year(s)", len(url_map)) - for year_code in years: - # Convert 6-digit code to folder name: "201819" → "2018-2019" - folder = f"20{year_code[2:4]}-20{year_code[4:6]}" - url = f"{base_url}/{folder}/england_ks2final.csv" - - self.logger.info("Downloading %s", url) + for year_code, url in url_map.items(): + self.logger.info("Downloading %s for %s", url, year_code) try: resp = requests.get(url, timeout=120) resp.raise_for_status() @@ -619,15 +610,9 @@ class TapUKEES(Tap): default=False, ), th.Property( - "legacy_ks2_base_url", - th.StringType, - description="Base URL for legacy KS2 CSVs (e.g. https://example.com/data). Files expected at {base_url}/{year}/england_ks2final.csv", - ), - th.Property( - "legacy_ks2_years", - th.ArrayType(th.StringType), - description="Legacy KS2 year codes to load (default: 201516-201819)", - default=_LEGACY_KS2_DEFAULT_YEARS, + "legacy_ks2_urls", + th.ObjectType(), + description="Mapping of 6-digit year code to download URL for legacy KS2 CSVs (e.g. {\"201819\": \"https://...\"})", ), ).to_dict()