From ae33bfe04b3262f0f5782caf98f54272c664e4b8 Mon Sep 17 00:00:00 2001 From: Tudor Sitaru Date: Thu, 16 Apr 2026 10:41:01 +0100 Subject: [PATCH] refactor(pipeline): unify KS2 and KS4 legacy sources to same annual ZIPs MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit LegacyKS2Stream now auto-detects ZIP vs bare CSV — if the download is a ZIP it extracts england_ks2final.csv; if it's a plain CSV file it reads directly. This keeps backwards compatibility while allowing both streams to share the same DfE annual archive URLs. legacy_ks2_urls updated to point at the same 4 ZIPs as legacy_ks4_urls so only one set of archives needs to be maintained going forward. Co-Authored-By: Claude Sonnet 4.6 --- pipeline/meltano.yml | 8 +++--- .../extractors/tap-uk-ees/tap_uk_ees/tap.py | 25 ++++++++++++++++++- 2 files changed, 28 insertions(+), 5 deletions(-) diff --git a/pipeline/meltano.yml b/pipeline/meltano.yml index 98762c0..bb9ecec 100644 --- a/pipeline/meltano.yml +++ b/pipeline/meltano.yml @@ -31,10 +31,10 @@ plugins: description: "Year code → URL mapping for legacy KS4 ZIPs (england_ks4final.csv inside)" config: legacy_ks2_urls: - "201516": "http://10.0.1.224:8081/filebrowser/api/public/dl/R9jjXFWa?inline=true" - "201617": "http://10.0.1.224:8081/filebrowser/api/public/dl/tIwJPVQS?inline=true" - "201718": "http://10.0.1.224:8081/filebrowser/api/public/dl/GO7SKE0p?inline=true" - "201819": "http://10.0.1.224:8081/filebrowser/api/public/dl/jchDEHsv?inline=true" + "201516": "http://10.0.1.224:8081/filebrowser/api/public/dl/iaoSkg1v?inline=true" + "201617": "http://10.0.1.224:8081/filebrowser/api/public/dl/bqCMUcIH?inline=true" + "201718": "http://10.0.1.224:8081/filebrowser/api/public/dl/0L61fE_a?inline=true" + "201819": "http://10.0.1.224:8081/filebrowser/api/public/dl/XJGJ5lG1?inline=true" legacy_ks4_urls: "201516": "http://10.0.1.224:8081/filebrowser/api/public/dl/iaoSkg1v?inline=true" "201617": "http://10.0.1.224:8081/filebrowser/api/public/dl/bqCMUcIH?inline=true" diff --git a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py index 4cddc9a..6e89676 100644 --- a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py +++ b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py @@ -682,8 +682,31 @@ class LegacyKS2Stream(Stream): self.logger.warning("Failed to download %s: %s", url, e) continue + content = resp.content + + # Auto-detect ZIP — the DfE annual archives contain both KS2 and KS4 + # CSVs in one ZIP. If the download is a ZIP, extract england_ks2final.csv; + # otherwise treat the content as a bare CSV (legacy individual-file URLs). + csv_bytes = None + try: + zf = zipfile.ZipFile(io.BytesIO(content)) + target = next( + (n for n in zf.namelist() if "ks2final" in n.lower() and n.endswith(".csv")), + None, + ) + if target: + with zf.open(target) as f: + csv_bytes = f.read() + self.logger.info("Extracted %s from ZIP for %s", target, year_code) + else: + self.logger.warning("No ks2final CSV found in ZIP for %s", year_code) + continue + except zipfile.BadZipFile: + # Not a ZIP — treat as a bare CSV file + csv_bytes = content + df = pd.read_csv( - io.BytesIO(resp.content), + io.BytesIO(csv_bytes), dtype=str, keep_default_na=False, encoding="latin-1",