refactor(legacy-ks2): use explicit year→URL mapping instead of base URL pattern
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 34s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 34s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
The file hosting uses non-deterministic URLs, so replace legacy_ks2_base_url + legacy_ks2_years with a single legacy_ks2_urls object mapping year codes to download URLs. Configure the 4 pre-COVID years in meltano.yml. Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -23,6 +23,15 @@ plugins:
|
|||||||
- name: datasets
|
- name: datasets
|
||||||
kind: array
|
kind: array
|
||||||
description: List of EES dataset configs to extract
|
description: List of EES dataset configs to extract
|
||||||
|
- name: legacy_ks2_urls
|
||||||
|
kind: object
|
||||||
|
description: "Year code → URL mapping for legacy KS2 CSVs"
|
||||||
|
config:
|
||||||
|
legacy_ks2_urls:
|
||||||
|
"201516": "http://10.0.1.224:8081/filebrowser/api/public/dl/R9jjXFWa?inline=true"
|
||||||
|
"201617": "http://10.0.1.224:8081/filebrowser/api/public/dl/tIwJPVQS?inline=true"
|
||||||
|
"201718": "http://10.0.1.224:8081/filebrowser/api/public/dl/GO7SKE0p?inline=true"
|
||||||
|
"201819": "http://10.0.1.224:8081/filebrowser/api/public/dl/jchDEHsv?inline=true"
|
||||||
|
|
||||||
- name: tap-uk-ofsted
|
- name: tap-uk-ofsted
|
||||||
namespace: uk_ofsted
|
namespace: uk_ofsted
|
||||||
|
|||||||
@@ -501,16 +501,12 @@ _LEGACY_KS2_COLUMN_MAP = {
|
|||||||
"PTMOBN": "stability_pct",
|
"PTMOBN": "stability_pct",
|
||||||
}
|
}
|
||||||
|
|
||||||
# Default years to load (new-curriculum KS2 only — 2014-15 used Level 4+ metrics)
|
|
||||||
_LEGACY_KS2_DEFAULT_YEARS = ["201516", "201617", "201718", "201819"]
|
|
||||||
|
|
||||||
|
|
||||||
class LegacyKS2Stream(Stream):
|
class LegacyKS2Stream(Stream):
|
||||||
"""Stream for pre-COVID KS2 data from DfE performance tables CSVs.
|
"""Stream for pre-COVID KS2 data from DfE performance tables CSVs.
|
||||||
|
|
||||||
Downloads england_ks2final.csv for each configured year from a base URL,
|
Downloads CSVs from URLs configured in legacy_ks2_urls (a mapping of
|
||||||
maps old DfE column names to match the stg_ees_ks2 output schema, and
|
6-digit year code → download URL), maps old DfE column names to match
|
||||||
emits one record per school per year.
|
the stg_ees_ks2 output schema, and emits one record per school per year.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
name = "legacy_ks2"
|
name = "legacy_ks2"
|
||||||
@@ -558,20 +554,15 @@ class LegacyKS2Stream(Stream):
|
|||||||
def get_records(self, context):
|
def get_records(self, context):
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
base_url = self.config.get("legacy_ks2_base_url", "")
|
url_map = self.config.get("legacy_ks2_urls", {})
|
||||||
if not base_url:
|
if not url_map:
|
||||||
self.logger.warning("legacy_ks2_base_url not configured, skipping legacy KS2")
|
self.logger.warning("legacy_ks2_urls not configured, skipping legacy KS2")
|
||||||
return
|
return
|
||||||
|
|
||||||
years = self.config.get("legacy_ks2_years", _LEGACY_KS2_DEFAULT_YEARS)
|
self.logger.info("Loading legacy KS2 for %d year(s)", len(url_map))
|
||||||
self.logger.info("Loading legacy KS2 for years: %s from %s", years, base_url)
|
|
||||||
|
|
||||||
for year_code in years:
|
for year_code, url in url_map.items():
|
||||||
# Convert 6-digit code to folder name: "201819" → "2018-2019"
|
self.logger.info("Downloading %s for %s", url, year_code)
|
||||||
folder = f"20{year_code[2:4]}-20{year_code[4:6]}"
|
|
||||||
url = f"{base_url}/{folder}/england_ks2final.csv"
|
|
||||||
|
|
||||||
self.logger.info("Downloading %s", url)
|
|
||||||
try:
|
try:
|
||||||
resp = requests.get(url, timeout=120)
|
resp = requests.get(url, timeout=120)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
@@ -619,15 +610,9 @@ class TapUKEES(Tap):
|
|||||||
default=False,
|
default=False,
|
||||||
),
|
),
|
||||||
th.Property(
|
th.Property(
|
||||||
"legacy_ks2_base_url",
|
"legacy_ks2_urls",
|
||||||
th.StringType,
|
th.ObjectType(),
|
||||||
description="Base URL for legacy KS2 CSVs (e.g. https://example.com/data). Files expected at {base_url}/{year}/england_ks2final.csv",
|
description="Mapping of 6-digit year code to download URL for legacy KS2 CSVs (e.g. {\"201819\": \"https://...\"})",
|
||||||
),
|
|
||||||
th.Property(
|
|
||||||
"legacy_ks2_years",
|
|
||||||
th.ArrayType(th.StringType),
|
|
||||||
description="Legacy KS2 year codes to load (default: 201516-201819)",
|
|
||||||
default=_LEGACY_KS2_DEFAULT_YEARS,
|
|
||||||
),
|
),
|
||||||
).to_dict()
|
).to_dict()
|
||||||
|
|
||||||
|
|||||||
Reference in New Issue
Block a user