feat: ingest official DfE KS2 national averages from EES data catalogue

Replaces computed means from our school dataset with the published DfE national headline figures for the KS2 chart reference line. - tap-uk-ees: new EESKs2NationalStream fetches the stable EES data-catalogue CSV (one row per year, England national total, AllSchools filter) - dbt staging: stg_ees_ks2_national normalises columns, casts to float, filters to years >= 201617 - dbt mart: fact_ks2_national_averages — one row per year, official figures - backend/models: Ks2NationalAverage SQLAlchemy model - backend/app: /api/national-averages queries the mart for KS2 by_year; secondary by_year stays computed (no DfE KS4 national dataset yet) - DAG: extract_ks2_national task added to school_data_annual_ees, runs in parallel with the main EES extract Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-09 14:40:33 +01:00
parent a3cfffa4d0
commit dc66e22d4d
8 changed files with 236 additions and 12 deletions
@@ -452,6 +452,99 @@ class EESAdmissionsStream(EESDatasetStream):
 # on EES. Only national and LA-level files are published.


+# ── KS2 National Headlines (national level only — one row per year) ───────────
+# Dataset: "Key stage 2 attainment: national headlines"
+# URL: https://explore-education-statistics.service.gov.uk/data-catalogue/data-set/
+#      58bb4b03-c6df-447f-bb7e-b82970c4d974/csv
+# This is a stable data-catalogue CSV endpoint (not a versioned release ZIP).
+# Covers 2015/16 → latest; COVID years (2019/20, 2020/21) are suppressed ('x').
+
+_KS2_NATIONAL_CSV_URL = (
+    "https://explore-education-statistics.service.gov.uk/data-catalogue/"
+    "data-set/58bb4b03-c6df-447f-bb7e-b82970c4d974/csv"
+)
+
+_KS2_NATIONAL_COL_MAP = {
+    "pt_rwm_exp":    "rwm_expected_pct",
+    "pt_rwm_high":   "rwm_high_pct",
+    "pt_read_exp":   "reading_expected_pct",
+    "pt_read_high":  "reading_high_pct",
+    "pt_mat_exp":    "maths_expected_pct",
+    "pt_mat_high":   "maths_high_pct",
+    "pt_writta_exp": "writing_expected_pct",
+    "pt_writta_gd":  "writing_gd_pct",
+    "pt_gps_exp":    "gps_expected_pct",
+    "pt_gps_high":   "gps_high_pct",
+    "pt_scita_exp":  "science_expected_pct",
+    "avg_readscore": "reading_avg_score",
+    "avg_gpsscore":  "gps_avg_score",
+    "avg_matscore":  "maths_avg_score",
+}
+
+
+class EESKs2NationalStream(Stream):
+    """National KS2 headline averages — one row per academic year.
+
+    Fetches the DfE EES data-catalogue CSV directly (stable URL, not versioned
+    release ZIP). Filters to geographic_level == 'National' and
+    school_type == 'AllSchools' so only the England-wide headline row per year
+    is emitted. COVID years (2019/20, 2020/21) are naturally absent (suppressed
+    with 'x' → treated as null downstream in dbt staging).
+    """
+
+    name = "ees_ks2_national"
+    primary_keys = ["time_period"]
+    replication_key = None
+
+    schema = th.PropertiesList(
+        th.Property("time_period", th.StringType, required=True),
+        th.Property("rwm_expected_pct",   th.StringType),
+        th.Property("rwm_high_pct",       th.StringType),
+        th.Property("reading_expected_pct", th.StringType),
+        th.Property("reading_high_pct",   th.StringType),
+        th.Property("maths_expected_pct", th.StringType),
+        th.Property("maths_high_pct",     th.StringType),
+        th.Property("writing_expected_pct", th.StringType),
+        th.Property("writing_gd_pct",     th.StringType),
+        th.Property("gps_expected_pct",   th.StringType),
+        th.Property("gps_high_pct",       th.StringType),
+        th.Property("science_expected_pct", th.StringType),
+        th.Property("reading_avg_score",  th.StringType),
+        th.Property("gps_avg_score",      th.StringType),
+        th.Property("maths_avg_score",    th.StringType),
+    ).to_dict()
+
+    def get_records(self, context):
+        import pandas as pd
+
+        self.logger.info("Downloading KS2 national headlines: %s", _KS2_NATIONAL_CSV_URL)
+        resp = requests.get(_KS2_NATIONAL_CSV_URL, timeout=60)
+        resp.raise_for_status()
+
+        df = pd.read_csv(
+            io.BytesIO(resp.content),
+            dtype=str,
+            keep_default_na=False,
+        )
+
+        # Normalise column names to lowercase
+        df.columns = [c.strip().lower() for c in df.columns]
+
+        # Keep only the England national headline row per year
+        if "geographic_level" in df.columns:
+            df = df[df["geographic_level"].str.strip().str.lower() == "national"]
+        if "school_type" in df.columns:
+            df = df[df["school_type"].str.strip().str.lower() == "allschools"]
+
+        self.logger.info("Emitting %d national KS2 rows", len(df))
+
+        for _, row in df.iterrows():
+            record = {"time_period": row.get("time_period", "").strip()}
+            for csv_col, field in _KS2_NATIONAL_COL_MAP.items():
+                record[field] = row.get(csv_col, "").strip()
+            yield record
+
+
 # ── Legacy KS2 (pre-COVID wide format from DfE performance tables) ────────────
 # The DfE "Compare School Performance" site published school-level KS2 CSVs
 # in a wide format (one row per school, ~300 columns).  EES only has school-level
@@ -629,6 +722,7 @@ class TapUKEES(Tap):
            EESCensusStream(self),
            EESAdmissionsStream(self),
            LegacyKS2Stream(self),
+            EESKs2NationalStream(self),
        ]