diff --git a/backend/app.py b/backend/app.py index 1d86a6f..74175f9 100644 --- a/backend/app.py +++ b/backend/app.py @@ -708,21 +708,56 @@ async def get_national_averages(request: Request): # Secondary: schools where KS4 data is non-null secondary_df = df_latest[df_latest["attainment_8_score"].notna()] - # Per-year averages for every year in the dataset (used by chart reference lines) + latest_primary = _means(primary_df, ks2_metrics) + latest_secondary = _means(secondary_df, ks4_metrics) + + # Per-year KS2 primary averages: use official DfE figures from the mart table. + # Per-year KS4 secondary averages: computed from our dataset (no DfE dataset yet). + from .database import SessionLocal + from .models import Ks2NationalAverage + by_year = [] - for yr in sorted(df["year"].dropna().unique()): - yr = int(yr) - df_yr = df[df["year"] == yr] - by_year.append({ - "year": yr, - "primary": _means(df_yr[df_yr["rwm_expected_pct"].notna()], ks2_metrics), - "secondary": _means(df_yr[df_yr["attainment_8_score"].notna()], ks4_metrics), - }) + try: + db = SessionLocal() + nat_rows = db.query(Ks2NationalAverage).order_by(Ks2NationalAverage.year).all() + # Build a lookup of computed secondary averages per year as fallback + secondary_by_year = {} + for yr in sorted(df["year"].dropna().unique()): + yr = int(yr) + df_yr = df[df["year"] == yr] + secondary_by_year[yr] = _means( + df_yr[df_yr["attainment_8_score"].notna()], ks4_metrics + ) + # Merge: official KS2 figures + computed KS4 figures per year + ks2_years = {r.year for r in nat_rows} + all_years = sorted(ks2_years | set(secondary_by_year.keys())) + nat_lookup = {r.year: r for r in nat_rows} + for yr in all_years: + primary_yr: dict = {} + if yr in nat_lookup: + r = nat_lookup[yr] + for col in ks2_metrics: + val = getattr(r, col, None) + if val is not None: + primary_yr[col] = val + by_year.append({ + "year": yr, + "primary": primary_yr, + "secondary": secondary_by_year.get(yr, {}), + }) + finally: + db.close() + + # Update latest_primary with official DfE figure for the latest year if available + if by_year: + latest_official = next((e["primary"] for e in reversed(by_year) if e["primary"]), None) + if latest_official: + latest_primary = latest_official return { "year": latest_year, - "primary": _means(primary_df, ks2_metrics), - "secondary": _means(secondary_df, ks4_metrics), + "primary": latest_primary, + "secondary": latest_secondary, "by_year": by_year, } diff --git a/backend/models.py b/backend/models.py index c92999d..13748cb 100644 --- a/backend/models.py +++ b/backend/models.py @@ -215,3 +215,25 @@ class FactFinance(Base): teacher_cost_pct = Column(Float) support_staff_cost_pct = Column(Float) premises_cost_pct = Column(Float) + + +class Ks2NationalAverage(Base): + """Official DfE KS2 national headline averages — one row per academic year.""" + __tablename__ = "fact_ks2_national_averages" + __table_args__ = MARTS + + year = Column(Integer, primary_key=True) + rwm_expected_pct = Column(Float) + rwm_high_pct = Column(Float) + reading_expected_pct = Column(Float) + reading_high_pct = Column(Float) + reading_avg_score = Column(Float) + writing_expected_pct = Column(Float) + writing_gd_pct = Column(Float) + maths_expected_pct = Column(Float) + maths_high_pct = Column(Float) + maths_avg_score = Column(Float) + gps_expected_pct = Column(Float) + gps_high_pct = Column(Float) + gps_avg_score = Column(Float) + science_expected_pct = Column(Float) diff --git a/pipeline/dags/school_data_pipeline.py b/pipeline/dags/school_data_pipeline.py index e7457d8..a6c5c08 100644 --- a/pipeline/dags/school_data_pipeline.py +++ b/pipeline/dags/school_data_pipeline.py @@ -137,10 +137,15 @@ with DAG( task_id="extract_ees", bash_command=f"cd {PIPELINE_DIR} && {MELTANO_BIN} run tap-uk-ees target-postgres", ) + # KS2 national headlines run in parallel — small single-CSV download + extract_ks2_national = BashOperator( + task_id="extract_ks2_national", + bash_command=f"cd {PIPELINE_DIR} && {MELTANO_BIN} run tap-uk-ees target-postgres --select ees_ks2_national", + ) dbt_build_ees = BashOperator( task_id="dbt_build", - bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production --select stg_ees_ks2+ stg_legacy_ks2+ stg_ees_ks4+ stg_ees_census+ stg_ees_admissions+", + bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production --select stg_ees_ks2+ stg_legacy_ks2+ stg_ees_ks4+ stg_ees_census+ stg_ees_admissions+ stg_ees_ks2_national+", ) sync_typesense_ees = BashOperator( diff --git a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py index ae2957b..d0d9300 100644 --- a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py +++ b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py @@ -452,6 +452,99 @@ class EESAdmissionsStream(EESDatasetStream): # on EES. Only national and LA-level files are published. +# ── KS2 National Headlines (national level only — one row per year) ─────────── +# Dataset: "Key stage 2 attainment: national headlines" +# URL: https://explore-education-statistics.service.gov.uk/data-catalogue/data-set/ +# 58bb4b03-c6df-447f-bb7e-b82970c4d974/csv +# This is a stable data-catalogue CSV endpoint (not a versioned release ZIP). +# Covers 2015/16 → latest; COVID years (2019/20, 2020/21) are suppressed ('x'). + +_KS2_NATIONAL_CSV_URL = ( + "https://explore-education-statistics.service.gov.uk/data-catalogue/" + "data-set/58bb4b03-c6df-447f-bb7e-b82970c4d974/csv" +) + +_KS2_NATIONAL_COL_MAP = { + "pt_rwm_exp": "rwm_expected_pct", + "pt_rwm_high": "rwm_high_pct", + "pt_read_exp": "reading_expected_pct", + "pt_read_high": "reading_high_pct", + "pt_mat_exp": "maths_expected_pct", + "pt_mat_high": "maths_high_pct", + "pt_writta_exp": "writing_expected_pct", + "pt_writta_gd": "writing_gd_pct", + "pt_gps_exp": "gps_expected_pct", + "pt_gps_high": "gps_high_pct", + "pt_scita_exp": "science_expected_pct", + "avg_readscore": "reading_avg_score", + "avg_gpsscore": "gps_avg_score", + "avg_matscore": "maths_avg_score", +} + + +class EESKs2NationalStream(Stream): + """National KS2 headline averages — one row per academic year. + + Fetches the DfE EES data-catalogue CSV directly (stable URL, not versioned + release ZIP). Filters to geographic_level == 'National' and + school_type == 'AllSchools' so only the England-wide headline row per year + is emitted. COVID years (2019/20, 2020/21) are naturally absent (suppressed + with 'x' → treated as null downstream in dbt staging). + """ + + name = "ees_ks2_national" + primary_keys = ["time_period"] + replication_key = None + + schema = th.PropertiesList( + th.Property("time_period", th.StringType, required=True), + th.Property("rwm_expected_pct", th.StringType), + th.Property("rwm_high_pct", th.StringType), + th.Property("reading_expected_pct", th.StringType), + th.Property("reading_high_pct", th.StringType), + th.Property("maths_expected_pct", th.StringType), + th.Property("maths_high_pct", th.StringType), + th.Property("writing_expected_pct", th.StringType), + th.Property("writing_gd_pct", th.StringType), + th.Property("gps_expected_pct", th.StringType), + th.Property("gps_high_pct", th.StringType), + th.Property("science_expected_pct", th.StringType), + th.Property("reading_avg_score", th.StringType), + th.Property("gps_avg_score", th.StringType), + th.Property("maths_avg_score", th.StringType), + ).to_dict() + + def get_records(self, context): + import pandas as pd + + self.logger.info("Downloading KS2 national headlines: %s", _KS2_NATIONAL_CSV_URL) + resp = requests.get(_KS2_NATIONAL_CSV_URL, timeout=60) + resp.raise_for_status() + + df = pd.read_csv( + io.BytesIO(resp.content), + dtype=str, + keep_default_na=False, + ) + + # Normalise column names to lowercase + df.columns = [c.strip().lower() for c in df.columns] + + # Keep only the England national headline row per year + if "geographic_level" in df.columns: + df = df[df["geographic_level"].str.strip().str.lower() == "national"] + if "school_type" in df.columns: + df = df[df["school_type"].str.strip().str.lower() == "allschools"] + + self.logger.info("Emitting %d national KS2 rows", len(df)) + + for _, row in df.iterrows(): + record = {"time_period": row.get("time_period", "").strip()} + for csv_col, field in _KS2_NATIONAL_COL_MAP.items(): + record[field] = row.get(csv_col, "").strip() + yield record + + # ── Legacy KS2 (pre-COVID wide format from DfE performance tables) ──────────── # The DfE "Compare School Performance" site published school-level KS2 CSVs # in a wide format (one row per school, ~300 columns). EES only has school-level @@ -629,6 +722,7 @@ class TapUKEES(Tap): EESCensusStream(self), EESAdmissionsStream(self), LegacyKS2Stream(self), + EESKs2NationalStream(self), ] diff --git a/pipeline/transform/models/marts/_marts_schema.yml b/pipeline/transform/models/marts/_marts_schema.yml index 4291cfb..803874a 100644 --- a/pipeline/transform/models/marts/_marts_schema.yml +++ b/pipeline/transform/models/marts/_marts_schema.yml @@ -111,6 +111,12 @@ models: - name: urn tests: [not_null] + - name: fact_ks2_national_averages + description: Official DfE KS2 national headline averages — one row per academic year + columns: + - name: year + tests: [not_null, unique] + - name: fact_deprivation description: IDACI deprivation index — one row per URN columns: diff --git a/pipeline/transform/models/marts/fact_ks2_national_averages.sql b/pipeline/transform/models/marts/fact_ks2_national_averages.sql new file mode 100644 index 0000000..1465afd --- /dev/null +++ b/pipeline/transform/models/marts/fact_ks2_national_averages.sql @@ -0,0 +1,25 @@ +{{ config(materialized='table') }} + +-- Mart: Official DfE KS2 national headline averages — one row per academic year. +-- These are the published England-wide figures, not computed means from our school dataset. +-- Used by the /api/national-averages endpoint to provide accurate per-year reference lines +-- on the school history chart and for hero stat comparisons. + +select + year, + rwm_expected_pct, + rwm_high_pct, + reading_expected_pct, + reading_high_pct, + reading_avg_score, + writing_expected_pct, + writing_gd_pct, + maths_expected_pct, + maths_high_pct, + maths_avg_score, + gps_expected_pct, + gps_high_pct, + gps_avg_score, + science_expected_pct +from {{ ref('stg_ees_ks2_national') }} +order by year diff --git a/pipeline/transform/models/staging/_stg_sources.yml b/pipeline/transform/models/staging/_stg_sources.yml index f3976fd..b4b852e 100644 --- a/pipeline/transform/models/staging/_stg_sources.yml +++ b/pipeline/transform/models/staging/_stg_sources.yml @@ -45,6 +45,9 @@ sources: - name: ees_admissions description: Primary and secondary school admissions data + - name: ees_ks2_national + description: KS2 national headline averages from DfE EES data catalogue — one row per academic year + # Phonics: no school-level data on EES (only national/LA level) - name: parent_view diff --git a/pipeline/transform/models/staging/stg_ees_ks2_national.sql b/pipeline/transform/models/staging/stg_ees_ks2_national.sql new file mode 100644 index 0000000..7335ce6 --- /dev/null +++ b/pipeline/transform/models/staging/stg_ees_ks2_national.sql @@ -0,0 +1,34 @@ +{{ config(materialized='table') }} + +-- Staging model: DfE KS2 national headline averages +-- Source: EES data catalogue CSV (one row per academic year, England national total) +-- COVID years 2019/20 and 2020/21 are naturally absent — DfE did not publish figures +-- because national assessments were cancelled. Those years produce no rows here. +-- 'x' (not applicable) and suppressed values are coerced to NULL by safe_numeric. + +select + cast(trim(time_period) as integer) as year, + + {{ safe_numeric('rwm_expected_pct') }} as rwm_expected_pct, + {{ safe_numeric('rwm_high_pct') }} as rwm_high_pct, + + {{ safe_numeric('reading_expected_pct') }} as reading_expected_pct, + {{ safe_numeric('reading_high_pct') }} as reading_high_pct, + {{ safe_numeric('reading_avg_score') }} as reading_avg_score, + + {{ safe_numeric('writing_expected_pct') }} as writing_expected_pct, + {{ safe_numeric('writing_gd_pct') }} as writing_gd_pct, + + {{ safe_numeric('maths_expected_pct') }} as maths_expected_pct, + {{ safe_numeric('maths_high_pct') }} as maths_high_pct, + {{ safe_numeric('maths_avg_score') }} as maths_avg_score, + + {{ safe_numeric('gps_expected_pct') }} as gps_expected_pct, + {{ safe_numeric('gps_high_pct') }} as gps_high_pct, + {{ safe_numeric('gps_avg_score') }} as gps_avg_score, + + {{ safe_numeric('science_expected_pct') }} as science_expected_pct + +from {{ source('raw', 'ees_ks2_national') }} +where time_period ~ '^[0-9]+$' + and cast(trim(time_period) as integer) >= 201617