feat(data): integrate 9 UK government data sources via Kestra

Adds a full data integration pipeline for enriching school profiles with supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT. Backend: - Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections, ofsted_parent_view, school_census, admissions, sen_detail, phonics, school_deprivation, school_finance) plus GIAS columns on schools - Expose all supplementary data via GET /api/schools/{urn} - Enrich school list responses with ofsted_grade + ofsted_date Integrator (new service): - FastAPI HTTP microservice; Kestra calls POST /run/{source} - 9 source modules: ofsted, gias, parent_view, census, admissions, sen_detail, phonics, idaci, finance - 9 Kestra flow YAMLs with scheduled triggers and 3× retry Frontend: - SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate) - SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation Context, Finances - types.ts: 8 new interfaces + extended School/SchoolDetailsResponse Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 11:44:04 +00:00
parent c49593d4d6
commit dd49ef28b2
36 changed files with 2849 additions and 8 deletions
--- a/integrator/scripts/sources/census.py
+++ b/integrator/scripts/sources/census.py
@@ -0,0 +1,148 @@
+"""
+School Census (SPC) downloader and loader.
+
+Source: EES publication "schools-pupils-and-their-characteristics"
+Update: Annual (June)
+Adds: class_size_avg, ethnicity breakdown by school
+"""
+import argparse
+import re
+import sys
+from pathlib import Path
+
+import pandas as pd
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from config import SUPPLEMENTARY_DIR
+from db import get_session
+from sources.ees import get_latest_csv_url, download_csv
+
+DEST_DIR = SUPPLEMENTARY_DIR / "census"
+PUBLICATION_SLUG = "schools-pupils-and-their-characteristics"
+
+NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}
+
+COLUMN_MAP = {
+    "URN": "urn",
+    "urn": "urn",
+    "YEAR": "year",
+    "Year": "year",
+    # Class size
+    "average_class_size": "class_size_avg",
+    "AVCLAS": "class_size_avg",
+    "avg_class_size": "class_size_avg",
+    # Ethnicity — DfE uses ethnicity major group percentages
+    "perc_white": "ethnicity_white_pct",
+    "perc_asian": "ethnicity_asian_pct",
+    "perc_black": "ethnicity_black_pct",
+    "perc_mixed": "ethnicity_mixed_pct",
+    "perc_other_ethnic": "ethnicity_other_pct",
+    "PTWHITE": "ethnicity_white_pct",
+    "PTASIAN": "ethnicity_asian_pct",
+    "PTBLACK": "ethnicity_black_pct",
+    "PTMIXED": "ethnicity_mixed_pct",
+    "PTOTHER": "ethnicity_other_pct",
+}
+
+
+def download(data_dir: Path | None = None) -> Path:
+    dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR
+    dest.mkdir(parents=True, exist_ok=True)
+
+    url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
+    if not url:
+        raise RuntimeError(f"Could not find CSV URL for census publication")
+
+    filename = url.split("/")[-1].split("?")[0] or "census_latest.csv"
+    return download_csv(url, dest / filename)
+
+
+def _parse_pct(val) -> float | None:
+    if pd.isna(val):
+        return None
+    s = str(val).strip().upper().replace("%", "")
+    if s in NULL_VALUES:
+        return None
+    try:
+        return float(s)
+    except ValueError:
+        return None
+
+
+def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
+    if path is None:
+        dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR
+        files = sorted(dest.glob("*.csv"))
+        if not files:
+            raise FileNotFoundError(f"No census CSV found in {dest}")
+        path = files[-1]
+
+    print(f"  Census: loading {path} ...")
+    df = pd.read_csv(path, encoding="latin-1", low_memory=False)
+    df.rename(columns=COLUMN_MAP, inplace=True)
+
+    if "urn" not in df.columns:
+        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
+
+    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
+    df = df.dropna(subset=["urn"])
+    df["urn"] = df["urn"].astype(int)
+
+    year = None
+    m = re.search(r"20(\d{2})", path.stem)
+    if m:
+        year = int("20" + m.group(1))
+
+    inserted = 0
+    with get_session() as session:
+        from sqlalchemy import text
+        for _, row in df.iterrows():
+            urn = int(row["urn"])
+            row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
+            if not row_year:
+                continue
+
+            session.execute(
+                text("""
+                    INSERT INTO school_census
+                        (urn, year, class_size_avg,
+                         ethnicity_white_pct, ethnicity_asian_pct, ethnicity_black_pct,
+                         ethnicity_mixed_pct, ethnicity_other_pct)
+                    VALUES (:urn, :year, :class_size_avg,
+                            :white, :asian, :black, :mixed, :other)
+                    ON CONFLICT (urn, year) DO UPDATE SET
+                        class_size_avg       = EXCLUDED.class_size_avg,
+                        ethnicity_white_pct  = EXCLUDED.ethnicity_white_pct,
+                        ethnicity_asian_pct  = EXCLUDED.ethnicity_asian_pct,
+                        ethnicity_black_pct  = EXCLUDED.ethnicity_black_pct,
+                        ethnicity_mixed_pct  = EXCLUDED.ethnicity_mixed_pct,
+                        ethnicity_other_pct  = EXCLUDED.ethnicity_other_pct
+                """),
+                {
+                    "urn": urn,
+                    "year": row_year,
+                    "class_size_avg": _parse_pct(row.get("class_size_avg")),
+                    "white": _parse_pct(row.get("ethnicity_white_pct")),
+                    "asian": _parse_pct(row.get("ethnicity_asian_pct")),
+                    "black": _parse_pct(row.get("ethnicity_black_pct")),
+                    "mixed": _parse_pct(row.get("ethnicity_mixed_pct")),
+                    "other": _parse_pct(row.get("ethnicity_other_pct")),
+                },
+            )
+            inserted += 1
+            if inserted % 5000 == 0:
+                session.flush()
+
+    print(f"  Census: upserted {inserted} records")
+    return {"inserted": inserted, "updated": 0, "skipped": 0}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
+    parser.add_argument("--data-dir", type=Path, default=None)
+    args = parser.parse_args()
+    if args.action in ("download", "all"):
+        download(args.data_dir)
+    if args.action in ("load", "all"):
+        load(data_dir=args.data_dir)