feat(data): integrate 9 UK government data sources via Kestra

Adds a full data integration pipeline for enriching school profiles with supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT. Backend: - Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections, ofsted_parent_view, school_census, admissions, sen_detail, phonics, school_deprivation, school_finance) plus GIAS columns on schools - Expose all supplementary data via GET /api/schools/{urn} - Enrich school list responses with ofsted_grade + ofsted_date Integrator (new service): - FastAPI HTTP microservice; Kestra calls POST /run/{source} - 9 source modules: ofsted, gias, parent_view, census, admissions, sen_detail, phonics, idaci, finance - 9 Kestra flow YAMLs with scheduled triggers and 3× retry Frontend: - SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate) - SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation Context, Finances - types.ts: 8 new interfaces + extended School/SchoolDetailsResponse Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 11:44:04 +00:00
parent c49593d4d6
commit dd49ef28b2
36 changed files with 2849 additions and 8 deletions
--- a/integrator/scripts/sources/gias.py
+++ b/integrator/scripts/sources/gias.py
@@ -0,0 +1,159 @@
+"""
+GIAS (Get Information About Schools) bulk CSV downloader and loader.
+
+Source: https://get-information-schools.service.gov.uk/Downloads
+Update: Daily; we refresh weekly.
+Adds: website, headteacher_name, capacity, trust_name, trust_uid, gender, nursery_provision
+"""
+import argparse
+import sys
+from datetime import date
+from pathlib import Path
+
+import pandas as pd
+import requests
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from config import SUPPLEMENTARY_DIR
+from db import get_session
+
+DEST_DIR = SUPPLEMENTARY_DIR / "gias"
+
+# GIAS bulk download URL — date is injected at runtime
+GIAS_URL_TEMPLATE = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata{date}.csv"
+
+COLUMN_MAP = {
+    "URN": "urn",
+    "SchoolWebsite": "website",
+    "SchoolCapacity": "capacity",
+    "TrustName": "trust_name",
+    "TrustUID": "trust_uid",
+    "Gender (name)": "gender",
+    "NurseryProvision (name)": "nursery_provision_raw",
+    "HeadTitle": "head_title",
+    "HeadFirstName": "head_first",
+    "HeadLastName": "head_last",
+}
+
+
+def download(data_dir: Path | None = None) -> Path:
+    dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
+    dest.mkdir(parents=True, exist_ok=True)
+
+    today = date.today().strftime("%Y%m%d")
+    url = GIAS_URL_TEMPLATE.format(date=today)
+    filename = f"gias_{today}.csv"
+    dest_file = dest / filename
+
+    if dest_file.exists():
+        print(f"  GIAS: {filename} already exists, skipping download.")
+        return dest_file
+
+    print(f"  GIAS: downloading {url} ...")
+    resp = requests.get(url, timeout=300, stream=True)
+
+    # GIAS may not have today's file yet — fall back to yesterday
+    if resp.status_code == 404:
+        from datetime import timedelta
+        yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d")
+        url = GIAS_URL_TEMPLATE.format(date=yesterday)
+        filename = f"gias_{yesterday}.csv"
+        dest_file = dest / filename
+        if dest_file.exists():
+            print(f"  GIAS: {filename} already exists, skipping download.")
+            return dest_file
+        resp = requests.get(url, timeout=300, stream=True)
+
+    resp.raise_for_status()
+    with open(dest_file, "wb") as f:
+        for chunk in resp.iter_content(chunk_size=65536):
+            f.write(chunk)
+
+    print(f"  GIAS: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)")
+    return dest_file
+
+
+def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
+    if path is None:
+        dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
+        files = sorted(dest.glob("gias_*.csv"))
+        if not files:
+            raise FileNotFoundError(f"No GIAS CSV found in {dest}")
+        path = files[-1]
+
+    print(f"  GIAS: loading {path} ...")
+    df = pd.read_csv(path, encoding="latin-1", low_memory=False)
+    df.rename(columns=COLUMN_MAP, inplace=True)
+
+    if "urn" not in df.columns:
+        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
+
+    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
+    df = df.dropna(subset=["urn"])
+    df["urn"] = df["urn"].astype(int)
+
+    # Build headteacher_name from parts
+    def build_name(row):
+        parts = [
+            str(row.get("head_title", "") or "").strip(),
+            str(row.get("head_first", "") or "").strip(),
+            str(row.get("head_last", "") or "").strip(),
+        ]
+        return " ".join(p for p in parts if p) or None
+
+    df["headteacher_name"] = df.apply(build_name, axis=1)
+    df["nursery_provision"] = df.get("nursery_provision_raw", pd.Series()).apply(
+        lambda v: True if str(v).strip().lower().startswith("has") else False if pd.notna(v) else None
+    )
+
+    def clean_str(val):
+        s = str(val).strip() if pd.notna(val) else None
+        return s if s and s.lower() not in ("nan", "none", "") else None
+
+    updated = 0
+    with get_session() as session:
+        from sqlalchemy import text
+        for _, row in df.iterrows():
+            urn = int(row["urn"])
+            session.execute(
+                text("""
+                    UPDATE schools SET
+                        website            = :website,
+                        headteacher_name   = :headteacher_name,
+                        capacity           = :capacity,
+                        trust_name         = :trust_name,
+                        trust_uid          = :trust_uid,
+                        gender             = :gender,
+                        nursery_provision  = :nursery_provision
+                    WHERE urn = :urn
+                """),
+                {
+                    "urn": urn,
+                    "website": clean_str(row.get("website")),
+                    "headteacher_name": row.get("headteacher_name"),
+                    "capacity": int(row["capacity"]) if pd.notna(row.get("capacity")) and str(row.get("capacity")).strip().isdigit() else None,
+                    "trust_name": clean_str(row.get("trust_name")),
+                    "trust_uid": clean_str(row.get("trust_uid")),
+                    "gender": clean_str(row.get("gender")),
+                    "nursery_provision": row.get("nursery_provision"),
+                },
+            )
+            updated += 1
+            if updated % 5000 == 0:
+                session.flush()
+                print(f"    Updated {updated} schools...")
+
+    print(f"  GIAS: updated {updated} school records")
+    return {"inserted": 0, "updated": updated, "skipped": 0}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
+    parser.add_argument("--data-dir", type=Path, default=None)
+    args = parser.parse_args()
+
+    if args.action in ("download", "all"):
+        path = download(args.data_dir)
+    if args.action in ("load", "all"):
+        load(data_dir=args.data_dir)