feat(data): integrate 9 UK government data sources via Kestra

Adds a full data integration pipeline for enriching school profiles with supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT. Backend: - Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections, ofsted_parent_view, school_census, admissions, sen_detail, phonics, school_deprivation, school_finance) plus GIAS columns on schools - Expose all supplementary data via GET /api/schools/{urn} - Enrich school list responses with ofsted_grade + ofsted_date Integrator (new service): - FastAPI HTTP microservice; Kestra calls POST /run/{source} - 9 source modules: ofsted, gias, parent_view, census, admissions, sen_detail, phonics, idaci, finance - 9 Kestra flow YAMLs with scheduled triggers and 3× retry Frontend: - SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate) - SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation Context, Finances - types.ts: 8 new interfaces + extended School/SchoolDetailsResponse Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 11:44:04 +00:00
parent c49593d4d6
commit dd49ef28b2
36 changed files with 2849 additions and 8 deletions
--- a/integrator/scripts/sources/finance.py
+++ b/integrator/scripts/sources/finance.py
@@ -0,0 +1,143 @@
+"""
+FBIT (Financial Benchmarking and Insights Tool) financial data loader.
+
+Source: https://schools-financial-benchmarking.service.gov.uk/api/
+Update: Annual (December — data for the prior financial year)
+"""
+import argparse
+import sys
+import time
+from pathlib import Path
+
+import pandas as pd
+import requests
+
+sys.path.insert(0, str(Path(__file__).parent.parent))
+from config import SUPPLEMENTARY_DIR
+from db import get_session
+
+DEST_DIR = SUPPLEMENTARY_DIR / "finance"
+API_BASE = "https://schools-financial-benchmarking.service.gov.uk/api"
+RATE_LIMIT_DELAY = 0.1   # seconds between requests
+
+
+def download(data_dir: Path | None = None) -> Path:
+    """
+    Fetch per-URN financial data from FBIT API and save as CSV.
+    Batches all school URNs from the database.
+    """
+    dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
+    dest.mkdir(parents=True, exist_ok=True)
+
+    # Determine year from API (use current year minus 1 for completed financials)
+    from datetime import date
+    year = date.today().year - 1
+    dest_file = dest / f"fbit_{year}.csv"
+
+    if dest_file.exists():
+        print(f"  Finance: {dest_file.name} already exists, skipping download.")
+        return dest_file
+
+    # Get all URNs from the database
+    with get_session() as session:
+        from sqlalchemy import text
+        rows = session.execute(text("SELECT urn FROM schools")).fetchall()
+    urns = [r[0] for r in rows]
+    print(f"  Finance: fetching FBIT data for {len(urns)} schools (year {year}) ...")
+
+    records = []
+    errors = 0
+    for i, urn in enumerate(urns):
+        if i % 500 == 0:
+            print(f"    {i}/{len(urns)} ...")
+        try:
+            resp = requests.get(
+                f"{API_BASE}/schoolFinancialDataObject/{urn}",
+                timeout=10,
+            )
+            if resp.status_code == 200:
+                data = resp.json()
+                if data:
+                    records.append({
+                        "urn": urn,
+                        "year": year,
+                        "per_pupil_spend": data.get("totalExpenditure") and
+                                           data.get("numberOfPupils") and
+                                           round(data["totalExpenditure"] / data["numberOfPupils"], 2),
+                        "staff_cost_pct": data.get("staffCostPercent"),
+                        "teacher_cost_pct": data.get("teachingStaffCostPercent"),
+                        "support_staff_cost_pct": data.get("educationSupportStaffCostPercent"),
+                        "premises_cost_pct": data.get("premisesStaffCostPercent"),
+                    })
+            elif resp.status_code not in (404, 400):
+                errors += 1
+        except Exception:
+            errors += 1
+
+        time.sleep(RATE_LIMIT_DELAY)
+
+    df = pd.DataFrame(records)
+    df.to_csv(dest_file, index=False)
+    print(f"  Finance: saved {len(records)} records to {dest_file} ({errors} errors)")
+    return dest_file
+
+
+def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
+    if path is None:
+        dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
+        files = sorted(dest.glob("fbit_*.csv"))
+        if not files:
+            raise FileNotFoundError(f"No finance CSV found in {dest}")
+        path = files[-1]
+
+    print(f"  Finance: loading {path} ...")
+    df = pd.read_csv(path)
+
+    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
+    df = df.dropna(subset=["urn"])
+    df["urn"] = df["urn"].astype(int)
+
+    inserted = 0
+    with get_session() as session:
+        from sqlalchemy import text
+        for _, row in df.iterrows():
+            session.execute(
+                text("""
+                    INSERT INTO school_finance
+                        (urn, year, per_pupil_spend, staff_cost_pct, teacher_cost_pct,
+                         support_staff_cost_pct, premises_cost_pct)
+                    VALUES (:urn, :year, :per_pupil, :staff, :teacher, :support, :premises)
+                    ON CONFLICT (urn, year) DO UPDATE SET
+                        per_pupil_spend        = EXCLUDED.per_pupil_spend,
+                        staff_cost_pct         = EXCLUDED.staff_cost_pct,
+                        teacher_cost_pct       = EXCLUDED.teacher_cost_pct,
+                        support_staff_cost_pct = EXCLUDED.support_staff_cost_pct,
+                        premises_cost_pct      = EXCLUDED.premises_cost_pct
+                """),
+                {
+                    "urn": int(row["urn"]),
+                    "year": int(row["year"]),
+                    "per_pupil": float(row["per_pupil_spend"]) if pd.notna(row.get("per_pupil_spend")) else None,
+                    "staff": float(row["staff_cost_pct"]) if pd.notna(row.get("staff_cost_pct")) else None,
+                    "teacher": float(row["teacher_cost_pct"]) if pd.notna(row.get("teacher_cost_pct")) else None,
+                    "support": float(row["support_staff_cost_pct"]) if pd.notna(row.get("support_staff_cost_pct")) else None,
+                    "premises": float(row["premises_cost_pct"]) if pd.notna(row.get("premises_cost_pct")) else None,
+                },
+            )
+            inserted += 1
+            if inserted % 2000 == 0:
+                session.flush()
+
+    print(f"  Finance: upserted {inserted} records")
+    return {"inserted": inserted, "updated": 0, "skipped": 0}
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
+    parser.add_argument("--data-dir", type=Path, default=None)
+    args = parser.parse_args()
+    if args.action in ("download", "all"):
+        download(args.data_dir)
+    if args.action in ("load", "all"):
+        load(data_dir=args.data_dir)