feat(data): integrate 9 UK government data sources via Kestra
Some checks failed
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 47s
Build and Push Docker Images / Trigger Portainer Update (push) Has been cancelled
Build and Push Docker Images / Build Frontend (Next.js) (push) Has been cancelled

Adds a full data integration pipeline for enriching school profiles with
supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT.

Backend:
- Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections,
  ofsted_parent_view, school_census, admissions, sen_detail, phonics,
  school_deprivation, school_finance) plus GIAS columns on schools
- Expose all supplementary data via GET /api/schools/{urn}
- Enrich school list responses with ofsted_grade + ofsted_date

Integrator (new service):
- FastAPI HTTP microservice; Kestra calls POST /run/{source}
- 9 source modules: ofsted, gias, parent_view, census, admissions,
  sen_detail, phonics, idaci, finance
- 9 Kestra flow YAMLs with scheduled triggers and 3× retry

Frontend:
- SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate)
- SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View
  survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation
  Context, Finances
- types.ts: 8 new interfaces + extended School/SchoolDetailsResponse

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-24 11:44:04 +00:00
parent c49593d4d6
commit dd49ef28b2
36 changed files with 2849 additions and 8 deletions

View File

@@ -0,0 +1,143 @@
"""
FBIT (Financial Benchmarking and Insights Tool) financial data loader.
Source: https://schools-financial-benchmarking.service.gov.uk/api/
Update: Annual (December — data for the prior financial year)
"""
import argparse
import sys
import time
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
DEST_DIR = SUPPLEMENTARY_DIR / "finance"
API_BASE = "https://schools-financial-benchmarking.service.gov.uk/api"
RATE_LIMIT_DELAY = 0.1 # seconds between requests
def download(data_dir: Path | None = None) -> Path:
"""
Fetch per-URN financial data from FBIT API and save as CSV.
Batches all school URNs from the database.
"""
dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
# Determine year from API (use current year minus 1 for completed financials)
from datetime import date
year = date.today().year - 1
dest_file = dest / f"fbit_{year}.csv"
if dest_file.exists():
print(f" Finance: {dest_file.name} already exists, skipping download.")
return dest_file
# Get all URNs from the database
with get_session() as session:
from sqlalchemy import text
rows = session.execute(text("SELECT urn FROM schools")).fetchall()
urns = [r[0] for r in rows]
print(f" Finance: fetching FBIT data for {len(urns)} schools (year {year}) ...")
records = []
errors = 0
for i, urn in enumerate(urns):
if i % 500 == 0:
print(f" {i}/{len(urns)} ...")
try:
resp = requests.get(
f"{API_BASE}/schoolFinancialDataObject/{urn}",
timeout=10,
)
if resp.status_code == 200:
data = resp.json()
if data:
records.append({
"urn": urn,
"year": year,
"per_pupil_spend": data.get("totalExpenditure") and
data.get("numberOfPupils") and
round(data["totalExpenditure"] / data["numberOfPupils"], 2),
"staff_cost_pct": data.get("staffCostPercent"),
"teacher_cost_pct": data.get("teachingStaffCostPercent"),
"support_staff_cost_pct": data.get("educationSupportStaffCostPercent"),
"premises_cost_pct": data.get("premisesStaffCostPercent"),
})
elif resp.status_code not in (404, 400):
errors += 1
except Exception:
errors += 1
time.sleep(RATE_LIMIT_DELAY)
df = pd.DataFrame(records)
df.to_csv(dest_file, index=False)
print(f" Finance: saved {len(records)} records to {dest_file} ({errors} errors)")
return dest_file
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
files = sorted(dest.glob("fbit_*.csv"))
if not files:
raise FileNotFoundError(f"No finance CSV found in {dest}")
path = files[-1]
print(f" Finance: loading {path} ...")
df = pd.read_csv(path)
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
inserted = 0
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
session.execute(
text("""
INSERT INTO school_finance
(urn, year, per_pupil_spend, staff_cost_pct, teacher_cost_pct,
support_staff_cost_pct, premises_cost_pct)
VALUES (:urn, :year, :per_pupil, :staff, :teacher, :support, :premises)
ON CONFLICT (urn, year) DO UPDATE SET
per_pupil_spend = EXCLUDED.per_pupil_spend,
staff_cost_pct = EXCLUDED.staff_cost_pct,
teacher_cost_pct = EXCLUDED.teacher_cost_pct,
support_staff_cost_pct = EXCLUDED.support_staff_cost_pct,
premises_cost_pct = EXCLUDED.premises_cost_pct
"""),
{
"urn": int(row["urn"]),
"year": int(row["year"]),
"per_pupil": float(row["per_pupil_spend"]) if pd.notna(row.get("per_pupil_spend")) else None,
"staff": float(row["staff_cost_pct"]) if pd.notna(row.get("staff_cost_pct")) else None,
"teacher": float(row["teacher_cost_pct"]) if pd.notna(row.get("teacher_cost_pct")) else None,
"support": float(row["support_staff_cost_pct"]) if pd.notna(row.get("support_staff_cost_pct")) else None,
"premises": float(row["premises_cost_pct"]) if pd.notna(row.get("premises_cost_pct")) else None,
},
)
inserted += 1
if inserted % 2000 == 0:
session.flush()
print(f" Finance: upserted {inserted} records")
return {"inserted": inserted, "updated": 0, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)