""" FBIT (Financial Benchmarking and Insights Tool) financial data loader. Source: https://schools-financial-benchmarking.service.gov.uk/api/ Update: Annual (December — data for the prior financial year) """ import argparse import sys import time from pathlib import Path import pandas as pd import requests sys.path.insert(0, str(Path(__file__).parent.parent)) from config import SUPPLEMENTARY_DIR from db import get_session DEST_DIR = SUPPLEMENTARY_DIR / "finance" API_BASE = "https://schools-financial-benchmarking.service.gov.uk/api" RATE_LIMIT_DELAY = 0.1 # seconds between requests def download(data_dir: Path | None = None) -> Path: """ Fetch per-URN financial data from FBIT API and save as CSV. Batches all school URNs from the database. """ dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR dest.mkdir(parents=True, exist_ok=True) # Determine year from API (use current year minus 1 for completed financials) from datetime import date year = date.today().year - 1 dest_file = dest / f"fbit_{year}.csv" if dest_file.exists(): print(f" Finance: {dest_file.name} already exists, skipping download.") return dest_file # Get all URNs from the database with get_session() as session: from sqlalchemy import text rows = session.execute(text("SELECT urn FROM schools")).fetchall() urns = [r[0] for r in rows] print(f" Finance: fetching FBIT data for {len(urns)} schools (year {year}) ...") records = [] errors = 0 for i, urn in enumerate(urns): if i % 500 == 0: print(f" {i}/{len(urns)} ...") try: resp = requests.get( f"{API_BASE}/schoolFinancialDataObject/{urn}", timeout=10, ) if resp.status_code == 200: data = resp.json() if data: records.append({ "urn": urn, "year": year, "per_pupil_spend": data.get("totalExpenditure") and data.get("numberOfPupils") and round(data["totalExpenditure"] / data["numberOfPupils"], 2), "staff_cost_pct": data.get("staffCostPercent"), "teacher_cost_pct": data.get("teachingStaffCostPercent"), "support_staff_cost_pct": data.get("educationSupportStaffCostPercent"), "premises_cost_pct": data.get("premisesStaffCostPercent"), }) elif resp.status_code not in (404, 400): errors += 1 except Exception: errors += 1 time.sleep(RATE_LIMIT_DELAY) df = pd.DataFrame(records) df.to_csv(dest_file, index=False) print(f" Finance: saved {len(records)} records to {dest_file} ({errors} errors)") return dest_file def load(path: Path | None = None, data_dir: Path | None = None) -> dict: if path is None: dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR files = sorted(dest.glob("fbit_*.csv")) if not files: raise FileNotFoundError(f"No finance CSV found in {dest}") path = files[-1] print(f" Finance: loading {path} ...") df = pd.read_csv(path) df["urn"] = pd.to_numeric(df["urn"], errors="coerce") df = df.dropna(subset=["urn"]) df["urn"] = df["urn"].astype(int) inserted = 0 with get_session() as session: from sqlalchemy import text for _, row in df.iterrows(): session.execute( text(""" INSERT INTO school_finance (urn, year, per_pupil_spend, staff_cost_pct, teacher_cost_pct, support_staff_cost_pct, premises_cost_pct) VALUES (:urn, :year, :per_pupil, :staff, :teacher, :support, :premises) ON CONFLICT (urn, year) DO UPDATE SET per_pupil_spend = EXCLUDED.per_pupil_spend, staff_cost_pct = EXCLUDED.staff_cost_pct, teacher_cost_pct = EXCLUDED.teacher_cost_pct, support_staff_cost_pct = EXCLUDED.support_staff_cost_pct, premises_cost_pct = EXCLUDED.premises_cost_pct """), { "urn": int(row["urn"]), "year": int(row["year"]), "per_pupil": float(row["per_pupil_spend"]) if pd.notna(row.get("per_pupil_spend")) else None, "staff": float(row["staff_cost_pct"]) if pd.notna(row.get("staff_cost_pct")) else None, "teacher": float(row["teacher_cost_pct"]) if pd.notna(row.get("teacher_cost_pct")) else None, "support": float(row["support_staff_cost_pct"]) if pd.notna(row.get("support_staff_cost_pct")) else None, "premises": float(row["premises_cost_pct"]) if pd.notna(row.get("premises_cost_pct")) else None, }, ) inserted += 1 if inserted % 2000 == 0: session.flush() print(f" Finance: upserted {inserted} records") return {"inserted": inserted, "updated": 0, "skipped": 0} if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--action", choices=["download", "load", "all"], default="all") parser.add_argument("--data-dir", type=Path, default=None) args = parser.parse_args() if args.action in ("download", "all"): download(args.data_dir) if args.action in ("load", "all"): load(data_dir=args.data_dir)