integrator/scripts/sources/finance.py

"""
FBIT (Financial Benchmarking and Insights Tool) financial data loader.

Source: https://schools-financial-benchmarking.service.gov.uk/api/
Update: Annual (December — data for the prior financial year)
"""
import argparse
import sys
import time
from pathlib import Path

import pandas as pd
import requests

sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session

DEST_DIR = SUPPLEMENTARY_DIR / "finance"
API_BASE = "https://schools-financial-benchmarking.service.gov.uk/api"
RATE_LIMIT_DELAY = 0.1   # seconds between requests


def download(data_dir: Path | None = None) -> Path:
    """
    Fetch per-URN financial data from FBIT API and save as CSV.
    Batches all school URNs from the database.
    """
    dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
    dest.mkdir(parents=True, exist_ok=True)

    # Determine year from API (use current year minus 1 for completed financials)
    from datetime import date
    year = date.today().year - 1
    dest_file = dest / f"fbit_{year}.csv"

    if dest_file.exists():
        print(f"  Finance: {dest_file.name} already exists, skipping download.")
        return dest_file

    # Get all URNs from the database
    with get_session() as session:
        from sqlalchemy import text
        rows = session.execute(text("SELECT urn FROM schools")).fetchall()
    urns = [r[0] for r in rows]
    print(f"  Finance: fetching FBIT data for {len(urns)} schools (year {year}) ...")

    records = []
    errors = 0
    for i, urn in enumerate(urns):
        if i % 500 == 0:
            print(f"    {i}/{len(urns)} ...")
        try:
            resp = requests.get(
                f"{API_BASE}/schoolFinancialDataObject/{urn}",
                timeout=10,
            )
            if resp.status_code == 200:
                data = resp.json()
                if data:
                    records.append({
                        "urn": urn,
                        "year": year,
                        "per_pupil_spend": data.get("totalExpenditure") and
                                           data.get("numberOfPupils") and
                                           round(data["totalExpenditure"] / data["numberOfPupils"], 2),
                        "staff_cost_pct": data.get("staffCostPercent"),
                        "teacher_cost_pct": data.get("teachingStaffCostPercent"),
                        "support_staff_cost_pct": data.get("educationSupportStaffCostPercent"),
                        "premises_cost_pct": data.get("premisesStaffCostPercent"),
                    })
            elif resp.status_code not in (404, 400):
                errors += 1
        except Exception:
            errors += 1

        time.sleep(RATE_LIMIT_DELAY)

    df = pd.DataFrame(records)
    df.to_csv(dest_file, index=False)
    print(f"  Finance: saved {len(records)} records to {dest_file} ({errors} errors)")
    return dest_file


def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
    if path is None:
        dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
        files = sorted(dest.glob("fbit_*.csv"))
        if not files:
            raise FileNotFoundError(f"No finance CSV found in {dest}")
        path = files[-1]

    print(f"  Finance: loading {path} ...")
    df = pd.read_csv(path)

    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
    df = df.dropna(subset=["urn"])
    df["urn"] = df["urn"].astype(int)

    inserted = 0
    with get_session() as session:
        from sqlalchemy import text
        for _, row in df.iterrows():
            session.execute(
                text("""
                    INSERT INTO school_finance
                        (urn, year, per_pupil_spend, staff_cost_pct, teacher_cost_pct,
                         support_staff_cost_pct, premises_cost_pct)
                    VALUES (:urn, :year, :per_pupil, :staff, :teacher, :support, :premises)
                    ON CONFLICT (urn, year) DO UPDATE SET
                        per_pupil_spend        = EXCLUDED.per_pupil_spend,
                        staff_cost_pct         = EXCLUDED.staff_cost_pct,
                        teacher_cost_pct       = EXCLUDED.teacher_cost_pct,
                        support_staff_cost_pct = EXCLUDED.support_staff_cost_pct,
                        premises_cost_pct      = EXCLUDED.premises_cost_pct
                """),
                {
                    "urn": int(row["urn"]),
                    "year": int(row["year"]),
                    "per_pupil": float(row["per_pupil_spend"]) if pd.notna(row.get("per_pupil_spend")) else None,
                    "staff": float(row["staff_cost_pct"]) if pd.notna(row.get("staff_cost_pct")) else None,
                    "teacher": float(row["teacher_cost_pct"]) if pd.notna(row.get("teacher_cost_pct")) else None,
                    "support": float(row["support_staff_cost_pct"]) if pd.notna(row.get("support_staff_cost_pct")) else None,
                    "premises": float(row["premises_cost_pct"]) if pd.notna(row.get("premises_cost_pct")) else None,
                },
            )
            inserted += 1
            if inserted % 2000 == 0:
                session.flush()

    print(f"  Finance: upserted {inserted} records")
    return {"inserted": inserted, "updated": 0, "skipped": 0}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
    parser.add_argument("--data-dir", type=Path, default=None)
    args = parser.parse_args()
    if args.action in ("download", "all"):
        download(args.data_dir)
    if args.action in ("load", "all"):
        load(data_dir=args.data_dir)
feat(data): integrate 9 UK government data sources via Kestra Adds a full data integration pipeline for enriching school profiles with supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT. Backend: - Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections, ofsted_parent_view, school_census, admissions, sen_detail, phonics, school_deprivation, school_finance) plus GIAS columns on schools - Expose all supplementary data via GET /api/schools/{urn} - Enrich school list responses with ofsted_grade + ofsted_date Integrator (new service): - FastAPI HTTP microservice; Kestra calls POST /run/{source} - 9 source modules: ofsted, gias, parent_view, census, admissions, sen_detail, phonics, idaci, finance - 9 Kestra flow YAMLs with scheduled triggers and 3× retry Frontend: - SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate) - SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation Context, Finances - types.ts: 8 new interfaces + extended School/SchoolDetailsResponse Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-24 11:44:04 +00:00			`"""`
			`FBIT (Financial Benchmarking and Insights Tool) financial data loader.`

			`Source: https://schools-financial-benchmarking.service.gov.uk/api/`
			`Update: Annual (December — data for the prior financial year)`
			`"""`
			`import argparse`
			`import sys`
			`import time`
			`from pathlib import Path`

			`import pandas as pd`
			`import requests`

			`sys.path.insert(0, str(Path(__file__).parent.parent))`
			`from config import SUPPLEMENTARY_DIR`
			`from db import get_session`

			`DEST_DIR = SUPPLEMENTARY_DIR / "finance"`
			`API_BASE = "https://schools-financial-benchmarking.service.gov.uk/api"`
			`RATE_LIMIT_DELAY = 0.1 # seconds between requests`


			`def download(data_dir: Path \| None = None) -> Path:`
			`"""`
			`Fetch per-URN financial data from FBIT API and save as CSV.`
			`Batches all school URNs from the database.`
			`"""`
			`dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR`
			`dest.mkdir(parents=True, exist_ok=True)`

			`# Determine year from API (use current year minus 1 for completed financials)`
			`from datetime import date`
			`year = date.today().year - 1`
			`dest_file = dest / f"fbit_{year}.csv"`

			`if dest_file.exists():`
			`print(f" Finance: {dest_file.name} already exists, skipping download.")`
			`return dest_file`

			`# Get all URNs from the database`
			`with get_session() as session:`
			`from sqlalchemy import text`
			`rows = session.execute(text("SELECT urn FROM schools")).fetchall()`
			`urns = [r[0] for r in rows]`
			`print(f" Finance: fetching FBIT data for {len(urns)} schools (year {year}) ...")`

			`records = []`
			`errors = 0`
			`for i, urn in enumerate(urns):`
			`if i % 500 == 0:`
			`print(f" {i}/{len(urns)} ...")`
			`try:`
			`resp = requests.get(`
			`f"{API_BASE}/schoolFinancialDataObject/{urn}",`
			`timeout=10,`
			`)`
			`if resp.status_code == 200:`
			`data = resp.json()`
			`if data:`
			`records.append({`
			`"urn": urn,`
			`"year": year,`
			`"per_pupil_spend": data.get("totalExpenditure") and`
			`data.get("numberOfPupils") and`
			`round(data["totalExpenditure"] / data["numberOfPupils"], 2),`
			`"staff_cost_pct": data.get("staffCostPercent"),`
			`"teacher_cost_pct": data.get("teachingStaffCostPercent"),`
			`"support_staff_cost_pct": data.get("educationSupportStaffCostPercent"),`
			`"premises_cost_pct": data.get("premisesStaffCostPercent"),`
			`})`
			`elif resp.status_code not in (404, 400):`
			`errors += 1`
			`except Exception:`
			`errors += 1`

			`time.sleep(RATE_LIMIT_DELAY)`

			`df = pd.DataFrame(records)`
			`df.to_csv(dest_file, index=False)`
			`print(f" Finance: saved {len(records)} records to {dest_file} ({errors} errors)")`
			`return dest_file`


			`def load(path: Path \| None = None, data_dir: Path \| None = None) -> dict:`
			`if path is None:`
			`dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR`
			`files = sorted(dest.glob("fbit_*.csv"))`
			`if not files:`
			`raise FileNotFoundError(f"No finance CSV found in {dest}")`
			`path = files[-1]`

			`print(f" Finance: loading {path} ...")`
			`df = pd.read_csv(path)`

			`df["urn"] = pd.to_numeric(df["urn"], errors="coerce")`
			`df = df.dropna(subset=["urn"])`
			`df["urn"] = df["urn"].astype(int)`

			`inserted = 0`
			`with get_session() as session:`
			`from sqlalchemy import text`
			`for _, row in df.iterrows():`
			`session.execute(`
			`text("""`
			`INSERT INTO school_finance`
			`(urn, year, per_pupil_spend, staff_cost_pct, teacher_cost_pct,`
			`support_staff_cost_pct, premises_cost_pct)`
			`VALUES (:urn, :year, :per_pupil, :staff, :teacher, :support, :premises)`
			`ON CONFLICT (urn, year) DO UPDATE SET`
			`per_pupil_spend = EXCLUDED.per_pupil_spend,`
			`staff_cost_pct = EXCLUDED.staff_cost_pct,`
			`teacher_cost_pct = EXCLUDED.teacher_cost_pct,`
			`support_staff_cost_pct = EXCLUDED.support_staff_cost_pct,`
			`premises_cost_pct = EXCLUDED.premises_cost_pct`
			`"""),`
			`{`
			`"urn": int(row["urn"]),`
			`"year": int(row["year"]),`
			`"per_pupil": float(row["per_pupil_spend"]) if pd.notna(row.get("per_pupil_spend")) else None,`
			`"staff": float(row["staff_cost_pct"]) if pd.notna(row.get("staff_cost_pct")) else None,`
			`"teacher": float(row["teacher_cost_pct"]) if pd.notna(row.get("teacher_cost_pct")) else None,`
			`"support": float(row["support_staff_cost_pct"]) if pd.notna(row.get("support_staff_cost_pct")) else None,`
			`"premises": float(row["premises_cost_pct"]) if pd.notna(row.get("premises_cost_pct")) else None,`
			`},`
			`)`
			`inserted += 1`
			`if inserted % 2000 == 0:`
			`session.flush()`

			`print(f" Finance: upserted {inserted} records")`
			`return {"inserted": inserted, "updated": 0, "skipped": 0}`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--action", choices=["download", "load", "all"], default="all")`
			`parser.add_argument("--data-dir", type=Path, default=None)`
			`args = parser.parse_args()`
			`if args.action in ("download", "all"):`
			`download(args.data_dir)`
			`if args.action in ("load", "all"):`
			`load(data_dir=args.data_dir)`