144 lines
5.7 KiB
Python
144 lines
5.7 KiB
Python
|
|
"""
|
||
|
|
FBIT (Financial Benchmarking and Insights Tool) financial data loader.
|
||
|
|
|
||
|
|
Source: https://schools-financial-benchmarking.service.gov.uk/api/
|
||
|
|
Update: Annual (December — data for the prior financial year)
|
||
|
|
"""
|
||
|
|
import argparse
|
||
|
|
import sys
|
||
|
|
import time
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import pandas as pd
|
||
|
|
import requests
|
||
|
|
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
|
|
from config import SUPPLEMENTARY_DIR
|
||
|
|
from db import get_session
|
||
|
|
|
||
|
|
DEST_DIR = SUPPLEMENTARY_DIR / "finance"
|
||
|
|
API_BASE = "https://schools-financial-benchmarking.service.gov.uk/api"
|
||
|
|
RATE_LIMIT_DELAY = 0.1 # seconds between requests
|
||
|
|
|
||
|
|
|
||
|
|
def download(data_dir: Path | None = None) -> Path:
|
||
|
|
"""
|
||
|
|
Fetch per-URN financial data from FBIT API and save as CSV.
|
||
|
|
Batches all school URNs from the database.
|
||
|
|
"""
|
||
|
|
dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
|
||
|
|
dest.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
# Determine year from API (use current year minus 1 for completed financials)
|
||
|
|
from datetime import date
|
||
|
|
year = date.today().year - 1
|
||
|
|
dest_file = dest / f"fbit_{year}.csv"
|
||
|
|
|
||
|
|
if dest_file.exists():
|
||
|
|
print(f" Finance: {dest_file.name} already exists, skipping download.")
|
||
|
|
return dest_file
|
||
|
|
|
||
|
|
# Get all URNs from the database
|
||
|
|
with get_session() as session:
|
||
|
|
from sqlalchemy import text
|
||
|
|
rows = session.execute(text("SELECT urn FROM schools")).fetchall()
|
||
|
|
urns = [r[0] for r in rows]
|
||
|
|
print(f" Finance: fetching FBIT data for {len(urns)} schools (year {year}) ...")
|
||
|
|
|
||
|
|
records = []
|
||
|
|
errors = 0
|
||
|
|
for i, urn in enumerate(urns):
|
||
|
|
if i % 500 == 0:
|
||
|
|
print(f" {i}/{len(urns)} ...")
|
||
|
|
try:
|
||
|
|
resp = requests.get(
|
||
|
|
f"{API_BASE}/schoolFinancialDataObject/{urn}",
|
||
|
|
timeout=10,
|
||
|
|
)
|
||
|
|
if resp.status_code == 200:
|
||
|
|
data = resp.json()
|
||
|
|
if data:
|
||
|
|
records.append({
|
||
|
|
"urn": urn,
|
||
|
|
"year": year,
|
||
|
|
"per_pupil_spend": data.get("totalExpenditure") and
|
||
|
|
data.get("numberOfPupils") and
|
||
|
|
round(data["totalExpenditure"] / data["numberOfPupils"], 2),
|
||
|
|
"staff_cost_pct": data.get("staffCostPercent"),
|
||
|
|
"teacher_cost_pct": data.get("teachingStaffCostPercent"),
|
||
|
|
"support_staff_cost_pct": data.get("educationSupportStaffCostPercent"),
|
||
|
|
"premises_cost_pct": data.get("premisesStaffCostPercent"),
|
||
|
|
})
|
||
|
|
elif resp.status_code not in (404, 400):
|
||
|
|
errors += 1
|
||
|
|
except Exception:
|
||
|
|
errors += 1
|
||
|
|
|
||
|
|
time.sleep(RATE_LIMIT_DELAY)
|
||
|
|
|
||
|
|
df = pd.DataFrame(records)
|
||
|
|
df.to_csv(dest_file, index=False)
|
||
|
|
print(f" Finance: saved {len(records)} records to {dest_file} ({errors} errors)")
|
||
|
|
return dest_file
|
||
|
|
|
||
|
|
|
||
|
|
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||
|
|
if path is None:
|
||
|
|
dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
|
||
|
|
files = sorted(dest.glob("fbit_*.csv"))
|
||
|
|
if not files:
|
||
|
|
raise FileNotFoundError(f"No finance CSV found in {dest}")
|
||
|
|
path = files[-1]
|
||
|
|
|
||
|
|
print(f" Finance: loading {path} ...")
|
||
|
|
df = pd.read_csv(path)
|
||
|
|
|
||
|
|
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
|
||
|
|
df = df.dropna(subset=["urn"])
|
||
|
|
df["urn"] = df["urn"].astype(int)
|
||
|
|
|
||
|
|
inserted = 0
|
||
|
|
with get_session() as session:
|
||
|
|
from sqlalchemy import text
|
||
|
|
for _, row in df.iterrows():
|
||
|
|
session.execute(
|
||
|
|
text("""
|
||
|
|
INSERT INTO school_finance
|
||
|
|
(urn, year, per_pupil_spend, staff_cost_pct, teacher_cost_pct,
|
||
|
|
support_staff_cost_pct, premises_cost_pct)
|
||
|
|
VALUES (:urn, :year, :per_pupil, :staff, :teacher, :support, :premises)
|
||
|
|
ON CONFLICT (urn, year) DO UPDATE SET
|
||
|
|
per_pupil_spend = EXCLUDED.per_pupil_spend,
|
||
|
|
staff_cost_pct = EXCLUDED.staff_cost_pct,
|
||
|
|
teacher_cost_pct = EXCLUDED.teacher_cost_pct,
|
||
|
|
support_staff_cost_pct = EXCLUDED.support_staff_cost_pct,
|
||
|
|
premises_cost_pct = EXCLUDED.premises_cost_pct
|
||
|
|
"""),
|
||
|
|
{
|
||
|
|
"urn": int(row["urn"]),
|
||
|
|
"year": int(row["year"]),
|
||
|
|
"per_pupil": float(row["per_pupil_spend"]) if pd.notna(row.get("per_pupil_spend")) else None,
|
||
|
|
"staff": float(row["staff_cost_pct"]) if pd.notna(row.get("staff_cost_pct")) else None,
|
||
|
|
"teacher": float(row["teacher_cost_pct"]) if pd.notna(row.get("teacher_cost_pct")) else None,
|
||
|
|
"support": float(row["support_staff_cost_pct"]) if pd.notna(row.get("support_staff_cost_pct")) else None,
|
||
|
|
"premises": float(row["premises_cost_pct"]) if pd.notna(row.get("premises_cost_pct")) else None,
|
||
|
|
},
|
||
|
|
)
|
||
|
|
inserted += 1
|
||
|
|
if inserted % 2000 == 0:
|
||
|
|
session.flush()
|
||
|
|
|
||
|
|
print(f" Finance: upserted {inserted} records")
|
||
|
|
return {"inserted": inserted, "updated": 0, "skipped": 0}
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
parser = argparse.ArgumentParser()
|
||
|
|
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
|
||
|
|
parser.add_argument("--data-dir", type=Path, default=None)
|
||
|
|
args = parser.parse_args()
|
||
|
|
if args.action in ("download", "all"):
|
||
|
|
download(args.data_dir)
|
||
|
|
if args.action in ("load", "all"):
|
||
|
|
load(data_dir=args.data_dir)
|