feat(data): integrate 9 UK government data sources via Kestra
Some checks failed
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 47s
Build and Push Docker Images / Trigger Portainer Update (push) Has been cancelled
Build and Push Docker Images / Build Frontend (Next.js) (push) Has been cancelled

Adds a full data integration pipeline for enriching school profiles with
supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT.

Backend:
- Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections,
  ofsted_parent_view, school_census, admissions, sen_detail, phonics,
  school_deprivation, school_finance) plus GIAS columns on schools
- Expose all supplementary data via GET /api/schools/{urn}
- Enrich school list responses with ofsted_grade + ofsted_date

Integrator (new service):
- FastAPI HTTP microservice; Kestra calls POST /run/{source}
- 9 source modules: ofsted, gias, parent_view, census, admissions,
  sen_detail, phonics, idaci, finance
- 9 Kestra flow YAMLs with scheduled triggers and 3× retry

Frontend:
- SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate)
- SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View
  survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation
  Context, Finances
- types.ts: 8 new interfaces + extended School/SchoolDetailsResponse

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-24 11:44:04 +00:00
parent c49593d4d6
commit dd49ef28b2
36 changed files with 2849 additions and 8 deletions

View File

@@ -0,0 +1,159 @@
"""
GIAS (Get Information About Schools) bulk CSV downloader and loader.
Source: https://get-information-schools.service.gov.uk/Downloads
Update: Daily; we refresh weekly.
Adds: website, headteacher_name, capacity, trust_name, trust_uid, gender, nursery_provision
"""
import argparse
import sys
from datetime import date
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
DEST_DIR = SUPPLEMENTARY_DIR / "gias"
# GIAS bulk download URL — date is injected at runtime
GIAS_URL_TEMPLATE = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata{date}.csv"
COLUMN_MAP = {
"URN": "urn",
"SchoolWebsite": "website",
"SchoolCapacity": "capacity",
"TrustName": "trust_name",
"TrustUID": "trust_uid",
"Gender (name)": "gender",
"NurseryProvision (name)": "nursery_provision_raw",
"HeadTitle": "head_title",
"HeadFirstName": "head_first",
"HeadLastName": "head_last",
}
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
today = date.today().strftime("%Y%m%d")
url = GIAS_URL_TEMPLATE.format(date=today)
filename = f"gias_{today}.csv"
dest_file = dest / filename
if dest_file.exists():
print(f" GIAS: {filename} already exists, skipping download.")
return dest_file
print(f" GIAS: downloading {url} ...")
resp = requests.get(url, timeout=300, stream=True)
# GIAS may not have today's file yet — fall back to yesterday
if resp.status_code == 404:
from datetime import timedelta
yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d")
url = GIAS_URL_TEMPLATE.format(date=yesterday)
filename = f"gias_{yesterday}.csv"
dest_file = dest / filename
if dest_file.exists():
print(f" GIAS: {filename} already exists, skipping download.")
return dest_file
resp = requests.get(url, timeout=300, stream=True)
resp.raise_for_status()
with open(dest_file, "wb") as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
print(f" GIAS: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)")
return dest_file
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
files = sorted(dest.glob("gias_*.csv"))
if not files:
raise FileNotFoundError(f"No GIAS CSV found in {dest}")
path = files[-1]
print(f" GIAS: loading {path} ...")
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
df.rename(columns=COLUMN_MAP, inplace=True)
if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
# Build headteacher_name from parts
def build_name(row):
parts = [
str(row.get("head_title", "") or "").strip(),
str(row.get("head_first", "") or "").strip(),
str(row.get("head_last", "") or "").strip(),
]
return " ".join(p for p in parts if p) or None
df["headteacher_name"] = df.apply(build_name, axis=1)
df["nursery_provision"] = df.get("nursery_provision_raw", pd.Series()).apply(
lambda v: True if str(v).strip().lower().startswith("has") else False if pd.notna(v) else None
)
def clean_str(val):
s = str(val).strip() if pd.notna(val) else None
return s if s and s.lower() not in ("nan", "none", "") else None
updated = 0
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
session.execute(
text("""
UPDATE schools SET
website = :website,
headteacher_name = :headteacher_name,
capacity = :capacity,
trust_name = :trust_name,
trust_uid = :trust_uid,
gender = :gender,
nursery_provision = :nursery_provision
WHERE urn = :urn
"""),
{
"urn": urn,
"website": clean_str(row.get("website")),
"headteacher_name": row.get("headteacher_name"),
"capacity": int(row["capacity"]) if pd.notna(row.get("capacity")) and str(row.get("capacity")).strip().isdigit() else None,
"trust_name": clean_str(row.get("trust_name")),
"trust_uid": clean_str(row.get("trust_uid")),
"gender": clean_str(row.get("gender")),
"nursery_provision": row.get("nursery_provision"),
},
)
updated += 1
if updated % 5000 == 0:
session.flush()
print(f" Updated {updated} schools...")
print(f" GIAS: updated {updated} school records")
return {"inserted": 0, "updated": updated, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
path = download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)