Files
school_compare/integrator/scripts/sources/sen_detail.py
Tudor dd49ef28b2
Some checks failed
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 47s
Build and Push Docker Images / Trigger Portainer Update (push) Has been cancelled
Build and Push Docker Images / Build Frontend (Next.js) (push) Has been cancelled
feat(data): integrate 9 UK government data sources via Kestra
Adds a full data integration pipeline for enriching school profiles with
supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT.

Backend:
- Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections,
  ofsted_parent_view, school_census, admissions, sen_detail, phonics,
  school_deprivation, school_finance) plus GIAS columns on schools
- Expose all supplementary data via GET /api/schools/{urn}
- Enrich school list responses with ofsted_grade + ofsted_date

Integrator (new service):
- FastAPI HTTP microservice; Kestra calls POST /run/{source}
- 9 source modules: ofsted, gias, parent_view, census, admissions,
  sen_detail, phonics, idaci, finance
- 9 Kestra flow YAMLs with scheduled triggers and 3× retry

Frontend:
- SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate)
- SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View
  survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation
  Context, Finances
- types.ts: 8 new interfaces + extended School/SchoolDetailsResponse

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 11:44:04 +00:00

151 lines
5.7 KiB
Python

"""
SEN (Special Educational Needs) primary need type breakdown.
Source: EES publication "special-educational-needs-in-england"
Update: Annual (September)
"""
import argparse
import re
import sys
from pathlib import Path
import pandas as pd
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
from sources.ees import get_latest_csv_url, download_csv
DEST_DIR = SUPPLEMENTARY_DIR / "sen_detail"
PUBLICATION_SLUG = "special-educational-needs-in-england"
NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}
COLUMN_MAP = {
"URN": "urn",
"urn": "urn",
"YEAR": "year",
"Year": "year",
# Primary need types — DfE abbreviated codes
"PT_SPEECH": "primary_need_speech_pct", # SLCN
"PT_ASD": "primary_need_autism_pct", # ASD
"PT_MLD": "primary_need_mld_pct", # Moderate learning difficulty
"PT_SPLD": "primary_need_spld_pct", # Specific learning difficulty
"PT_SEMH": "primary_need_semh_pct", # Social, emotional, mental health
"PT_PHYSICAL": "primary_need_physical_pct", # Physical/sensory
"PT_OTHER": "primary_need_other_pct",
# Alternative naming
"SLCN_PCT": "primary_need_speech_pct",
"ASD_PCT": "primary_need_autism_pct",
"MLD_PCT": "primary_need_mld_pct",
"SPLD_PCT": "primary_need_spld_pct",
"SEMH_PCT": "primary_need_semh_pct",
"PHYSICAL_PCT": "primary_need_physical_pct",
"OTHER_PCT": "primary_need_other_pct",
}
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
if not url:
url = get_latest_csv_url(PUBLICATION_SLUG)
if not url:
raise RuntimeError("Could not find CSV URL for SEN publication")
filename = url.split("/")[-1].split("?")[0] or "sen_latest.csv"
return download_csv(url, dest / filename)
def _parse_pct(val) -> float | None:
if pd.isna(val):
return None
s = str(val).strip().upper().replace("%", "")
if s in NULL_VALUES:
return None
try:
return float(s)
except ValueError:
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR
files = sorted(dest.glob("*.csv"))
if not files:
raise FileNotFoundError(f"No SEN CSV found in {dest}")
path = files[-1]
print(f" SEN Detail: loading {path} ...")
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
df.rename(columns=COLUMN_MAP, inplace=True)
if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
year = None
m = re.search(r"20(\d{2})", path.stem)
if m:
year = int("20" + m.group(1))
inserted = 0
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
if not row_year:
continue
session.execute(
text("""
INSERT INTO sen_detail
(urn, year, primary_need_speech_pct, primary_need_autism_pct,
primary_need_mld_pct, primary_need_spld_pct, primary_need_semh_pct,
primary_need_physical_pct, primary_need_other_pct)
VALUES (:urn, :year, :speech, :autism, :mld, :spld, :semh, :physical, :other)
ON CONFLICT (urn, year) DO UPDATE SET
primary_need_speech_pct = EXCLUDED.primary_need_speech_pct,
primary_need_autism_pct = EXCLUDED.primary_need_autism_pct,
primary_need_mld_pct = EXCLUDED.primary_need_mld_pct,
primary_need_spld_pct = EXCLUDED.primary_need_spld_pct,
primary_need_semh_pct = EXCLUDED.primary_need_semh_pct,
primary_need_physical_pct = EXCLUDED.primary_need_physical_pct,
primary_need_other_pct = EXCLUDED.primary_need_other_pct
"""),
{
"urn": urn, "year": row_year,
"speech": _parse_pct(row.get("primary_need_speech_pct")),
"autism": _parse_pct(row.get("primary_need_autism_pct")),
"mld": _parse_pct(row.get("primary_need_mld_pct")),
"spld": _parse_pct(row.get("primary_need_spld_pct")),
"semh": _parse_pct(row.get("primary_need_semh_pct")),
"physical": _parse_pct(row.get("primary_need_physical_pct")),
"other": _parse_pct(row.get("primary_need_other_pct")),
},
)
inserted += 1
if inserted % 5000 == 0:
session.flush()
print(f" SEN Detail: upserted {inserted} records")
return {"inserted": inserted, "updated": 0, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)