""" SEN (Special Educational Needs) primary need type breakdown. Source: EES publication "special-educational-needs-in-england" Update: Annual (September) """ import argparse import re import sys from pathlib import Path import pandas as pd sys.path.insert(0, str(Path(__file__).parent.parent)) from config import SUPPLEMENTARY_DIR from db import get_session from sources.ees import get_latest_csv_url, download_csv DEST_DIR = SUPPLEMENTARY_DIR / "sen_detail" PUBLICATION_SLUG = "special-educational-needs-in-england" NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""} COLUMN_MAP = { "URN": "urn", "urn": "urn", "YEAR": "year", "Year": "year", # Primary need types — DfE abbreviated codes "PT_SPEECH": "primary_need_speech_pct", # SLCN "PT_ASD": "primary_need_autism_pct", # ASD "PT_MLD": "primary_need_mld_pct", # Moderate learning difficulty "PT_SPLD": "primary_need_spld_pct", # Specific learning difficulty "PT_SEMH": "primary_need_semh_pct", # Social, emotional, mental health "PT_PHYSICAL": "primary_need_physical_pct", # Physical/sensory "PT_OTHER": "primary_need_other_pct", # Alternative naming "SLCN_PCT": "primary_need_speech_pct", "ASD_PCT": "primary_need_autism_pct", "MLD_PCT": "primary_need_mld_pct", "SPLD_PCT": "primary_need_spld_pct", "SEMH_PCT": "primary_need_semh_pct", "PHYSICAL_PCT": "primary_need_physical_pct", "OTHER_PCT": "primary_need_other_pct", } def download(data_dir: Path | None = None) -> Path: dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR dest.mkdir(parents=True, exist_ok=True) url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school") if not url: url = get_latest_csv_url(PUBLICATION_SLUG) if not url: raise RuntimeError("Could not find CSV URL for SEN publication") filename = url.split("/")[-1].split("?")[0] or "sen_latest.csv" return download_csv(url, dest / filename) def _parse_pct(val) -> float | None: if pd.isna(val): return None s = str(val).strip().upper().replace("%", "") if s in NULL_VALUES: return None try: return float(s) except ValueError: return None def load(path: Path | None = None, data_dir: Path | None = None) -> dict: if path is None: dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR files = sorted(dest.glob("*.csv")) if not files: raise FileNotFoundError(f"No SEN CSV found in {dest}") path = files[-1] print(f" SEN Detail: loading {path} ...") df = pd.read_csv(path, encoding="latin-1", low_memory=False) df.rename(columns=COLUMN_MAP, inplace=True) if "urn" not in df.columns: raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") df["urn"] = pd.to_numeric(df["urn"], errors="coerce") df = df.dropna(subset=["urn"]) df["urn"] = df["urn"].astype(int) year = None m = re.search(r"20(\d{2})", path.stem) if m: year = int("20" + m.group(1)) inserted = 0 with get_session() as session: from sqlalchemy import text for _, row in df.iterrows(): urn = int(row["urn"]) row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year if not row_year: continue session.execute( text(""" INSERT INTO sen_detail (urn, year, primary_need_speech_pct, primary_need_autism_pct, primary_need_mld_pct, primary_need_spld_pct, primary_need_semh_pct, primary_need_physical_pct, primary_need_other_pct) VALUES (:urn, :year, :speech, :autism, :mld, :spld, :semh, :physical, :other) ON CONFLICT (urn, year) DO UPDATE SET primary_need_speech_pct = EXCLUDED.primary_need_speech_pct, primary_need_autism_pct = EXCLUDED.primary_need_autism_pct, primary_need_mld_pct = EXCLUDED.primary_need_mld_pct, primary_need_spld_pct = EXCLUDED.primary_need_spld_pct, primary_need_semh_pct = EXCLUDED.primary_need_semh_pct, primary_need_physical_pct = EXCLUDED.primary_need_physical_pct, primary_need_other_pct = EXCLUDED.primary_need_other_pct """), { "urn": urn, "year": row_year, "speech": _parse_pct(row.get("primary_need_speech_pct")), "autism": _parse_pct(row.get("primary_need_autism_pct")), "mld": _parse_pct(row.get("primary_need_mld_pct")), "spld": _parse_pct(row.get("primary_need_spld_pct")), "semh": _parse_pct(row.get("primary_need_semh_pct")), "physical": _parse_pct(row.get("primary_need_physical_pct")), "other": _parse_pct(row.get("primary_need_other_pct")), }, ) inserted += 1 if inserted % 5000 == 0: session.flush() print(f" SEN Detail: upserted {inserted} records") return {"inserted": inserted, "updated": 0, "skipped": 0} if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--action", choices=["download", "load", "all"], default="all") parser.add_argument("--data-dir", type=Path, default=None) args = parser.parse_args() if args.action in ("download", "all"): download(args.data_dir) if args.action in ("load", "all"): load(data_dir=args.data_dir)