school_compare/integrator/scripts/sources/sen_detail.py

"""
SEN (Special Educational Needs) primary need type breakdown.

Source: EES publication "special-educational-needs-in-england"
Update: Annual (September)
"""
import argparse
import re
import sys
from pathlib import Path

import pandas as pd

sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
from sources.ees import get_latest_csv_url, download_csv

DEST_DIR = SUPPLEMENTARY_DIR / "sen_detail"
PUBLICATION_SLUG = "special-educational-needs-in-england"

NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}

COLUMN_MAP = {
    "URN": "urn",
    "urn": "urn",
    "YEAR": "year",
    "Year": "year",
    # Primary need types — DfE abbreviated codes
    "PT_SPEECH": "primary_need_speech_pct",       # SLCN
    "PT_ASD": "primary_need_autism_pct",           # ASD
    "PT_MLD": "primary_need_mld_pct",             # Moderate learning difficulty
    "PT_SPLD": "primary_need_spld_pct",           # Specific learning difficulty
    "PT_SEMH": "primary_need_semh_pct",           # Social, emotional, mental health
    "PT_PHYSICAL": "primary_need_physical_pct",   # Physical/sensory
    "PT_OTHER": "primary_need_other_pct",
    # Alternative naming
    "SLCN_PCT": "primary_need_speech_pct",
    "ASD_PCT": "primary_need_autism_pct",
    "MLD_PCT": "primary_need_mld_pct",
    "SPLD_PCT": "primary_need_spld_pct",
    "SEMH_PCT": "primary_need_semh_pct",
    "PHYSICAL_PCT": "primary_need_physical_pct",
    "OTHER_PCT": "primary_need_other_pct",
}


def download(data_dir: Path | None = None) -> Path:
    dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR
    dest.mkdir(parents=True, exist_ok=True)

    url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
    if not url:
        url = get_latest_csv_url(PUBLICATION_SLUG)
    if not url:
        raise RuntimeError("Could not find CSV URL for SEN publication")

    filename = url.split("/")[-1].split("?")[0] or "sen_latest.csv"
    return download_csv(url, dest / filename)


def _parse_pct(val) -> float | None:
    if pd.isna(val):
        return None
    s = str(val).strip().upper().replace("%", "")
    if s in NULL_VALUES:
        return None
    try:
        return float(s)
    except ValueError:
        return None


def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
    if path is None:
        dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR
        files = sorted(dest.glob("*.csv"))
        if not files:
            raise FileNotFoundError(f"No SEN CSV found in {dest}")
        path = files[-1]

    print(f"  SEN Detail: loading {path} ...")
    df = pd.read_csv(path, encoding="latin-1", low_memory=False)
    df.rename(columns=COLUMN_MAP, inplace=True)

    if "urn" not in df.columns:
        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")

    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
    df = df.dropna(subset=["urn"])
    df["urn"] = df["urn"].astype(int)

    year = None
    m = re.search(r"20(\d{2})", path.stem)
    if m:
        year = int("20" + m.group(1))

    inserted = 0
    with get_session() as session:
        from sqlalchemy import text
        for _, row in df.iterrows():
            urn = int(row["urn"])
            row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
            if not row_year:
                continue

            session.execute(
                text("""
                    INSERT INTO sen_detail
                        (urn, year, primary_need_speech_pct, primary_need_autism_pct,
                         primary_need_mld_pct, primary_need_spld_pct, primary_need_semh_pct,
                         primary_need_physical_pct, primary_need_other_pct)
                    VALUES (:urn, :year, :speech, :autism, :mld, :spld, :semh, :physical, :other)
                    ON CONFLICT (urn, year) DO UPDATE SET
                        primary_need_speech_pct   = EXCLUDED.primary_need_speech_pct,
                        primary_need_autism_pct   = EXCLUDED.primary_need_autism_pct,
                        primary_need_mld_pct      = EXCLUDED.primary_need_mld_pct,
                        primary_need_spld_pct     = EXCLUDED.primary_need_spld_pct,
                        primary_need_semh_pct     = EXCLUDED.primary_need_semh_pct,
                        primary_need_physical_pct = EXCLUDED.primary_need_physical_pct,
                        primary_need_other_pct    = EXCLUDED.primary_need_other_pct
                """),
                {
                    "urn": urn, "year": row_year,
                    "speech": _parse_pct(row.get("primary_need_speech_pct")),
                    "autism": _parse_pct(row.get("primary_need_autism_pct")),
                    "mld": _parse_pct(row.get("primary_need_mld_pct")),
                    "spld": _parse_pct(row.get("primary_need_spld_pct")),
                    "semh": _parse_pct(row.get("primary_need_semh_pct")),
                    "physical": _parse_pct(row.get("primary_need_physical_pct")),
                    "other": _parse_pct(row.get("primary_need_other_pct")),
                },
            )
            inserted += 1
            if inserted % 5000 == 0:
                session.flush()

    print(f"  SEN Detail: upserted {inserted} records")
    return {"inserted": inserted, "updated": 0, "skipped": 0}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
    parser.add_argument("--data-dir", type=Path, default=None)
    args = parser.parse_args()
    if args.action in ("download", "all"):
        download(args.data_dir)
    if args.action in ("load", "all"):
        load(data_dir=args.data_dir)