integrator/scripts/sources/census.py

"""
School Census (SPC) downloader and loader.

Source: EES publication "schools-pupils-and-their-characteristics"
Update: Annual (June)
Adds: class_size_avg, ethnicity breakdown by school
"""
import argparse
import re
import sys
from pathlib import Path

import pandas as pd

sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
from sources.ees import get_latest_csv_url, download_csv

DEST_DIR = SUPPLEMENTARY_DIR / "census"
PUBLICATION_SLUG = "schools-pupils-and-their-characteristics"

NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}

COLUMN_MAP = {
    "URN": "urn",
    "urn": "urn",
    "YEAR": "year",
    "Year": "year",
    # Class size
    "average_class_size": "class_size_avg",
    "AVCLAS": "class_size_avg",
    "avg_class_size": "class_size_avg",
    # Ethnicity — DfE uses ethnicity major group percentages
    "perc_white": "ethnicity_white_pct",
    "perc_asian": "ethnicity_asian_pct",
    "perc_black": "ethnicity_black_pct",
    "perc_mixed": "ethnicity_mixed_pct",
    "perc_other_ethnic": "ethnicity_other_pct",
    "PTWHITE": "ethnicity_white_pct",
    "PTASIAN": "ethnicity_asian_pct",
    "PTBLACK": "ethnicity_black_pct",
    "PTMIXED": "ethnicity_mixed_pct",
    "PTOTHER": "ethnicity_other_pct",
}


def download(data_dir: Path | None = None) -> Path:
    dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR
    dest.mkdir(parents=True, exist_ok=True)

    url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
    if not url:
        raise RuntimeError(f"Could not find CSV URL for census publication")

    filename = url.split("/")[-1].split("?")[0] or "census_latest.csv"
    return download_csv(url, dest / filename)


def _parse_pct(val) -> float | None:
    if pd.isna(val):
        return None
    s = str(val).strip().upper().replace("%", "")
    if s in NULL_VALUES:
        return None
    try:
        return float(s)
    except ValueError:
        return None


def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
    if path is None:
        dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR
        files = sorted(dest.glob("*.csv"))
        if not files:
            raise FileNotFoundError(f"No census CSV found in {dest}")
        path = files[-1]

    print(f"  Census: loading {path} ...")
    df = pd.read_csv(path, encoding="latin-1", low_memory=False)
    df.rename(columns=COLUMN_MAP, inplace=True)

    if "urn" not in df.columns:
        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")

    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
    df = df.dropna(subset=["urn"])
    df["urn"] = df["urn"].astype(int)

    year = None
    m = re.search(r"20(\d{2})", path.stem)
    if m:
        year = int("20" + m.group(1))

    inserted = 0
    with get_session() as session:
        from sqlalchemy import text
        for _, row in df.iterrows():
            urn = int(row["urn"])
            row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
            if not row_year:
                continue

            session.execute(
                text("""
                    INSERT INTO school_census
                        (urn, year, class_size_avg,
                         ethnicity_white_pct, ethnicity_asian_pct, ethnicity_black_pct,
                         ethnicity_mixed_pct, ethnicity_other_pct)
                    VALUES (:urn, :year, :class_size_avg,
                            :white, :asian, :black, :mixed, :other)
                    ON CONFLICT (urn, year) DO UPDATE SET
                        class_size_avg       = EXCLUDED.class_size_avg,
                        ethnicity_white_pct  = EXCLUDED.ethnicity_white_pct,
                        ethnicity_asian_pct  = EXCLUDED.ethnicity_asian_pct,
                        ethnicity_black_pct  = EXCLUDED.ethnicity_black_pct,
                        ethnicity_mixed_pct  = EXCLUDED.ethnicity_mixed_pct,
                        ethnicity_other_pct  = EXCLUDED.ethnicity_other_pct
                """),
                {
                    "urn": urn,
                    "year": row_year,
                    "class_size_avg": _parse_pct(row.get("class_size_avg")),
                    "white": _parse_pct(row.get("ethnicity_white_pct")),
                    "asian": _parse_pct(row.get("ethnicity_asian_pct")),
                    "black": _parse_pct(row.get("ethnicity_black_pct")),
                    "mixed": _parse_pct(row.get("ethnicity_mixed_pct")),
                    "other": _parse_pct(row.get("ethnicity_other_pct")),
                },
            )
            inserted += 1
            if inserted % 5000 == 0:
                session.flush()

    print(f"  Census: upserted {inserted} records")
    return {"inserted": inserted, "updated": 0, "skipped": 0}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
    parser.add_argument("--data-dir", type=Path, default=None)
    args = parser.parse_args()
    if args.action in ("download", "all"):
        download(args.data_dir)
    if args.action in ("load", "all"):
        load(data_dir=args.data_dir)
feat(data): integrate 9 UK government data sources via Kestra Adds a full data integration pipeline for enriching school profiles with supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT. Backend: - Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections, ofsted_parent_view, school_census, admissions, sen_detail, phonics, school_deprivation, school_finance) plus GIAS columns on schools - Expose all supplementary data via GET /api/schools/{urn} - Enrich school list responses with ofsted_grade + ofsted_date Integrator (new service): - FastAPI HTTP microservice; Kestra calls POST /run/{source} - 9 source modules: ofsted, gias, parent_view, census, admissions, sen_detail, phonics, idaci, finance - 9 Kestra flow YAMLs with scheduled triggers and 3× retry Frontend: - SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate) - SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation Context, Finances - types.ts: 8 new interfaces + extended School/SchoolDetailsResponse Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com> 2026-03-24 11:44:04 +00:00			`"""`
			`School Census (SPC) downloader and loader.`

			`Source: EES publication "schools-pupils-and-their-characteristics"`
			`Update: Annual (June)`
			`Adds: class_size_avg, ethnicity breakdown by school`
			`"""`
			`import argparse`
			`import re`
			`import sys`
			`from pathlib import Path`

			`import pandas as pd`

			`sys.path.insert(0, str(Path(__file__).parent.parent))`
			`from config import SUPPLEMENTARY_DIR`
			`from db import get_session`
			`from sources.ees import get_latest_csv_url, download_csv`

			`DEST_DIR = SUPPLEMENTARY_DIR / "census"`
			`PUBLICATION_SLUG = "schools-pupils-and-their-characteristics"`

			`NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}`

			`COLUMN_MAP = {`
			`"URN": "urn",`
			`"urn": "urn",`
			`"YEAR": "year",`
			`"Year": "year",`
			`# Class size`
			`"average_class_size": "class_size_avg",`
			`"AVCLAS": "class_size_avg",`
			`"avg_class_size": "class_size_avg",`
			`# Ethnicity — DfE uses ethnicity major group percentages`
			`"perc_white": "ethnicity_white_pct",`
			`"perc_asian": "ethnicity_asian_pct",`
			`"perc_black": "ethnicity_black_pct",`
			`"perc_mixed": "ethnicity_mixed_pct",`
			`"perc_other_ethnic": "ethnicity_other_pct",`
			`"PTWHITE": "ethnicity_white_pct",`
			`"PTASIAN": "ethnicity_asian_pct",`
			`"PTBLACK": "ethnicity_black_pct",`
			`"PTMIXED": "ethnicity_mixed_pct",`
			`"PTOTHER": "ethnicity_other_pct",`
			`}`


			`def download(data_dir: Path \| None = None) -> Path:`
			`dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR`
			`dest.mkdir(parents=True, exist_ok=True)`

			`url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")`
			`if not url:`
			`raise RuntimeError(f"Could not find CSV URL for census publication")`

			`filename = url.split("/")[-1].split("?")[0] or "census_latest.csv"`
			`return download_csv(url, dest / filename)`


			`def _parse_pct(val) -> float \| None:`
			`if pd.isna(val):`
			`return None`
			`s = str(val).strip().upper().replace("%", "")`
			`if s in NULL_VALUES:`
			`return None`
			`try:`
			`return float(s)`
			`except ValueError:`
			`return None`


			`def load(path: Path \| None = None, data_dir: Path \| None = None) -> dict:`
			`if path is None:`
			`dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR`
			`files = sorted(dest.glob("*.csv"))`
			`if not files:`
			`raise FileNotFoundError(f"No census CSV found in {dest}")`
			`path = files[-1]`

			`print(f" Census: loading {path} ...")`
			`df = pd.read_csv(path, encoding="latin-1", low_memory=False)`
			`df.rename(columns=COLUMN_MAP, inplace=True)`

			`if "urn" not in df.columns:`
			`raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")`

			`df["urn"] = pd.to_numeric(df["urn"], errors="coerce")`
			`df = df.dropna(subset=["urn"])`
			`df["urn"] = df["urn"].astype(int)`

			`year = None`
			`m = re.search(r"20(\d{2})", path.stem)`
			`if m:`
			`year = int("20" + m.group(1))`

			`inserted = 0`
			`with get_session() as session:`
			`from sqlalchemy import text`
			`for _, row in df.iterrows():`
			`urn = int(row["urn"])`
			`row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year`
			`if not row_year:`
			`continue`

			`session.execute(`
			`text("""`
			`INSERT INTO school_census`
			`(urn, year, class_size_avg,`
			`ethnicity_white_pct, ethnicity_asian_pct, ethnicity_black_pct,`
			`ethnicity_mixed_pct, ethnicity_other_pct)`
			`VALUES (:urn, :year, :class_size_avg,`
			`:white, :asian, :black, :mixed, :other)`
			`ON CONFLICT (urn, year) DO UPDATE SET`
			`class_size_avg = EXCLUDED.class_size_avg,`
			`ethnicity_white_pct = EXCLUDED.ethnicity_white_pct,`
			`ethnicity_asian_pct = EXCLUDED.ethnicity_asian_pct,`
			`ethnicity_black_pct = EXCLUDED.ethnicity_black_pct,`
			`ethnicity_mixed_pct = EXCLUDED.ethnicity_mixed_pct,`
			`ethnicity_other_pct = EXCLUDED.ethnicity_other_pct`
			`"""),`
			`{`
			`"urn": urn,`
			`"year": row_year,`
			`"class_size_avg": _parse_pct(row.get("class_size_avg")),`
			`"white": _parse_pct(row.get("ethnicity_white_pct")),`
			`"asian": _parse_pct(row.get("ethnicity_asian_pct")),`
			`"black": _parse_pct(row.get("ethnicity_black_pct")),`
			`"mixed": _parse_pct(row.get("ethnicity_mixed_pct")),`
			`"other": _parse_pct(row.get("ethnicity_other_pct")),`
			`},`
			`)`
			`inserted += 1`
			`if inserted % 5000 == 0:`
			`session.flush()`

			`print(f" Census: upserted {inserted} records")`
			`return {"inserted": inserted, "updated": 0, "skipped": 0}`


			`if __name__ == "__main__":`
			`parser = argparse.ArgumentParser()`
			`parser.add_argument("--action", choices=["download", "load", "all"], default="all")`
			`parser.add_argument("--data-dir", type=Path, default=None)`
			`args = parser.parse_args()`
			`if args.action in ("download", "all"):`
			`download(args.data_dir)`
			`if args.action in ("load", "all"):`
			`load(data_dir=args.data_dir)`