feat(data): integrate 9 UK government data sources via Kestra
Adds a full data integration pipeline for enriching school profiles with
supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT.
Backend:
- Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections,
ofsted_parent_view, school_census, admissions, sen_detail, phonics,
school_deprivation, school_finance) plus GIAS columns on schools
- Expose all supplementary data via GET /api/schools/{urn}
- Enrich school list responses with ofsted_grade + ofsted_date
Integrator (new service):
- FastAPI HTTP microservice; Kestra calls POST /run/{source}
- 9 source modules: ofsted, gias, parent_view, census, admissions,
sen_detail, phonics, idaci, finance
- 9 Kestra flow YAMLs with scheduled triggers and 3× retry
Frontend:
- SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate)
- SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View
survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation
Context, Finances
- types.ts: 8 new interfaces + extended School/SchoolDetailsResponse
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
148
integrator/scripts/sources/census.py
Normal file
148
integrator/scripts/sources/census.py
Normal file
@@ -0,0 +1,148 @@
|
||||
"""
|
||||
School Census (SPC) downloader and loader.
|
||||
|
||||
Source: EES publication "schools-pupils-and-their-characteristics"
|
||||
Update: Annual (June)
|
||||
Adds: class_size_avg, ethnicity breakdown by school
|
||||
"""
|
||||
import argparse
|
||||
import re
|
||||
import sys
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from config import SUPPLEMENTARY_DIR
|
||||
from db import get_session
|
||||
from sources.ees import get_latest_csv_url, download_csv
|
||||
|
||||
DEST_DIR = SUPPLEMENTARY_DIR / "census"
|
||||
PUBLICATION_SLUG = "schools-pupils-and-their-characteristics"
|
||||
|
||||
NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}
|
||||
|
||||
COLUMN_MAP = {
|
||||
"URN": "urn",
|
||||
"urn": "urn",
|
||||
"YEAR": "year",
|
||||
"Year": "year",
|
||||
# Class size
|
||||
"average_class_size": "class_size_avg",
|
||||
"AVCLAS": "class_size_avg",
|
||||
"avg_class_size": "class_size_avg",
|
||||
# Ethnicity — DfE uses ethnicity major group percentages
|
||||
"perc_white": "ethnicity_white_pct",
|
||||
"perc_asian": "ethnicity_asian_pct",
|
||||
"perc_black": "ethnicity_black_pct",
|
||||
"perc_mixed": "ethnicity_mixed_pct",
|
||||
"perc_other_ethnic": "ethnicity_other_pct",
|
||||
"PTWHITE": "ethnicity_white_pct",
|
||||
"PTASIAN": "ethnicity_asian_pct",
|
||||
"PTBLACK": "ethnicity_black_pct",
|
||||
"PTMIXED": "ethnicity_mixed_pct",
|
||||
"PTOTHER": "ethnicity_other_pct",
|
||||
}
|
||||
|
||||
|
||||
def download(data_dir: Path | None = None) -> Path:
|
||||
dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
|
||||
if not url:
|
||||
raise RuntimeError(f"Could not find CSV URL for census publication")
|
||||
|
||||
filename = url.split("/")[-1].split("?")[0] or "census_latest.csv"
|
||||
return download_csv(url, dest / filename)
|
||||
|
||||
|
||||
def _parse_pct(val) -> float | None:
|
||||
if pd.isna(val):
|
||||
return None
|
||||
s = str(val).strip().upper().replace("%", "")
|
||||
if s in NULL_VALUES:
|
||||
return None
|
||||
try:
|
||||
return float(s)
|
||||
except ValueError:
|
||||
return None
|
||||
|
||||
|
||||
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||||
if path is None:
|
||||
dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR
|
||||
files = sorted(dest.glob("*.csv"))
|
||||
if not files:
|
||||
raise FileNotFoundError(f"No census CSV found in {dest}")
|
||||
path = files[-1]
|
||||
|
||||
print(f" Census: loading {path} ...")
|
||||
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
|
||||
df.rename(columns=COLUMN_MAP, inplace=True)
|
||||
|
||||
if "urn" not in df.columns:
|
||||
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
|
||||
|
||||
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
|
||||
df = df.dropna(subset=["urn"])
|
||||
df["urn"] = df["urn"].astype(int)
|
||||
|
||||
year = None
|
||||
m = re.search(r"20(\d{2})", path.stem)
|
||||
if m:
|
||||
year = int("20" + m.group(1))
|
||||
|
||||
inserted = 0
|
||||
with get_session() as session:
|
||||
from sqlalchemy import text
|
||||
for _, row in df.iterrows():
|
||||
urn = int(row["urn"])
|
||||
row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
|
||||
if not row_year:
|
||||
continue
|
||||
|
||||
session.execute(
|
||||
text("""
|
||||
INSERT INTO school_census
|
||||
(urn, year, class_size_avg,
|
||||
ethnicity_white_pct, ethnicity_asian_pct, ethnicity_black_pct,
|
||||
ethnicity_mixed_pct, ethnicity_other_pct)
|
||||
VALUES (:urn, :year, :class_size_avg,
|
||||
:white, :asian, :black, :mixed, :other)
|
||||
ON CONFLICT (urn, year) DO UPDATE SET
|
||||
class_size_avg = EXCLUDED.class_size_avg,
|
||||
ethnicity_white_pct = EXCLUDED.ethnicity_white_pct,
|
||||
ethnicity_asian_pct = EXCLUDED.ethnicity_asian_pct,
|
||||
ethnicity_black_pct = EXCLUDED.ethnicity_black_pct,
|
||||
ethnicity_mixed_pct = EXCLUDED.ethnicity_mixed_pct,
|
||||
ethnicity_other_pct = EXCLUDED.ethnicity_other_pct
|
||||
"""),
|
||||
{
|
||||
"urn": urn,
|
||||
"year": row_year,
|
||||
"class_size_avg": _parse_pct(row.get("class_size_avg")),
|
||||
"white": _parse_pct(row.get("ethnicity_white_pct")),
|
||||
"asian": _parse_pct(row.get("ethnicity_asian_pct")),
|
||||
"black": _parse_pct(row.get("ethnicity_black_pct")),
|
||||
"mixed": _parse_pct(row.get("ethnicity_mixed_pct")),
|
||||
"other": _parse_pct(row.get("ethnicity_other_pct")),
|
||||
},
|
||||
)
|
||||
inserted += 1
|
||||
if inserted % 5000 == 0:
|
||||
session.flush()
|
||||
|
||||
print(f" Census: upserted {inserted} records")
|
||||
return {"inserted": inserted, "updated": 0, "skipped": 0}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
|
||||
parser.add_argument("--data-dir", type=Path, default=None)
|
||||
args = parser.parse_args()
|
||||
if args.action in ("download", "all"):
|
||||
download(args.data_dir)
|
||||
if args.action in ("load", "all"):
|
||||
load(data_dir=args.data_dir)
|
||||
Reference in New Issue
Block a user