Files
school_compare/integrator/scripts/sources/idaci.py
Tudor dd49ef28b2
Some checks failed
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 47s
Build and Push Docker Images / Trigger Portainer Update (push) Has been cancelled
Build and Push Docker Images / Build Frontend (Next.js) (push) Has been cancelled
feat(data): integrate 9 UK government data sources via Kestra
Adds a full data integration pipeline for enriching school profiles with
supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT.

Backend:
- Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections,
  ofsted_parent_view, school_census, admissions, sen_detail, phonics,
  school_deprivation, school_finance) plus GIAS columns on schools
- Expose all supplementary data via GET /api/schools/{urn}
- Enrich school list responses with ofsted_grade + ofsted_date

Integrator (new service):
- FastAPI HTTP microservice; Kestra calls POST /run/{source}
- 9 source modules: ofsted, gias, parent_view, census, admissions,
  sen_detail, phonics, idaci, finance
- 9 Kestra flow YAMLs with scheduled triggers and 3× retry

Frontend:
- SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate)
- SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View
  survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation
  Context, Finances
- types.ts: 8 new interfaces + extended School/SchoolDetailsResponse

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 11:44:04 +00:00

177 lines
6.7 KiB
Python

"""
IDACI (Income Deprivation Affecting Children Index) loader.
Source: English Indices of Deprivation 2019
https://www.gov.uk/government/statistics/english-indices-of-deprivation-2019
This is a one-time download (5-yearly release). We join school postcodes to LSOAs
via postcodes.io, then look up IDACI scores from the IoD2019 file.
Update: ~5-yearly (next release expected 2025/26)
"""
import argparse
import sys
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
DEST_DIR = SUPPLEMENTARY_DIR / "idaci"
# IoD 2019 supplementary data — "Income Deprivation Affecting Children Index (IDACI)"
IOD_2019_URL = (
"https://assets.publishing.service.gov.uk/government/uploads/system/uploads/"
"attachment_data/file/833970/File_1_-_IMD2019_Index_of_Multiple_Deprivation.xlsx"
)
POSTCODES_IO_BATCH = "https://api.postcodes.io/postcodes"
BATCH_SIZE = 100
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "idaci") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
filename = "iod2019_idaci.xlsx"
dest_file = dest / filename
if dest_file.exists():
print(f" IDACI: {filename} already exists, skipping download.")
return dest_file
print(f" IDACI: downloading IoD2019 file ...")
resp = requests.get(IOD_2019_URL, timeout=300, stream=True)
resp.raise_for_status()
with open(dest_file, "wb") as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
print(f" IDACI: saved {dest_file}")
return dest_file
def _postcode_to_lsoa(postcodes: list[str]) -> dict[str, str]:
"""Batch-resolve postcodes to LSOA codes via postcodes.io."""
result = {}
valid = [p.strip().upper() for p in postcodes if p and len(str(p).strip()) >= 5]
valid = list(set(valid))
for i in range(0, len(valid), BATCH_SIZE):
batch = valid[i:i + BATCH_SIZE]
try:
resp = requests.post(POSTCODES_IO_BATCH, json={"postcodes": batch}, timeout=30)
if resp.status_code == 200:
for item in resp.json().get("result", []):
if item and item.get("result"):
lsoa = item["result"].get("lsoa")
if lsoa:
result[item["query"].upper()] = lsoa
except Exception as e:
print(f" Warning: postcodes.io batch failed: {e}")
return result
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
dest = (data_dir / "supplementary" / "idaci") if data_dir else DEST_DIR
if path is None:
files = sorted(dest.glob("*.xlsx"))
if not files:
raise FileNotFoundError(f"No IDACI file found in {dest}")
path = files[-1]
print(f" IDACI: loading IoD2019 from {path} ...")
# IoD2019 File 1 — sheet "IoD2019 IDACI" or similar
try:
iod_df = pd.read_excel(path, sheet_name=None)
# Find sheet with IDACI data
idaci_sheet = None
for name, df in iod_df.items():
if "IDACI" in name.upper() or "IDACI" in str(df.columns.tolist()).upper():
idaci_sheet = name
break
if idaci_sheet is None:
idaci_sheet = list(iod_df.keys())[0]
df_iod = iod_df[idaci_sheet]
except Exception as e:
raise RuntimeError(f"Could not read IoD2019 file: {e}")
# Normalise column names — IoD2019 uses specific headers
col_lsoa = next((c for c in df_iod.columns if "LSOA" in str(c).upper() and "code" in str(c).lower()), None)
col_score = next((c for c in df_iod.columns if "IDACI" in str(c).upper() and "score" in str(c).lower()), None)
col_rank = next((c for c in df_iod.columns if "IDACI" in str(c).upper() and "rank" in str(c).lower()), None)
if not col_lsoa or not col_score:
print(f" IDACI columns available: {list(df_iod.columns)[:20]}")
raise ValueError("Could not find LSOA code or IDACI score columns")
df_iod = df_iod[[col_lsoa, col_score]].copy()
df_iod.columns = ["lsoa_code", "idaci_score"]
df_iod = df_iod.dropna()
# Compute decile from rank (or from score distribution)
total = len(df_iod)
df_iod = df_iod.sort_values("idaci_score", ascending=False)
df_iod["idaci_decile"] = (pd.qcut(df_iod["idaci_score"], 10, labels=False) + 1).astype(int)
# Decile 1 = most deprived (highest IDACI score)
df_iod["idaci_decile"] = 11 - df_iod["idaci_decile"]
lsoa_lookup = df_iod.set_index("lsoa_code")[["idaci_score", "idaci_decile"]].to_dict("index")
print(f" IDACI: loaded {len(lsoa_lookup)} LSOA records")
# Fetch all school postcodes from the database
with get_session() as session:
from sqlalchemy import text
rows = session.execute(text("SELECT urn, postcode FROM schools WHERE postcode IS NOT NULL")).fetchall()
postcodes = [r[1] for r in rows]
print(f" IDACI: resolving {len(postcodes)} postcodes via postcodes.io ...")
pc_to_lsoa = _postcode_to_lsoa(postcodes)
print(f" IDACI: resolved {len(pc_to_lsoa)} postcodes to LSOAs")
inserted = skipped = 0
with get_session() as session:
from sqlalchemy import text
for urn, postcode in rows:
lsoa = pc_to_lsoa.get(str(postcode).strip().upper())
if not lsoa:
skipped += 1
continue
iod = lsoa_lookup.get(lsoa)
if not iod:
skipped += 1
continue
session.execute(
text("""
INSERT INTO school_deprivation (urn, lsoa_code, idaci_score, idaci_decile)
VALUES (:urn, :lsoa, :score, :decile)
ON CONFLICT (urn) DO UPDATE SET
lsoa_code = EXCLUDED.lsoa_code,
idaci_score = EXCLUDED.idaci_score,
idaci_decile = EXCLUDED.idaci_decile
"""),
{"urn": urn, "lsoa": lsoa, "score": float(iod["idaci_score"]), "decile": int(iod["idaci_decile"])},
)
inserted += 1
if inserted % 2000 == 0:
session.flush()
print(f" IDACI: upserted {inserted}, skipped {skipped}")
return {"inserted": inserted, "updated": 0, "skipped": skipped}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)