feat(data): integrate 9 UK government data sources via Kestra
Some checks failed
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 47s
Build and Push Docker Images / Trigger Portainer Update (push) Has been cancelled
Build and Push Docker Images / Build Frontend (Next.js) (push) Has been cancelled

Adds a full data integration pipeline for enriching school profiles with
supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT.

Backend:
- Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections,
  ofsted_parent_view, school_census, admissions, sen_detail, phonics,
  school_deprivation, school_finance) plus GIAS columns on schools
- Expose all supplementary data via GET /api/schools/{urn}
- Enrich school list responses with ofsted_grade + ofsted_date

Integrator (new service):
- FastAPI HTTP microservice; Kestra calls POST /run/{source}
- 9 source modules: ofsted, gias, parent_view, census, admissions,
  sen_detail, phonics, idaci, finance
- 9 Kestra flow YAMLs with scheduled triggers and 3× retry

Frontend:
- SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate)
- SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View
  survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation
  Context, Finances
- types.ts: 8 new interfaces + extended School/SchoolDetailsResponse

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-24 11:44:04 +00:00
parent c49593d4d6
commit dd49ef28b2
36 changed files with 2849 additions and 8 deletions

View File

View File

@@ -0,0 +1,158 @@
"""
School Admissions data downloader and loader.
Source: EES publication "secondary-and-primary-school-applications-and-offers"
Update: Annual (June/July post-offer round)
"""
import argparse
import re
import sys
from pathlib import Path
import pandas as pd
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
from sources.ees import get_latest_csv_url, download_csv
DEST_DIR = SUPPLEMENTARY_DIR / "admissions"
PUBLICATION_SLUG = "secondary-and-primary-school-applications-and-offers"
NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}
COLUMN_MAP = {
"URN": "urn",
"urn": "urn",
"YEAR": "year",
"Year": "year",
# PAN
"PAN": "pan",
"published_admission_number": "pan",
"admissions_number": "pan",
# Applications
"total_applications": "total_applications",
"TAPP": "total_applications",
"applications_received": "total_applications",
# 1st preference offers
"first_preference_offers_pct": "first_preference_offers_pct",
"pct_1st_preference": "first_preference_offers_pct",
"PT1PREF": "first_preference_offers_pct",
# Oversubscription
"oversubscribed": "oversubscribed",
}
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
url = get_latest_csv_url(PUBLICATION_SLUG, keyword="primary")
if not url:
url = get_latest_csv_url(PUBLICATION_SLUG)
if not url:
raise RuntimeError("Could not find CSV URL for admissions publication")
filename = url.split("/")[-1].split("?")[0] or "admissions_latest.csv"
return download_csv(url, dest / filename)
def _parse_int(val) -> int | None:
if pd.isna(val):
return None
s = str(val).strip().upper().replace(",", "")
if s in NULL_VALUES:
return None
try:
return int(float(s))
except ValueError:
return None
def _parse_pct(val) -> float | None:
if pd.isna(val):
return None
s = str(val).strip().upper().replace("%", "")
if s in NULL_VALUES:
return None
try:
return float(s)
except ValueError:
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR
files = sorted(dest.glob("*.csv"))
if not files:
raise FileNotFoundError(f"No admissions CSV found in {dest}")
path = files[-1]
print(f" Admissions: loading {path} ...")
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
df.rename(columns=COLUMN_MAP, inplace=True)
if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
year = None
m = re.search(r"20(\d{2})", path.stem)
if m:
year = int("20" + m.group(1))
inserted = 0
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
if not row_year:
continue
pan = _parse_int(row.get("pan"))
total_apps = _parse_int(row.get("total_applications"))
pct_1st = _parse_pct(row.get("first_preference_offers_pct"))
oversubscribed = bool(row.get("oversubscribed")) if pd.notna(row.get("oversubscribed")) else (
True if (pan and total_apps and total_apps > pan) else None
)
session.execute(
text("""
INSERT INTO school_admissions
(urn, year, published_admission_number, total_applications,
first_preference_offers_pct, oversubscribed)
VALUES (:urn, :year, :pan, :total_apps, :pct_1st, :oversubscribed)
ON CONFLICT (urn, year) DO UPDATE SET
published_admission_number = EXCLUDED.published_admission_number,
total_applications = EXCLUDED.total_applications,
first_preference_offers_pct = EXCLUDED.first_preference_offers_pct,
oversubscribed = EXCLUDED.oversubscribed
"""),
{
"urn": urn, "year": row_year, "pan": pan,
"total_apps": total_apps, "pct_1st": pct_1st,
"oversubscribed": oversubscribed,
},
)
inserted += 1
if inserted % 5000 == 0:
session.flush()
print(f" Admissions: upserted {inserted} records")
return {"inserted": inserted, "updated": 0, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)

View File

@@ -0,0 +1,148 @@
"""
School Census (SPC) downloader and loader.
Source: EES publication "schools-pupils-and-their-characteristics"
Update: Annual (June)
Adds: class_size_avg, ethnicity breakdown by school
"""
import argparse
import re
import sys
from pathlib import Path
import pandas as pd
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
from sources.ees import get_latest_csv_url, download_csv
DEST_DIR = SUPPLEMENTARY_DIR / "census"
PUBLICATION_SLUG = "schools-pupils-and-their-characteristics"
NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}
COLUMN_MAP = {
"URN": "urn",
"urn": "urn",
"YEAR": "year",
"Year": "year",
# Class size
"average_class_size": "class_size_avg",
"AVCLAS": "class_size_avg",
"avg_class_size": "class_size_avg",
# Ethnicity — DfE uses ethnicity major group percentages
"perc_white": "ethnicity_white_pct",
"perc_asian": "ethnicity_asian_pct",
"perc_black": "ethnicity_black_pct",
"perc_mixed": "ethnicity_mixed_pct",
"perc_other_ethnic": "ethnicity_other_pct",
"PTWHITE": "ethnicity_white_pct",
"PTASIAN": "ethnicity_asian_pct",
"PTBLACK": "ethnicity_black_pct",
"PTMIXED": "ethnicity_mixed_pct",
"PTOTHER": "ethnicity_other_pct",
}
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
if not url:
raise RuntimeError(f"Could not find CSV URL for census publication")
filename = url.split("/")[-1].split("?")[0] or "census_latest.csv"
return download_csv(url, dest / filename)
def _parse_pct(val) -> float | None:
if pd.isna(val):
return None
s = str(val).strip().upper().replace("%", "")
if s in NULL_VALUES:
return None
try:
return float(s)
except ValueError:
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR
files = sorted(dest.glob("*.csv"))
if not files:
raise FileNotFoundError(f"No census CSV found in {dest}")
path = files[-1]
print(f" Census: loading {path} ...")
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
df.rename(columns=COLUMN_MAP, inplace=True)
if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
year = None
m = re.search(r"20(\d{2})", path.stem)
if m:
year = int("20" + m.group(1))
inserted = 0
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
if not row_year:
continue
session.execute(
text("""
INSERT INTO school_census
(urn, year, class_size_avg,
ethnicity_white_pct, ethnicity_asian_pct, ethnicity_black_pct,
ethnicity_mixed_pct, ethnicity_other_pct)
VALUES (:urn, :year, :class_size_avg,
:white, :asian, :black, :mixed, :other)
ON CONFLICT (urn, year) DO UPDATE SET
class_size_avg = EXCLUDED.class_size_avg,
ethnicity_white_pct = EXCLUDED.ethnicity_white_pct,
ethnicity_asian_pct = EXCLUDED.ethnicity_asian_pct,
ethnicity_black_pct = EXCLUDED.ethnicity_black_pct,
ethnicity_mixed_pct = EXCLUDED.ethnicity_mixed_pct,
ethnicity_other_pct = EXCLUDED.ethnicity_other_pct
"""),
{
"urn": urn,
"year": row_year,
"class_size_avg": _parse_pct(row.get("class_size_avg")),
"white": _parse_pct(row.get("ethnicity_white_pct")),
"asian": _parse_pct(row.get("ethnicity_asian_pct")),
"black": _parse_pct(row.get("ethnicity_black_pct")),
"mixed": _parse_pct(row.get("ethnicity_mixed_pct")),
"other": _parse_pct(row.get("ethnicity_other_pct")),
},
)
inserted += 1
if inserted % 5000 == 0:
session.flush()
print(f" Census: upserted {inserted} records")
return {"inserted": inserted, "updated": 0, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)

View File

@@ -0,0 +1,53 @@
"""
Shared EES (Explore Education Statistics) API client.
Base URL: https://api.education.gov.uk/statistics/v1
"""
import sys
from pathlib import Path
from typing import Optional
import requests
API_BASE = "https://api.education.gov.uk/statistics/v1"
TIMEOUT = 60
def get_publication_files(publication_slug: str) -> list[dict]:
"""Return list of data-set file descriptors for a publication."""
url = f"{API_BASE}/publications/{publication_slug}/data-set-files"
resp = requests.get(url, timeout=TIMEOUT)
resp.raise_for_status()
return resp.json().get("results", [])
def get_latest_csv_url(publication_slug: str, keyword: str = "") -> Optional[str]:
"""
Find the most recent CSV download URL for a publication.
Optionally filter by a keyword in the file name.
"""
files = get_publication_files(publication_slug)
for entry in files:
name = entry.get("name", "").lower()
if keyword and keyword.lower() not in name:
continue
csv_url = entry.get("csvDownloadUrl") or entry.get("file", {}).get("url")
if csv_url:
return csv_url
return None
def download_csv(url: str, dest_path: Path) -> Path:
"""Download a CSV from EES to dest_path."""
if dest_path.exists():
print(f" EES: {dest_path.name} already exists, skipping.")
return dest_path
print(f" EES: downloading {url} ...")
resp = requests.get(url, timeout=300, stream=True)
resp.raise_for_status()
dest_path.parent.mkdir(parents=True, exist_ok=True)
with open(dest_path, "wb") as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
print(f" EES: saved {dest_path} ({dest_path.stat().st_size // 1024} KB)")
return dest_path

View File

@@ -0,0 +1,143 @@
"""
FBIT (Financial Benchmarking and Insights Tool) financial data loader.
Source: https://schools-financial-benchmarking.service.gov.uk/api/
Update: Annual (December — data for the prior financial year)
"""
import argparse
import sys
import time
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
DEST_DIR = SUPPLEMENTARY_DIR / "finance"
API_BASE = "https://schools-financial-benchmarking.service.gov.uk/api"
RATE_LIMIT_DELAY = 0.1 # seconds between requests
def download(data_dir: Path | None = None) -> Path:
"""
Fetch per-URN financial data from FBIT API and save as CSV.
Batches all school URNs from the database.
"""
dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
# Determine year from API (use current year minus 1 for completed financials)
from datetime import date
year = date.today().year - 1
dest_file = dest / f"fbit_{year}.csv"
if dest_file.exists():
print(f" Finance: {dest_file.name} already exists, skipping download.")
return dest_file
# Get all URNs from the database
with get_session() as session:
from sqlalchemy import text
rows = session.execute(text("SELECT urn FROM schools")).fetchall()
urns = [r[0] for r in rows]
print(f" Finance: fetching FBIT data for {len(urns)} schools (year {year}) ...")
records = []
errors = 0
for i, urn in enumerate(urns):
if i % 500 == 0:
print(f" {i}/{len(urns)} ...")
try:
resp = requests.get(
f"{API_BASE}/schoolFinancialDataObject/{urn}",
timeout=10,
)
if resp.status_code == 200:
data = resp.json()
if data:
records.append({
"urn": urn,
"year": year,
"per_pupil_spend": data.get("totalExpenditure") and
data.get("numberOfPupils") and
round(data["totalExpenditure"] / data["numberOfPupils"], 2),
"staff_cost_pct": data.get("staffCostPercent"),
"teacher_cost_pct": data.get("teachingStaffCostPercent"),
"support_staff_cost_pct": data.get("educationSupportStaffCostPercent"),
"premises_cost_pct": data.get("premisesStaffCostPercent"),
})
elif resp.status_code not in (404, 400):
errors += 1
except Exception:
errors += 1
time.sleep(RATE_LIMIT_DELAY)
df = pd.DataFrame(records)
df.to_csv(dest_file, index=False)
print(f" Finance: saved {len(records)} records to {dest_file} ({errors} errors)")
return dest_file
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
files = sorted(dest.glob("fbit_*.csv"))
if not files:
raise FileNotFoundError(f"No finance CSV found in {dest}")
path = files[-1]
print(f" Finance: loading {path} ...")
df = pd.read_csv(path)
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
inserted = 0
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
session.execute(
text("""
INSERT INTO school_finance
(urn, year, per_pupil_spend, staff_cost_pct, teacher_cost_pct,
support_staff_cost_pct, premises_cost_pct)
VALUES (:urn, :year, :per_pupil, :staff, :teacher, :support, :premises)
ON CONFLICT (urn, year) DO UPDATE SET
per_pupil_spend = EXCLUDED.per_pupil_spend,
staff_cost_pct = EXCLUDED.staff_cost_pct,
teacher_cost_pct = EXCLUDED.teacher_cost_pct,
support_staff_cost_pct = EXCLUDED.support_staff_cost_pct,
premises_cost_pct = EXCLUDED.premises_cost_pct
"""),
{
"urn": int(row["urn"]),
"year": int(row["year"]),
"per_pupil": float(row["per_pupil_spend"]) if pd.notna(row.get("per_pupil_spend")) else None,
"staff": float(row["staff_cost_pct"]) if pd.notna(row.get("staff_cost_pct")) else None,
"teacher": float(row["teacher_cost_pct"]) if pd.notna(row.get("teacher_cost_pct")) else None,
"support": float(row["support_staff_cost_pct"]) if pd.notna(row.get("support_staff_cost_pct")) else None,
"premises": float(row["premises_cost_pct"]) if pd.notna(row.get("premises_cost_pct")) else None,
},
)
inserted += 1
if inserted % 2000 == 0:
session.flush()
print(f" Finance: upserted {inserted} records")
return {"inserted": inserted, "updated": 0, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)

View File

@@ -0,0 +1,159 @@
"""
GIAS (Get Information About Schools) bulk CSV downloader and loader.
Source: https://get-information-schools.service.gov.uk/Downloads
Update: Daily; we refresh weekly.
Adds: website, headteacher_name, capacity, trust_name, trust_uid, gender, nursery_provision
"""
import argparse
import sys
from datetime import date
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
DEST_DIR = SUPPLEMENTARY_DIR / "gias"
# GIAS bulk download URL — date is injected at runtime
GIAS_URL_TEMPLATE = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata{date}.csv"
COLUMN_MAP = {
"URN": "urn",
"SchoolWebsite": "website",
"SchoolCapacity": "capacity",
"TrustName": "trust_name",
"TrustUID": "trust_uid",
"Gender (name)": "gender",
"NurseryProvision (name)": "nursery_provision_raw",
"HeadTitle": "head_title",
"HeadFirstName": "head_first",
"HeadLastName": "head_last",
}
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
today = date.today().strftime("%Y%m%d")
url = GIAS_URL_TEMPLATE.format(date=today)
filename = f"gias_{today}.csv"
dest_file = dest / filename
if dest_file.exists():
print(f" GIAS: {filename} already exists, skipping download.")
return dest_file
print(f" GIAS: downloading {url} ...")
resp = requests.get(url, timeout=300, stream=True)
# GIAS may not have today's file yet — fall back to yesterday
if resp.status_code == 404:
from datetime import timedelta
yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d")
url = GIAS_URL_TEMPLATE.format(date=yesterday)
filename = f"gias_{yesterday}.csv"
dest_file = dest / filename
if dest_file.exists():
print(f" GIAS: {filename} already exists, skipping download.")
return dest_file
resp = requests.get(url, timeout=300, stream=True)
resp.raise_for_status()
with open(dest_file, "wb") as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
print(f" GIAS: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)")
return dest_file
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
files = sorted(dest.glob("gias_*.csv"))
if not files:
raise FileNotFoundError(f"No GIAS CSV found in {dest}")
path = files[-1]
print(f" GIAS: loading {path} ...")
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
df.rename(columns=COLUMN_MAP, inplace=True)
if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
# Build headteacher_name from parts
def build_name(row):
parts = [
str(row.get("head_title", "") or "").strip(),
str(row.get("head_first", "") or "").strip(),
str(row.get("head_last", "") or "").strip(),
]
return " ".join(p for p in parts if p) or None
df["headteacher_name"] = df.apply(build_name, axis=1)
df["nursery_provision"] = df.get("nursery_provision_raw", pd.Series()).apply(
lambda v: True if str(v).strip().lower().startswith("has") else False if pd.notna(v) else None
)
def clean_str(val):
s = str(val).strip() if pd.notna(val) else None
return s if s and s.lower() not in ("nan", "none", "") else None
updated = 0
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
session.execute(
text("""
UPDATE schools SET
website = :website,
headteacher_name = :headteacher_name,
capacity = :capacity,
trust_name = :trust_name,
trust_uid = :trust_uid,
gender = :gender,
nursery_provision = :nursery_provision
WHERE urn = :urn
"""),
{
"urn": urn,
"website": clean_str(row.get("website")),
"headteacher_name": row.get("headteacher_name"),
"capacity": int(row["capacity"]) if pd.notna(row.get("capacity")) and str(row.get("capacity")).strip().isdigit() else None,
"trust_name": clean_str(row.get("trust_name")),
"trust_uid": clean_str(row.get("trust_uid")),
"gender": clean_str(row.get("gender")),
"nursery_provision": row.get("nursery_provision"),
},
)
updated += 1
if updated % 5000 == 0:
session.flush()
print(f" Updated {updated} schools...")
print(f" GIAS: updated {updated} school records")
return {"inserted": 0, "updated": updated, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
path = download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)

View File

@@ -0,0 +1,176 @@
"""
IDACI (Income Deprivation Affecting Children Index) loader.
Source: English Indices of Deprivation 2019
https://www.gov.uk/government/statistics/english-indices-of-deprivation-2019
This is a one-time download (5-yearly release). We join school postcodes to LSOAs
via postcodes.io, then look up IDACI scores from the IoD2019 file.
Update: ~5-yearly (next release expected 2025/26)
"""
import argparse
import sys
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
DEST_DIR = SUPPLEMENTARY_DIR / "idaci"
# IoD 2019 supplementary data — "Income Deprivation Affecting Children Index (IDACI)"
IOD_2019_URL = (
"https://assets.publishing.service.gov.uk/government/uploads/system/uploads/"
"attachment_data/file/833970/File_1_-_IMD2019_Index_of_Multiple_Deprivation.xlsx"
)
POSTCODES_IO_BATCH = "https://api.postcodes.io/postcodes"
BATCH_SIZE = 100
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "idaci") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
filename = "iod2019_idaci.xlsx"
dest_file = dest / filename
if dest_file.exists():
print(f" IDACI: {filename} already exists, skipping download.")
return dest_file
print(f" IDACI: downloading IoD2019 file ...")
resp = requests.get(IOD_2019_URL, timeout=300, stream=True)
resp.raise_for_status()
with open(dest_file, "wb") as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
print(f" IDACI: saved {dest_file}")
return dest_file
def _postcode_to_lsoa(postcodes: list[str]) -> dict[str, str]:
"""Batch-resolve postcodes to LSOA codes via postcodes.io."""
result = {}
valid = [p.strip().upper() for p in postcodes if p and len(str(p).strip()) >= 5]
valid = list(set(valid))
for i in range(0, len(valid), BATCH_SIZE):
batch = valid[i:i + BATCH_SIZE]
try:
resp = requests.post(POSTCODES_IO_BATCH, json={"postcodes": batch}, timeout=30)
if resp.status_code == 200:
for item in resp.json().get("result", []):
if item and item.get("result"):
lsoa = item["result"].get("lsoa")
if lsoa:
result[item["query"].upper()] = lsoa
except Exception as e:
print(f" Warning: postcodes.io batch failed: {e}")
return result
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
dest = (data_dir / "supplementary" / "idaci") if data_dir else DEST_DIR
if path is None:
files = sorted(dest.glob("*.xlsx"))
if not files:
raise FileNotFoundError(f"No IDACI file found in {dest}")
path = files[-1]
print(f" IDACI: loading IoD2019 from {path} ...")
# IoD2019 File 1 — sheet "IoD2019 IDACI" or similar
try:
iod_df = pd.read_excel(path, sheet_name=None)
# Find sheet with IDACI data
idaci_sheet = None
for name, df in iod_df.items():
if "IDACI" in name.upper() or "IDACI" in str(df.columns.tolist()).upper():
idaci_sheet = name
break
if idaci_sheet is None:
idaci_sheet = list(iod_df.keys())[0]
df_iod = iod_df[idaci_sheet]
except Exception as e:
raise RuntimeError(f"Could not read IoD2019 file: {e}")
# Normalise column names — IoD2019 uses specific headers
col_lsoa = next((c for c in df_iod.columns if "LSOA" in str(c).upper() and "code" in str(c).lower()), None)
col_score = next((c for c in df_iod.columns if "IDACI" in str(c).upper() and "score" in str(c).lower()), None)
col_rank = next((c for c in df_iod.columns if "IDACI" in str(c).upper() and "rank" in str(c).lower()), None)
if not col_lsoa or not col_score:
print(f" IDACI columns available: {list(df_iod.columns)[:20]}")
raise ValueError("Could not find LSOA code or IDACI score columns")
df_iod = df_iod[[col_lsoa, col_score]].copy()
df_iod.columns = ["lsoa_code", "idaci_score"]
df_iod = df_iod.dropna()
# Compute decile from rank (or from score distribution)
total = len(df_iod)
df_iod = df_iod.sort_values("idaci_score", ascending=False)
df_iod["idaci_decile"] = (pd.qcut(df_iod["idaci_score"], 10, labels=False) + 1).astype(int)
# Decile 1 = most deprived (highest IDACI score)
df_iod["idaci_decile"] = 11 - df_iod["idaci_decile"]
lsoa_lookup = df_iod.set_index("lsoa_code")[["idaci_score", "idaci_decile"]].to_dict("index")
print(f" IDACI: loaded {len(lsoa_lookup)} LSOA records")
# Fetch all school postcodes from the database
with get_session() as session:
from sqlalchemy import text
rows = session.execute(text("SELECT urn, postcode FROM schools WHERE postcode IS NOT NULL")).fetchall()
postcodes = [r[1] for r in rows]
print(f" IDACI: resolving {len(postcodes)} postcodes via postcodes.io ...")
pc_to_lsoa = _postcode_to_lsoa(postcodes)
print(f" IDACI: resolved {len(pc_to_lsoa)} postcodes to LSOAs")
inserted = skipped = 0
with get_session() as session:
from sqlalchemy import text
for urn, postcode in rows:
lsoa = pc_to_lsoa.get(str(postcode).strip().upper())
if not lsoa:
skipped += 1
continue
iod = lsoa_lookup.get(lsoa)
if not iod:
skipped += 1
continue
session.execute(
text("""
INSERT INTO school_deprivation (urn, lsoa_code, idaci_score, idaci_decile)
VALUES (:urn, :lsoa, :score, :decile)
ON CONFLICT (urn) DO UPDATE SET
lsoa_code = EXCLUDED.lsoa_code,
idaci_score = EXCLUDED.idaci_score,
idaci_decile = EXCLUDED.idaci_decile
"""),
{"urn": urn, "lsoa": lsoa, "score": float(iod["idaci_score"]), "decile": int(iod["idaci_decile"])},
)
inserted += 1
if inserted % 2000 == 0:
session.flush()
print(f" IDACI: upserted {inserted}, skipped {skipped}")
return {"inserted": inserted, "updated": 0, "skipped": skipped}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)

View File

@@ -0,0 +1,226 @@
"""
Ofsted Monthly Management Information CSV downloader and loader.
Source: https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes
Update: Monthly (released ~2 weeks into each month)
"""
import argparse
import re
import sys
from datetime import date, datetime
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
# Current Ofsted MI download URL — update this when Ofsted releases a new file.
# The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page.
GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes"
COLUMN_MAP = {
"URN": "urn",
"Inspection date": "inspection_date",
"Publication date": "publication_date",
"Inspection type": "inspection_type",
"Overall effectiveness": "overall_effectiveness",
"Quality of education": "quality_of_education",
"Behaviour and attitudes": "behaviour_attitudes",
"Personal development": "personal_development",
"Leadership and management": "leadership_management",
"Early years provision": "early_years_provision",
# Some CSVs use shortened names
"Urn": "urn",
"InspectionDate": "inspection_date",
"PublicationDate": "publication_date",
"InspectionType": "inspection_type",
"OverallEffectiveness": "overall_effectiveness",
"QualityOfEducation": "quality_of_education",
"BehaviourAndAttitudes": "behaviour_attitudes",
"PersonalDevelopment": "personal_development",
"LeadershipAndManagement": "leadership_management",
"EarlyYearsProvision": "early_years_provision",
}
GRADE_MAP = {
"Outstanding": 1, "1": 1, 1: 1,
"Good": 2, "2": 2, 2: 2,
"Requires improvement": 3, "3": 3, 3: 3,
"Requires Improvement": 3,
"Inadequate": 4, "4": 4, 4: 4,
}
DEST_DIR = SUPPLEMENTARY_DIR / "ofsted"
def _discover_csv_url() -> str | None:
"""Scrape the GOV.UK page for the most recent CSV/ZIP link."""
try:
resp = requests.get(GOV_UK_PAGE, timeout=30)
resp.raise_for_status()
# Look for links to assets.publishing.service.gov.uk CSV or ZIP files
pattern = r'href="(https://assets\.publishing\.service\.gov\.uk[^"]+\.(?:csv|zip))"'
urls = re.findall(pattern, resp.text, re.IGNORECASE)
if urls:
return urls[0]
except Exception as e:
print(f" Warning: could not scrape GOV.UK page: {e}")
return None
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
url = _discover_csv_url()
if not url:
raise RuntimeError(
"Could not discover Ofsted MI download URL. "
"Visit https://www.gov.uk/government/statistical-data-sets/"
"monthly-management-information-ofsteds-school-inspections-outcomes "
"to get the latest URL and update MANUAL_URL in ofsted.py"
)
filename = url.split("/")[-1]
dest_file = dest / filename
if dest_file.exists():
print(f" Ofsted: {filename} already exists, skipping download.")
return dest_file
print(f" Ofsted: downloading {url} ...")
resp = requests.get(url, timeout=120, stream=True)
resp.raise_for_status()
with open(dest_file, "wb") as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
print(f" Ofsted: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)")
return dest_file
def _parse_grade(val) -> int | None:
if pd.isna(val):
return None
key = str(val).strip()
return GRADE_MAP.get(key)
def _parse_date(val) -> date | None:
if pd.isna(val):
return None
for fmt in ("%d/%m/%Y", "%Y-%m-%d", "%d-%m-%Y", "%d %B %Y"):
try:
return datetime.strptime(str(val).strip(), fmt).date()
except ValueError:
pass
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR
files = sorted(dest.glob("*.csv")) + sorted(dest.glob("*.zip"))
if not files:
raise FileNotFoundError(f"No Ofsted MI file found in {dest}")
path = files[-1]
print(f" Ofsted: loading {path} ...")
if str(path).endswith(".zip"):
import zipfile, io
with zipfile.ZipFile(path) as z:
csv_names = [n for n in z.namelist() if n.endswith(".csv")]
if not csv_names:
raise ValueError("No CSV found inside Ofsted ZIP")
with z.open(csv_names[0]) as f:
df = pd.read_csv(io.TextIOWrapper(f, encoding="latin-1"), low_memory=False)
else:
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
# Normalise column names
df.rename(columns=COLUMN_MAP, inplace=True)
if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
# Only keep rows with a valid URN
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
inserted = updated = skipped = 0
with get_session() as session:
# Keep only the most recent inspection per URN
if "inspection_date" in df.columns:
df["_date_parsed"] = df["inspection_date"].apply(_parse_date)
df = df.sort_values("_date_parsed", ascending=False).groupby("urn").first().reset_index()
for _, row in df.iterrows():
urn = int(row["urn"])
record = {
"urn": urn,
"inspection_date": _parse_date(row.get("inspection_date")),
"publication_date": _parse_date(row.get("publication_date")),
"inspection_type": str(row.get("inspection_type", "")).strip() or None,
"overall_effectiveness": _parse_grade(row.get("overall_effectiveness")),
"quality_of_education": _parse_grade(row.get("quality_of_education")),
"behaviour_attitudes": _parse_grade(row.get("behaviour_attitudes")),
"personal_development": _parse_grade(row.get("personal_development")),
"leadership_management": _parse_grade(row.get("leadership_management")),
"early_years_provision": _parse_grade(row.get("early_years_provision")),
"previous_overall": None,
}
from sqlalchemy import text
session.execute(
text("""
INSERT INTO ofsted_inspections
(urn, inspection_date, publication_date, inspection_type,
overall_effectiveness, quality_of_education, behaviour_attitudes,
personal_development, leadership_management, early_years_provision,
previous_overall)
VALUES
(:urn, :inspection_date, :publication_date, :inspection_type,
:overall_effectiveness, :quality_of_education, :behaviour_attitudes,
:personal_development, :leadership_management, :early_years_provision,
:previous_overall)
ON CONFLICT (urn) DO UPDATE SET
previous_overall = ofsted_inspections.overall_effectiveness,
inspection_date = EXCLUDED.inspection_date,
publication_date = EXCLUDED.publication_date,
inspection_type = EXCLUDED.inspection_type,
overall_effectiveness = EXCLUDED.overall_effectiveness,
quality_of_education = EXCLUDED.quality_of_education,
behaviour_attitudes = EXCLUDED.behaviour_attitudes,
personal_development = EXCLUDED.personal_development,
leadership_management = EXCLUDED.leadership_management,
early_years_provision = EXCLUDED.early_years_provision
"""),
record,
)
inserted += 1
if inserted % 5000 == 0:
session.flush()
print(f" Processed {inserted} records...")
print(f" Ofsted: upserted {inserted} records")
return {"inserted": inserted, "updated": updated, "skipped": skipped}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
path = download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)

View File

@@ -0,0 +1,229 @@
"""
Ofsted Parent View open data downloader and loader.
Source: https://parentview.ofsted.gov.uk/open-data
Update: ~3 times/year (Spring, Autumn, Summer)
"""
import argparse
import re
import sys
from datetime import date, datetime
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
DEST_DIR = SUPPLEMENTARY_DIR / "parent_view"
OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data"
# Question column mapping — Parent View open data uses descriptive column headers
# Map any variant to our internal field names
QUESTION_MAP = {
# Q1 — happiness
"My child is happy at this school": "q_happy_pct",
"Happy": "q_happy_pct",
# Q2 — safety
"My child feels safe at this school": "q_safe_pct",
"Safe": "q_safe_pct",
# Q3 — bullying
"The school makes sure its pupils are well behaved": "q_behaviour_pct",
"Well Behaved": "q_behaviour_pct",
# Q4 — bullying dealt with (sometimes separate)
"My child has been bullied and the school dealt with the bullying quickly and effectively": "q_bullying_pct",
"Bullying": "q_bullying_pct",
# Q5 — curriculum info
"The school makes me aware of what my child will learn during the year": "q_communication_pct",
"Aware of learning": "q_communication_pct",
# Q6 — concerns dealt with
"When I have raised concerns with the school, they have been dealt with properly": "q_communication_pct",
# Q7 — child does well
"My child does well at this school": "q_progress_pct",
"Does well": "q_progress_pct",
# Q8 — teaching
"The teaching is good at this school": "q_teaching_pct",
"Good teaching": "q_teaching_pct",
# Q9 — progress info
"I receive valuable information from the school about my child's progress": "q_information_pct",
"Progress information": "q_information_pct",
# Q10 — curriculum breadth
"My child is taught a broad range of subjects": "q_curriculum_pct",
"Broad subjects": "q_curriculum_pct",
# Q11 — prepares for future
"The school prepares my child well for the future": "q_future_pct",
"Prepared for future": "q_future_pct",
# Q12 — leadership
"The school is led and managed effectively": "q_leadership_pct",
"Led well": "q_leadership_pct",
# Q13 — wellbeing
"The school supports my child's wider personal development": "q_wellbeing_pct",
"Personal development": "q_wellbeing_pct",
# Q14 — recommendation
"I would recommend this school to another parent": "q_recommend_pct",
"Recommend": "q_recommend_pct",
}
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
# Scrape the open data page for the download link
try:
resp = requests.get(OPEN_DATA_PAGE, timeout=30)
resp.raise_for_status()
pattern = r'href="([^"]+\.(?:xlsx|csv|zip))"'
urls = re.findall(pattern, resp.text, re.IGNORECASE)
if not urls:
raise RuntimeError("No download link found on Parent View open data page")
url = urls[0] if urls[0].startswith("http") else "https://parentview.ofsted.gov.uk" + urls[0]
except Exception as e:
raise RuntimeError(f"Could not discover Parent View download URL: {e}")
filename = url.split("/")[-1].split("?")[0]
dest_file = dest / filename
if dest_file.exists():
print(f" ParentView: {filename} already exists, skipping download.")
return dest_file
print(f" ParentView: downloading {url} ...")
resp = requests.get(url, timeout=120, stream=True)
resp.raise_for_status()
with open(dest_file, "wb") as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
print(f" ParentView: saved {dest_file}")
return dest_file
def _positive_pct(row: pd.Series, q_col_base: str) -> float | None:
"""Sum 'Strongly agree' + 'Agree' percentages for a question."""
# Parent View open data has columns like "Q1 - Strongly agree %", "Q1 - Agree %"
strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %")
agree = row.get(f"{q_col_base} - Agree %")
try:
total = 0.0
if pd.notna(strongly):
total += float(strongly)
if pd.notna(agree):
total += float(agree)
return round(total, 1) if total > 0 else None
except (TypeError, ValueError):
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
files = sorted(dest.glob("*.xlsx")) + sorted(dest.glob("*.csv"))
if not files:
raise FileNotFoundError(f"No Parent View file found in {dest}")
path = files[-1]
print(f" ParentView: loading {path} ...")
if str(path).endswith(".xlsx"):
df = pd.read_excel(path)
else:
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
# Normalise URN column
urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None)
if not urn_col:
raise ValueError(f"URN column not found. Columns: {list(df.columns)[:20]}")
df.rename(columns={urn_col: "urn"}, inplace=True)
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
# Try to find total responses column
resp_col = next((c for c in df.columns if "total" in c.lower() and "respon" in c.lower()), None)
inserted = 0
today = date.today()
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
total = int(row[resp_col]) if resp_col and pd.notna(row.get(resp_col)) else None
# Try to extract % positive per question from wide-format columns
# Parent View has numbered questions Q1Q12 (or Q1Q14 depending on year)
record = {
"urn": urn,
"survey_date": today,
"total_responses": total,
"q_happy_pct": _positive_pct(row, "Q1"),
"q_safe_pct": _positive_pct(row, "Q2"),
"q_behaviour_pct": _positive_pct(row, "Q3"),
"q_bullying_pct": _positive_pct(row, "Q4"),
"q_communication_pct": _positive_pct(row, "Q5"),
"q_progress_pct": _positive_pct(row, "Q7"),
"q_teaching_pct": _positive_pct(row, "Q8"),
"q_information_pct": _positive_pct(row, "Q9"),
"q_curriculum_pct": _positive_pct(row, "Q10"),
"q_future_pct": _positive_pct(row, "Q11"),
"q_leadership_pct": _positive_pct(row, "Q12"),
"q_wellbeing_pct": _positive_pct(row, "Q13"),
"q_recommend_pct": _positive_pct(row, "Q14"),
"q_sen_pct": None,
}
session.execute(
text("""
INSERT INTO ofsted_parent_view
(urn, survey_date, total_responses,
q_happy_pct, q_safe_pct, q_behaviour_pct, q_bullying_pct,
q_communication_pct, q_progress_pct, q_teaching_pct,
q_information_pct, q_curriculum_pct, q_future_pct,
q_leadership_pct, q_wellbeing_pct, q_recommend_pct, q_sen_pct)
VALUES
(:urn, :survey_date, :total_responses,
:q_happy_pct, :q_safe_pct, :q_behaviour_pct, :q_bullying_pct,
:q_communication_pct, :q_progress_pct, :q_teaching_pct,
:q_information_pct, :q_curriculum_pct, :q_future_pct,
:q_leadership_pct, :q_wellbeing_pct, :q_recommend_pct, :q_sen_pct)
ON CONFLICT (urn) DO UPDATE SET
survey_date = EXCLUDED.survey_date,
total_responses = EXCLUDED.total_responses,
q_happy_pct = EXCLUDED.q_happy_pct,
q_safe_pct = EXCLUDED.q_safe_pct,
q_behaviour_pct = EXCLUDED.q_behaviour_pct,
q_bullying_pct = EXCLUDED.q_bullying_pct,
q_communication_pct = EXCLUDED.q_communication_pct,
q_progress_pct = EXCLUDED.q_progress_pct,
q_teaching_pct = EXCLUDED.q_teaching_pct,
q_information_pct = EXCLUDED.q_information_pct,
q_curriculum_pct = EXCLUDED.q_curriculum_pct,
q_future_pct = EXCLUDED.q_future_pct,
q_leadership_pct = EXCLUDED.q_leadership_pct,
q_wellbeing_pct = EXCLUDED.q_wellbeing_pct,
q_recommend_pct = EXCLUDED.q_recommend_pct,
q_sen_pct = EXCLUDED.q_sen_pct
"""),
record,
)
inserted += 1
if inserted % 2000 == 0:
session.flush()
print(f" ParentView: upserted {inserted} records")
return {"inserted": inserted, "updated": 0, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)

View File

@@ -0,0 +1,132 @@
"""
Phonics Screening Check downloader and loader.
Source: EES publication "phonics-screening-check-and-key-stage-1-assessments-england"
Update: Annual (September/October)
"""
import argparse
import sys
from pathlib import Path
import pandas as pd
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
from sources.ees import get_latest_csv_url, download_csv
DEST_DIR = SUPPLEMENTARY_DIR / "phonics"
PUBLICATION_SLUG = "phonics-screening-check-and-key-stage-1-assessments-england"
# Known column names in the phonics CSV (vary by year)
COLUMN_MAP = {
"URN": "urn",
"urn": "urn",
# Year 1 pass rate
"PPTA1": "year1_phonics_pct", # % meeting expected standard Y1
"PPTA1B": "year1_phonics_pct",
"PT_MET_PHON_Y1": "year1_phonics_pct",
"Y1_MET_EXPECTED_PCT": "year1_phonics_pct",
# Year 2 (re-takers)
"PPTA2": "year2_phonics_pct",
"PT_MET_PHON_Y2": "year2_phonics_pct",
"Y2_MET_EXPECTED_PCT": "year2_phonics_pct",
# Year label
"YEAR": "year",
"Year": "year",
}
NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", ""}
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "phonics") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
if not url:
raise RuntimeError(f"Could not find CSV URL for phonics publication")
filename = url.split("/")[-1].split("?")[0] or "phonics_latest.csv"
return download_csv(url, dest / filename)
def _parse_pct(val) -> float | None:
if pd.isna(val):
return None
s = str(val).strip().upper().replace("%", "")
if s in NULL_VALUES:
return None
try:
return float(s)
except ValueError:
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "phonics") if data_dir else DEST_DIR
files = sorted(dest.glob("*.csv"))
if not files:
raise FileNotFoundError(f"No phonics CSV found in {dest}")
path = files[-1]
print(f" Phonics: loading {path} ...")
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
df.rename(columns=COLUMN_MAP, inplace=True)
if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
# Infer year from filename if not in data
year = None
import re
m = re.search(r"20(\d{2})", path.stem)
if m:
year = int("20" + m.group(1))
inserted = 0
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
if not row_year:
continue
session.execute(
text("""
INSERT INTO phonics (urn, year, year1_phonics_pct, year2_phonics_pct)
VALUES (:urn, :year, :y1, :y2)
ON CONFLICT (urn, year) DO UPDATE SET
year1_phonics_pct = EXCLUDED.year1_phonics_pct,
year2_phonics_pct = EXCLUDED.year2_phonics_pct
"""),
{
"urn": urn,
"year": row_year,
"y1": _parse_pct(row.get("year1_phonics_pct")),
"y2": _parse_pct(row.get("year2_phonics_pct")),
},
)
inserted += 1
if inserted % 5000 == 0:
session.flush()
print(f" Phonics: upserted {inserted} records")
return {"inserted": inserted, "updated": 0, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)

View File

@@ -0,0 +1,150 @@
"""
SEN (Special Educational Needs) primary need type breakdown.
Source: EES publication "special-educational-needs-in-england"
Update: Annual (September)
"""
import argparse
import re
import sys
from pathlib import Path
import pandas as pd
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
from sources.ees import get_latest_csv_url, download_csv
DEST_DIR = SUPPLEMENTARY_DIR / "sen_detail"
PUBLICATION_SLUG = "special-educational-needs-in-england"
NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}
COLUMN_MAP = {
"URN": "urn",
"urn": "urn",
"YEAR": "year",
"Year": "year",
# Primary need types — DfE abbreviated codes
"PT_SPEECH": "primary_need_speech_pct", # SLCN
"PT_ASD": "primary_need_autism_pct", # ASD
"PT_MLD": "primary_need_mld_pct", # Moderate learning difficulty
"PT_SPLD": "primary_need_spld_pct", # Specific learning difficulty
"PT_SEMH": "primary_need_semh_pct", # Social, emotional, mental health
"PT_PHYSICAL": "primary_need_physical_pct", # Physical/sensory
"PT_OTHER": "primary_need_other_pct",
# Alternative naming
"SLCN_PCT": "primary_need_speech_pct",
"ASD_PCT": "primary_need_autism_pct",
"MLD_PCT": "primary_need_mld_pct",
"SPLD_PCT": "primary_need_spld_pct",
"SEMH_PCT": "primary_need_semh_pct",
"PHYSICAL_PCT": "primary_need_physical_pct",
"OTHER_PCT": "primary_need_other_pct",
}
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
if not url:
url = get_latest_csv_url(PUBLICATION_SLUG)
if not url:
raise RuntimeError("Could not find CSV URL for SEN publication")
filename = url.split("/")[-1].split("?")[0] or "sen_latest.csv"
return download_csv(url, dest / filename)
def _parse_pct(val) -> float | None:
if pd.isna(val):
return None
s = str(val).strip().upper().replace("%", "")
if s in NULL_VALUES:
return None
try:
return float(s)
except ValueError:
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR
files = sorted(dest.glob("*.csv"))
if not files:
raise FileNotFoundError(f"No SEN CSV found in {dest}")
path = files[-1]
print(f" SEN Detail: loading {path} ...")
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
df.rename(columns=COLUMN_MAP, inplace=True)
if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
year = None
m = re.search(r"20(\d{2})", path.stem)
if m:
year = int("20" + m.group(1))
inserted = 0
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
if not row_year:
continue
session.execute(
text("""
INSERT INTO sen_detail
(urn, year, primary_need_speech_pct, primary_need_autism_pct,
primary_need_mld_pct, primary_need_spld_pct, primary_need_semh_pct,
primary_need_physical_pct, primary_need_other_pct)
VALUES (:urn, :year, :speech, :autism, :mld, :spld, :semh, :physical, :other)
ON CONFLICT (urn, year) DO UPDATE SET
primary_need_speech_pct = EXCLUDED.primary_need_speech_pct,
primary_need_autism_pct = EXCLUDED.primary_need_autism_pct,
primary_need_mld_pct = EXCLUDED.primary_need_mld_pct,
primary_need_spld_pct = EXCLUDED.primary_need_spld_pct,
primary_need_semh_pct = EXCLUDED.primary_need_semh_pct,
primary_need_physical_pct = EXCLUDED.primary_need_physical_pct,
primary_need_other_pct = EXCLUDED.primary_need_other_pct
"""),
{
"urn": urn, "year": row_year,
"speech": _parse_pct(row.get("primary_need_speech_pct")),
"autism": _parse_pct(row.get("primary_need_autism_pct")),
"mld": _parse_pct(row.get("primary_need_mld_pct")),
"spld": _parse_pct(row.get("primary_need_spld_pct")),
"semh": _parse_pct(row.get("primary_need_semh_pct")),
"physical": _parse_pct(row.get("primary_need_physical_pct")),
"other": _parse_pct(row.get("primary_need_other_pct")),
},
)
inserted += 1
if inserted % 5000 == 0:
session.flush()
print(f" SEN Detail: upserted {inserted} records")
return {"inserted": inserted, "updated": 0, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)