Files
school_compare/integrator/scripts/sources/ofsted.py
Tudor d81f03cfcf
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 33s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m3s
Build and Push Docker Images / Build Integrator (push) Successful in 58s
Build and Push Docker Images / Build Kestra Init (push) Successful in 33s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
fix(ofsted): per-row framework detection instead of per-file
The MI CSV contains both OEIF and RC column sets simultaneously — OEIF columns
are populated for older inspections, RC columns for post-Nov-2025 inspections.
File-level detection wrongly classified all schools based on column presence alone.

Replace _detect_framework(df) with _framework_for_row(row):
- ReportCard: any rc_* column has a value
- OEIF: overall_effectiveness or quality_of_education has a value
- None: neither has data (no graded inspection on record)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-25 15:08:42 +00:00

419 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Ofsted Monthly Management Information CSV downloader and loader.
Source: https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes
Update: Monthly (released ~2 weeks into each month)
"""
import argparse
import re
import sys
from datetime import date, datetime
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
# Current Ofsted MI download URL — update this when Ofsted releases a new file.
# The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page.
GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes"
# Column name → internal field, listed in priority order per field.
# First matching column wins; later entries are fallbacks for older file formats.
COLUMN_PRIORITY = {
"urn": ["URN", "Urn", "urn"],
"inspection_date": [
"Inspection start date of latest OEIF graded inspection",
"Inspection start date",
"Inspection date",
"InspectionDate",
],
"publication_date": [
"Publication date of latest OEIF graded inspection",
"Publication date",
"PublicationDate",
],
"inspection_type": [
"Inspection type of latest OEIF graded inspection",
"Inspection type",
"InspectionType",
],
"overall_effectiveness": [
"Latest OEIF overall effectiveness",
"Overall effectiveness",
"OverallEffectiveness",
],
"quality_of_education": [
"Latest OEIF quality of education",
"Quality of education",
"QualityOfEducation",
],
"behaviour_attitudes": [
"Latest OEIF behaviour and attitudes",
"Behaviour and attitudes",
"BehaviourAndAttitudes",
],
"personal_development": [
"Latest OEIF personal development",
"Personal development",
"PersonalDevelopment",
],
"leadership_management": [
"Latest OEIF effectiveness of leadership and management",
"Leadership and management",
"LeadershipAndManagement",
],
"early_years_provision": [
"Latest OEIF early years provision (where applicable)",
"Early years provision",
"EarlyYearsProvision",
],
}
GRADE_MAP = {
"Outstanding": 1, "1": 1, 1: 1,
"Good": 2, "2": 2, 2: 2,
"Requires improvement": 3, "3": 3, 3: 3,
"Requires Improvement": 3,
"Inadequate": 4, "4": 4, 4: 4,
}
# Report Card grade text → integer (1=Exceptional … 5=Urgent improvement)
RC_GRADE_MAP = {
"exceptional": 1,
"strong standard": 2,
"strong": 2,
"expected standard": 3,
"expected": 3,
"needs attention": 4,
"urgent improvement": 5,
}
# Column name priority for Report Card fields (best-guess names; Ofsted may vary)
RC_COLUMN_PRIORITY = {
"rc_safeguarding": [
"Safeguarding",
"safeguarding",
"Safeguarding standards",
],
"rc_inclusion": [
"Inclusion",
"inclusion",
],
"rc_curriculum_teaching": [
"Curriculum and teaching",
"curriculum_and_teaching",
"Curriculum & teaching",
],
"rc_achievement": [
"Achievement",
"achievement",
],
"rc_attendance_behaviour": [
"Attendance and behaviour",
"attendance_and_behaviour",
"Attendance & behaviour",
],
"rc_personal_development": [
"Personal development and well-being",
"Personal development and wellbeing",
"personal_development_and_wellbeing",
"Personal development & well-being",
],
"rc_leadership_governance": [
"Leadership and governance",
"leadership_and_governance",
"Leadership & governance",
],
"rc_early_years": [
"Early years",
"early_years",
"Early years provision",
],
"rc_sixth_form": [
"Sixth form",
"sixth_form",
"Sixth form in schools",
],
}
DEST_DIR = SUPPLEMENTARY_DIR / "ofsted"
def _discover_csv_url() -> str | None:
"""Scrape the GOV.UK page for the most recent CSV/ZIP link."""
try:
resp = requests.get(GOV_UK_PAGE, timeout=30)
resp.raise_for_status()
# Look for links to assets.publishing.service.gov.uk CSV or ZIP files
pattern = r'href="(https://assets\.publishing\.service\.gov\.uk[^"]+\.(?:csv|zip))"'
urls = re.findall(pattern, resp.text, re.IGNORECASE)
if urls:
return urls[0]
except Exception as e:
print(f" Warning: could not scrape GOV.UK page: {e}")
return None
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
url = _discover_csv_url()
if not url:
raise RuntimeError(
"Could not discover Ofsted MI download URL. "
"Visit https://www.gov.uk/government/statistical-data-sets/"
"monthly-management-information-ofsteds-school-inspections-outcomes "
"to get the latest URL and update MANUAL_URL in ofsted.py"
)
filename = url.split("/")[-1]
dest_file = dest / filename
if dest_file.exists():
print(f" Ofsted: {filename} already exists, skipping download.")
return dest_file
print(f" Ofsted: downloading {url} ...")
resp = requests.get(url, timeout=120, stream=True)
resp.raise_for_status()
with open(dest_file, "wb") as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
print(f" Ofsted: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)")
return dest_file
def _parse_grade(val) -> int | None:
if pd.isna(val):
return None
key = str(val).strip()
return GRADE_MAP.get(key)
def _parse_rc_grade(val) -> int | None:
"""Parse a Report Card grade text to integer 15."""
if pd.isna(val):
return None
key = str(val).strip().lower()
return RC_GRADE_MAP.get(key)
def _parse_safeguarding(val) -> bool | None:
"""Parse safeguarding 'Met'/'Not met' to boolean."""
if pd.isna(val):
return None
s = str(val).strip().lower()
if s == "met":
return True
if s in ("not met", "not_met"):
return False
return None
def _parse_date(val) -> date | None:
if pd.isna(val):
return None
for fmt in ("%d/%m/%Y", "%Y-%m-%d", "%d-%m-%Y", "%d %B %Y"):
try:
return datetime.strptime(str(val).strip(), fmt).date()
except ValueError:
pass
return None
def _framework_for_row(row) -> str | None:
"""Determine inspection framework for a single school row.
Check RC columns first — if any have a value, it's a Report Card inspection.
Fall back to OEIF columns. If neither has data, the school has no graded
inspection on record (return None).
"""
rc_check_cols = [
"rc_inclusion", "rc_curriculum_teaching", "rc_achievement",
"rc_attendance_behaviour", "rc_personal_development",
"rc_leadership_governance", "rc_safeguarding",
]
for col in rc_check_cols:
val = row.get(col)
if val is not None and not (isinstance(val, float) and pd.isna(val)):
return "ReportCard"
oeif_check_cols = ["overall_effectiveness", "quality_of_education"]
for col in oeif_check_cols:
val = row.get(col)
if val is not None and not (isinstance(val, float) and pd.isna(val)):
return "OEIF"
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR
files = sorted(dest.glob("*.csv")) + sorted(dest.glob("*.zip"))
if not files:
raise FileNotFoundError(f"No Ofsted MI file found in {dest}")
path = files[-1]
print(f" Ofsted: loading {path} ...")
def _find_header_row(filepath, encoding="latin-1"):
"""Scan up to 10 rows to find the one containing a URN column."""
for i in range(10):
peek = pd.read_csv(filepath, encoding=encoding, header=i, nrows=0)
if any(str(c).strip() in ("URN", "Urn", "urn") for c in peek.columns):
return i
return 0
if str(path).endswith(".zip"):
import zipfile, io
with zipfile.ZipFile(path) as z:
csv_names = [n for n in z.namelist() if n.endswith(".csv")]
if not csv_names:
raise ValueError("No CSV found inside Ofsted ZIP")
# Extract to a temp file so we can scan for the header row
import tempfile, os
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
tmp.write(z.read(csv_names[0]))
tmp_path = tmp.name
try:
hdr = _find_header_row(tmp_path)
df = pd.read_csv(tmp_path, encoding="latin-1", low_memory=False, header=hdr)
finally:
os.unlink(tmp_path)
else:
hdr = _find_header_row(path)
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
# Normalise OEIF column names: for each target field pick the first source column present
available = set(df.columns)
for target, sources in COLUMN_PRIORITY.items():
for src in sources:
if src in available:
df.rename(columns={src: target}, inplace=True)
break
# Normalise Report Card column names (if present)
available = set(df.columns)
for target, sources in RC_COLUMN_PRIORITY.items():
for src in sources:
if src in available:
df.rename(columns={src: target}, inplace=True)
break
if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
# Only keep rows with a valid URN
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
inserted = updated = skipped = 0
with get_session() as session:
# Keep only the most recent inspection per URN
if "inspection_date" in df.columns:
df["_date_parsed"] = df["inspection_date"].apply(_parse_date)
df = df.sort_values("_date_parsed", ascending=False).groupby("urn").first().reset_index()
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
record = {
"urn": urn,
"framework": _framework_for_row(row),
"inspection_date": _parse_date(row.get("inspection_date")),
"publication_date": _parse_date(row.get("publication_date")),
"inspection_type": str(row.get("inspection_type", "")).strip() or None,
# OEIF fields
"overall_effectiveness": _parse_grade(row.get("overall_effectiveness")),
"quality_of_education": _parse_grade(row.get("quality_of_education")),
"behaviour_attitudes": _parse_grade(row.get("behaviour_attitudes")),
"personal_development": _parse_grade(row.get("personal_development")),
"leadership_management": _parse_grade(row.get("leadership_management")),
"early_years_provision": _parse_grade(row.get("early_years_provision")),
"previous_overall": None,
# Report Card fields
"rc_safeguarding_met": _parse_safeguarding(row.get("rc_safeguarding")),
"rc_inclusion": _parse_rc_grade(row.get("rc_inclusion")),
"rc_curriculum_teaching": _parse_rc_grade(row.get("rc_curriculum_teaching")),
"rc_achievement": _parse_rc_grade(row.get("rc_achievement")),
"rc_attendance_behaviour": _parse_rc_grade(row.get("rc_attendance_behaviour")),
"rc_personal_development": _parse_rc_grade(row.get("rc_personal_development")),
"rc_leadership_governance": _parse_rc_grade(row.get("rc_leadership_governance")),
"rc_early_years": _parse_rc_grade(row.get("rc_early_years")),
"rc_sixth_form": _parse_rc_grade(row.get("rc_sixth_form")),
}
session.execute(
text("""
INSERT INTO ofsted_inspections
(urn, framework, inspection_date, publication_date, inspection_type,
overall_effectiveness, quality_of_education, behaviour_attitudes,
personal_development, leadership_management, early_years_provision,
previous_overall,
rc_safeguarding_met, rc_inclusion, rc_curriculum_teaching,
rc_achievement, rc_attendance_behaviour, rc_personal_development,
rc_leadership_governance, rc_early_years, rc_sixth_form)
VALUES
(:urn, :framework, :inspection_date, :publication_date, :inspection_type,
:overall_effectiveness, :quality_of_education, :behaviour_attitudes,
:personal_development, :leadership_management, :early_years_provision,
:previous_overall,
:rc_safeguarding_met, :rc_inclusion, :rc_curriculum_teaching,
:rc_achievement, :rc_attendance_behaviour, :rc_personal_development,
:rc_leadership_governance, :rc_early_years, :rc_sixth_form)
ON CONFLICT (urn) DO UPDATE SET
previous_overall = ofsted_inspections.overall_effectiveness,
framework = EXCLUDED.framework,
inspection_date = EXCLUDED.inspection_date,
publication_date = EXCLUDED.publication_date,
inspection_type = EXCLUDED.inspection_type,
overall_effectiveness = EXCLUDED.overall_effectiveness,
quality_of_education = EXCLUDED.quality_of_education,
behaviour_attitudes = EXCLUDED.behaviour_attitudes,
personal_development = EXCLUDED.personal_development,
leadership_management = EXCLUDED.leadership_management,
early_years_provision = EXCLUDED.early_years_provision,
rc_safeguarding_met = EXCLUDED.rc_safeguarding_met,
rc_inclusion = EXCLUDED.rc_inclusion,
rc_curriculum_teaching = EXCLUDED.rc_curriculum_teaching,
rc_achievement = EXCLUDED.rc_achievement,
rc_attendance_behaviour = EXCLUDED.rc_attendance_behaviour,
rc_personal_development = EXCLUDED.rc_personal_development,
rc_leadership_governance = EXCLUDED.rc_leadership_governance,
rc_early_years = EXCLUDED.rc_early_years,
rc_sixth_form = EXCLUDED.rc_sixth_form
"""),
record,
)
inserted += 1
if inserted % 5000 == 0:
session.flush()
print(f" Processed {inserted} records...")
print(f" Ofsted: upserted {inserted} records")
return {"inserted": inserted, "updated": updated, "skipped": skipped}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
path = download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)