Files
school_compare/integrator/scripts/sources/ofsted.py
Tudor 5720e18358
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 33s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Integrator (push) Successful in 58s
Build and Push Docker Images / Build Kestra Init (push) Successful in 33s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
fix(ofsted): tighten framework detection to avoid false ReportCard classification
The old OEIF CSV contains columns whose names include substrings like
'inclusion' and 'achievement', causing _detect_framework() to wrongly return
'ReportCard' for pre-Nov-2025 inspections.

Fix: check for OEIF-specific phrases first ('overall effectiveness', 'quality
of education', 'behaviour and attitudes'). Only if none are found, look for
multi-word RC-specific phrases. Default to OEIF as a safe fallback.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-25 14:55:10 +00:00

429 lines
16 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Ofsted Monthly Management Information CSV downloader and loader.
Source: https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes
Update: Monthly (released ~2 weeks into each month)
"""
import argparse
import re
import sys
from datetime import date, datetime
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
# Current Ofsted MI download URL — update this when Ofsted releases a new file.
# The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page.
GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes"
# Column name → internal field, listed in priority order per field.
# First matching column wins; later entries are fallbacks for older file formats.
COLUMN_PRIORITY = {
"urn": ["URN", "Urn", "urn"],
"inspection_date": [
"Inspection start date of latest OEIF graded inspection",
"Inspection start date",
"Inspection date",
"InspectionDate",
],
"publication_date": [
"Publication date of latest OEIF graded inspection",
"Publication date",
"PublicationDate",
],
"inspection_type": [
"Inspection type of latest OEIF graded inspection",
"Inspection type",
"InspectionType",
],
"overall_effectiveness": [
"Latest OEIF overall effectiveness",
"Overall effectiveness",
"OverallEffectiveness",
],
"quality_of_education": [
"Latest OEIF quality of education",
"Quality of education",
"QualityOfEducation",
],
"behaviour_attitudes": [
"Latest OEIF behaviour and attitudes",
"Behaviour and attitudes",
"BehaviourAndAttitudes",
],
"personal_development": [
"Latest OEIF personal development",
"Personal development",
"PersonalDevelopment",
],
"leadership_management": [
"Latest OEIF effectiveness of leadership and management",
"Leadership and management",
"LeadershipAndManagement",
],
"early_years_provision": [
"Latest OEIF early years provision (where applicable)",
"Early years provision",
"EarlyYearsProvision",
],
}
GRADE_MAP = {
"Outstanding": 1, "1": 1, 1: 1,
"Good": 2, "2": 2, 2: 2,
"Requires improvement": 3, "3": 3, 3: 3,
"Requires Improvement": 3,
"Inadequate": 4, "4": 4, 4: 4,
}
# Report Card grade text → integer (1=Exceptional … 5=Urgent improvement)
RC_GRADE_MAP = {
"exceptional": 1,
"strong standard": 2,
"strong": 2,
"expected standard": 3,
"expected": 3,
"needs attention": 4,
"urgent improvement": 5,
}
# Column name priority for Report Card fields (best-guess names; Ofsted may vary)
RC_COLUMN_PRIORITY = {
"rc_safeguarding": [
"Safeguarding",
"safeguarding",
"Safeguarding standards",
],
"rc_inclusion": [
"Inclusion",
"inclusion",
],
"rc_curriculum_teaching": [
"Curriculum and teaching",
"curriculum_and_teaching",
"Curriculum & teaching",
],
"rc_achievement": [
"Achievement",
"achievement",
],
"rc_attendance_behaviour": [
"Attendance and behaviour",
"attendance_and_behaviour",
"Attendance & behaviour",
],
"rc_personal_development": [
"Personal development and well-being",
"Personal development and wellbeing",
"personal_development_and_wellbeing",
"Personal development & well-being",
],
"rc_leadership_governance": [
"Leadership and governance",
"leadership_and_governance",
"Leadership & governance",
],
"rc_early_years": [
"Early years",
"early_years",
"Early years provision",
],
"rc_sixth_form": [
"Sixth form",
"sixth_form",
"Sixth form in schools",
],
}
DEST_DIR = SUPPLEMENTARY_DIR / "ofsted"
def _discover_csv_url() -> str | None:
"""Scrape the GOV.UK page for the most recent CSV/ZIP link."""
try:
resp = requests.get(GOV_UK_PAGE, timeout=30)
resp.raise_for_status()
# Look for links to assets.publishing.service.gov.uk CSV or ZIP files
pattern = r'href="(https://assets\.publishing\.service\.gov\.uk[^"]+\.(?:csv|zip))"'
urls = re.findall(pattern, resp.text, re.IGNORECASE)
if urls:
return urls[0]
except Exception as e:
print(f" Warning: could not scrape GOV.UK page: {e}")
return None
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
url = _discover_csv_url()
if not url:
raise RuntimeError(
"Could not discover Ofsted MI download URL. "
"Visit https://www.gov.uk/government/statistical-data-sets/"
"monthly-management-information-ofsteds-school-inspections-outcomes "
"to get the latest URL and update MANUAL_URL in ofsted.py"
)
filename = url.split("/")[-1]
dest_file = dest / filename
if dest_file.exists():
print(f" Ofsted: {filename} already exists, skipping download.")
return dest_file
print(f" Ofsted: downloading {url} ...")
resp = requests.get(url, timeout=120, stream=True)
resp.raise_for_status()
with open(dest_file, "wb") as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
print(f" Ofsted: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)")
return dest_file
def _parse_grade(val) -> int | None:
if pd.isna(val):
return None
key = str(val).strip()
return GRADE_MAP.get(key)
def _parse_rc_grade(val) -> int | None:
"""Parse a Report Card grade text to integer 15."""
if pd.isna(val):
return None
key = str(val).strip().lower()
return RC_GRADE_MAP.get(key)
def _parse_safeguarding(val) -> bool | None:
"""Parse safeguarding 'Met'/'Not met' to boolean."""
if pd.isna(val):
return None
s = str(val).strip().lower()
if s == "met":
return True
if s in ("not met", "not_met"):
return False
return None
def _parse_date(val) -> date | None:
if pd.isna(val):
return None
for fmt in ("%d/%m/%Y", "%Y-%m-%d", "%d-%m-%Y", "%d %B %Y"):
try:
return datetime.strptime(str(val).strip(), fmt).date()
except ValueError:
pass
return None
def _detect_framework(df: pd.DataFrame) -> str:
"""Return 'ReportCard' if new-format columns are present, else 'OEIF'.
Strategy: check for OEIF-specific phrases first (positive evidence of the
old format). Only if none are found, look for RC-specific phrases.
Defaults to 'OEIF' so misdetection is always a safe fallback.
"""
cols_lower = {c.lower() for c in df.columns}
# Phrases unique to the old OEIF CSV — if any present, it's OEIF.
oeif_indicators = [
"overall effectiveness",
"quality of education",
"behaviour and attitudes",
]
for indicator in oeif_indicators:
if any(indicator in c for c in cols_lower):
return "OEIF"
# Phrases unique to the new Report Card CSV — multi-word, RC-specific.
rc_indicators = [
"curriculum and teaching",
"leadership and governance",
"attendance and behaviour",
]
for indicator in rc_indicators:
if any(indicator in c for c in cols_lower):
return "ReportCard"
return "OEIF"
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR
files = sorted(dest.glob("*.csv")) + sorted(dest.glob("*.zip"))
if not files:
raise FileNotFoundError(f"No Ofsted MI file found in {dest}")
path = files[-1]
print(f" Ofsted: loading {path} ...")
def _find_header_row(filepath, encoding="latin-1"):
"""Scan up to 10 rows to find the one containing a URN column."""
for i in range(10):
peek = pd.read_csv(filepath, encoding=encoding, header=i, nrows=0)
if any(str(c).strip() in ("URN", "Urn", "urn") for c in peek.columns):
return i
return 0
if str(path).endswith(".zip"):
import zipfile, io
with zipfile.ZipFile(path) as z:
csv_names = [n for n in z.namelist() if n.endswith(".csv")]
if not csv_names:
raise ValueError("No CSV found inside Ofsted ZIP")
# Extract to a temp file so we can scan for the header row
import tempfile, os
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
tmp.write(z.read(csv_names[0]))
tmp_path = tmp.name
try:
hdr = _find_header_row(tmp_path)
df = pd.read_csv(tmp_path, encoding="latin-1", low_memory=False, header=hdr)
finally:
os.unlink(tmp_path)
else:
hdr = _find_header_row(path)
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
# Detect which framework the CSV represents BEFORE any renaming
framework = _detect_framework(df)
print(f" Ofsted: detected framework '{framework}'")
# Normalise OEIF column names: for each target field pick the first source column present
available = set(df.columns)
for target, sources in COLUMN_PRIORITY.items():
for src in sources:
if src in available:
df.rename(columns={src: target}, inplace=True)
break
# Normalise Report Card column names (if present)
available = set(df.columns)
for target, sources in RC_COLUMN_PRIORITY.items():
for src in sources:
if src in available:
df.rename(columns={src: target}, inplace=True)
break
if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
# Only keep rows with a valid URN
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
inserted = updated = skipped = 0
with get_session() as session:
# Keep only the most recent inspection per URN
if "inspection_date" in df.columns:
df["_date_parsed"] = df["inspection_date"].apply(_parse_date)
df = df.sort_values("_date_parsed", ascending=False).groupby("urn").first().reset_index()
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
record = {
"urn": urn,
"framework": framework,
"inspection_date": _parse_date(row.get("inspection_date")),
"publication_date": _parse_date(row.get("publication_date")),
"inspection_type": str(row.get("inspection_type", "")).strip() or None,
# OEIF fields
"overall_effectiveness": _parse_grade(row.get("overall_effectiveness")),
"quality_of_education": _parse_grade(row.get("quality_of_education")),
"behaviour_attitudes": _parse_grade(row.get("behaviour_attitudes")),
"personal_development": _parse_grade(row.get("personal_development")),
"leadership_management": _parse_grade(row.get("leadership_management")),
"early_years_provision": _parse_grade(row.get("early_years_provision")),
"previous_overall": None,
# Report Card fields
"rc_safeguarding_met": _parse_safeguarding(row.get("rc_safeguarding")),
"rc_inclusion": _parse_rc_grade(row.get("rc_inclusion")),
"rc_curriculum_teaching": _parse_rc_grade(row.get("rc_curriculum_teaching")),
"rc_achievement": _parse_rc_grade(row.get("rc_achievement")),
"rc_attendance_behaviour": _parse_rc_grade(row.get("rc_attendance_behaviour")),
"rc_personal_development": _parse_rc_grade(row.get("rc_personal_development")),
"rc_leadership_governance": _parse_rc_grade(row.get("rc_leadership_governance")),
"rc_early_years": _parse_rc_grade(row.get("rc_early_years")),
"rc_sixth_form": _parse_rc_grade(row.get("rc_sixth_form")),
}
session.execute(
text("""
INSERT INTO ofsted_inspections
(urn, framework, inspection_date, publication_date, inspection_type,
overall_effectiveness, quality_of_education, behaviour_attitudes,
personal_development, leadership_management, early_years_provision,
previous_overall,
rc_safeguarding_met, rc_inclusion, rc_curriculum_teaching,
rc_achievement, rc_attendance_behaviour, rc_personal_development,
rc_leadership_governance, rc_early_years, rc_sixth_form)
VALUES
(:urn, :framework, :inspection_date, :publication_date, :inspection_type,
:overall_effectiveness, :quality_of_education, :behaviour_attitudes,
:personal_development, :leadership_management, :early_years_provision,
:previous_overall,
:rc_safeguarding_met, :rc_inclusion, :rc_curriculum_teaching,
:rc_achievement, :rc_attendance_behaviour, :rc_personal_development,
:rc_leadership_governance, :rc_early_years, :rc_sixth_form)
ON CONFLICT (urn) DO UPDATE SET
previous_overall = ofsted_inspections.overall_effectiveness,
framework = EXCLUDED.framework,
inspection_date = EXCLUDED.inspection_date,
publication_date = EXCLUDED.publication_date,
inspection_type = EXCLUDED.inspection_type,
overall_effectiveness = EXCLUDED.overall_effectiveness,
quality_of_education = EXCLUDED.quality_of_education,
behaviour_attitudes = EXCLUDED.behaviour_attitudes,
personal_development = EXCLUDED.personal_development,
leadership_management = EXCLUDED.leadership_management,
early_years_provision = EXCLUDED.early_years_provision,
rc_safeguarding_met = EXCLUDED.rc_safeguarding_met,
rc_inclusion = EXCLUDED.rc_inclusion,
rc_curriculum_teaching = EXCLUDED.rc_curriculum_teaching,
rc_achievement = EXCLUDED.rc_achievement,
rc_attendance_behaviour = EXCLUDED.rc_attendance_behaviour,
rc_personal_development = EXCLUDED.rc_personal_development,
rc_leadership_governance = EXCLUDED.rc_leadership_governance,
rc_early_years = EXCLUDED.rc_early_years,
rc_sixth_form = EXCLUDED.rc_sixth_form
"""),
record,
)
inserted += 1
if inserted % 5000 == 0:
session.flush()
print(f" Processed {inserted} records...")
print(f" Ofsted: upserted {inserted} records")
return {"inserted": inserted, "updated": updated, "skipped": skipped}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
path = download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)