""" Ofsted Monthly Management Information CSV downloader and loader. Source: https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes Update: Monthly (released ~2 weeks into each month) """ import argparse import re import sys from datetime import date, datetime from pathlib import Path import pandas as pd import requests sys.path.insert(0, str(Path(__file__).parent.parent)) from config import SUPPLEMENTARY_DIR from db import get_session # Current Ofsted MI download URL — update this when Ofsted releases a new file. # The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page. GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes" # Column name → internal field, listed in priority order per field. # First matching column wins; later entries are fallbacks for older file formats. COLUMN_PRIORITY = { "urn": ["URN", "Urn", "urn"], "inspection_date": [ "Inspection start date of latest OEIF graded inspection", "Inspection start date", "Inspection date", "InspectionDate", ], "publication_date": [ "Publication date of latest OEIF graded inspection", "Publication date", "PublicationDate", ], "inspection_type": [ "Inspection type of latest OEIF graded inspection", "Inspection type", "InspectionType", ], "overall_effectiveness": [ "Latest OEIF overall effectiveness", "Overall effectiveness", "OverallEffectiveness", ], "quality_of_education": [ "Latest OEIF quality of education", "Quality of education", "QualityOfEducation", ], "behaviour_attitudes": [ "Latest OEIF behaviour and attitudes", "Behaviour and attitudes", "BehaviourAndAttitudes", ], "personal_development": [ "Latest OEIF personal development", "Personal development", "PersonalDevelopment", ], "leadership_management": [ "Latest OEIF effectiveness of leadership and management", "Leadership and management", "LeadershipAndManagement", ], "early_years_provision": [ "Latest OEIF early years provision (where applicable)", "Early years provision", "EarlyYearsProvision", ], } GRADE_MAP = { "Outstanding": 1, "1": 1, 1: 1, "Good": 2, "2": 2, 2: 2, "Requires improvement": 3, "3": 3, 3: 3, "Requires Improvement": 3, "Inadequate": 4, "4": 4, 4: 4, } # Report Card grade text → integer (1=Exceptional … 5=Urgent improvement) RC_GRADE_MAP = { "exceptional": 1, "strong standard": 2, "strong": 2, "expected standard": 3, "expected": 3, "needs attention": 4, "urgent improvement": 5, } # Column name priority for Report Card fields (best-guess names; Ofsted may vary) RC_COLUMN_PRIORITY = { "rc_safeguarding": [ "Safeguarding", "safeguarding", "Safeguarding standards", ], "rc_inclusion": [ "Inclusion", "inclusion", ], "rc_curriculum_teaching": [ "Curriculum and teaching", "curriculum_and_teaching", "Curriculum & teaching", ], "rc_achievement": [ "Achievement", "achievement", ], "rc_attendance_behaviour": [ "Attendance and behaviour", "attendance_and_behaviour", "Attendance & behaviour", ], "rc_personal_development": [ "Personal development and well-being", "Personal development and wellbeing", "personal_development_and_wellbeing", "Personal development & well-being", ], "rc_leadership_governance": [ "Leadership and governance", "leadership_and_governance", "Leadership & governance", ], "rc_early_years": [ "Early years", "early_years", "Early years provision", ], "rc_sixth_form": [ "Sixth form", "sixth_form", "Sixth form in schools", ], } DEST_DIR = SUPPLEMENTARY_DIR / "ofsted" def _discover_csv_url() -> str | None: """Scrape the GOV.UK page for the most recent CSV/ZIP link.""" try: resp = requests.get(GOV_UK_PAGE, timeout=30) resp.raise_for_status() # Look for links to assets.publishing.service.gov.uk CSV or ZIP files pattern = r'href="(https://assets\.publishing\.service\.gov\.uk[^"]+\.(?:csv|zip))"' urls = re.findall(pattern, resp.text, re.IGNORECASE) if urls: return urls[0] except Exception as e: print(f" Warning: could not scrape GOV.UK page: {e}") return None def download(data_dir: Path | None = None) -> Path: dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR dest.mkdir(parents=True, exist_ok=True) url = _discover_csv_url() if not url: raise RuntimeError( "Could not discover Ofsted MI download URL. " "Visit https://www.gov.uk/government/statistical-data-sets/" "monthly-management-information-ofsteds-school-inspections-outcomes " "to get the latest URL and update MANUAL_URL in ofsted.py" ) filename = url.split("/")[-1] dest_file = dest / filename if dest_file.exists(): print(f" Ofsted: {filename} already exists, skipping download.") return dest_file print(f" Ofsted: downloading {url} ...") resp = requests.get(url, timeout=120, stream=True) resp.raise_for_status() with open(dest_file, "wb") as f: for chunk in resp.iter_content(chunk_size=65536): f.write(chunk) print(f" Ofsted: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)") return dest_file def _parse_grade(val) -> int | None: if pd.isna(val): return None key = str(val).strip() return GRADE_MAP.get(key) def _parse_rc_grade(val) -> int | None: """Parse a Report Card grade text to integer 1–5.""" if pd.isna(val): return None key = str(val).strip().lower() return RC_GRADE_MAP.get(key) def _parse_safeguarding(val) -> bool | None: """Parse safeguarding 'Met'/'Not met' to boolean.""" if pd.isna(val): return None s = str(val).strip().lower() if s == "met": return True if s in ("not met", "not_met"): return False return None def _parse_date(val) -> date | None: if pd.isna(val): return None for fmt in ("%d/%m/%Y", "%Y-%m-%d", "%d-%m-%Y", "%d %B %Y"): try: return datetime.strptime(str(val).strip(), fmt).date() except ValueError: pass return None def _framework_for_row(row) -> str | None: """Determine inspection framework for a single school row. Check RC columns first — if any have a value, it's a Report Card inspection. Fall back to OEIF columns. If neither has data, the school has no graded inspection on record (return None). """ rc_check_cols = [ "rc_inclusion", "rc_curriculum_teaching", "rc_achievement", "rc_attendance_behaviour", "rc_personal_development", "rc_leadership_governance", "rc_safeguarding", ] for col in rc_check_cols: val = row.get(col) if val is not None and not (isinstance(val, float) and pd.isna(val)): return "ReportCard" oeif_check_cols = ["overall_effectiveness", "quality_of_education"] for col in oeif_check_cols: val = row.get(col) if val is not None and not (isinstance(val, float) and pd.isna(val)): return "OEIF" return None def load(path: Path | None = None, data_dir: Path | None = None) -> dict: if path is None: dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR files = sorted(dest.glob("*.csv")) + sorted(dest.glob("*.zip")) if not files: raise FileNotFoundError(f"No Ofsted MI file found in {dest}") path = files[-1] print(f" Ofsted: loading {path} ...") def _find_header_row(filepath, encoding="latin-1"): """Scan up to 10 rows to find the one containing a URN column.""" for i in range(10): peek = pd.read_csv(filepath, encoding=encoding, header=i, nrows=0) if any(str(c).strip() in ("URN", "Urn", "urn") for c in peek.columns): return i return 0 if str(path).endswith(".zip"): import zipfile, io with zipfile.ZipFile(path) as z: csv_names = [n for n in z.namelist() if n.endswith(".csv")] if not csv_names: raise ValueError("No CSV found inside Ofsted ZIP") # Extract to a temp file so we can scan for the header row import tempfile, os with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp: tmp.write(z.read(csv_names[0])) tmp_path = tmp.name try: hdr = _find_header_row(tmp_path) df = pd.read_csv(tmp_path, encoding="latin-1", low_memory=False, header=hdr) finally: os.unlink(tmp_path) else: hdr = _find_header_row(path) df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr) # Normalise OEIF column names: for each target field pick the first source column present available = set(df.columns) for target, sources in COLUMN_PRIORITY.items(): for src in sources: if src in available: df.rename(columns={src: target}, inplace=True) break # Normalise Report Card column names (if present) available = set(df.columns) for target, sources in RC_COLUMN_PRIORITY.items(): for src in sources: if src in available: df.rename(columns={src: target}, inplace=True) break if "urn" not in df.columns: raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") # Only keep rows with a valid URN df["urn"] = pd.to_numeric(df["urn"], errors="coerce") df = df.dropna(subset=["urn"]) df["urn"] = df["urn"].astype(int) inserted = updated = skipped = 0 with get_session() as session: # Keep only the most recent inspection per URN if "inspection_date" in df.columns: df["_date_parsed"] = df["inspection_date"].apply(_parse_date) df = df.sort_values("_date_parsed", ascending=False).groupby("urn").first().reset_index() from sqlalchemy import text for _, row in df.iterrows(): urn = int(row["urn"]) record = { "urn": urn, "framework": _framework_for_row(row), "inspection_date": _parse_date(row.get("inspection_date")), "publication_date": _parse_date(row.get("publication_date")), "inspection_type": str(row.get("inspection_type", "")).strip() or None, # OEIF fields "overall_effectiveness": _parse_grade(row.get("overall_effectiveness")), "quality_of_education": _parse_grade(row.get("quality_of_education")), "behaviour_attitudes": _parse_grade(row.get("behaviour_attitudes")), "personal_development": _parse_grade(row.get("personal_development")), "leadership_management": _parse_grade(row.get("leadership_management")), "early_years_provision": _parse_grade(row.get("early_years_provision")), "previous_overall": None, # Report Card fields "rc_safeguarding_met": _parse_safeguarding(row.get("rc_safeguarding")), "rc_inclusion": _parse_rc_grade(row.get("rc_inclusion")), "rc_curriculum_teaching": _parse_rc_grade(row.get("rc_curriculum_teaching")), "rc_achievement": _parse_rc_grade(row.get("rc_achievement")), "rc_attendance_behaviour": _parse_rc_grade(row.get("rc_attendance_behaviour")), "rc_personal_development": _parse_rc_grade(row.get("rc_personal_development")), "rc_leadership_governance": _parse_rc_grade(row.get("rc_leadership_governance")), "rc_early_years": _parse_rc_grade(row.get("rc_early_years")), "rc_sixth_form": _parse_rc_grade(row.get("rc_sixth_form")), } session.execute( text(""" INSERT INTO ofsted_inspections (urn, framework, inspection_date, publication_date, inspection_type, overall_effectiveness, quality_of_education, behaviour_attitudes, personal_development, leadership_management, early_years_provision, previous_overall, rc_safeguarding_met, rc_inclusion, rc_curriculum_teaching, rc_achievement, rc_attendance_behaviour, rc_personal_development, rc_leadership_governance, rc_early_years, rc_sixth_form) VALUES (:urn, :framework, :inspection_date, :publication_date, :inspection_type, :overall_effectiveness, :quality_of_education, :behaviour_attitudes, :personal_development, :leadership_management, :early_years_provision, :previous_overall, :rc_safeguarding_met, :rc_inclusion, :rc_curriculum_teaching, :rc_achievement, :rc_attendance_behaviour, :rc_personal_development, :rc_leadership_governance, :rc_early_years, :rc_sixth_form) ON CONFLICT (urn) DO UPDATE SET previous_overall = ofsted_inspections.overall_effectiveness, framework = EXCLUDED.framework, inspection_date = EXCLUDED.inspection_date, publication_date = EXCLUDED.publication_date, inspection_type = EXCLUDED.inspection_type, overall_effectiveness = EXCLUDED.overall_effectiveness, quality_of_education = EXCLUDED.quality_of_education, behaviour_attitudes = EXCLUDED.behaviour_attitudes, personal_development = EXCLUDED.personal_development, leadership_management = EXCLUDED.leadership_management, early_years_provision = EXCLUDED.early_years_provision, rc_safeguarding_met = EXCLUDED.rc_safeguarding_met, rc_inclusion = EXCLUDED.rc_inclusion, rc_curriculum_teaching = EXCLUDED.rc_curriculum_teaching, rc_achievement = EXCLUDED.rc_achievement, rc_attendance_behaviour = EXCLUDED.rc_attendance_behaviour, rc_personal_development = EXCLUDED.rc_personal_development, rc_leadership_governance = EXCLUDED.rc_leadership_governance, rc_early_years = EXCLUDED.rc_early_years, rc_sixth_form = EXCLUDED.rc_sixth_form """), record, ) inserted += 1 if inserted % 5000 == 0: session.flush() print(f" Processed {inserted} records...") print(f" Ofsted: upserted {inserted} records") return {"inserted": inserted, "updated": updated, "skipped": skipped} if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--action", choices=["download", "load", "all"], default="all") parser.add_argument("--data-dir", type=Path, default=None) args = parser.parse_args() if args.action in ("download", "all"): path = download(args.data_dir) if args.action in ("load", "all"): load(data_dir=args.data_dir)