""" Ofsted Monthly Management Information CSV downloader and loader. Source: https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes Update: Monthly (released ~2 weeks into each month) """ import argparse import re import sys from datetime import date, datetime from pathlib import Path import pandas as pd import requests sys.path.insert(0, str(Path(__file__).parent.parent)) from config import SUPPLEMENTARY_DIR from db import get_session # Current Ofsted MI download URL — update this when Ofsted releases a new file. # The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page. GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes" COLUMN_MAP = { "URN": "urn", "Inspection date": "inspection_date", "Publication date": "publication_date", "Inspection type": "inspection_type", "Overall effectiveness": "overall_effectiveness", "Quality of education": "quality_of_education", "Behaviour and attitudes": "behaviour_attitudes", "Personal development": "personal_development", "Leadership and management": "leadership_management", "Early years provision": "early_years_provision", # Some CSVs use shortened names "Urn": "urn", "InspectionDate": "inspection_date", "PublicationDate": "publication_date", "InspectionType": "inspection_type", "OverallEffectiveness": "overall_effectiveness", "QualityOfEducation": "quality_of_education", "BehaviourAndAttitudes": "behaviour_attitudes", "PersonalDevelopment": "personal_development", "LeadershipAndManagement": "leadership_management", "EarlyYearsProvision": "early_years_provision", } GRADE_MAP = { "Outstanding": 1, "1": 1, 1: 1, "Good": 2, "2": 2, 2: 2, "Requires improvement": 3, "3": 3, 3: 3, "Requires Improvement": 3, "Inadequate": 4, "4": 4, 4: 4, } DEST_DIR = SUPPLEMENTARY_DIR / "ofsted" def _discover_csv_url() -> str | None: """Scrape the GOV.UK page for the most recent CSV/ZIP link.""" try: resp = requests.get(GOV_UK_PAGE, timeout=30) resp.raise_for_status() # Look for links to assets.publishing.service.gov.uk CSV or ZIP files pattern = r'href="(https://assets\.publishing\.service\.gov\.uk[^"]+\.(?:csv|zip))"' urls = re.findall(pattern, resp.text, re.IGNORECASE) if urls: return urls[0] except Exception as e: print(f" Warning: could not scrape GOV.UK page: {e}") return None def download(data_dir: Path | None = None) -> Path: dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR dest.mkdir(parents=True, exist_ok=True) url = _discover_csv_url() if not url: raise RuntimeError( "Could not discover Ofsted MI download URL. " "Visit https://www.gov.uk/government/statistical-data-sets/" "monthly-management-information-ofsteds-school-inspections-outcomes " "to get the latest URL and update MANUAL_URL in ofsted.py" ) filename = url.split("/")[-1] dest_file = dest / filename if dest_file.exists(): print(f" Ofsted: {filename} already exists, skipping download.") return dest_file print(f" Ofsted: downloading {url} ...") resp = requests.get(url, timeout=120, stream=True) resp.raise_for_status() with open(dest_file, "wb") as f: for chunk in resp.iter_content(chunk_size=65536): f.write(chunk) print(f" Ofsted: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)") return dest_file def _parse_grade(val) -> int | None: if pd.isna(val): return None key = str(val).strip() return GRADE_MAP.get(key) def _parse_date(val) -> date | None: if pd.isna(val): return None for fmt in ("%d/%m/%Y", "%Y-%m-%d", "%d-%m-%Y", "%d %B %Y"): try: return datetime.strptime(str(val).strip(), fmt).date() except ValueError: pass return None def load(path: Path | None = None, data_dir: Path | None = None) -> dict: if path is None: dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR files = sorted(dest.glob("*.csv")) + sorted(dest.glob("*.zip")) if not files: raise FileNotFoundError(f"No Ofsted MI file found in {dest}") path = files[-1] print(f" Ofsted: loading {path} ...") if str(path).endswith(".zip"): import zipfile, io with zipfile.ZipFile(path) as z: csv_names = [n for n in z.namelist() if n.endswith(".csv")] if not csv_names: raise ValueError("No CSV found inside Ofsted ZIP") with z.open(csv_names[0]) as f: df = pd.read_csv(io.TextIOWrapper(f, encoding="latin-1"), low_memory=False) else: df = pd.read_csv(path, encoding="latin-1", low_memory=False) # Normalise column names df.rename(columns=COLUMN_MAP, inplace=True) if "urn" not in df.columns: raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") # Only keep rows with a valid URN df["urn"] = pd.to_numeric(df["urn"], errors="coerce") df = df.dropna(subset=["urn"]) df["urn"] = df["urn"].astype(int) inserted = updated = skipped = 0 with get_session() as session: # Keep only the most recent inspection per URN if "inspection_date" in df.columns: df["_date_parsed"] = df["inspection_date"].apply(_parse_date) df = df.sort_values("_date_parsed", ascending=False).groupby("urn").first().reset_index() for _, row in df.iterrows(): urn = int(row["urn"]) record = { "urn": urn, "inspection_date": _parse_date(row.get("inspection_date")), "publication_date": _parse_date(row.get("publication_date")), "inspection_type": str(row.get("inspection_type", "")).strip() or None, "overall_effectiveness": _parse_grade(row.get("overall_effectiveness")), "quality_of_education": _parse_grade(row.get("quality_of_education")), "behaviour_attitudes": _parse_grade(row.get("behaviour_attitudes")), "personal_development": _parse_grade(row.get("personal_development")), "leadership_management": _parse_grade(row.get("leadership_management")), "early_years_provision": _parse_grade(row.get("early_years_provision")), "previous_overall": None, } from sqlalchemy import text session.execute( text(""" INSERT INTO ofsted_inspections (urn, inspection_date, publication_date, inspection_type, overall_effectiveness, quality_of_education, behaviour_attitudes, personal_development, leadership_management, early_years_provision, previous_overall) VALUES (:urn, :inspection_date, :publication_date, :inspection_type, :overall_effectiveness, :quality_of_education, :behaviour_attitudes, :personal_development, :leadership_management, :early_years_provision, :previous_overall) ON CONFLICT (urn) DO UPDATE SET previous_overall = ofsted_inspections.overall_effectiveness, inspection_date = EXCLUDED.inspection_date, publication_date = EXCLUDED.publication_date, inspection_type = EXCLUDED.inspection_type, overall_effectiveness = EXCLUDED.overall_effectiveness, quality_of_education = EXCLUDED.quality_of_education, behaviour_attitudes = EXCLUDED.behaviour_attitudes, personal_development = EXCLUDED.personal_development, leadership_management = EXCLUDED.leadership_management, early_years_provision = EXCLUDED.early_years_provision """), record, ) inserted += 1 if inserted % 5000 == 0: session.flush() print(f" Processed {inserted} records...") print(f" Ofsted: upserted {inserted} records") return {"inserted": inserted, "updated": updated, "skipped": skipped} if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--action", choices=["download", "load", "all"], default="all") parser.add_argument("--data-dir", type=Path, default=None) args = parser.parse_args() if args.action in ("download", "all"): path = download(args.data_dir) if args.action in ("load", "all"): load(data_dir=args.data_dir)