fix(ofsted): map OEIF column names from current CSV format

Ofsted renamed all columns in the OEIF framework: - grades are now 'Latest OEIF overall effectiveness' etc. - dates are 'Inspection start date of latest OEIF graded inspection' Replace flat COLUMN_MAP with a priority list per field so both current OEIF and legacy column names work without duplicate-column conflicts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 22:02:02 +00:00
parent d00dc699cc
commit a478068d5a
1 changed files with 57 additions and 24 deletions
@@ -21,28 +21,56 @@ from db import get_session
 # The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page.
 GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes"
-COLUMN_MAP = {
+# Column name → internal field, listed in priority order per field.
-    "URN": "urn",
+# First matching column wins; later entries are fallbacks for older file formats.
-    "Inspection date": "inspection_date",
+COLUMN_PRIORITY = {
-    "Publication date": "publication_date",
+    "urn": ["URN", "Urn", "urn"],
-    "Inspection type": "inspection_type",
+    "inspection_date": [
-    "Overall effectiveness": "overall_effectiveness",
+        "Inspection start date of latest OEIF graded inspection",
-    "Quality of education": "quality_of_education",
+        "Inspection start date",
-    "Behaviour and attitudes": "behaviour_attitudes",
+        "Inspection date",
-    "Personal development": "personal_development",
+        "InspectionDate",
-    "Leadership and management": "leadership_management",
+    ],
-    "Early years provision": "early_years_provision",
+    "publication_date": [
-    # Some CSVs use shortened names
+        "Publication date of latest OEIF graded inspection",
-    "Urn": "urn",
+        "Publication date",
-    "InspectionDate": "inspection_date",
+        "PublicationDate",
-    "PublicationDate": "publication_date",
+    ],
-    "InspectionType": "inspection_type",
+    "inspection_type": [
-    "OverallEffectiveness": "overall_effectiveness",
+        "Inspection type of latest OEIF graded inspection",
-    "QualityOfEducation": "quality_of_education",
+        "Inspection type",
-    "BehaviourAndAttitudes": "behaviour_attitudes",
+        "InspectionType",
-    "PersonalDevelopment": "personal_development",
+    ],
-    "LeadershipAndManagement": "leadership_management",
+    "overall_effectiveness": [
-    "EarlyYearsProvision": "early_years_provision",
+        "Latest OEIF overall effectiveness",
        "Overall effectiveness",
        "OverallEffectiveness",
    ],
    "quality_of_education": [
        "Latest OEIF quality of education",
        "Quality of education",
        "QualityOfEducation",
    ],
    "behaviour_attitudes": [
        "Latest OEIF behaviour and attitudes",
        "Behaviour and attitudes",
        "BehaviourAndAttitudes",
    ],
    "personal_development": [
        "Latest OEIF personal development",
        "Personal development",
        "PersonalDevelopment",
    ],
    "leadership_management": [
        "Latest OEIF effectiveness of leadership and management",
        "Leadership and management",
        "LeadershipAndManagement",
    ],
    "early_years_provision": [
        "Latest OEIF early years provision (where applicable)",
        "Early years provision",
        "EarlyYearsProvision",
    ],
 }
 GRADE_MAP = {
@@ -158,8 +186,13 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
        hdr = _find_header_row(path)
        df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
-    # Normalise column names
+    # Normalise column names: for each target field pick the first source column present
-    df.rename(columns=COLUMN_MAP, inplace=True)
+    available = set(df.columns)
    for target, sources in COLUMN_PRIORITY.items():
        for src in sources:
            if src in available:
                df.rename(columns={src: target}, inplace=True)
                break
    if "urn" not in df.columns:
        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")