fix(ofsted): map OEIF column names from current CSV format

Ofsted renamed all columns in the OEIF framework: - grades are now 'Latest OEIF overall effectiveness' etc. - dates are 'Inspection start date of latest OEIF graded inspection' Replace flat COLUMN_MAP with a priority list per field so both current OEIF and legacy column names work without duplicate-column conflicts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 22:02:02 +00:00
parent d00dc699cc
commit a478068d5a
1 changed files with 57 additions and 24 deletions
@@ -21,28 +21,56 @@ from db import get_session
 # The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page.
 GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes"

-COLUMN_MAP = {
-    "URN": "urn",
-    "Inspection date": "inspection_date",
-    "Publication date": "publication_date",
-    "Inspection type": "inspection_type",
-    "Overall effectiveness": "overall_effectiveness",
-    "Quality of education": "quality_of_education",
-    "Behaviour and attitudes": "behaviour_attitudes",
-    "Personal development": "personal_development",
-    "Leadership and management": "leadership_management",
-    "Early years provision": "early_years_provision",
-    # Some CSVs use shortened names
-    "Urn": "urn",
-    "InspectionDate": "inspection_date",
-    "PublicationDate": "publication_date",
-    "InspectionType": "inspection_type",
-    "OverallEffectiveness": "overall_effectiveness",
-    "QualityOfEducation": "quality_of_education",
-    "BehaviourAndAttitudes": "behaviour_attitudes",
-    "PersonalDevelopment": "personal_development",
-    "LeadershipAndManagement": "leadership_management",
-    "EarlyYearsProvision": "early_years_provision",
+# Column name → internal field, listed in priority order per field.
+# First matching column wins; later entries are fallbacks for older file formats.
+COLUMN_PRIORITY = {
+    "urn": ["URN", "Urn", "urn"],
+    "inspection_date": [
+        "Inspection start date of latest OEIF graded inspection",
+        "Inspection start date",
+        "Inspection date",
+        "InspectionDate",
+    ],
+    "publication_date": [
+        "Publication date of latest OEIF graded inspection",
+        "Publication date",
+        "PublicationDate",
+    ],
+    "inspection_type": [
+        "Inspection type of latest OEIF graded inspection",
+        "Inspection type",
+        "InspectionType",
+    ],
+    "overall_effectiveness": [
+        "Latest OEIF overall effectiveness",
+        "Overall effectiveness",
+        "OverallEffectiveness",
+    ],
+    "quality_of_education": [
+        "Latest OEIF quality of education",
+        "Quality of education",
+        "QualityOfEducation",
+    ],
+    "behaviour_attitudes": [
+        "Latest OEIF behaviour and attitudes",
+        "Behaviour and attitudes",
+        "BehaviourAndAttitudes",
+    ],
+    "personal_development": [
+        "Latest OEIF personal development",
+        "Personal development",
+        "PersonalDevelopment",
+    ],
+    "leadership_management": [
+        "Latest OEIF effectiveness of leadership and management",
+        "Leadership and management",
+        "LeadershipAndManagement",
+    ],
+    "early_years_provision": [
+        "Latest OEIF early years provision (where applicable)",
+        "Early years provision",
+        "EarlyYearsProvision",
+    ],
 }

 GRADE_MAP = {
@@ -158,8 +186,13 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
        hdr = _find_header_row(path)
        df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)

-    # Normalise column names
-    df.rename(columns=COLUMN_MAP, inplace=True)
+    # Normalise column names: for each target field pick the first source column present
+    available = set(df.columns)
+    for target, sources in COLUMN_PRIORITY.items():
+        for src in sources:
+            if src in available:
+                df.rename(columns={src: target}, inplace=True)
+                break

    if "urn" not in df.columns:
        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")