From a478068d5a1770887ae727a48806938af4a48586 Mon Sep 17 00:00:00 2001
From: Tudor <tudor@sitaru.org>
Date: Tue, 24 Mar 2026 22:02:02 +0000
Subject: [PATCH] fix(ofsted): map OEIF column names from current CSV format

Ofsted renamed all columns in the OEIF framework:
- grades are now 'Latest OEIF overall effectiveness' etc.
- dates are 'Inspection start date of latest OEIF graded inspection'
Replace flat COLUMN_MAP with a priority list per field so both current
OEIF and legacy column names work without duplicate-column conflicts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 integrator/scripts/sources/ofsted.py | 81 +++++++++++++++++++---------
 1 file changed, 57 insertions(+), 24 deletions(-)

diff --git a/integrator/scripts/sources/ofsted.py b/integrator/scripts/sources/ofsted.py
index e52ec6b..10b0531 100644
--- a/integrator/scripts/sources/ofsted.py
+++ b/integrator/scripts/sources/ofsted.py
@@ -21,28 +21,56 @@ from db import get_session
 # The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page.
 GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes"
 
-COLUMN_MAP = {
-    "URN": "urn",
-    "Inspection date": "inspection_date",
-    "Publication date": "publication_date",
-    "Inspection type": "inspection_type",
-    "Overall effectiveness": "overall_effectiveness",
-    "Quality of education": "quality_of_education",
-    "Behaviour and attitudes": "behaviour_attitudes",
-    "Personal development": "personal_development",
-    "Leadership and management": "leadership_management",
-    "Early years provision": "early_years_provision",
-    # Some CSVs use shortened names
-    "Urn": "urn",
-    "InspectionDate": "inspection_date",
-    "PublicationDate": "publication_date",
-    "InspectionType": "inspection_type",
-    "OverallEffectiveness": "overall_effectiveness",
-    "QualityOfEducation": "quality_of_education",
-    "BehaviourAndAttitudes": "behaviour_attitudes",
-    "PersonalDevelopment": "personal_development",
-    "LeadershipAndManagement": "leadership_management",
-    "EarlyYearsProvision": "early_years_provision",
+# Column name → internal field, listed in priority order per field.
+# First matching column wins; later entries are fallbacks for older file formats.
+COLUMN_PRIORITY = {
+    "urn": ["URN", "Urn", "urn"],
+    "inspection_date": [
+        "Inspection start date of latest OEIF graded inspection",
+        "Inspection start date",
+        "Inspection date",
+        "InspectionDate",
+    ],
+    "publication_date": [
+        "Publication date of latest OEIF graded inspection",
+        "Publication date",
+        "PublicationDate",
+    ],
+    "inspection_type": [
+        "Inspection type of latest OEIF graded inspection",
+        "Inspection type",
+        "InspectionType",
+    ],
+    "overall_effectiveness": [
+        "Latest OEIF overall effectiveness",
+        "Overall effectiveness",
+        "OverallEffectiveness",
+    ],
+    "quality_of_education": [
+        "Latest OEIF quality of education",
+        "Quality of education",
+        "QualityOfEducation",
+    ],
+    "behaviour_attitudes": [
+        "Latest OEIF behaviour and attitudes",
+        "Behaviour and attitudes",
+        "BehaviourAndAttitudes",
+    ],
+    "personal_development": [
+        "Latest OEIF personal development",
+        "Personal development",
+        "PersonalDevelopment",
+    ],
+    "leadership_management": [
+        "Latest OEIF effectiveness of leadership and management",
+        "Leadership and management",
+        "LeadershipAndManagement",
+    ],
+    "early_years_provision": [
+        "Latest OEIF early years provision (where applicable)",
+        "Early years provision",
+        "EarlyYearsProvision",
+    ],
 }
 
 GRADE_MAP = {
@@ -158,8 +186,13 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
         hdr = _find_header_row(path)
         df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
 
-    # Normalise column names
-    df.rename(columns=COLUMN_MAP, inplace=True)
+    # Normalise column names: for each target field pick the first source column present
+    available = set(df.columns)
+    for target, sources in COLUMN_PRIORITY.items():
+        for src in sources:
+            if src in available:
+                df.rename(columns={src: target}, inplace=True)
+                break
 
     if "urn" not in df.columns:
         raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")