From a478068d5a1770887ae727a48806938af4a48586 Mon Sep 17 00:00:00 2001 From: Tudor Date: Tue, 24 Mar 2026 22:02:02 +0000 Subject: [PATCH] fix(ofsted): map OEIF column names from current CSV format Ofsted renamed all columns in the OEIF framework: - grades are now 'Latest OEIF overall effectiveness' etc. - dates are 'Inspection start date of latest OEIF graded inspection' Replace flat COLUMN_MAP with a priority list per field so both current OEIF and legacy column names work without duplicate-column conflicts. Co-Authored-By: Claude Sonnet 4.6 --- integrator/scripts/sources/ofsted.py | 81 +++++++++++++++++++--------- 1 file changed, 57 insertions(+), 24 deletions(-) diff --git a/integrator/scripts/sources/ofsted.py b/integrator/scripts/sources/ofsted.py index e52ec6b..10b0531 100644 --- a/integrator/scripts/sources/ofsted.py +++ b/integrator/scripts/sources/ofsted.py @@ -21,28 +21,56 @@ from db import get_session # The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page. GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes" -COLUMN_MAP = { - "URN": "urn", - "Inspection date": "inspection_date", - "Publication date": "publication_date", - "Inspection type": "inspection_type", - "Overall effectiveness": "overall_effectiveness", - "Quality of education": "quality_of_education", - "Behaviour and attitudes": "behaviour_attitudes", - "Personal development": "personal_development", - "Leadership and management": "leadership_management", - "Early years provision": "early_years_provision", - # Some CSVs use shortened names - "Urn": "urn", - "InspectionDate": "inspection_date", - "PublicationDate": "publication_date", - "InspectionType": "inspection_type", - "OverallEffectiveness": "overall_effectiveness", - "QualityOfEducation": "quality_of_education", - "BehaviourAndAttitudes": "behaviour_attitudes", - "PersonalDevelopment": "personal_development", - "LeadershipAndManagement": "leadership_management", - "EarlyYearsProvision": "early_years_provision", +# Column name → internal field, listed in priority order per field. +# First matching column wins; later entries are fallbacks for older file formats. +COLUMN_PRIORITY = { + "urn": ["URN", "Urn", "urn"], + "inspection_date": [ + "Inspection start date of latest OEIF graded inspection", + "Inspection start date", + "Inspection date", + "InspectionDate", + ], + "publication_date": [ + "Publication date of latest OEIF graded inspection", + "Publication date", + "PublicationDate", + ], + "inspection_type": [ + "Inspection type of latest OEIF graded inspection", + "Inspection type", + "InspectionType", + ], + "overall_effectiveness": [ + "Latest OEIF overall effectiveness", + "Overall effectiveness", + "OverallEffectiveness", + ], + "quality_of_education": [ + "Latest OEIF quality of education", + "Quality of education", + "QualityOfEducation", + ], + "behaviour_attitudes": [ + "Latest OEIF behaviour and attitudes", + "Behaviour and attitudes", + "BehaviourAndAttitudes", + ], + "personal_development": [ + "Latest OEIF personal development", + "Personal development", + "PersonalDevelopment", + ], + "leadership_management": [ + "Latest OEIF effectiveness of leadership and management", + "Leadership and management", + "LeadershipAndManagement", + ], + "early_years_provision": [ + "Latest OEIF early years provision (where applicable)", + "Early years provision", + "EarlyYearsProvision", + ], } GRADE_MAP = { @@ -158,8 +186,13 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict: hdr = _find_header_row(path) df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr) - # Normalise column names - df.rename(columns=COLUMN_MAP, inplace=True) + # Normalise column names: for each target field pick the first source column present + available = set(df.columns) + for target, sources in COLUMN_PRIORITY.items(): + for src in sources: + if src in available: + df.rename(columns={src: target}, inplace=True) + break if "urn" not in df.columns: raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")