From 5720e1835852a3d1f4cd501d1fceb9d63a0ee58d Mon Sep 17 00:00:00 2001 From: Tudor Date: Wed, 25 Mar 2026 14:55:10 +0000 Subject: [PATCH] fix(ofsted): tighten framework detection to avoid false ReportCard classification The old OEIF CSV contains columns whose names include substrings like 'inclusion' and 'achievement', causing _detect_framework() to wrongly return 'ReportCard' for pre-Nov-2025 inspections. Fix: check for OEIF-specific phrases first ('overall effectiveness', 'quality of education', 'behaviour and attitudes'). Only if none are found, look for multi-word RC-specific phrases. Default to OEIF as a safe fallback. Co-Authored-By: Claude Sonnet 4.6 --- integrator/scripts/sources/ofsted.py | 29 +++++++++++++++++++++++----- 1 file changed, 24 insertions(+), 5 deletions(-) diff --git a/integrator/scripts/sources/ofsted.py b/integrator/scripts/sources/ofsted.py index 561b9d4..eee65ef 100644 --- a/integrator/scripts/sources/ofsted.py +++ b/integrator/scripts/sources/ofsted.py @@ -228,15 +228,34 @@ def _parse_date(val) -> date | None: def _detect_framework(df: pd.DataFrame) -> str: - """Return 'ReportCard' if new-format columns are present, else 'OEIF'.""" - rc_indicators = [ - "inclusion", "curriculum and teaching", "achievement", - "attendance and behaviour", "safeguarding standards", "safeguarding", - ] + """Return 'ReportCard' if new-format columns are present, else 'OEIF'. + + Strategy: check for OEIF-specific phrases first (positive evidence of the + old format). Only if none are found, look for RC-specific phrases. + Defaults to 'OEIF' so misdetection is always a safe fallback. + """ cols_lower = {c.lower() for c in df.columns} + + # Phrases unique to the old OEIF CSV — if any present, it's OEIF. + oeif_indicators = [ + "overall effectiveness", + "quality of education", + "behaviour and attitudes", + ] + for indicator in oeif_indicators: + if any(indicator in c for c in cols_lower): + return "OEIF" + + # Phrases unique to the new Report Card CSV — multi-word, RC-specific. + rc_indicators = [ + "curriculum and teaching", + "leadership and governance", + "attendance and behaviour", + ] for indicator in rc_indicators: if any(indicator in c for c in cols_lower): return "ReportCard" + return "OEIF"