diff --git a/integrator/scripts/sources/ofsted.py b/integrator/scripts/sources/ofsted.py index 561b9d4..eee65ef 100644 --- a/integrator/scripts/sources/ofsted.py +++ b/integrator/scripts/sources/ofsted.py @@ -228,15 +228,34 @@ def _parse_date(val) -> date | None: def _detect_framework(df: pd.DataFrame) -> str: - """Return 'ReportCard' if new-format columns are present, else 'OEIF'.""" - rc_indicators = [ - "inclusion", "curriculum and teaching", "achievement", - "attendance and behaviour", "safeguarding standards", "safeguarding", - ] + """Return 'ReportCard' if new-format columns are present, else 'OEIF'. + + Strategy: check for OEIF-specific phrases first (positive evidence of the + old format). Only if none are found, look for RC-specific phrases. + Defaults to 'OEIF' so misdetection is always a safe fallback. + """ cols_lower = {c.lower() for c in df.columns} + + # Phrases unique to the old OEIF CSV — if any present, it's OEIF. + oeif_indicators = [ + "overall effectiveness", + "quality of education", + "behaviour and attitudes", + ] + for indicator in oeif_indicators: + if any(indicator in c for c in cols_lower): + return "OEIF" + + # Phrases unique to the new Report Card CSV — multi-word, RC-specific. + rc_indicators = [ + "curriculum and teaching", + "leadership and governance", + "attendance and behaviour", + ] for indicator in rc_indicators: if any(indicator in c for c in cols_lower): return "ReportCard" + return "OEIF"