fix(ofsted): per-row framework detection instead of per-file

The MI CSV contains both OEIF and RC column sets simultaneously — OEIF columns are populated for older inspections, RC columns for post-Nov-2025 inspections. File-level detection wrongly classified all schools based on column presence alone. Replace _detect_framework(df) with _framework_for_row(row): - ReportCard: any rc_* column has a value - OEIF: overall_effectiveness or quality_of_education has a value - None: neither has data (no graded inspection on record) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-25 15:08:42 +00:00
parent 5720e18358
commit d81f03cfcf
1 changed files with 20 additions and 30 deletions
--- a/integrator/scripts/sources/ofsted.py
+++ b/integrator/scripts/sources/ofsted.py
@@ -227,36 +227,30 @@ def _parse_date(val) -> date | None:
    return None


-def _detect_framework(df: pd.DataFrame) -> str:
-    """Return 'ReportCard' if new-format columns are present, else 'OEIF'.
+def _framework_for_row(row) -> str | None:
+    """Determine inspection framework for a single school row.

-    Strategy: check for OEIF-specific phrases first (positive evidence of the
-    old format). Only if none are found, look for RC-specific phrases.
-    Defaults to 'OEIF' so misdetection is always a safe fallback.
+    Check RC columns first — if any have a value, it's a Report Card inspection.
+    Fall back to OEIF columns. If neither has data, the school has no graded
+    inspection on record (return None).
    """
-    cols_lower = {c.lower() for c in df.columns}
-
-    # Phrases unique to the old OEIF CSV — if any present, it's OEIF.
-    oeif_indicators = [
-        "overall effectiveness",
-        "quality of education",
-        "behaviour and attitudes",
+    rc_check_cols = [
+        "rc_inclusion", "rc_curriculum_teaching", "rc_achievement",
+        "rc_attendance_behaviour", "rc_personal_development",
+        "rc_leadership_governance", "rc_safeguarding",
    ]
-    for indicator in oeif_indicators:
-        if any(indicator in c for c in cols_lower):
-            return "OEIF"
-
-    # Phrases unique to the new Report Card CSV — multi-word, RC-specific.
-    rc_indicators = [
-        "curriculum and teaching",
-        "leadership and governance",
-        "attendance and behaviour",
-    ]
-    for indicator in rc_indicators:
-        if any(indicator in c for c in cols_lower):
+    for col in rc_check_cols:
+        val = row.get(col)
+        if val is not None and not (isinstance(val, float) and pd.isna(val)):
            return "ReportCard"

-    return "OEIF"
+    oeif_check_cols = ["overall_effectiveness", "quality_of_education"]
+    for col in oeif_check_cols:
+        val = row.get(col)
+        if val is not None and not (isinstance(val, float) and pd.isna(val)):
+            return "OEIF"
+
+    return None


 def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
@@ -297,10 +291,6 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
        hdr = _find_header_row(path)
        df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)

-    # Detect which framework the CSV represents BEFORE any renaming
-    framework = _detect_framework(df)
-    print(f"  Ofsted: detected framework '{framework}'")
-
    # Normalise OEIF column names: for each target field pick the first source column present
    available = set(df.columns)
    for target, sources in COLUMN_PRIORITY.items():
@@ -340,7 +330,7 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:

            record = {
                "urn": urn,
-                "framework": framework,
+                "framework": _framework_for_row(row),
                "inspection_date": _parse_date(row.get("inspection_date")),
                "publication_date": _parse_date(row.get("publication_date")),
                "inspection_type": str(row.get("inspection_type", "")).strip() or None,