fix(ofsted): per-row framework detection instead of per-file
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 33s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m3s
Build and Push Docker Images / Build Integrator (push) Successful in 58s
Build and Push Docker Images / Build Kestra Init (push) Successful in 33s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s

The MI CSV contains both OEIF and RC column sets simultaneously — OEIF columns
are populated for older inspections, RC columns for post-Nov-2025 inspections.
File-level detection wrongly classified all schools based on column presence alone.

Replace _detect_framework(df) with _framework_for_row(row):
- ReportCard: any rc_* column has a value
- OEIF: overall_effectiveness or quality_of_education has a value
- None: neither has data (no graded inspection on record)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-25 15:08:42 +00:00
parent 5720e18358
commit d81f03cfcf

View File

@@ -227,36 +227,30 @@ def _parse_date(val) -> date | None:
return None
def _detect_framework(df: pd.DataFrame) -> str:
"""Return 'ReportCard' if new-format columns are present, else 'OEIF'.
def _framework_for_row(row) -> str | None:
"""Determine inspection framework for a single school row.
Strategy: check for OEIF-specific phrases first (positive evidence of the
old format). Only if none are found, look for RC-specific phrases.
Defaults to 'OEIF' so misdetection is always a safe fallback.
Check RC columns first — if any have a value, it's a Report Card inspection.
Fall back to OEIF columns. If neither has data, the school has no graded
inspection on record (return None).
"""
cols_lower = {c.lower() for c in df.columns}
# Phrases unique to the old OEIF CSV — if any present, it's OEIF.
oeif_indicators = [
"overall effectiveness",
"quality of education",
"behaviour and attitudes",
rc_check_cols = [
"rc_inclusion", "rc_curriculum_teaching", "rc_achievement",
"rc_attendance_behaviour", "rc_personal_development",
"rc_leadership_governance", "rc_safeguarding",
]
for indicator in oeif_indicators:
if any(indicator in c for c in cols_lower):
return "OEIF"
# Phrases unique to the new Report Card CSV — multi-word, RC-specific.
rc_indicators = [
"curriculum and teaching",
"leadership and governance",
"attendance and behaviour",
]
for indicator in rc_indicators:
if any(indicator in c for c in cols_lower):
for col in rc_check_cols:
val = row.get(col)
if val is not None and not (isinstance(val, float) and pd.isna(val)):
return "ReportCard"
return "OEIF"
oeif_check_cols = ["overall_effectiveness", "quality_of_education"]
for col in oeif_check_cols:
val = row.get(col)
if val is not None and not (isinstance(val, float) and pd.isna(val)):
return "OEIF"
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
@@ -297,10 +291,6 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
hdr = _find_header_row(path)
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
# Detect which framework the CSV represents BEFORE any renaming
framework = _detect_framework(df)
print(f" Ofsted: detected framework '{framework}'")
# Normalise OEIF column names: for each target field pick the first source column present
available = set(df.columns)
for target, sources in COLUMN_PRIORITY.items():
@@ -340,7 +330,7 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
record = {
"urn": urn,
"framework": framework,
"framework": _framework_for_row(row),
"inspection_date": _parse_date(row.get("inspection_date")),
"publication_date": _parse_date(row.get("publication_date")),
"inspection_type": str(row.get("inspection_type", "")).strip() or None,