fix(ofsted): per-row framework detection instead of per-file
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 33s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m3s
Build and Push Docker Images / Build Integrator (push) Successful in 58s
Build and Push Docker Images / Build Kestra Init (push) Successful in 33s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 33s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m3s
Build and Push Docker Images / Build Integrator (push) Successful in 58s
Build and Push Docker Images / Build Kestra Init (push) Successful in 33s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
The MI CSV contains both OEIF and RC column sets simultaneously — OEIF columns are populated for older inspections, RC columns for post-Nov-2025 inspections. File-level detection wrongly classified all schools based on column presence alone. Replace _detect_framework(df) with _framework_for_row(row): - ReportCard: any rc_* column has a value - OEIF: overall_effectiveness or quality_of_education has a value - None: neither has data (no graded inspection on record) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -227,36 +227,30 @@ def _parse_date(val) -> date | None:
|
||||
return None
|
||||
|
||||
|
||||
def _detect_framework(df: pd.DataFrame) -> str:
|
||||
"""Return 'ReportCard' if new-format columns are present, else 'OEIF'.
|
||||
def _framework_for_row(row) -> str | None:
|
||||
"""Determine inspection framework for a single school row.
|
||||
|
||||
Strategy: check for OEIF-specific phrases first (positive evidence of the
|
||||
old format). Only if none are found, look for RC-specific phrases.
|
||||
Defaults to 'OEIF' so misdetection is always a safe fallback.
|
||||
Check RC columns first — if any have a value, it's a Report Card inspection.
|
||||
Fall back to OEIF columns. If neither has data, the school has no graded
|
||||
inspection on record (return None).
|
||||
"""
|
||||
cols_lower = {c.lower() for c in df.columns}
|
||||
|
||||
# Phrases unique to the old OEIF CSV — if any present, it's OEIF.
|
||||
oeif_indicators = [
|
||||
"overall effectiveness",
|
||||
"quality of education",
|
||||
"behaviour and attitudes",
|
||||
rc_check_cols = [
|
||||
"rc_inclusion", "rc_curriculum_teaching", "rc_achievement",
|
||||
"rc_attendance_behaviour", "rc_personal_development",
|
||||
"rc_leadership_governance", "rc_safeguarding",
|
||||
]
|
||||
for indicator in oeif_indicators:
|
||||
if any(indicator in c for c in cols_lower):
|
||||
return "OEIF"
|
||||
|
||||
# Phrases unique to the new Report Card CSV — multi-word, RC-specific.
|
||||
rc_indicators = [
|
||||
"curriculum and teaching",
|
||||
"leadership and governance",
|
||||
"attendance and behaviour",
|
||||
]
|
||||
for indicator in rc_indicators:
|
||||
if any(indicator in c for c in cols_lower):
|
||||
for col in rc_check_cols:
|
||||
val = row.get(col)
|
||||
if val is not None and not (isinstance(val, float) and pd.isna(val)):
|
||||
return "ReportCard"
|
||||
|
||||
return "OEIF"
|
||||
oeif_check_cols = ["overall_effectiveness", "quality_of_education"]
|
||||
for col in oeif_check_cols:
|
||||
val = row.get(col)
|
||||
if val is not None and not (isinstance(val, float) and pd.isna(val)):
|
||||
return "OEIF"
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||||
@@ -297,10 +291,6 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||||
hdr = _find_header_row(path)
|
||||
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
|
||||
|
||||
# Detect which framework the CSV represents BEFORE any renaming
|
||||
framework = _detect_framework(df)
|
||||
print(f" Ofsted: detected framework '{framework}'")
|
||||
|
||||
# Normalise OEIF column names: for each target field pick the first source column present
|
||||
available = set(df.columns)
|
||||
for target, sources in COLUMN_PRIORITY.items():
|
||||
@@ -340,7 +330,7 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||||
|
||||
record = {
|
||||
"urn": urn,
|
||||
"framework": framework,
|
||||
"framework": _framework_for_row(row),
|
||||
"inspection_date": _parse_date(row.get("inspection_date")),
|
||||
"publication_date": _parse_date(row.get("publication_date")),
|
||||
"inspection_type": str(row.get("inspection_type", "")).strip() or None,
|
||||
|
||||
Reference in New Issue
Block a user