diff --git a/integrator/scripts/sources/ofsted.py b/integrator/scripts/sources/ofsted.py index eee65ef..a04c68a 100644 --- a/integrator/scripts/sources/ofsted.py +++ b/integrator/scripts/sources/ofsted.py @@ -227,36 +227,30 @@ def _parse_date(val) -> date | None: return None -def _detect_framework(df: pd.DataFrame) -> str: - """Return 'ReportCard' if new-format columns are present, else 'OEIF'. +def _framework_for_row(row) -> str | None: + """Determine inspection framework for a single school row. - Strategy: check for OEIF-specific phrases first (positive evidence of the - old format). Only if none are found, look for RC-specific phrases. - Defaults to 'OEIF' so misdetection is always a safe fallback. + Check RC columns first — if any have a value, it's a Report Card inspection. + Fall back to OEIF columns. If neither has data, the school has no graded + inspection on record (return None). """ - cols_lower = {c.lower() for c in df.columns} - - # Phrases unique to the old OEIF CSV — if any present, it's OEIF. - oeif_indicators = [ - "overall effectiveness", - "quality of education", - "behaviour and attitudes", + rc_check_cols = [ + "rc_inclusion", "rc_curriculum_teaching", "rc_achievement", + "rc_attendance_behaviour", "rc_personal_development", + "rc_leadership_governance", "rc_safeguarding", ] - for indicator in oeif_indicators: - if any(indicator in c for c in cols_lower): - return "OEIF" - - # Phrases unique to the new Report Card CSV — multi-word, RC-specific. - rc_indicators = [ - "curriculum and teaching", - "leadership and governance", - "attendance and behaviour", - ] - for indicator in rc_indicators: - if any(indicator in c for c in cols_lower): + for col in rc_check_cols: + val = row.get(col) + if val is not None and not (isinstance(val, float) and pd.isna(val)): return "ReportCard" - return "OEIF" + oeif_check_cols = ["overall_effectiveness", "quality_of_education"] + for col in oeif_check_cols: + val = row.get(col) + if val is not None and not (isinstance(val, float) and pd.isna(val)): + return "OEIF" + + return None def load(path: Path | None = None, data_dir: Path | None = None) -> dict: @@ -297,10 +291,6 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict: hdr = _find_header_row(path) df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr) - # Detect which framework the CSV represents BEFORE any renaming - framework = _detect_framework(df) - print(f" Ofsted: detected framework '{framework}'") - # Normalise OEIF column names: for each target field pick the first source column present available = set(df.columns) for target, sources in COLUMN_PRIORITY.items(): @@ -340,7 +330,7 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict: record = { "urn": urn, - "framework": framework, + "framework": _framework_for_row(row), "inspection_date": _parse_date(row.get("inspection_date")), "publication_date": _parse_date(row.get("publication_date")), "inspection_type": str(row.get("inspection_type", "")).strip() or None,