fix(ofsted): map OEIF column names from current CSV format
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 31s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m3s
Build and Push Docker Images / Build Integrator (push) Successful in 59s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s

Ofsted renamed all columns in the OEIF framework:
- grades are now 'Latest OEIF overall effectiveness' etc.
- dates are 'Inspection start date of latest OEIF graded inspection'
Replace flat COLUMN_MAP with a priority list per field so both current
OEIF and legacy column names work without duplicate-column conflicts.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-24 22:02:02 +00:00
parent d00dc699cc
commit a478068d5a

View File

@@ -21,28 +21,56 @@ from db import get_session
# The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page. # The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page.
GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes" GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes"
COLUMN_MAP = { # Column name → internal field, listed in priority order per field.
"URN": "urn", # First matching column wins; later entries are fallbacks for older file formats.
"Inspection date": "inspection_date", COLUMN_PRIORITY = {
"Publication date": "publication_date", "urn": ["URN", "Urn", "urn"],
"Inspection type": "inspection_type", "inspection_date": [
"Overall effectiveness": "overall_effectiveness", "Inspection start date of latest OEIF graded inspection",
"Quality of education": "quality_of_education", "Inspection start date",
"Behaviour and attitudes": "behaviour_attitudes", "Inspection date",
"Personal development": "personal_development", "InspectionDate",
"Leadership and management": "leadership_management", ],
"Early years provision": "early_years_provision", "publication_date": [
# Some CSVs use shortened names "Publication date of latest OEIF graded inspection",
"Urn": "urn", "Publication date",
"InspectionDate": "inspection_date", "PublicationDate",
"PublicationDate": "publication_date", ],
"InspectionType": "inspection_type", "inspection_type": [
"OverallEffectiveness": "overall_effectiveness", "Inspection type of latest OEIF graded inspection",
"QualityOfEducation": "quality_of_education", "Inspection type",
"BehaviourAndAttitudes": "behaviour_attitudes", "InspectionType",
"PersonalDevelopment": "personal_development", ],
"LeadershipAndManagement": "leadership_management", "overall_effectiveness": [
"EarlyYearsProvision": "early_years_provision", "Latest OEIF overall effectiveness",
"Overall effectiveness",
"OverallEffectiveness",
],
"quality_of_education": [
"Latest OEIF quality of education",
"Quality of education",
"QualityOfEducation",
],
"behaviour_attitudes": [
"Latest OEIF behaviour and attitudes",
"Behaviour and attitudes",
"BehaviourAndAttitudes",
],
"personal_development": [
"Latest OEIF personal development",
"Personal development",
"PersonalDevelopment",
],
"leadership_management": [
"Latest OEIF effectiveness of leadership and management",
"Leadership and management",
"LeadershipAndManagement",
],
"early_years_provision": [
"Latest OEIF early years provision (where applicable)",
"Early years provision",
"EarlyYearsProvision",
],
} }
GRADE_MAP = { GRADE_MAP = {
@@ -158,8 +186,13 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
hdr = _find_header_row(path) hdr = _find_header_row(path)
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr) df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
# Normalise column names # Normalise column names: for each target field pick the first source column present
df.rename(columns=COLUMN_MAP, inplace=True) available = set(df.columns)
for target, sources in COLUMN_PRIORITY.items():
for src in sources:
if src in available:
df.rename(columns={src: target}, inplace=True)
break
if "urn" not in df.columns: if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")