fix(ofsted): map OEIF column names from current CSV format
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 31s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m3s
Build and Push Docker Images / Build Integrator (push) Successful in 59s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 31s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m3s
Build and Push Docker Images / Build Integrator (push) Successful in 59s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
Ofsted renamed all columns in the OEIF framework: - grades are now 'Latest OEIF overall effectiveness' etc. - dates are 'Inspection start date of latest OEIF graded inspection' Replace flat COLUMN_MAP with a priority list per field so both current OEIF and legacy column names work without duplicate-column conflicts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,28 +21,56 @@ from db import get_session
|
||||
# The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page.
|
||||
GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes"
|
||||
|
||||
COLUMN_MAP = {
|
||||
"URN": "urn",
|
||||
"Inspection date": "inspection_date",
|
||||
"Publication date": "publication_date",
|
||||
"Inspection type": "inspection_type",
|
||||
"Overall effectiveness": "overall_effectiveness",
|
||||
"Quality of education": "quality_of_education",
|
||||
"Behaviour and attitudes": "behaviour_attitudes",
|
||||
"Personal development": "personal_development",
|
||||
"Leadership and management": "leadership_management",
|
||||
"Early years provision": "early_years_provision",
|
||||
# Some CSVs use shortened names
|
||||
"Urn": "urn",
|
||||
"InspectionDate": "inspection_date",
|
||||
"PublicationDate": "publication_date",
|
||||
"InspectionType": "inspection_type",
|
||||
"OverallEffectiveness": "overall_effectiveness",
|
||||
"QualityOfEducation": "quality_of_education",
|
||||
"BehaviourAndAttitudes": "behaviour_attitudes",
|
||||
"PersonalDevelopment": "personal_development",
|
||||
"LeadershipAndManagement": "leadership_management",
|
||||
"EarlyYearsProvision": "early_years_provision",
|
||||
# Column name → internal field, listed in priority order per field.
|
||||
# First matching column wins; later entries are fallbacks for older file formats.
|
||||
COLUMN_PRIORITY = {
|
||||
"urn": ["URN", "Urn", "urn"],
|
||||
"inspection_date": [
|
||||
"Inspection start date of latest OEIF graded inspection",
|
||||
"Inspection start date",
|
||||
"Inspection date",
|
||||
"InspectionDate",
|
||||
],
|
||||
"publication_date": [
|
||||
"Publication date of latest OEIF graded inspection",
|
||||
"Publication date",
|
||||
"PublicationDate",
|
||||
],
|
||||
"inspection_type": [
|
||||
"Inspection type of latest OEIF graded inspection",
|
||||
"Inspection type",
|
||||
"InspectionType",
|
||||
],
|
||||
"overall_effectiveness": [
|
||||
"Latest OEIF overall effectiveness",
|
||||
"Overall effectiveness",
|
||||
"OverallEffectiveness",
|
||||
],
|
||||
"quality_of_education": [
|
||||
"Latest OEIF quality of education",
|
||||
"Quality of education",
|
||||
"QualityOfEducation",
|
||||
],
|
||||
"behaviour_attitudes": [
|
||||
"Latest OEIF behaviour and attitudes",
|
||||
"Behaviour and attitudes",
|
||||
"BehaviourAndAttitudes",
|
||||
],
|
||||
"personal_development": [
|
||||
"Latest OEIF personal development",
|
||||
"Personal development",
|
||||
"PersonalDevelopment",
|
||||
],
|
||||
"leadership_management": [
|
||||
"Latest OEIF effectiveness of leadership and management",
|
||||
"Leadership and management",
|
||||
"LeadershipAndManagement",
|
||||
],
|
||||
"early_years_provision": [
|
||||
"Latest OEIF early years provision (where applicable)",
|
||||
"Early years provision",
|
||||
"EarlyYearsProvision",
|
||||
],
|
||||
}
|
||||
|
||||
GRADE_MAP = {
|
||||
@@ -158,8 +186,13 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||||
hdr = _find_header_row(path)
|
||||
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
|
||||
|
||||
# Normalise column names
|
||||
df.rename(columns=COLUMN_MAP, inplace=True)
|
||||
# Normalise column names: for each target field pick the first source column present
|
||||
available = set(df.columns)
|
||||
for target, sources in COLUMN_PRIORITY.items():
|
||||
for src in sources:
|
||||
if src in available:
|
||||
df.rename(columns={src: target}, inplace=True)
|
||||
break
|
||||
|
||||
if "urn" not in df.columns:
|
||||
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
|
||||
|
||||
Reference in New Issue
Block a user