fix(ofsted): map OEIF column names from current CSV format
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 31s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m3s
Build and Push Docker Images / Build Integrator (push) Successful in 59s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 31s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m3s
Build and Push Docker Images / Build Integrator (push) Successful in 59s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
Ofsted renamed all columns in the OEIF framework: - grades are now 'Latest OEIF overall effectiveness' etc. - dates are 'Inspection start date of latest OEIF graded inspection' Replace flat COLUMN_MAP with a priority list per field so both current OEIF and legacy column names work without duplicate-column conflicts. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -21,28 +21,56 @@ from db import get_session
|
|||||||
# The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page.
|
# The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page.
|
||||||
GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes"
|
GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes"
|
||||||
|
|
||||||
COLUMN_MAP = {
|
# Column name → internal field, listed in priority order per field.
|
||||||
"URN": "urn",
|
# First matching column wins; later entries are fallbacks for older file formats.
|
||||||
"Inspection date": "inspection_date",
|
COLUMN_PRIORITY = {
|
||||||
"Publication date": "publication_date",
|
"urn": ["URN", "Urn", "urn"],
|
||||||
"Inspection type": "inspection_type",
|
"inspection_date": [
|
||||||
"Overall effectiveness": "overall_effectiveness",
|
"Inspection start date of latest OEIF graded inspection",
|
||||||
"Quality of education": "quality_of_education",
|
"Inspection start date",
|
||||||
"Behaviour and attitudes": "behaviour_attitudes",
|
"Inspection date",
|
||||||
"Personal development": "personal_development",
|
"InspectionDate",
|
||||||
"Leadership and management": "leadership_management",
|
],
|
||||||
"Early years provision": "early_years_provision",
|
"publication_date": [
|
||||||
# Some CSVs use shortened names
|
"Publication date of latest OEIF graded inspection",
|
||||||
"Urn": "urn",
|
"Publication date",
|
||||||
"InspectionDate": "inspection_date",
|
"PublicationDate",
|
||||||
"PublicationDate": "publication_date",
|
],
|
||||||
"InspectionType": "inspection_type",
|
"inspection_type": [
|
||||||
"OverallEffectiveness": "overall_effectiveness",
|
"Inspection type of latest OEIF graded inspection",
|
||||||
"QualityOfEducation": "quality_of_education",
|
"Inspection type",
|
||||||
"BehaviourAndAttitudes": "behaviour_attitudes",
|
"InspectionType",
|
||||||
"PersonalDevelopment": "personal_development",
|
],
|
||||||
"LeadershipAndManagement": "leadership_management",
|
"overall_effectiveness": [
|
||||||
"EarlyYearsProvision": "early_years_provision",
|
"Latest OEIF overall effectiveness",
|
||||||
|
"Overall effectiveness",
|
||||||
|
"OverallEffectiveness",
|
||||||
|
],
|
||||||
|
"quality_of_education": [
|
||||||
|
"Latest OEIF quality of education",
|
||||||
|
"Quality of education",
|
||||||
|
"QualityOfEducation",
|
||||||
|
],
|
||||||
|
"behaviour_attitudes": [
|
||||||
|
"Latest OEIF behaviour and attitudes",
|
||||||
|
"Behaviour and attitudes",
|
||||||
|
"BehaviourAndAttitudes",
|
||||||
|
],
|
||||||
|
"personal_development": [
|
||||||
|
"Latest OEIF personal development",
|
||||||
|
"Personal development",
|
||||||
|
"PersonalDevelopment",
|
||||||
|
],
|
||||||
|
"leadership_management": [
|
||||||
|
"Latest OEIF effectiveness of leadership and management",
|
||||||
|
"Leadership and management",
|
||||||
|
"LeadershipAndManagement",
|
||||||
|
],
|
||||||
|
"early_years_provision": [
|
||||||
|
"Latest OEIF early years provision (where applicable)",
|
||||||
|
"Early years provision",
|
||||||
|
"EarlyYearsProvision",
|
||||||
|
],
|
||||||
}
|
}
|
||||||
|
|
||||||
GRADE_MAP = {
|
GRADE_MAP = {
|
||||||
@@ -158,8 +186,13 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
|||||||
hdr = _find_header_row(path)
|
hdr = _find_header_row(path)
|
||||||
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
|
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
|
||||||
|
|
||||||
# Normalise column names
|
# Normalise column names: for each target field pick the first source column present
|
||||||
df.rename(columns=COLUMN_MAP, inplace=True)
|
available = set(df.columns)
|
||||||
|
for target, sources in COLUMN_PRIORITY.items():
|
||||||
|
for src in sources:
|
||||||
|
if src in available:
|
||||||
|
df.rename(columns={src: target}, inplace=True)
|
||||||
|
break
|
||||||
|
|
||||||
if "urn" not in df.columns:
|
if "urn" not in df.columns:
|
||||||
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
|
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
|
||||||
|
|||||||
Reference in New Issue
Block a user