fix(ofsted): detect header row dynamically instead of hardcoding offset
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 36s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m15s
Build and Push Docker Images / Build Integrator (push) Successful in 59s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

Ofsted CSV has a variable number of preamble rows (title, filter warning,
etc.) before the real column headers. Scan up to 10 rows to find the one
containing a URN column rather than assuming a fixed offset.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-24 21:15:03 +00:00
parent 0e5b71d4a0
commit 7f9c61d587

View File

@@ -130,16 +130,33 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
print(f" Ofsted: loading {path} ...") print(f" Ofsted: loading {path} ...")
def _find_header_row(filepath, encoding="latin-1"):
"""Scan up to 10 rows to find the one containing a URN column."""
for i in range(10):
peek = pd.read_csv(filepath, encoding=encoding, header=i, nrows=0)
if any(str(c).strip() in ("URN", "Urn", "urn") for c in peek.columns):
return i
return 0
if str(path).endswith(".zip"): if str(path).endswith(".zip"):
import zipfile, io import zipfile, io
with zipfile.ZipFile(path) as z: with zipfile.ZipFile(path) as z:
csv_names = [n for n in z.namelist() if n.endswith(".csv")] csv_names = [n for n in z.namelist() if n.endswith(".csv")]
if not csv_names: if not csv_names:
raise ValueError("No CSV found inside Ofsted ZIP") raise ValueError("No CSV found inside Ofsted ZIP")
with z.open(csv_names[0]) as f: # Extract to a temp file so we can scan for the header row
df = pd.read_csv(io.TextIOWrapper(f, encoding="latin-1"), low_memory=False, header=1) import tempfile, os
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
tmp.write(z.read(csv_names[0]))
tmp_path = tmp.name
try:
hdr = _find_header_row(tmp_path)
df = pd.read_csv(tmp_path, encoding="latin-1", low_memory=False, header=hdr)
finally:
os.unlink(tmp_path)
else: else:
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=1) hdr = _find_header_row(path)
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
# Normalise column names # Normalise column names
df.rename(columns=COLUMN_MAP, inplace=True) df.rename(columns=COLUMN_MAP, inplace=True)