fix(ofsted): detect header row dynamically instead of hardcoding offset
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 36s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m15s
Build and Push Docker Images / Build Integrator (push) Successful in 59s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 36s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m15s
Build and Push Docker Images / Build Integrator (push) Successful in 59s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
Ofsted CSV has a variable number of preamble rows (title, filter warning, etc.) before the real column headers. Scan up to 10 rows to find the one containing a URN column rather than assuming a fixed offset. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -130,16 +130,33 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
|||||||
|
|
||||||
print(f" Ofsted: loading {path} ...")
|
print(f" Ofsted: loading {path} ...")
|
||||||
|
|
||||||
|
def _find_header_row(filepath, encoding="latin-1"):
|
||||||
|
"""Scan up to 10 rows to find the one containing a URN column."""
|
||||||
|
for i in range(10):
|
||||||
|
peek = pd.read_csv(filepath, encoding=encoding, header=i, nrows=0)
|
||||||
|
if any(str(c).strip() in ("URN", "Urn", "urn") for c in peek.columns):
|
||||||
|
return i
|
||||||
|
return 0
|
||||||
|
|
||||||
if str(path).endswith(".zip"):
|
if str(path).endswith(".zip"):
|
||||||
import zipfile, io
|
import zipfile, io
|
||||||
with zipfile.ZipFile(path) as z:
|
with zipfile.ZipFile(path) as z:
|
||||||
csv_names = [n for n in z.namelist() if n.endswith(".csv")]
|
csv_names = [n for n in z.namelist() if n.endswith(".csv")]
|
||||||
if not csv_names:
|
if not csv_names:
|
||||||
raise ValueError("No CSV found inside Ofsted ZIP")
|
raise ValueError("No CSV found inside Ofsted ZIP")
|
||||||
with z.open(csv_names[0]) as f:
|
# Extract to a temp file so we can scan for the header row
|
||||||
df = pd.read_csv(io.TextIOWrapper(f, encoding="latin-1"), low_memory=False, header=1)
|
import tempfile, os
|
||||||
|
with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
|
||||||
|
tmp.write(z.read(csv_names[0]))
|
||||||
|
tmp_path = tmp.name
|
||||||
|
try:
|
||||||
|
hdr = _find_header_row(tmp_path)
|
||||||
|
df = pd.read_csv(tmp_path, encoding="latin-1", low_memory=False, header=hdr)
|
||||||
|
finally:
|
||||||
|
os.unlink(tmp_path)
|
||||||
else:
|
else:
|
||||||
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=1)
|
hdr = _find_header_row(path)
|
||||||
|
df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
|
||||||
|
|
||||||
# Normalise column names
|
# Normalise column names
|
||||||
df.rename(columns=COLUMN_MAP, inplace=True)
|
df.rename(columns=COLUMN_MAP, inplace=True)
|
||||||
|
|||||||
Reference in New Issue
Block a user