fix(ofsted): detect header row dynamically instead of hardcoding offset

Ofsted CSV has a variable number of preamble rows (title, filter warning, etc.) before the real column headers. Scan up to 10 rows to find the one containing a URN column rather than assuming a fixed offset. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 21:15:03 +00:00
parent 0e5b71d4a0
commit 7f9c61d587
1 changed files with 20 additions and 3 deletions
@@ -130,16 +130,33 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
    print(f"  Ofsted: loading {path} ...")
    def _find_header_row(filepath, encoding="latin-1"):
        """Scan up to 10 rows to find the one containing a URN column."""
        for i in range(10):
            peek = pd.read_csv(filepath, encoding=encoding, header=i, nrows=0)
            if any(str(c).strip() in ("URN", "Urn", "urn") for c in peek.columns):
                return i
        return 0
    if str(path).endswith(".zip"):
        import zipfile, io
        with zipfile.ZipFile(path) as z:
            csv_names = [n for n in z.namelist() if n.endswith(".csv")]
            if not csv_names:
                raise ValueError("No CSV found inside Ofsted ZIP")
-            with z.open(csv_names[0]) as f:
+            # Extract to a temp file so we can scan for the header row
-                df = pd.read_csv(io.TextIOWrapper(f, encoding="latin-1"), low_memory=False, header=1)
+            import tempfile, os
            with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
                tmp.write(z.read(csv_names[0]))
                tmp_path = tmp.name
            try:
                hdr = _find_header_row(tmp_path)
                df = pd.read_csv(tmp_path, encoding="latin-1", low_memory=False, header=hdr)
            finally:
                os.unlink(tmp_path)
    else:
-        df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=1)
+        hdr = _find_header_row(path)
        df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
    # Normalise column names
    df.rename(columns=COLUMN_MAP, inplace=True)