fix(ofsted): detect header row dynamically instead of hardcoding offset

Ofsted CSV has a variable number of preamble rows (title, filter warning, etc.) before the real column headers. Scan up to 10 rows to find the one containing a URN column rather than assuming a fixed offset. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 21:15:03 +00:00
parent 0e5b71d4a0
commit 7f9c61d587
1 changed files with 20 additions and 3 deletions
@@ -130,16 +130,33 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:

    print(f"  Ofsted: loading {path} ...")

+    def _find_header_row(filepath, encoding="latin-1"):
+        """Scan up to 10 rows to find the one containing a URN column."""
+        for i in range(10):
+            peek = pd.read_csv(filepath, encoding=encoding, header=i, nrows=0)
+            if any(str(c).strip() in ("URN", "Urn", "urn") for c in peek.columns):
+                return i
+        return 0
+
    if str(path).endswith(".zip"):
        import zipfile, io
        with zipfile.ZipFile(path) as z:
            csv_names = [n for n in z.namelist() if n.endswith(".csv")]
            if not csv_names:
                raise ValueError("No CSV found inside Ofsted ZIP")
-            with z.open(csv_names[0]) as f:
-                df = pd.read_csv(io.TextIOWrapper(f, encoding="latin-1"), low_memory=False, header=1)
+            # Extract to a temp file so we can scan for the header row
+            import tempfile, os
+            with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
+                tmp.write(z.read(csv_names[0]))
+                tmp_path = tmp.name
+            try:
+                hdr = _find_header_row(tmp_path)
+                df = pd.read_csv(tmp_path, encoding="latin-1", low_memory=False, header=hdr)
+            finally:
+                os.unlink(tmp_path)
    else:
-        df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=1)
+        hdr = _find_header_row(path)
+        df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)

    # Normalise column names
    df.rename(columns=COLUMN_MAP, inplace=True)