diff --git a/integrator/scripts/sources/ofsted.py b/integrator/scripts/sources/ofsted.py index 800ad81..e52ec6b 100644 --- a/integrator/scripts/sources/ofsted.py +++ b/integrator/scripts/sources/ofsted.py @@ -130,16 +130,33 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict: print(f" Ofsted: loading {path} ...") + def _find_header_row(filepath, encoding="latin-1"): + """Scan up to 10 rows to find the one containing a URN column.""" + for i in range(10): + peek = pd.read_csv(filepath, encoding=encoding, header=i, nrows=0) + if any(str(c).strip() in ("URN", "Urn", "urn") for c in peek.columns): + return i + return 0 + if str(path).endswith(".zip"): import zipfile, io with zipfile.ZipFile(path) as z: csv_names = [n for n in z.namelist() if n.endswith(".csv")] if not csv_names: raise ValueError("No CSV found inside Ofsted ZIP") - with z.open(csv_names[0]) as f: - df = pd.read_csv(io.TextIOWrapper(f, encoding="latin-1"), low_memory=False, header=1) + # Extract to a temp file so we can scan for the header row + import tempfile, os + with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp: + tmp.write(z.read(csv_names[0])) + tmp_path = tmp.name + try: + hdr = _find_header_row(tmp_path) + df = pd.read_csv(tmp_path, encoding="latin-1", low_memory=False, header=hdr) + finally: + os.unlink(tmp_path) else: - df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=1) + hdr = _find_header_row(path) + df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr) # Normalise column names df.rename(columns=COLUMN_MAP, inplace=True)