From 26aa3c2d70c8392301b80ebe46f5225cb14d789c Mon Sep 17 00:00:00 2001 From: Tudor Date: Fri, 27 Mar 2026 17:05:03 +0000 Subject: [PATCH] fix(tap-uk-ofsted): fix header row detection matching 'urn' inside 'turn' The preamble row in Ofsted CSVs contains 'turn off all filters' which matched 'urn' in line.lower(), so header_idx was set to 0 instead of the real header row. Use a regex that matches URN only as a CSV field. Co-Authored-By: Claude Sonnet 4.6 --- .../plugins/extractors/tap-uk-ofsted/tap_uk_ofsted/tap.py | 4 +++- 1 file changed, 3 insertions(+), 1 deletion(-) diff --git a/pipeline/plugins/extractors/tap-uk-ofsted/tap_uk_ofsted/tap.py b/pipeline/plugins/extractors/tap-uk-ofsted/tap_uk_ofsted/tap.py index f1ba1f9..764bff5 100644 --- a/pipeline/plugins/extractors/tap-uk-ofsted/tap_uk_ofsted/tap.py +++ b/pipeline/plugins/extractors/tap-uk-ofsted/tap_uk_ofsted/tap.py @@ -137,7 +137,9 @@ class OfstedInspectionsStream(Stream): lines = text.split("\n") header_idx = 0 for i, line in enumerate(lines[:20]): - if "URN" in line or "urn" in line.lower(): + # Match lines where URN appears as a CSV field (start or after comma), + # not as a substring of words like "turn" or "return". + if re.search(r'(?:^|,)\s*URN\s*(?:,|$)', line): header_idx = i break df = pd.read_csv(