From c7357336e35410b53479a5dd708f07764d760a37 Mon Sep 17 00:00:00 2001 From: Tudor Date: Fri, 27 Mar 2026 10:03:17 +0000 Subject: [PATCH] fix(ees-tap): fix BOM handling for admissions CSV MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Admissions file is UTF-8 with BOM, not Latin-1. Reading as latin-1 decoded the BOM bytes as '' which wasn't stripped. Change admissions encoding to utf-8-sig (strips BOM automatically). Also update the manual BOM strip fallback to handle the latin-1 decoded form. Co-Authored-By: Claude Sonnet 4.6 --- .../plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py | 11 ++++++++--- 1 file changed, 8 insertions(+), 3 deletions(-) diff --git a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py index fa6031f..6e07728 100644 --- a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py +++ b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py @@ -83,8 +83,13 @@ class EESDatasetStream(Stream): with zf.open(target) as f: df = pd.read_csv(f, dtype=str, keep_default_na=False, encoding=self._encoding) - # Strip UTF-8 BOM from column names (some DfE files have a BOM on the first column) - df.columns = df.columns.str.lstrip("\ufeff") + # Strip BOM from first column name — handles both: + # - UTF-8 BOM decoded as Unicode (\ufeff) when read with utf-8/utf-8-sig + # - UTF-8 BOM bytes decoded as Latin-1 () when read with latin-1 + cols = list(df.columns) + if cols: + cols[0] = cols[0].lstrip("\ufeff").lstrip("") + df.columns = cols # Filter to school-level data if the column exists if "geographic_level" in df.columns: @@ -292,7 +297,7 @@ class EESAdmissionsStream(EESDatasetStream): primary_keys = ["school_urn", "time_period"] _publication_slug = "primary-and-secondary-school-applications-and-offers" _target_filename = "SchoolLevel" - _encoding = "latin-1" + _encoding = "utf-8-sig" # UTF-8 with BOM — sig variant strips the BOM automatically schema = th.PropertiesList( th.Property("time_period", th.StringType, required=True), th.Property("school_urn", th.StringType, required=True),