From b8ecc5c58bf0cb93419b71622f4e74b4c4ead380 Mon Sep 17 00:00:00 2001
From: Tudor <tudor@sitaru.org>
Date: Fri, 27 Mar 2026 09:54:15 +0000
Subject: [PATCH] fix(ees-tap): strip UTF-8 BOM from CSV column names

Some DfE supporting-files CSVs have a UTF-8 BOM on the first column,
causing it to be named '\ufefftime_period' instead of 'time_period'.
This trips Singer schema validation ('time_period' is a required property).
Strip the BOM from all column names after read_csv.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py | 3 +++
 1 file changed, 3 insertions(+)

diff --git a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py
index 9f550c2..fa6031f 100644
--- a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py
+++ b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py
@@ -83,6 +83,9 @@ class EESDatasetStream(Stream):
         with zf.open(target) as f:
             df = pd.read_csv(f, dtype=str, keep_default_na=False, encoding=self._encoding)
 
+        # Strip UTF-8 BOM from column names (some DfE files have a BOM on the first column)
+        df.columns = df.columns.str.lstrip("\ufeff")
+
         # Filter to school-level data if the column exists
         if "geographic_level" in df.columns:
             df = df[df["geographic_level"] == "School"]