fix(ees-tap): fix BOM handling for admissions CSV

Admissions file is UTF-8 with BOM, not Latin-1. Reading as latin-1 decoded the BOM bytes as 'ï»¿' which wasn't stripped. Change admissions encoding to utf-8-sig (strips BOM automatically). Also update the manual BOM strip fallback to handle the latin-1 decoded form. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 10:03:17 +00:00
parent b8ecc5c58b
commit c7357336e3
1 changed files with 8 additions and 3 deletions
@@ -83,8 +83,13 @@ class EESDatasetStream(Stream):
        with zf.open(target) as f:
            df = pd.read_csv(f, dtype=str, keep_default_na=False, encoding=self._encoding)

-        # Strip UTF-8 BOM from column names (some DfE files have a BOM on the first column)
-        df.columns = df.columns.str.lstrip("\ufeff")
+        # Strip BOM from first column name — handles both:
+        # - UTF-8 BOM decoded as Unicode (\ufeff) when read with utf-8/utf-8-sig
+        # - UTF-8 BOM bytes decoded as Latin-1 (ï»¿) when read with latin-1
+        cols = list(df.columns)
+        if cols:
+            cols[0] = cols[0].lstrip("\ufeff").lstrip("ï»¿")
+            df.columns = cols

        # Filter to school-level data if the column exists
        if "geographic_level" in df.columns:
@@ -292,7 +297,7 @@ class EESAdmissionsStream(EESDatasetStream):
    primary_keys = ["school_urn", "time_period"]
    _publication_slug = "primary-and-secondary-school-applications-and-offers"
    _target_filename = "SchoolLevel"
-    _encoding = "latin-1"
+    _encoding = "utf-8-sig"  # UTF-8 with BOM — sig variant strips the BOM automatically
    schema = th.PropertiesList(
        th.Property("time_period", th.StringType, required=True),
        th.Property("school_urn", th.StringType, required=True),