From f4f02574477a1dc6999ce6bb23d596c52a088206 Mon Sep 17 00:00:00 2001 From: Tudor Date: Fri, 27 Mar 2026 09:41:40 +0000 Subject: [PATCH] fix(ees-tap): add latin-1 encoding for census/admissions, default utf-8 for others DfE supporting-files CSVs (spc_school_level_underlying_data, AppsandOffers SchoolLevel) are Latin-1 encoded. Add _encoding class attribute to base stream class and override to 'latin-1' for census and admissions streams. Co-Authored-By: Claude Sonnet 4.6 --- pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py | 5 ++++- 1 file changed, 4 insertions(+), 1 deletion(-) diff --git a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py index f9e5756..9f550c2 100644 --- a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py +++ b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py @@ -50,6 +50,7 @@ class EESDatasetStream(Stream): _publication_slug: str = "" _target_filename: str = "" # keyword that appears in the CSV path _urn_column: str = "school_urn" # column name for URN in the CSV + _encoding: str = "utf-8" # CSV file encoding (some DfE files use latin-1) def get_records(self, context): import pandas as pd @@ -80,7 +81,7 @@ class EESDatasetStream(Stream): self.logger.info("Reading %s from ZIP", target) with zf.open(target) as f: - df = pd.read_csv(f, dtype=str, keep_default_na=False) + df = pd.read_csv(f, dtype=str, keep_default_na=False, encoding=self._encoding) # Filter to school-level data if the column exists if "geographic_level" in df.columns: @@ -266,6 +267,7 @@ class EESCensusStream(EESDatasetStream): _publication_slug = "school-pupils-and-their-characteristics" _target_filename = "spc_school_level_underlying_data" _urn_column = "urn" + _encoding = "latin-1" schema = th.PropertiesList( th.Property("time_period", th.StringType, required=True), th.Property("school_urn", th.StringType, required=True), @@ -287,6 +289,7 @@ class EESAdmissionsStream(EESDatasetStream): primary_keys = ["school_urn", "time_period"] _publication_slug = "primary-and-secondary-school-applications-and-offers" _target_filename = "SchoolLevel" + _encoding = "latin-1" schema = th.PropertiesList( th.Property("time_period", th.StringType, required=True), th.Property("school_urn", th.StringType, required=True),