diff --git a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py index f9e5756..9f550c2 100644 --- a/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py +++ b/pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py @@ -50,6 +50,7 @@ class EESDatasetStream(Stream): _publication_slug: str = "" _target_filename: str = "" # keyword that appears in the CSV path _urn_column: str = "school_urn" # column name for URN in the CSV + _encoding: str = "utf-8" # CSV file encoding (some DfE files use latin-1) def get_records(self, context): import pandas as pd @@ -80,7 +81,7 @@ class EESDatasetStream(Stream): self.logger.info("Reading %s from ZIP", target) with zf.open(target) as f: - df = pd.read_csv(f, dtype=str, keep_default_na=False) + df = pd.read_csv(f, dtype=str, keep_default_na=False, encoding=self._encoding) # Filter to school-level data if the column exists if "geographic_level" in df.columns: @@ -266,6 +267,7 @@ class EESCensusStream(EESDatasetStream): _publication_slug = "school-pupils-and-their-characteristics" _target_filename = "spc_school_level_underlying_data" _urn_column = "urn" + _encoding = "latin-1" schema = th.PropertiesList( th.Property("time_period", th.StringType, required=True), th.Property("school_urn", th.StringType, required=True), @@ -287,6 +289,7 @@ class EESAdmissionsStream(EESDatasetStream): primary_keys = ["school_urn", "time_period"] _publication_slug = "primary-and-secondary-school-applications-and-offers" _target_filename = "SchoolLevel" + _encoding = "latin-1" schema = th.PropertiesList( th.Property("time_period", th.StringType, required=True), th.Property("school_urn", th.StringType, required=True),