fix(ees-tap): add latin-1 encoding for census/admissions, default utf-8 for others
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 52s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m8s
Build and Push Docker Images / Build Integrator (push) Successful in 55s
Build and Push Docker Images / Build Kestra Init (push) Successful in 31s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m40s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s

DfE supporting-files CSVs (spc_school_level_underlying_data, AppsandOffers
SchoolLevel) are Latin-1 encoded. Add _encoding class attribute to base
stream class and override to 'latin-1' for census and admissions streams.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-27 09:41:40 +00:00
parent ca351e9d73
commit f4f0257447

View File

@@ -50,6 +50,7 @@ class EESDatasetStream(Stream):
_publication_slug: str = "" _publication_slug: str = ""
_target_filename: str = "" # keyword that appears in the CSV path _target_filename: str = "" # keyword that appears in the CSV path
_urn_column: str = "school_urn" # column name for URN in the CSV _urn_column: str = "school_urn" # column name for URN in the CSV
_encoding: str = "utf-8" # CSV file encoding (some DfE files use latin-1)
def get_records(self, context): def get_records(self, context):
import pandas as pd import pandas as pd
@@ -80,7 +81,7 @@ class EESDatasetStream(Stream):
self.logger.info("Reading %s from ZIP", target) self.logger.info("Reading %s from ZIP", target)
with zf.open(target) as f: with zf.open(target) as f:
df = pd.read_csv(f, dtype=str, keep_default_na=False) df = pd.read_csv(f, dtype=str, keep_default_na=False, encoding=self._encoding)
# Filter to school-level data if the column exists # Filter to school-level data if the column exists
if "geographic_level" in df.columns: if "geographic_level" in df.columns:
@@ -266,6 +267,7 @@ class EESCensusStream(EESDatasetStream):
_publication_slug = "school-pupils-and-their-characteristics" _publication_slug = "school-pupils-and-their-characteristics"
_target_filename = "spc_school_level_underlying_data" _target_filename = "spc_school_level_underlying_data"
_urn_column = "urn" _urn_column = "urn"
_encoding = "latin-1"
schema = th.PropertiesList( schema = th.PropertiesList(
th.Property("time_period", th.StringType, required=True), th.Property("time_period", th.StringType, required=True),
th.Property("school_urn", th.StringType, required=True), th.Property("school_urn", th.StringType, required=True),
@@ -287,6 +289,7 @@ class EESAdmissionsStream(EESDatasetStream):
primary_keys = ["school_urn", "time_period"] primary_keys = ["school_urn", "time_period"]
_publication_slug = "primary-and-secondary-school-applications-and-offers" _publication_slug = "primary-and-secondary-school-applications-and-offers"
_target_filename = "SchoolLevel" _target_filename = "SchoolLevel"
_encoding = "latin-1"
schema = th.PropertiesList( schema = th.PropertiesList(
th.Property("time_period", th.StringType, required=True), th.Property("time_period", th.StringType, required=True),
th.Property("school_urn", th.StringType, required=True), th.Property("school_urn", th.StringType, required=True),