From 0062a5eabed31fed18a98de43167d5da1b84b34c Mon Sep 17 00:00:00 2001 From: Tudor Date: Thu, 26 Mar 2026 14:03:26 +0000 Subject: [PATCH] fix(tap-gias): declare numeric CSV columns as StringType CSV is read with dtype=str so all values arrive as strings. Declaring LA (code) and EstablishmentNumber as IntegerType caused schema validation failures in target-postgres. Use StringType for all columns except URN (which is explicitly cast to int for the primary key). Type casting happens in dbt staging models. Co-Authored-By: Claude Opus 4.6 --- pipeline/plugins/extractors/tap-uk-gias/tap_uk_gias/tap.py | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/pipeline/plugins/extractors/tap-uk-gias/tap_uk_gias/tap.py b/pipeline/plugins/extractors/tap-uk-gias/tap_uk_gias/tap.py index 1f014e3..7c01b78 100644 --- a/pipeline/plugins/extractors/tap-uk-gias/tap_uk_gias/tap.py +++ b/pipeline/plugins/extractors/tap-uk-gias/tap_uk_gias/tap.py @@ -26,14 +26,16 @@ class GIASEstablishmentsStream(Stream): replication_key = None # Schema is wide (~250 columns); we declare key columns and pass through the rest + # All columns are read as strings from CSV; dbt staging models handle type casting. + # Only URN is cast to int in get_records() for the primary key. schema = th.PropertiesList( th.Property("URN", th.IntegerType, required=True), th.Property("EstablishmentName", th.StringType), th.Property("TypeOfEstablishment (name)", th.StringType), th.Property("PhaseOfEducation (name)", th.StringType), - th.Property("LA (code)", th.IntegerType), + th.Property("LA (code)", th.StringType), th.Property("LA (name)", th.StringType), - th.Property("EstablishmentNumber", th.IntegerType), + th.Property("EstablishmentNumber", th.StringType), th.Property("EstablishmentStatus (name)", th.StringType), th.Property("Postcode", th.StringType), ).to_dict()