fix(pipeline): expand GIAS schema, handle empty strings, scope DAG selectors
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 32s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m8s
Build and Push Docker Images / Build Integrator (push) Successful in 57s
Build and Push Docker Images / Build Kestra Init (push) Successful in 34s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m39s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

- Declare all 34 columns needed by dbt in GIAS tap schema (target-postgres
  only persists columns present in the Singer schema message)
- Use nullif() for empty-string-to-integer/date casts in staging models
- Scope daily DAG dbt build to GIAS models only (stg_gias_establishments+
  stg_gias_links+) to avoid errors on unloaded sources
- Scope annual EES DAG similarly; remove redundant dbt test steps
- Make dim_school gracefully handle missing int_ofsted_latest table

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 20:43:24 +00:00
parent 24cfb83144
commit e7b1ab9f37
5 changed files with 59 additions and 35 deletions

View File

@@ -79,12 +79,7 @@ print(f'Validation passed: {{count}} GIAS rows')
dbt_build = BashOperator( dbt_build = BashOperator(
task_id="dbt_build", task_id="dbt_build",
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production", bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production --select stg_gias_establishments+ stg_gias_links+",
)
dbt_test = BashOperator(
task_id="dbt_test",
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} test --profiles-dir . --target production",
) )
geocode_new = BashOperator( geocode_new = BashOperator(
@@ -97,7 +92,7 @@ print(f'Validation passed: {{count}} GIAS rows')
bash_command=f"cd {PIPELINE_DIR} && python scripts/sync_typesense.py", bash_command=f"cd {PIPELINE_DIR} && python scripts/sync_typesense.py",
) )
extract_group >> validate_raw >> dbt_build >> dbt_test >> geocode_new >> sync_typesense extract_group >> validate_raw >> dbt_build >> geocode_new >> sync_typesense
# ── Monthly DAG (Ofsted) ─────────────────────────────────────────────── # ── Monthly DAG (Ofsted) ───────────────────────────────────────────────
@@ -150,12 +145,7 @@ with DAG(
dbt_build_ees = BashOperator( dbt_build_ees = BashOperator(
task_id="dbt_build", task_id="dbt_build",
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production", bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production --select stg_ees_ks2+ stg_ees_ks4+ stg_ees_census+ stg_ees_admissions+ stg_ees_phonics+",
)
dbt_test_ees = BashOperator(
task_id="dbt_test",
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} test --profiles-dir . --target production",
) )
sync_typesense_ees = BashOperator( sync_typesense_ees = BashOperator(
@@ -163,4 +153,4 @@ with DAG(
bash_command=f"cd {PIPELINE_DIR} && python scripts/sync_typesense.py", bash_command=f"cd {PIPELINE_DIR} && python scripts/sync_typesense.py",
) )
extract_ees_group >> dbt_build_ees >> dbt_test_ees >> sync_typesense_ees extract_ees_group >> dbt_build_ees >> sync_typesense_ees

View File

@@ -25,9 +25,9 @@ class GIASEstablishmentsStream(Stream):
primary_keys = ["URN"] primary_keys = ["URN"]
replication_key = None replication_key = None
# Schema is wide (~250 columns); we declare key columns and pass through the rest # All columns used by dbt staging models must be declared here —
# All columns are read as strings from CSV; dbt staging models handle type casting. # target-postgres only persists columns present in the Singer schema.
# Only URN is cast to int in get_records() for the primary key. # All non-PK columns are StringType; dbt handles type casting.
schema = th.PropertiesList( schema = th.PropertiesList(
th.Property("URN", th.IntegerType, required=True), th.Property("URN", th.IntegerType, required=True),
th.Property("EstablishmentName", th.StringType), th.Property("EstablishmentName", th.StringType),
@@ -38,6 +38,31 @@ class GIASEstablishmentsStream(Stream):
th.Property("EstablishmentNumber", th.StringType), th.Property("EstablishmentNumber", th.StringType),
th.Property("EstablishmentStatus (name)", th.StringType), th.Property("EstablishmentStatus (name)", th.StringType),
th.Property("Postcode", th.StringType), th.Property("Postcode", th.StringType),
th.Property("Gender (name)", th.StringType),
th.Property("ReligiousCharacter (name)", th.StringType),
th.Property("AdmissionsPolicy (name)", th.StringType),
th.Property("SchoolCapacity", th.StringType),
th.Property("NumberOfPupils", th.StringType),
th.Property("HeadTitle (name)", th.StringType),
th.Property("HeadFirstName", th.StringType),
th.Property("HeadLastName", th.StringType),
th.Property("TelephoneNum", th.StringType),
th.Property("SchoolWebsite", th.StringType),
th.Property("Street", th.StringType),
th.Property("Locality", th.StringType),
th.Property("Town", th.StringType),
th.Property("County (name)", th.StringType),
th.Property("OpenDate", th.StringType),
th.Property("CloseDate", th.StringType),
th.Property("Trusts (name)", th.StringType),
th.Property("Trusts (code)", th.StringType),
th.Property("UrbanRural (name)", th.StringType),
th.Property("ParliamentaryConstituency (name)", th.StringType),
th.Property("NurseryProvision (name)", th.StringType),
th.Property("Easting", th.StringType),
th.Property("Northing", th.StringType),
th.Property("StatutoryLowAge", th.StringType),
th.Property("StatutoryHighAge", th.StringType),
).to_dict() ).to_dict()
def get_records(self, context): def get_records(self, context):

View File

@@ -2,12 +2,14 @@
with schools as ( with schools as (
select * from {{ ref('stg_gias_establishments') }} select * from {{ ref('stg_gias_establishments') }}
),
latest_ofsted as (
select * from {{ ref('int_ofsted_latest') }}
) )
{% set ofsted_relation = adapter.get_relation(
database=target.database,
schema=target.schema,
identifier='int_ofsted_latest'
) %}
select select
s.urn, s.urn,
s.local_authority_code * 1000 + s.establishment_number as laestab, s.local_authority_code * 1000 + s.establishment_number as laestab,
@@ -30,11 +32,19 @@ select
s.nursery_provision, s.nursery_provision,
s.admissions_policy, s.admissions_policy,
-- Latest Ofsted -- Latest Ofsted (populated after monthly Ofsted pipeline runs)
{% if ofsted_relation is not none %}
o.overall_effectiveness as ofsted_grade, o.overall_effectiveness as ofsted_grade,
o.inspection_date as ofsted_date, o.inspection_date as ofsted_date,
o.framework as ofsted_framework o.framework as ofsted_framework
{% else %}
null::text as ofsted_grade,
null::date as ofsted_date,
null::text as ofsted_framework
{% endif %}
from schools s from schools s
left join latest_ofsted o on s.urn = o.urn {% if ofsted_relation is not none %}
left join {{ ref('int_ofsted_latest') }} o on s.urn = o.urn
{% endif %}
where s.status = 'Open' where s.status = 'Open'

View File

@@ -8,9 +8,9 @@ with source as (
renamed as ( renamed as (
select select
cast("URN" as integer) as urn, cast("URN" as integer) as urn,
cast("LA (code)" as integer) as local_authority_code, cast(nullif("LA (code)", '') as integer) as local_authority_code,
"LA (name)" as local_authority_name, "LA (name)" as local_authority_name,
cast("EstablishmentNumber" as integer) as establishment_number, cast(nullif("EstablishmentNumber", '') as integer) as establishment_number,
"EstablishmentName" as school_name, "EstablishmentName" as school_name,
"TypeOfEstablishment (name)" as school_type, "TypeOfEstablishment (name)" as school_type,
"PhaseOfEducation (name)" as phase, "PhaseOfEducation (name)" as phase,
@@ -18,7 +18,7 @@ renamed as (
"ReligiousCharacter (name)" as religious_character, "ReligiousCharacter (name)" as religious_character,
"AdmissionsPolicy (name)" as admissions_policy, "AdmissionsPolicy (name)" as admissions_policy,
"SchoolCapacity" as capacity, "SchoolCapacity" as capacity,
cast("NumberOfPupils" as integer) as total_pupils, cast(nullif("NumberOfPupils", '') as integer) as total_pupils,
"HeadTitle (name)" as head_title, "HeadTitle (name)" as head_title,
"HeadFirstName" as head_first_name, "HeadFirstName" as head_first_name,
"HeadLastName" as head_last_name, "HeadLastName" as head_last_name,
@@ -30,18 +30,17 @@ renamed as (
"County (name)" as county, "County (name)" as county,
"Postcode" as postcode, "Postcode" as postcode,
"EstablishmentStatus (name)" as status, "EstablishmentStatus (name)" as status,
cast("OpenDate" as date) as open_date, case when "OpenDate" = '' then null else cast("OpenDate" as date) end as open_date,
cast("CloseDate" as date) as close_date, case when "CloseDate" = '' then null else cast("CloseDate" as date) end as close_date,
"Trusts (name)" as academy_trust_name, "Trusts (name)" as academy_trust_name,
cast("Trusts (code)" as integer) as academy_trust_uid, cast(nullif("Trusts (code)", '') as integer) as academy_trust_uid,
"UrbanRural (name)" as urban_rural, "UrbanRural (name)" as urban_rural,
"ParliamentaryConstituency (name)" as parliamentary_constituency, "ParliamentaryConstituency (name)" as parliamentary_constituency,
"NurseryProvision (name)" as nursery_provision, "NurseryProvision (name)" as nursery_provision,
cast("Easting" as integer) as easting, cast(nullif("Easting", '') as integer) as easting,
cast("Northing" as integer) as northing, cast(nullif("Northing", '') as integer) as northing,
-- Age range cast(nullif("StatutoryLowAge", '') as integer) as statutory_low_age,
cast("StatutoryLowAge" as integer) as statutory_low_age, cast(nullif("StatutoryHighAge", '') as integer) as statutory_high_age
cast("StatutoryHighAge" as integer) as statutory_high_age
from source from source
where "URN" is not null where "URN" is not null
) )

View File

@@ -9,7 +9,7 @@ renamed as (
cast("URN" as integer) as urn, cast("URN" as integer) as urn,
cast("LinkURN" as integer) as linked_urn, cast("LinkURN" as integer) as linked_urn,
"LinkType" as link_type, "LinkType" as link_type,
cast("LinkEstablishedDate" as date) as link_date case when "LinkEstablishedDate" = '' then null else cast("LinkEstablishedDate" as date) end as link_date
from source from source
where "URN" is not null where "URN" is not null
and "LinkURN" is not null and "LinkURN" is not null