fix(pipeline): expand GIAS schema, handle empty strings, scope DAG selectors
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 32s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m8s
Build and Push Docker Images / Build Integrator (push) Successful in 57s
Build and Push Docker Images / Build Kestra Init (push) Successful in 34s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m39s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 32s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m8s
Build and Push Docker Images / Build Integrator (push) Successful in 57s
Build and Push Docker Images / Build Kestra Init (push) Successful in 34s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m39s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
- Declare all 34 columns needed by dbt in GIAS tap schema (target-postgres only persists columns present in the Singer schema message) - Use nullif() for empty-string-to-integer/date casts in staging models - Scope daily DAG dbt build to GIAS models only (stg_gias_establishments+ stg_gias_links+) to avoid errors on unloaded sources - Scope annual EES DAG similarly; remove redundant dbt test steps - Make dim_school gracefully handle missing int_ofsted_latest table Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -79,12 +79,7 @@ print(f'Validation passed: {{count}} GIAS rows')
|
|||||||
|
|
||||||
dbt_build = BashOperator(
|
dbt_build = BashOperator(
|
||||||
task_id="dbt_build",
|
task_id="dbt_build",
|
||||||
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production",
|
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production --select stg_gias_establishments+ stg_gias_links+",
|
||||||
)
|
|
||||||
|
|
||||||
dbt_test = BashOperator(
|
|
||||||
task_id="dbt_test",
|
|
||||||
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} test --profiles-dir . --target production",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
geocode_new = BashOperator(
|
geocode_new = BashOperator(
|
||||||
@@ -97,7 +92,7 @@ print(f'Validation passed: {{count}} GIAS rows')
|
|||||||
bash_command=f"cd {PIPELINE_DIR} && python scripts/sync_typesense.py",
|
bash_command=f"cd {PIPELINE_DIR} && python scripts/sync_typesense.py",
|
||||||
)
|
)
|
||||||
|
|
||||||
extract_group >> validate_raw >> dbt_build >> dbt_test >> geocode_new >> sync_typesense
|
extract_group >> validate_raw >> dbt_build >> geocode_new >> sync_typesense
|
||||||
|
|
||||||
|
|
||||||
# ── Monthly DAG (Ofsted) ───────────────────────────────────────────────
|
# ── Monthly DAG (Ofsted) ───────────────────────────────────────────────
|
||||||
@@ -150,12 +145,7 @@ with DAG(
|
|||||||
|
|
||||||
dbt_build_ees = BashOperator(
|
dbt_build_ees = BashOperator(
|
||||||
task_id="dbt_build",
|
task_id="dbt_build",
|
||||||
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production",
|
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production --select stg_ees_ks2+ stg_ees_ks4+ stg_ees_census+ stg_ees_admissions+ stg_ees_phonics+",
|
||||||
)
|
|
||||||
|
|
||||||
dbt_test_ees = BashOperator(
|
|
||||||
task_id="dbt_test",
|
|
||||||
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} test --profiles-dir . --target production",
|
|
||||||
)
|
)
|
||||||
|
|
||||||
sync_typesense_ees = BashOperator(
|
sync_typesense_ees = BashOperator(
|
||||||
@@ -163,4 +153,4 @@ with DAG(
|
|||||||
bash_command=f"cd {PIPELINE_DIR} && python scripts/sync_typesense.py",
|
bash_command=f"cd {PIPELINE_DIR} && python scripts/sync_typesense.py",
|
||||||
)
|
)
|
||||||
|
|
||||||
extract_ees_group >> dbt_build_ees >> dbt_test_ees >> sync_typesense_ees
|
extract_ees_group >> dbt_build_ees >> sync_typesense_ees
|
||||||
|
|||||||
@@ -25,9 +25,9 @@ class GIASEstablishmentsStream(Stream):
|
|||||||
primary_keys = ["URN"]
|
primary_keys = ["URN"]
|
||||||
replication_key = None
|
replication_key = None
|
||||||
|
|
||||||
# Schema is wide (~250 columns); we declare key columns and pass through the rest
|
# All columns used by dbt staging models must be declared here —
|
||||||
# All columns are read as strings from CSV; dbt staging models handle type casting.
|
# target-postgres only persists columns present in the Singer schema.
|
||||||
# Only URN is cast to int in get_records() for the primary key.
|
# All non-PK columns are StringType; dbt handles type casting.
|
||||||
schema = th.PropertiesList(
|
schema = th.PropertiesList(
|
||||||
th.Property("URN", th.IntegerType, required=True),
|
th.Property("URN", th.IntegerType, required=True),
|
||||||
th.Property("EstablishmentName", th.StringType),
|
th.Property("EstablishmentName", th.StringType),
|
||||||
@@ -38,6 +38,31 @@ class GIASEstablishmentsStream(Stream):
|
|||||||
th.Property("EstablishmentNumber", th.StringType),
|
th.Property("EstablishmentNumber", th.StringType),
|
||||||
th.Property("EstablishmentStatus (name)", th.StringType),
|
th.Property("EstablishmentStatus (name)", th.StringType),
|
||||||
th.Property("Postcode", th.StringType),
|
th.Property("Postcode", th.StringType),
|
||||||
|
th.Property("Gender (name)", th.StringType),
|
||||||
|
th.Property("ReligiousCharacter (name)", th.StringType),
|
||||||
|
th.Property("AdmissionsPolicy (name)", th.StringType),
|
||||||
|
th.Property("SchoolCapacity", th.StringType),
|
||||||
|
th.Property("NumberOfPupils", th.StringType),
|
||||||
|
th.Property("HeadTitle (name)", th.StringType),
|
||||||
|
th.Property("HeadFirstName", th.StringType),
|
||||||
|
th.Property("HeadLastName", th.StringType),
|
||||||
|
th.Property("TelephoneNum", th.StringType),
|
||||||
|
th.Property("SchoolWebsite", th.StringType),
|
||||||
|
th.Property("Street", th.StringType),
|
||||||
|
th.Property("Locality", th.StringType),
|
||||||
|
th.Property("Town", th.StringType),
|
||||||
|
th.Property("County (name)", th.StringType),
|
||||||
|
th.Property("OpenDate", th.StringType),
|
||||||
|
th.Property("CloseDate", th.StringType),
|
||||||
|
th.Property("Trusts (name)", th.StringType),
|
||||||
|
th.Property("Trusts (code)", th.StringType),
|
||||||
|
th.Property("UrbanRural (name)", th.StringType),
|
||||||
|
th.Property("ParliamentaryConstituency (name)", th.StringType),
|
||||||
|
th.Property("NurseryProvision (name)", th.StringType),
|
||||||
|
th.Property("Easting", th.StringType),
|
||||||
|
th.Property("Northing", th.StringType),
|
||||||
|
th.Property("StatutoryLowAge", th.StringType),
|
||||||
|
th.Property("StatutoryHighAge", th.StringType),
|
||||||
).to_dict()
|
).to_dict()
|
||||||
|
|
||||||
def get_records(self, context):
|
def get_records(self, context):
|
||||||
|
|||||||
@@ -2,12 +2,14 @@
|
|||||||
|
|
||||||
with schools as (
|
with schools as (
|
||||||
select * from {{ ref('stg_gias_establishments') }}
|
select * from {{ ref('stg_gias_establishments') }}
|
||||||
),
|
|
||||||
|
|
||||||
latest_ofsted as (
|
|
||||||
select * from {{ ref('int_ofsted_latest') }}
|
|
||||||
)
|
)
|
||||||
|
|
||||||
|
{% set ofsted_relation = adapter.get_relation(
|
||||||
|
database=target.database,
|
||||||
|
schema=target.schema,
|
||||||
|
identifier='int_ofsted_latest'
|
||||||
|
) %}
|
||||||
|
|
||||||
select
|
select
|
||||||
s.urn,
|
s.urn,
|
||||||
s.local_authority_code * 1000 + s.establishment_number as laestab,
|
s.local_authority_code * 1000 + s.establishment_number as laestab,
|
||||||
@@ -30,11 +32,19 @@ select
|
|||||||
s.nursery_provision,
|
s.nursery_provision,
|
||||||
s.admissions_policy,
|
s.admissions_policy,
|
||||||
|
|
||||||
-- Latest Ofsted
|
-- Latest Ofsted (populated after monthly Ofsted pipeline runs)
|
||||||
|
{% if ofsted_relation is not none %}
|
||||||
o.overall_effectiveness as ofsted_grade,
|
o.overall_effectiveness as ofsted_grade,
|
||||||
o.inspection_date as ofsted_date,
|
o.inspection_date as ofsted_date,
|
||||||
o.framework as ofsted_framework
|
o.framework as ofsted_framework
|
||||||
|
{% else %}
|
||||||
|
null::text as ofsted_grade,
|
||||||
|
null::date as ofsted_date,
|
||||||
|
null::text as ofsted_framework
|
||||||
|
{% endif %}
|
||||||
|
|
||||||
from schools s
|
from schools s
|
||||||
left join latest_ofsted o on s.urn = o.urn
|
{% if ofsted_relation is not none %}
|
||||||
|
left join {{ ref('int_ofsted_latest') }} o on s.urn = o.urn
|
||||||
|
{% endif %}
|
||||||
where s.status = 'Open'
|
where s.status = 'Open'
|
||||||
|
|||||||
@@ -8,9 +8,9 @@ with source as (
|
|||||||
renamed as (
|
renamed as (
|
||||||
select
|
select
|
||||||
cast("URN" as integer) as urn,
|
cast("URN" as integer) as urn,
|
||||||
cast("LA (code)" as integer) as local_authority_code,
|
cast(nullif("LA (code)", '') as integer) as local_authority_code,
|
||||||
"LA (name)" as local_authority_name,
|
"LA (name)" as local_authority_name,
|
||||||
cast("EstablishmentNumber" as integer) as establishment_number,
|
cast(nullif("EstablishmentNumber", '') as integer) as establishment_number,
|
||||||
"EstablishmentName" as school_name,
|
"EstablishmentName" as school_name,
|
||||||
"TypeOfEstablishment (name)" as school_type,
|
"TypeOfEstablishment (name)" as school_type,
|
||||||
"PhaseOfEducation (name)" as phase,
|
"PhaseOfEducation (name)" as phase,
|
||||||
@@ -18,7 +18,7 @@ renamed as (
|
|||||||
"ReligiousCharacter (name)" as religious_character,
|
"ReligiousCharacter (name)" as religious_character,
|
||||||
"AdmissionsPolicy (name)" as admissions_policy,
|
"AdmissionsPolicy (name)" as admissions_policy,
|
||||||
"SchoolCapacity" as capacity,
|
"SchoolCapacity" as capacity,
|
||||||
cast("NumberOfPupils" as integer) as total_pupils,
|
cast(nullif("NumberOfPupils", '') as integer) as total_pupils,
|
||||||
"HeadTitle (name)" as head_title,
|
"HeadTitle (name)" as head_title,
|
||||||
"HeadFirstName" as head_first_name,
|
"HeadFirstName" as head_first_name,
|
||||||
"HeadLastName" as head_last_name,
|
"HeadLastName" as head_last_name,
|
||||||
@@ -30,18 +30,17 @@ renamed as (
|
|||||||
"County (name)" as county,
|
"County (name)" as county,
|
||||||
"Postcode" as postcode,
|
"Postcode" as postcode,
|
||||||
"EstablishmentStatus (name)" as status,
|
"EstablishmentStatus (name)" as status,
|
||||||
cast("OpenDate" as date) as open_date,
|
case when "OpenDate" = '' then null else cast("OpenDate" as date) end as open_date,
|
||||||
cast("CloseDate" as date) as close_date,
|
case when "CloseDate" = '' then null else cast("CloseDate" as date) end as close_date,
|
||||||
"Trusts (name)" as academy_trust_name,
|
"Trusts (name)" as academy_trust_name,
|
||||||
cast("Trusts (code)" as integer) as academy_trust_uid,
|
cast(nullif("Trusts (code)", '') as integer) as academy_trust_uid,
|
||||||
"UrbanRural (name)" as urban_rural,
|
"UrbanRural (name)" as urban_rural,
|
||||||
"ParliamentaryConstituency (name)" as parliamentary_constituency,
|
"ParliamentaryConstituency (name)" as parliamentary_constituency,
|
||||||
"NurseryProvision (name)" as nursery_provision,
|
"NurseryProvision (name)" as nursery_provision,
|
||||||
cast("Easting" as integer) as easting,
|
cast(nullif("Easting", '') as integer) as easting,
|
||||||
cast("Northing" as integer) as northing,
|
cast(nullif("Northing", '') as integer) as northing,
|
||||||
-- Age range
|
cast(nullif("StatutoryLowAge", '') as integer) as statutory_low_age,
|
||||||
cast("StatutoryLowAge" as integer) as statutory_low_age,
|
cast(nullif("StatutoryHighAge", '') as integer) as statutory_high_age
|
||||||
cast("StatutoryHighAge" as integer) as statutory_high_age
|
|
||||||
from source
|
from source
|
||||||
where "URN" is not null
|
where "URN" is not null
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -9,7 +9,7 @@ renamed as (
|
|||||||
cast("URN" as integer) as urn,
|
cast("URN" as integer) as urn,
|
||||||
cast("LinkURN" as integer) as linked_urn,
|
cast("LinkURN" as integer) as linked_urn,
|
||||||
"LinkType" as link_type,
|
"LinkType" as link_type,
|
||||||
cast("LinkEstablishedDate" as date) as link_date
|
case when "LinkEstablishedDate" = '' then null else cast("LinkEstablishedDate" as date) end as link_date
|
||||||
from source
|
from source
|
||||||
where "URN" is not null
|
where "URN" is not null
|
||||||
and "LinkURN" is not null
|
and "LinkURN" is not null
|
||||||
|
|||||||
Reference in New Issue
Block a user