feat(legacy-ks2): add stream for pre-COVID KS2 data (2015-2019)
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 46s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m17s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 2m26s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

- Add LegacyKS2Stream to tap-uk-ees: downloads old DfE england_ks2final.csv
  files from a configurable base URL, maps 318-column wide format to the
  same schema as stg_ees_ks2 output
- Add stg_legacy_ks2.sql staging model with safe_numeric casts
- Add legacy_ks2 source to _stg_sources.yml
- Update int_ks2_with_lineage.sql to union EES + legacy data
- Configurable via legacy_ks2_base_url and legacy_ks2_years tap settings

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
Tudor Sitaru
2026-03-31 14:36:41 +01:00
parent fc011c6547
commit 6d4962639c
4 changed files with 235 additions and 4 deletions

View File

@@ -1,7 +1,14 @@
-- Intermediate model: KS2 data chained across academy conversions
-- Maps predecessor URN data to the current active URN
-- Unions EES (2022+) and legacy (2015-2019) school-level data
with current_ks2 as (
with all_ks2 as (
select * from {{ ref('stg_ees_ks2') }}
union all
select * from {{ ref('stg_legacy_ks2') }}
),
current_ks2 as (
select
urn as current_urn,
urn as source_urn,
@@ -15,7 +22,7 @@ with current_ks2 as (
rwm_expected_boys_pct, rwm_high_boys_pct, rwm_expected_girls_pct, rwm_high_girls_pct,
rwm_expected_disadvantaged_pct, rwm_expected_non_disadvantaged_pct, disadvantaged_gap,
disadvantaged_pct, eal_pct, sen_support_pct, sen_ehcp_pct, stability_pct
from {{ ref('stg_ees_ks2') }}
from all_ks2
),
predecessor_ks2 as (
@@ -33,11 +40,11 @@ predecessor_ks2 as (
ks2.rwm_expected_boys_pct, ks2.rwm_high_boys_pct, ks2.rwm_expected_girls_pct, ks2.rwm_high_girls_pct,
ks2.rwm_expected_disadvantaged_pct, ks2.rwm_expected_non_disadvantaged_pct, ks2.disadvantaged_gap,
ks2.disadvantaged_pct, ks2.eal_pct, ks2.sen_support_pct, ks2.sen_ehcp_pct, ks2.stability_pct
from {{ ref('stg_ees_ks2') }} ks2
from all_ks2 ks2
inner join {{ ref('int_school_lineage') }} lin
on ks2.urn = lin.predecessor_urn
where not exists (
select 1 from {{ ref('stg_ees_ks2') }} curr
select 1 from all_ks2 curr
where curr.urn = lin.current_urn
and curr.year = ks2.year
)

View File

@@ -30,6 +30,9 @@ sources:
- name: ees_ks2_info
description: KS2 school information (wide format — context/demographics per school)
- name: legacy_ks2
description: Pre-COVID KS2 school-level data (2015-16 to 2018-19) from DfE performance tables
- name: ees_ks4_performance
description: KS4 performance tables (long format — one row per school × breakdown × sex)

View File

@@ -0,0 +1,56 @@
{{ config(materialized='table') }}
-- Staging model: Legacy KS2 data from pre-COVID DfE performance tables
-- The tap already maps old column names to match stg_ees_ks2 output;
-- this model just applies safe_numeric casts.
select
cast(trim(urn) as integer) as urn,
cast(trim(year) as integer) as year,
{{ safe_numeric('total_pupils') }}::integer as total_pupils,
{{ safe_numeric('eligible_pupils') }}::integer as eligible_pupils,
{{ safe_numeric('rwm_expected_pct') }} as rwm_expected_pct,
{{ safe_numeric('rwm_high_pct') }} as rwm_high_pct,
{{ safe_numeric('reading_expected_pct') }} as reading_expected_pct,
{{ safe_numeric('reading_high_pct') }} as reading_high_pct,
{{ safe_numeric('reading_avg_score') }} as reading_avg_score,
{{ safe_numeric('reading_progress') }} as reading_progress,
{{ safe_numeric('writing_expected_pct') }} as writing_expected_pct,
{{ safe_numeric('writing_high_pct') }} as writing_high_pct,
{{ safe_numeric('writing_progress') }} as writing_progress,
{{ safe_numeric('maths_expected_pct') }} as maths_expected_pct,
{{ safe_numeric('maths_high_pct') }} as maths_high_pct,
{{ safe_numeric('maths_avg_score') }} as maths_avg_score,
{{ safe_numeric('maths_progress') }} as maths_progress,
{{ safe_numeric('gps_expected_pct') }} as gps_expected_pct,
{{ safe_numeric('gps_high_pct') }} as gps_high_pct,
{{ safe_numeric('gps_avg_score') }} as gps_avg_score,
null::numeric as science_expected_pct,
{{ safe_numeric('reading_absence_pct') }} as reading_absence_pct,
null::numeric as writing_absence_pct,
{{ safe_numeric('maths_absence_pct') }} as maths_absence_pct,
{{ safe_numeric('gps_absence_pct') }} as gps_absence_pct,
null::numeric as science_absence_pct,
{{ safe_numeric('rwm_expected_boys_pct') }} as rwm_expected_boys_pct,
{{ safe_numeric('rwm_high_boys_pct') }} as rwm_high_boys_pct,
{{ safe_numeric('rwm_expected_girls_pct') }} as rwm_expected_girls_pct,
{{ safe_numeric('rwm_high_girls_pct') }} as rwm_high_girls_pct,
{{ safe_numeric('rwm_expected_disadvantaged_pct') }} as rwm_expected_disadvantaged_pct,
{{ safe_numeric('rwm_expected_non_disadvantaged_pct') }} as rwm_expected_non_disadvantaged_pct,
{{ safe_numeric('rwm_expected_disadvantaged_pct') }}
- {{ safe_numeric('rwm_expected_non_disadvantaged_pct') }} as disadvantaged_gap,
{{ safe_numeric('disadvantaged_pct') }} as disadvantaged_pct,
{{ safe_numeric('eal_pct') }} as eal_pct,
{{ safe_numeric('sen_support_pct') }} as sen_support_pct,
{{ safe_numeric('sen_ehcp_pct') }} as sen_ehcp_pct,
{{ safe_numeric('stability_pct') }} as stability_pct
from {{ source('raw', 'legacy_ks2') }}
where urn is not null
and urn ~ '^[0-9]+$'