feat(ees): rewrite EES tap and KS2 models for actual data structure
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 31s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m8s
Build and Push Docker Images / Build Integrator (push) Successful in 55s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m45s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

- Fix publication slugs (KS4, Phonics, Admissions were wrong)
- Split KS2 into two streams: ees_ks2_attainment (long format) and
  ees_ks2_info (wide format context data)
- Target specific filenames instead of keyword matching
- Handle school_urn vs urn column naming
- Pivot KS2 attainment from long to wide format in dbt staging
- Add all ~40 KS2 columns the backend needs (GPS, absence, gender,
  disadvantaged breakdowns, context demographics)
- Pass through all columns in int_ks2_with_lineage and fact_ks2

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 23:08:50 +00:00
parent 719f06e480
commit d82e36e7b2
5 changed files with 354 additions and 99 deletions

View File

@@ -5,21 +5,16 @@ with current_ks2 as (
select
urn as current_urn,
urn as source_urn,
year,
total_pupils,
rwm_expected_pct,
reading_expected_pct,
writing_expected_pct,
maths_expected_pct,
rwm_high_pct,
reading_high_pct,
writing_high_pct,
maths_high_pct,
reading_progress,
writing_progress,
maths_progress,
reading_avg_score,
maths_avg_score
year, total_pupils, eligible_pupils,
rwm_expected_pct, rwm_high_pct,
reading_expected_pct, reading_high_pct, reading_avg_score, reading_progress,
writing_expected_pct, writing_high_pct, writing_progress,
maths_expected_pct, maths_high_pct, maths_avg_score, maths_progress,
gps_expected_pct, gps_high_pct, gps_avg_score, science_expected_pct,
reading_absence_pct, writing_absence_pct, maths_absence_pct, gps_absence_pct, science_absence_pct,
rwm_expected_boys_pct, rwm_high_boys_pct, rwm_expected_girls_pct, rwm_high_girls_pct,
rwm_expected_disadvantaged_pct, rwm_expected_non_disadvantaged_pct, disadvantaged_gap,
disadvantaged_pct, eal_pct, sen_support_pct, sen_ehcp_pct, stability_pct
from {{ ref('stg_ees_ks2') }}
),
@@ -27,25 +22,19 @@ predecessor_ks2 as (
select
lin.current_urn,
ks2.urn as source_urn,
ks2.year,
ks2.total_pupils,
ks2.rwm_expected_pct,
ks2.reading_expected_pct,
ks2.writing_expected_pct,
ks2.maths_expected_pct,
ks2.rwm_high_pct,
ks2.reading_high_pct,
ks2.writing_high_pct,
ks2.maths_high_pct,
ks2.reading_progress,
ks2.writing_progress,
ks2.maths_progress,
ks2.reading_avg_score,
ks2.maths_avg_score
ks2.year, ks2.total_pupils, ks2.eligible_pupils,
ks2.rwm_expected_pct, ks2.rwm_high_pct,
ks2.reading_expected_pct, ks2.reading_high_pct, ks2.reading_avg_score, ks2.reading_progress,
ks2.writing_expected_pct, ks2.writing_high_pct, ks2.writing_progress,
ks2.maths_expected_pct, ks2.maths_high_pct, ks2.maths_avg_score, ks2.maths_progress,
ks2.gps_expected_pct, ks2.gps_high_pct, ks2.gps_avg_score, ks2.science_expected_pct,
ks2.reading_absence_pct, ks2.writing_absence_pct, ks2.maths_absence_pct, ks2.gps_absence_pct, ks2.science_absence_pct,
ks2.rwm_expected_boys_pct, ks2.rwm_high_boys_pct, ks2.rwm_expected_girls_pct, ks2.rwm_high_girls_pct,
ks2.rwm_expected_disadvantaged_pct, ks2.rwm_expected_non_disadvantaged_pct, ks2.disadvantaged_gap,
ks2.disadvantaged_pct, ks2.eal_pct, ks2.sen_support_pct, ks2.sen_ehcp_pct, ks2.stability_pct
from {{ ref('stg_ees_ks2') }} ks2
inner join {{ ref('int_school_lineage') }} lin
on ks2.urn = lin.predecessor_urn
-- Only include predecessor data for years before the current URN has data
where not exists (
select 1 from {{ ref('stg_ees_ks2') }} curr
where curr.urn = lin.current_urn

View File

@@ -6,17 +6,50 @@ select
source_urn,
year,
total_pupils,
eligible_pupils,
-- Core attainment
rwm_expected_pct,
reading_expected_pct,
writing_expected_pct,
maths_expected_pct,
rwm_high_pct,
reading_expected_pct,
reading_high_pct,
writing_high_pct,
maths_high_pct,
reading_progress,
writing_progress,
maths_progress,
reading_avg_score,
maths_avg_score
reading_progress,
writing_expected_pct,
writing_high_pct,
writing_progress,
maths_expected_pct,
maths_high_pct,
maths_avg_score,
maths_progress,
gps_expected_pct,
gps_high_pct,
gps_avg_score,
science_expected_pct,
-- Absence
reading_absence_pct,
writing_absence_pct,
maths_absence_pct,
gps_absence_pct,
science_absence_pct,
-- Gender
rwm_expected_boys_pct,
rwm_high_boys_pct,
rwm_expected_girls_pct,
rwm_high_girls_pct,
-- Disadvantaged
rwm_expected_disadvantaged_pct,
rwm_expected_non_disadvantaged_pct,
disadvantaged_gap,
-- Context
disadvantaged_pct,
eal_pct,
sen_support_pct,
sen_ehcp_pct,
stability_pct
from {{ ref('int_ks2_with_lineage') }}

View File

@@ -24,8 +24,11 @@ sources:
- name: ofsted_inspections
description: Ofsted Management Information inspection records
- name: ees_ks2
description: KS2 attainment data from Explore Education Statistics
- name: ees_ks2_attainment
description: KS2 school attainment (long format — one row per school × subject × breakdown)
- name: ees_ks2_info
description: KS2 school information (wide format — context/demographics per school)
- name: ees_ks4
description: KS4 attainment data from Explore Education Statistics

View File

@@ -1,31 +1,185 @@
-- Staging model: KS2 attainment data from EES
-- Column names depend on the EES dataset schema; these will be finalised
-- once the tap-uk-ees extractor resolves the actual column names.
-- Staging model: KS2 attainment + information
-- Pivots long-format attainment data (one row per subject × breakdown) into
-- wide format (one row per school per year) and joins context from info table.
-- EES uses 'z' for suppressed values — cast to null via nullif.
with source as (
select * from {{ source('raw', 'ees_ks2') }}
with attainment as (
select * from {{ source('raw', 'ees_ks2_attainment') }}
where school_urn is not null
),
renamed as (
-- Pivot: extract metrics for each subject where breakdown = 'Total'
all_pupils as (
select
cast(urn as integer) as urn,
cast(time_period as integer) as year,
cast(t_pupils as integer) as total_pupils,
cast(pt_rwm_met_expected_standard as numeric) as rwm_expected_pct,
cast(pt_read_met_expected_standard as numeric) as reading_expected_pct,
cast(pt_write_met_expected_standard as numeric) as writing_expected_pct,
cast(pt_maths_met_expected_standard as numeric) as maths_expected_pct,
cast(pt_rwm_met_higher_standard as numeric) as rwm_high_pct,
cast(pt_read_met_higher_standard as numeric) as reading_high_pct,
cast(pt_write_met_higher_standard as numeric) as writing_high_pct,
cast(pt_maths_met_higher_standard as numeric) as maths_high_pct,
cast(read_progress as numeric) as reading_progress,
cast(write_progress as numeric) as writing_progress,
cast(maths_progress as numeric) as maths_progress,
cast(read_average_score as numeric) as reading_avg_score,
cast(maths_average_score as numeric) as maths_avg_score
from source
where urn is not null
school_urn,
time_period,
subject,
nullif(expected_standard_pupil_percent, 'z') as expected_pct,
nullif(higher_standard_pupil_percent, 'z') as higher_pct,
nullif(average_scaled_score, 'z') as avg_score,
nullif(progress_measure_score, 'z') as progress,
nullif(absent_or_not_able_to_access_percent, 'z') as absence_pct
from attainment
where breakdown_topic = 'All pupils'
and breakdown = 'Total'
),
pivoted as (
select
cast(school_urn as integer) as urn,
cast(time_period as integer) as year,
-- RWM combined
max(case when subject = 'Reading, writing and maths' then cast(expected_pct as numeric) end) as rwm_expected_pct,
max(case when subject = 'Reading, writing and maths' then cast(higher_pct as numeric) end) as rwm_high_pct,
-- Reading
max(case when subject = 'Reading' then cast(expected_pct as numeric) end) as reading_expected_pct,
max(case when subject = 'Reading' then cast(higher_pct as numeric) end) as reading_high_pct,
max(case when subject = 'Reading' then cast(avg_score as numeric) end) as reading_avg_score,
max(case when subject = 'Reading' then cast(progress as numeric) end) as reading_progress,
max(case when subject = 'Reading' then cast(absence_pct as numeric) end) as reading_absence_pct,
-- Writing
max(case when subject = 'Writing' then cast(expected_pct as numeric) end) as writing_expected_pct,
max(case when subject = 'Writing' then cast(higher_pct as numeric) end) as writing_high_pct,
max(case when subject = 'Writing' then cast(progress as numeric) end) as writing_progress,
max(case when subject = 'Writing' then cast(absence_pct as numeric) end) as writing_absence_pct,
-- Maths
max(case when subject = 'Maths' then cast(expected_pct as numeric) end) as maths_expected_pct,
max(case when subject = 'Maths' then cast(higher_pct as numeric) end) as maths_high_pct,
max(case when subject = 'Maths' then cast(avg_score as numeric) end) as maths_avg_score,
max(case when subject = 'Maths' then cast(progress as numeric) end) as maths_progress,
max(case when subject = 'Maths' then cast(absence_pct as numeric) end) as maths_absence_pct,
-- GPS
max(case when subject ilike '%grammar%' or subject = 'GPS' then cast(expected_pct as numeric) end) as gps_expected_pct,
max(case when subject ilike '%grammar%' or subject = 'GPS' then cast(higher_pct as numeric) end) as gps_high_pct,
max(case when subject ilike '%grammar%' or subject = 'GPS' then cast(avg_score as numeric) end) as gps_avg_score,
max(case when subject ilike '%grammar%' or subject = 'GPS' then cast(absence_pct as numeric) end) as gps_absence_pct,
-- Science
max(case when subject = 'Science' then cast(expected_pct as numeric) end) as science_expected_pct,
max(case when subject = 'Science' then cast(absence_pct as numeric) end) as science_absence_pct
from all_pupils
group by school_urn, time_period
),
-- Gender breakdown for RWM
gender_boys as (
select
school_urn,
time_period,
nullif(expected_standard_pupil_percent, 'z') as rwm_expected_boys_pct,
nullif(higher_standard_pupil_percent, 'z') as rwm_high_boys_pct
from attainment
where subject = 'Reading, writing and maths'
and breakdown = 'Boys'
),
gender_girls as (
select
school_urn,
time_period,
nullif(expected_standard_pupil_percent, 'z') as rwm_expected_girls_pct,
nullif(higher_standard_pupil_percent, 'z') as rwm_high_girls_pct
from attainment
where subject = 'Reading, writing and maths'
and breakdown = 'Girls'
),
-- Disadvantaged breakdown for RWM
disadv as (
select
school_urn,
time_period,
nullif(expected_standard_pupil_percent, 'z') as rwm_expected_disadvantaged_pct
from attainment
where subject = 'Reading, writing and maths'
and breakdown = 'Disadvantaged'
),
not_disadv as (
select
school_urn,
time_period,
nullif(expected_standard_pupil_percent, 'z') as rwm_expected_non_disadvantaged_pct
from attainment
where subject = 'Reading, writing and maths'
and breakdown = 'Not disadvantaged'
),
-- School info (context/demographics)
info as (
select
cast(school_urn as integer) as urn,
cast(time_period as integer) as year,
cast(nullif(totpups, 'z') as integer) as total_pupils,
cast(nullif(telig, 'z') as integer) as eligible_pupils,
cast(nullif(ptfsm6cla1a, 'z') as numeric) as disadvantaged_pct,
cast(nullif(ptealgrp2, 'z') as numeric) as eal_pct,
cast(nullif(psenelk, 'z') as numeric) as sen_support_pct,
cast(nullif(psenele, 'z') as numeric) as sen_ehcp_pct,
cast(nullif(ptmobn, 'z') as numeric) as stability_pct
from {{ source('raw', 'ees_ks2_info') }}
where school_urn is not null
)
select * from renamed
select
p.urn,
p.year,
i.total_pupils,
i.eligible_pupils,
-- Core attainment
p.rwm_expected_pct,
p.rwm_high_pct,
p.reading_expected_pct,
p.reading_high_pct,
p.reading_avg_score,
p.reading_progress,
p.writing_expected_pct,
p.writing_high_pct,
p.writing_progress,
p.maths_expected_pct,
p.maths_high_pct,
p.maths_avg_score,
p.maths_progress,
p.gps_expected_pct,
p.gps_high_pct,
p.gps_avg_score,
p.science_expected_pct,
-- Absence
p.reading_absence_pct,
p.writing_absence_pct,
p.maths_absence_pct,
p.gps_absence_pct,
p.science_absence_pct,
-- Gender
cast(gb.rwm_expected_boys_pct as numeric) as rwm_expected_boys_pct,
cast(gb.rwm_high_boys_pct as numeric) as rwm_high_boys_pct,
cast(gg.rwm_expected_girls_pct as numeric) as rwm_expected_girls_pct,
cast(gg.rwm_high_girls_pct as numeric) as rwm_high_girls_pct,
-- Disadvantaged
cast(d.rwm_expected_disadvantaged_pct as numeric) as rwm_expected_disadvantaged_pct,
cast(nd.rwm_expected_non_disadvantaged_pct as numeric) as rwm_expected_non_disadvantaged_pct,
cast(d.rwm_expected_disadvantaged_pct as numeric) - cast(nd.rwm_expected_non_disadvantaged_pct as numeric) as disadvantaged_gap,
-- Context
i.disadvantaged_pct,
i.eal_pct,
i.sen_support_pct,
i.sen_ehcp_pct,
i.stability_pct
from pivoted p
left join info i on p.urn = i.urn and p.year = i.year
left join gender_boys gb on p.urn = cast(gb.school_urn as integer) and p.year = cast(gb.time_period as integer)
left join gender_girls gg on p.urn = cast(gg.school_urn as integer) and p.year = cast(gg.time_period as integer)
left join disadv d on p.urn = cast(d.school_urn as integer) and p.year = cast(d.time_period as integer)
left join not_disadv nd on p.urn = cast(nd.school_urn as integer) and p.year = cast(nd.time_period as integer)