From 6e720feca416894a8b3ada6a9a14d40467231221 Mon Sep 17 00:00:00 2001 From: Tudor Date: Fri, 27 Mar 2026 11:42:40 +0000 Subject: [PATCH] perf(dbt): collapse stg_ees_ks2 to single-pass pivot MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Previous version scanned ees_ks2_attainment (1.2M rows) 5 times via separate CTEs (all_pupils, gender_boys, gender_girls, disadv, not_disadv) plus 5 LEFT JOINs. Rewritten as one GROUP BY with conditional aggregation — single scan, no self-joins. Co-Authored-By: Claude Sonnet 4.6 --- .../transform/models/staging/stg_ees_ks2.sql | 207 +++++++++--------- 1 file changed, 99 insertions(+), 108 deletions(-) diff --git a/pipeline/transform/models/staging/stg_ees_ks2.sql b/pipeline/transform/models/staging/stg_ees_ks2.sql index e098a8b..061168e 100644 --- a/pipeline/transform/models/staging/stg_ees_ks2.sql +++ b/pipeline/transform/models/staging/stg_ees_ks2.sql @@ -1,8 +1,9 @@ {{ config(materialized='table') }} -- Staging model: KS2 attainment + information --- Pivots long-format attainment data (one row per subject × breakdown) into --- wide format (one row per school per year) and joins context from info table. +-- Single-pass pivot: one GROUP BY scan of 1.2M rows handles all subjects +-- and all breakdowns (All pupils, Boys, Girls, Disadvantaged, Not disadvantaged) +-- instead of 5 separate CTE scans + 5 LEFT JOINs. -- EES uses 'z' (not applicable) and 'c' (confidential) as suppression codes — -- safe_numeric handles both by treating any non-numeric string as NULL. @@ -11,110 +12,109 @@ with attainment as ( where school_urn is not null ), --- Pivot: extract metrics for each subject where breakdown = 'Total' -all_pupils as ( - select - school_urn, - time_period, - subject, - {{ safe_numeric('expected_standard_pupil_percent') }} as expected_pct, - {{ safe_numeric('higher_standard_pupil_percent') }} as higher_pct, - {{ safe_numeric('average_scaled_score') }} as avg_score, - {{ safe_numeric('progress_measure_score') }} as progress, - {{ safe_numeric('absent_or_not_able_to_access_percent') }} as absence_pct - from attainment - where breakdown_topic = 'All pupils' - and breakdown = 'Total' -), - pivoted as ( select cast(school_urn as integer) as urn, cast(time_period as integer) as year, - -- RWM combined - max(case when subject = 'Reading, writing and maths' then expected_pct end) as rwm_expected_pct, - max(case when subject = 'Reading, writing and maths' then higher_pct end) as rwm_high_pct, + -- RWM combined (All pupils / Total) + max(case when subject = 'Reading, writing and maths' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('expected_standard_pupil_percent') }} end) as rwm_expected_pct, + max(case when subject = 'Reading, writing and maths' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('higher_standard_pupil_percent') }} end) as rwm_high_pct, - -- Reading - max(case when subject = 'Reading' then expected_pct end) as reading_expected_pct, - max(case when subject = 'Reading' then higher_pct end) as reading_high_pct, - max(case when subject = 'Reading' then avg_score end) as reading_avg_score, - max(case when subject = 'Reading' then progress end) as reading_progress, - max(case when subject = 'Reading' then absence_pct end) as reading_absence_pct, + -- Reading (All pupils / Total) + max(case when subject = 'Reading' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('expected_standard_pupil_percent') }} end) as reading_expected_pct, + max(case when subject = 'Reading' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('higher_standard_pupil_percent') }} end) as reading_high_pct, + max(case when subject = 'Reading' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('average_scaled_score') }} end) as reading_avg_score, + max(case when subject = 'Reading' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('progress_measure_score') }} end) as reading_progress, + max(case when subject = 'Reading' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('absent_or_not_able_to_access_percent') }} end) as reading_absence_pct, - -- Writing - max(case when subject = 'Writing' then expected_pct end) as writing_expected_pct, - max(case when subject = 'Writing' then higher_pct end) as writing_high_pct, - max(case when subject = 'Writing' then progress end) as writing_progress, - max(case when subject = 'Writing' then absence_pct end) as writing_absence_pct, + -- Writing (All pupils / Total) + max(case when subject = 'Writing' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('expected_standard_pupil_percent') }} end) as writing_expected_pct, + max(case when subject = 'Writing' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('higher_standard_pupil_percent') }} end) as writing_high_pct, + max(case when subject = 'Writing' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('progress_measure_score') }} end) as writing_progress, + max(case when subject = 'Writing' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('absent_or_not_able_to_access_percent') }} end) as writing_absence_pct, - -- Maths - max(case when subject = 'Maths' then expected_pct end) as maths_expected_pct, - max(case when subject = 'Maths' then higher_pct end) as maths_high_pct, - max(case when subject = 'Maths' then avg_score end) as maths_avg_score, - max(case when subject = 'Maths' then progress end) as maths_progress, - max(case when subject = 'Maths' then absence_pct end) as maths_absence_pct, + -- Maths (All pupils / Total) + max(case when subject = 'Maths' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('expected_standard_pupil_percent') }} end) as maths_expected_pct, + max(case when subject = 'Maths' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('higher_standard_pupil_percent') }} end) as maths_high_pct, + max(case when subject = 'Maths' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('average_scaled_score') }} end) as maths_avg_score, + max(case when subject = 'Maths' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('progress_measure_score') }} end) as maths_progress, + max(case when subject = 'Maths' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('absent_or_not_able_to_access_percent') }} end) as maths_absence_pct, - -- GPS - max(case when subject ilike '%grammar%' or subject = 'GPS' then expected_pct end) as gps_expected_pct, - max(case when subject ilike '%grammar%' or subject = 'GPS' then higher_pct end) as gps_high_pct, - max(case when subject ilike '%grammar%' or subject = 'GPS' then avg_score end) as gps_avg_score, - max(case when subject ilike '%grammar%' or subject = 'GPS' then absence_pct end) as gps_absence_pct, + -- GPS (All pupils / Total) + max(case when (subject ilike '%grammar%' or subject = 'GPS') + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('expected_standard_pupil_percent') }} end) as gps_expected_pct, + max(case when (subject ilike '%grammar%' or subject = 'GPS') + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('higher_standard_pupil_percent') }} end) as gps_high_pct, + max(case when (subject ilike '%grammar%' or subject = 'GPS') + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('average_scaled_score') }} end) as gps_avg_score, + max(case when (subject ilike '%grammar%' or subject = 'GPS') + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('absent_or_not_able_to_access_percent') }} end) as gps_absence_pct, - -- Science - max(case when subject = 'Science' then expected_pct end) as science_expected_pct, - max(case when subject = 'Science' then absence_pct end) as science_absence_pct + -- Science (All pupils / Total) + max(case when subject = 'Science' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('expected_standard_pupil_percent') }} end) as science_expected_pct, + max(case when subject = 'Science' + and breakdown_topic = 'All pupils' and breakdown = 'Total' + then {{ safe_numeric('absent_or_not_able_to_access_percent') }} end) as science_absence_pct, - from all_pupils + -- Gender breakdown for RWM + max(case when subject = 'Reading, writing and maths' and breakdown = 'Boys' + then {{ safe_numeric('expected_standard_pupil_percent') }} end) as rwm_expected_boys_pct, + max(case when subject = 'Reading, writing and maths' and breakdown = 'Boys' + then {{ safe_numeric('higher_standard_pupil_percent') }} end) as rwm_high_boys_pct, + max(case when subject = 'Reading, writing and maths' and breakdown = 'Girls' + then {{ safe_numeric('expected_standard_pupil_percent') }} end) as rwm_expected_girls_pct, + max(case when subject = 'Reading, writing and maths' and breakdown = 'Girls' + then {{ safe_numeric('higher_standard_pupil_percent') }} end) as rwm_high_girls_pct, + + -- Disadvantaged breakdown for RWM + max(case when subject = 'Reading, writing and maths' and breakdown = 'Disadvantaged' + then {{ safe_numeric('expected_standard_pupil_percent') }} end) as rwm_expected_disadvantaged_pct, + max(case when subject = 'Reading, writing and maths' and breakdown = 'Not disadvantaged' + then {{ safe_numeric('expected_standard_pupil_percent') }} end) as rwm_expected_non_disadvantaged_pct + + from attainment group by school_urn, time_period ), --- Gender breakdown for RWM -gender_boys as ( - select - school_urn, - time_period, - {{ safe_numeric('expected_standard_pupil_percent') }} as rwm_expected_boys_pct, - {{ safe_numeric('higher_standard_pupil_percent') }} as rwm_high_boys_pct - from attainment - where subject = 'Reading, writing and maths' - and breakdown = 'Boys' -), - -gender_girls as ( - select - school_urn, - time_period, - {{ safe_numeric('expected_standard_pupil_percent') }} as rwm_expected_girls_pct, - {{ safe_numeric('higher_standard_pupil_percent') }} as rwm_high_girls_pct - from attainment - where subject = 'Reading, writing and maths' - and breakdown = 'Girls' -), - --- Disadvantaged breakdown for RWM -disadv as ( - select - school_urn, - time_period, - {{ safe_numeric('expected_standard_pupil_percent') }} as rwm_expected_disadvantaged_pct - from attainment - where subject = 'Reading, writing and maths' - and breakdown = 'Disadvantaged' -), - -not_disadv as ( - select - school_urn, - time_period, - {{ safe_numeric('expected_standard_pupil_percent') }} as rwm_expected_non_disadvantaged_pct - from attainment - where subject = 'Reading, writing and maths' - and breakdown = 'Not disadvantaged' -), - --- School info (context/demographics) info as ( select cast(school_urn as integer) as urn, @@ -136,7 +136,6 @@ select i.total_pupils, i.eligible_pupils, - -- Core attainment p.rwm_expected_pct, p.rwm_high_pct, p.reading_expected_pct, @@ -155,25 +154,21 @@ select p.gps_avg_score, p.science_expected_pct, - -- Absence p.reading_absence_pct, p.writing_absence_pct, p.maths_absence_pct, p.gps_absence_pct, p.science_absence_pct, - -- Gender - gb.rwm_expected_boys_pct, - gb.rwm_high_boys_pct, - gg.rwm_expected_girls_pct, - gg.rwm_high_girls_pct, + p.rwm_expected_boys_pct, + p.rwm_high_boys_pct, + p.rwm_expected_girls_pct, + p.rwm_high_girls_pct, - -- Disadvantaged - d.rwm_expected_disadvantaged_pct, - nd.rwm_expected_non_disadvantaged_pct, - d.rwm_expected_disadvantaged_pct - nd.rwm_expected_non_disadvantaged_pct as disadvantaged_gap, + p.rwm_expected_disadvantaged_pct, + p.rwm_expected_non_disadvantaged_pct, + p.rwm_expected_disadvantaged_pct - p.rwm_expected_non_disadvantaged_pct as disadvantaged_gap, - -- Context i.disadvantaged_pct, i.eal_pct, i.sen_support_pct, @@ -182,7 +177,3 @@ select from pivoted p left join info i on p.urn = i.urn and p.year = i.year -left join gender_boys gb on p.urn = cast(gb.school_urn as integer) and p.year = cast(gb.time_period as integer) -left join gender_girls gg on p.urn = cast(gg.school_urn as integer) and p.year = cast(gg.time_period as integer) -left join disadv d on p.urn = cast(d.school_urn as integer) and p.year = cast(d.time_period as integer) -left join not_disadv nd on p.urn = cast(nd.school_urn as integer) and p.year = cast(nd.time_period as integer)