diff --git a/pipeline/transform/macros/safe_numeric.sql b/pipeline/transform/macros/safe_numeric.sql new file mode 100644 index 0000000..5f14cf1 --- /dev/null +++ b/pipeline/transform/macros/safe_numeric.sql @@ -0,0 +1,9 @@ +{# + safe_numeric(col) + Casts a string column to numeric, treating any non-numeric value as NULL. + Handles all EES suppression codes (z, c, x, q, u, etc.) without needing + an explicit list — any string that doesn't look like a number becomes NULL. +#} +{% macro safe_numeric(col) -%} + CASE WHEN {{ col }} ~ '^-?[0-9]+(\.[0-9]+)?$' THEN {{ col }}::numeric ELSE NULL END +{%- endmacro %} diff --git a/pipeline/transform/models/staging/stg_ees_admissions.sql b/pipeline/transform/models/staging/stg_ees_admissions.sql index 58e5667..6ee07a1 100644 --- a/pipeline/transform/models/staging/stg_ees_admissions.sql +++ b/pipeline/transform/models/staging/stg_ees_admissions.sql @@ -1,6 +1,7 @@ -- Staging model: Primary and secondary school admissions from EES -- Wide format, one row per school per year. No geographic_level column. --- File is in supporting-files/ subdirectory of the release ZIP. +-- Note: FSM_eligible_percent is stored with mixed case by target-postgres and +-- must be double-quoted in SQL to avoid case-folding to fsm_eligible_percent. with source as ( select * from {{ source('raw', 'ees_admissions') }} @@ -15,32 +16,32 @@ renamed as ( entry_year, -- Places and offers - cast(nullif(total_number_places_offered, 'z') as integer) as published_admission_number, - cast(nullif(number_preferred_offers, 'z') as integer) as total_offers, - cast(nullif(number_1st_preference_offers, 'z') as integer) as first_preference_offers, - cast(nullif(number_2nd_preference_offers, 'z') as integer) as second_preference_offers, - cast(nullif(number_3rd_preference_offers, 'z') as integer) as third_preference_offers, + {{ safe_numeric('total_number_places_offered') }}::integer as published_admission_number, + {{ safe_numeric('number_preferred_offers') }}::integer as total_offers, + {{ safe_numeric('number_1st_preference_offers') }}::integer as first_preference_offers, + {{ safe_numeric('number_2nd_preference_offers') }}::integer as second_preference_offers, + {{ safe_numeric('number_3rd_preference_offers') }}::integer as third_preference_offers, -- Applications - cast(nullif(times_put_as_any_preferred_school, 'z') as integer) as total_applications, - cast(nullif(times_put_as_1st_preference, 'z') as integer) as first_preference_applications, + {{ safe_numeric('times_put_as_any_preferred_school') }}::integer as total_applications, + {{ safe_numeric('times_put_as_1st_preference') }}::integer as first_preference_applications, -- Proportions - cast(nullif(proportion_1stprefs_v_totaloffers, 'z') as numeric) as first_preference_offer_pct, + {{ safe_numeric('proportion_1stprefs_v_totaloffers') }} as first_preference_offer_pct, - -- Derived: oversubscribed if applications > places + -- Derived: oversubscribed if 1st-preference applications > places offered case - when nullif(times_put_as_1st_preference, 'z') is not null - and nullif(total_number_places_offered, 'z') is not null - and cast(times_put_as_1st_preference as integer) - > cast(total_number_places_offered as integer) + when {{ safe_numeric('times_put_as_1st_preference') }} is not null + and {{ safe_numeric('total_number_places_offered') }} is not null + and {{ safe_numeric('times_put_as_1st_preference') }} + > {{ safe_numeric('total_number_places_offered') }} then true else false end as oversubscribed, -- Context admissions_policy, - nullif(FSM_eligible_percent, 'z') as fsm_eligible_pct + {{ safe_numeric('"FSM_eligible_percent"') }} as fsm_eligible_pct from source ) diff --git a/pipeline/transform/models/staging/stg_ees_ks2.sql b/pipeline/transform/models/staging/stg_ees_ks2.sql index 6526b5e..4e33d89 100644 --- a/pipeline/transform/models/staging/stg_ees_ks2.sql +++ b/pipeline/transform/models/staging/stg_ees_ks2.sql @@ -1,7 +1,8 @@ -- Staging model: KS2 attainment + information -- Pivots long-format attainment data (one row per subject × breakdown) into -- wide format (one row per school per year) and joins context from info table. --- EES uses 'z' for suppressed values — cast to null via nullif. +-- EES uses 'z' (not applicable) and 'c' (confidential) as suppression codes — +-- safe_numeric handles both by treating any non-numeric string as NULL. with attainment as ( select * from {{ source('raw', 'ees_ks2_attainment') }} @@ -14,11 +15,11 @@ all_pupils as ( school_urn, time_period, subject, - nullif(expected_standard_pupil_percent, 'z') as expected_pct, - nullif(higher_standard_pupil_percent, 'z') as higher_pct, - nullif(average_scaled_score, 'z') as avg_score, - nullif(progress_measure_score, 'z') as progress, - nullif(absent_or_not_able_to_access_percent, 'z') as absence_pct + {{ safe_numeric('expected_standard_pupil_percent') }} as expected_pct, + {{ safe_numeric('higher_standard_pupil_percent') }} as higher_pct, + {{ safe_numeric('average_scaled_score') }} as avg_score, + {{ safe_numeric('progress_measure_score') }} as progress, + {{ safe_numeric('absent_or_not_able_to_access_percent') }} as absence_pct from attainment where breakdown_topic = 'All pupils' and breakdown = 'Total' @@ -30,38 +31,38 @@ pivoted as ( cast(time_period as integer) as year, -- RWM combined - max(case when subject = 'Reading, writing and maths' then cast(expected_pct as numeric) end) as rwm_expected_pct, - max(case when subject = 'Reading, writing and maths' then cast(higher_pct as numeric) end) as rwm_high_pct, + max(case when subject = 'Reading, writing and maths' then expected_pct end) as rwm_expected_pct, + max(case when subject = 'Reading, writing and maths' then higher_pct end) as rwm_high_pct, -- Reading - max(case when subject = 'Reading' then cast(expected_pct as numeric) end) as reading_expected_pct, - max(case when subject = 'Reading' then cast(higher_pct as numeric) end) as reading_high_pct, - max(case when subject = 'Reading' then cast(avg_score as numeric) end) as reading_avg_score, - max(case when subject = 'Reading' then cast(progress as numeric) end) as reading_progress, - max(case when subject = 'Reading' then cast(absence_pct as numeric) end) as reading_absence_pct, + max(case when subject = 'Reading' then expected_pct end) as reading_expected_pct, + max(case when subject = 'Reading' then higher_pct end) as reading_high_pct, + max(case when subject = 'Reading' then avg_score end) as reading_avg_score, + max(case when subject = 'Reading' then progress end) as reading_progress, + max(case when subject = 'Reading' then absence_pct end) as reading_absence_pct, -- Writing - max(case when subject = 'Writing' then cast(expected_pct as numeric) end) as writing_expected_pct, - max(case when subject = 'Writing' then cast(higher_pct as numeric) end) as writing_high_pct, - max(case when subject = 'Writing' then cast(progress as numeric) end) as writing_progress, - max(case when subject = 'Writing' then cast(absence_pct as numeric) end) as writing_absence_pct, + max(case when subject = 'Writing' then expected_pct end) as writing_expected_pct, + max(case when subject = 'Writing' then higher_pct end) as writing_high_pct, + max(case when subject = 'Writing' then progress end) as writing_progress, + max(case when subject = 'Writing' then absence_pct end) as writing_absence_pct, -- Maths - max(case when subject = 'Maths' then cast(expected_pct as numeric) end) as maths_expected_pct, - max(case when subject = 'Maths' then cast(higher_pct as numeric) end) as maths_high_pct, - max(case when subject = 'Maths' then cast(avg_score as numeric) end) as maths_avg_score, - max(case when subject = 'Maths' then cast(progress as numeric) end) as maths_progress, - max(case when subject = 'Maths' then cast(absence_pct as numeric) end) as maths_absence_pct, + max(case when subject = 'Maths' then expected_pct end) as maths_expected_pct, + max(case when subject = 'Maths' then higher_pct end) as maths_high_pct, + max(case when subject = 'Maths' then avg_score end) as maths_avg_score, + max(case when subject = 'Maths' then progress end) as maths_progress, + max(case when subject = 'Maths' then absence_pct end) as maths_absence_pct, -- GPS - max(case when subject ilike '%grammar%' or subject = 'GPS' then cast(expected_pct as numeric) end) as gps_expected_pct, - max(case when subject ilike '%grammar%' or subject = 'GPS' then cast(higher_pct as numeric) end) as gps_high_pct, - max(case when subject ilike '%grammar%' or subject = 'GPS' then cast(avg_score as numeric) end) as gps_avg_score, - max(case when subject ilike '%grammar%' or subject = 'GPS' then cast(absence_pct as numeric) end) as gps_absence_pct, + max(case when subject ilike '%grammar%' or subject = 'GPS' then expected_pct end) as gps_expected_pct, + max(case when subject ilike '%grammar%' or subject = 'GPS' then higher_pct end) as gps_high_pct, + max(case when subject ilike '%grammar%' or subject = 'GPS' then avg_score end) as gps_avg_score, + max(case when subject ilike '%grammar%' or subject = 'GPS' then absence_pct end) as gps_absence_pct, -- Science - max(case when subject = 'Science' then cast(expected_pct as numeric) end) as science_expected_pct, - max(case when subject = 'Science' then cast(absence_pct as numeric) end) as science_absence_pct + max(case when subject = 'Science' then expected_pct end) as science_expected_pct, + max(case when subject = 'Science' then absence_pct end) as science_absence_pct from all_pupils group by school_urn, time_period @@ -72,8 +73,8 @@ gender_boys as ( select school_urn, time_period, - nullif(expected_standard_pupil_percent, 'z') as rwm_expected_boys_pct, - nullif(higher_standard_pupil_percent, 'z') as rwm_high_boys_pct + {{ safe_numeric('expected_standard_pupil_percent') }} as rwm_expected_boys_pct, + {{ safe_numeric('higher_standard_pupil_percent') }} as rwm_high_boys_pct from attainment where subject = 'Reading, writing and maths' and breakdown = 'Boys' @@ -83,8 +84,8 @@ gender_girls as ( select school_urn, time_period, - nullif(expected_standard_pupil_percent, 'z') as rwm_expected_girls_pct, - nullif(higher_standard_pupil_percent, 'z') as rwm_high_girls_pct + {{ safe_numeric('expected_standard_pupil_percent') }} as rwm_expected_girls_pct, + {{ safe_numeric('higher_standard_pupil_percent') }} as rwm_high_girls_pct from attainment where subject = 'Reading, writing and maths' and breakdown = 'Girls' @@ -95,7 +96,7 @@ disadv as ( select school_urn, time_period, - nullif(expected_standard_pupil_percent, 'z') as rwm_expected_disadvantaged_pct + {{ safe_numeric('expected_standard_pupil_percent') }} as rwm_expected_disadvantaged_pct from attainment where subject = 'Reading, writing and maths' and breakdown = 'Disadvantaged' @@ -105,7 +106,7 @@ not_disadv as ( select school_urn, time_period, - nullif(expected_standard_pupil_percent, 'z') as rwm_expected_non_disadvantaged_pct + {{ safe_numeric('expected_standard_pupil_percent') }} as rwm_expected_non_disadvantaged_pct from attainment where subject = 'Reading, writing and maths' and breakdown = 'Not disadvantaged' @@ -116,13 +117,13 @@ info as ( select cast(school_urn as integer) as urn, cast(time_period as integer) as year, - cast(nullif(totpups, 'z') as integer) as total_pupils, - cast(nullif(telig, 'z') as integer) as eligible_pupils, - cast(nullif(ptfsm6cla1a, 'z') as numeric) as disadvantaged_pct, - cast(nullif(ptealgrp2, 'z') as numeric) as eal_pct, - cast(nullif(psenelk, 'z') as numeric) as sen_support_pct, - cast(nullif(psenele, 'z') as numeric) as sen_ehcp_pct, - cast(nullif(ptmobn, 'z') as numeric) as stability_pct + {{ safe_numeric('totpups') }}::integer as total_pupils, + {{ safe_numeric('telig') }}::integer as eligible_pupils, + {{ safe_numeric('ptfsm6cla1a') }} as disadvantaged_pct, + {{ safe_numeric('ptealgrp2') }} as eal_pct, + {{ safe_numeric('psenelk') }} as sen_support_pct, + {{ safe_numeric('psenele') }} as sen_ehcp_pct, + {{ safe_numeric('ptmobn') }} as stability_pct from {{ source('raw', 'ees_ks2_info') }} where school_urn is not null ) @@ -160,15 +161,15 @@ select p.science_absence_pct, -- Gender - cast(gb.rwm_expected_boys_pct as numeric) as rwm_expected_boys_pct, - cast(gb.rwm_high_boys_pct as numeric) as rwm_high_boys_pct, - cast(gg.rwm_expected_girls_pct as numeric) as rwm_expected_girls_pct, - cast(gg.rwm_high_girls_pct as numeric) as rwm_high_girls_pct, + gb.rwm_expected_boys_pct, + gb.rwm_high_boys_pct, + gg.rwm_expected_girls_pct, + gg.rwm_high_girls_pct, -- Disadvantaged - cast(d.rwm_expected_disadvantaged_pct as numeric) as rwm_expected_disadvantaged_pct, - cast(nd.rwm_expected_non_disadvantaged_pct as numeric) as rwm_expected_non_disadvantaged_pct, - cast(d.rwm_expected_disadvantaged_pct as numeric) - cast(nd.rwm_expected_non_disadvantaged_pct as numeric) as disadvantaged_gap, + d.rwm_expected_disadvantaged_pct, + nd.rwm_expected_non_disadvantaged_pct, + d.rwm_expected_disadvantaged_pct - nd.rwm_expected_non_disadvantaged_pct as disadvantaged_gap, -- Context i.disadvantaged_pct, diff --git a/pipeline/transform/models/staging/stg_ees_ks4.sql b/pipeline/transform/models/staging/stg_ees_ks4.sql index a4aae9a..7f12346 100644 --- a/pipeline/transform/models/staging/stg_ees_ks4.sql +++ b/pipeline/transform/models/staging/stg_ees_ks4.sql @@ -2,7 +2,8 @@ -- KS4 performance data is long-format with breakdown dimensions (breakdown_topic, -- breakdown, sex). Unlike KS2 which has a subject dimension, KS4 metrics are -- already in separate columns — we just filter to the 'All pupils' breakdown. --- EES uses 'z' for suppressed values — cast to null via nullif. +-- EES uses 'z' (not applicable) and 'c' (confidential) as suppression codes — +-- safe_numeric handles both by treating any non-numeric string as NULL. with performance as ( select * from {{ source('raw', 'ees_ks4_performance') }} @@ -14,32 +15,32 @@ all_pupils as ( select cast(school_urn as integer) as urn, cast(time_period as integer) as year, - cast(nullif(pupil_count, 'z') as integer) as total_pupils, + {{ safe_numeric('pupil_count') }}::integer as total_pupils, -- Attainment 8 - cast(nullif(attainment8_average, 'z') as numeric) as attainment_8_score, + {{ safe_numeric('attainment8_average') }} as attainment_8_score, -- Progress 8 - cast(nullif(progress8_average, 'z') as numeric) as progress_8_score, - cast(nullif(progress8_lower_95_ci, 'z') as numeric) as progress_8_lower_ci, - cast(nullif(progress8_upper_95_ci, 'z') as numeric) as progress_8_upper_ci, - cast(nullif(progress8eng_average, 'z') as numeric) as progress_8_english, - cast(nullif(progress8mat_average, 'z') as numeric) as progress_8_maths, - cast(nullif(progress8ebacc_average, 'z') as numeric) as progress_8_ebacc, - cast(nullif(progress8open_average, 'z') as numeric) as progress_8_open, + {{ safe_numeric('progress8_average') }} as progress_8_score, + {{ safe_numeric('progress8_lower_95_ci') }} as progress_8_lower_ci, + {{ safe_numeric('progress8_upper_95_ci') }} as progress_8_upper_ci, + {{ safe_numeric('progress8eng_average') }} as progress_8_english, + {{ safe_numeric('progress8mat_average') }} as progress_8_maths, + {{ safe_numeric('progress8ebacc_average') }} as progress_8_ebacc, + {{ safe_numeric('progress8open_average') }} as progress_8_open, -- English & Maths pass rates - cast(nullif(engmath_95_percent, 'z') as numeric) as english_maths_strong_pass_pct, - cast(nullif(engmath_94_percent, 'z') as numeric) as english_maths_standard_pass_pct, + {{ safe_numeric('engmath_95_percent') }} as english_maths_strong_pass_pct, + {{ safe_numeric('engmath_94_percent') }} as english_maths_standard_pass_pct, -- EBacc - cast(nullif(ebacc_entering_percent, 'z') as numeric) as ebacc_entry_pct, - cast(nullif(ebacc_95_percent, 'z') as numeric) as ebacc_strong_pass_pct, - cast(nullif(ebacc_94_percent, 'z') as numeric) as ebacc_standard_pass_pct, - cast(nullif(ebacc_aps_average, 'z') as numeric) as ebacc_avg_score, + {{ safe_numeric('ebacc_entering_percent') }} as ebacc_entry_pct, + {{ safe_numeric('ebacc_95_percent') }} as ebacc_strong_pass_pct, + {{ safe_numeric('ebacc_94_percent') }} as ebacc_standard_pass_pct, + {{ safe_numeric('ebacc_aps_average') }} as ebacc_avg_score, -- GCSE grade 9-1 - cast(nullif(gcse_91_percent, 'z') as numeric) as gcse_grade_91_pct + {{ safe_numeric('gcse_91_percent') }} as gcse_grade_91_pct from performance where breakdown_topic = 'All pupils' @@ -52,11 +53,11 @@ info as ( select cast(school_urn as integer) as urn, cast(time_period as integer) as year, - cast(nullif(endks4_pupil_count, 'z') as integer) as eligible_pupils, - cast(nullif(ks2_scaledscore_average, 'z') as numeric) as prior_attainment_avg, - cast(nullif(sen_pupil_percent, 'z') as numeric) as sen_pct, - cast(nullif(sen_with_ehcp_pupil_percent, 'z') as numeric) as sen_ehcp_pct, - cast(nullif(sen_no_ehcp_pupil_percent, 'z') as numeric) as sen_support_pct + {{ safe_numeric('endks4_pupil_count') }}::integer as eligible_pupils, + {{ safe_numeric('ks2_scaledscore_average') }} as prior_attainment_avg, + {{ safe_numeric('sen_pupil_percent') }} as sen_pct, + {{ safe_numeric('sen_with_ehcp_pupil_percent') }} as sen_ehcp_pct, + {{ safe_numeric('sen_no_ehcp_pupil_percent') }} as sen_support_pct from {{ source('raw', 'ees_ks4_info') }} where school_urn is not null )