feat: migrate backend to marts schema, update EES tap for verified datasets
Pipeline: - EES tap: split KS4 into performance + info streams, fix admissions filename (SchoolLevel keyword match), fix census filename (yearly suffix), remove phonics (no school-level data on EES), change endswith → in for matching - stg_ees_ks4: rewrite to filter long-format data and extract Attainment 8, Progress 8, EBacc, English/Maths metrics; join KS4 info for context - stg_ees_admissions: map real CSV columns (total_number_places_offered, etc.) - stg_ees_census: update source reference, stub with TODO for data columns - Remove stg_ees_phonics, fact_phonics (no school-level EES data) - Add ees_ks4_performance + ees_ks4_info sources, remove ees_ks4 + ees_phonics - Update int_ks4_with_lineage + fact_ks4_performance with new KS4 columns - Annual EES DAG: remove stg_ees_phonics+ from selector Backend: - models.py: replace all models to point at marts.* tables with schema='marts' (DimSchool, DimLocation, KS2Performance, FactOfstedInspection, etc.) - data_loader.py: rewrite load_school_data_as_dataframe() using raw SQL joining dim_school + dim_location + fact_ks2_performance; update get_supplementary_data() - database.py: remove migration machinery, keep only connection setup - app.py: remove check_and_migrate_if_needed, remove /api/admin/reimport-ks2 endpoints (pipeline handles all imports) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -120,12 +120,12 @@ with DAG(
|
||||
extract_ofsted >> dbt_build_ofsted >> sync_typesense_ofsted
|
||||
|
||||
|
||||
# ── Annual DAG (EES: KS2, KS4, Census, Admissions, Phonics) ───────────
|
||||
# ── Annual DAG (EES: KS2, KS4, Census, Admissions) ───────────────────
|
||||
|
||||
with DAG(
|
||||
dag_id="school_data_annual_ees",
|
||||
default_args=default_args,
|
||||
description="Annual EES data extraction (KS2, KS4, Census, Admissions, Phonics)",
|
||||
description="Annual EES data extraction (KS2, KS4, Census, Admissions)",
|
||||
schedule=None, # Triggered manually when new releases are published
|
||||
start_date=datetime(2025, 1, 1),
|
||||
catchup=False,
|
||||
@@ -140,7 +140,7 @@ with DAG(
|
||||
|
||||
dbt_build_ees = BashOperator(
|
||||
task_id="dbt_build",
|
||||
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production --select stg_ees_ks2+ stg_ees_ks4+ stg_ees_census+ stg_ees_admissions+ stg_ees_phonics+",
|
||||
bash_command=f"cd {PIPELINE_DIR}/transform && {DBT_BIN} build --profiles-dir . --target production --select stg_ees_ks2+ stg_ees_ks4+ stg_ees_census+ stg_ees_admissions+",
|
||||
)
|
||||
|
||||
sync_typesense_ees = BashOperator(
|
||||
|
||||
@@ -1,9 +1,11 @@
|
||||
"""EES Singer tap — extracts KS2, KS4, Census, Admissions, Phonics data.
|
||||
"""EES Singer tap — extracts KS2, KS4, Census, Admissions data.
|
||||
|
||||
Each stream targets a specific CSV file within an EES release ZIP.
|
||||
The EES data uses 'school_urn' for school-level records and 'z' for
|
||||
suppressed values. Column names vary by file — schemas declare all
|
||||
columns needed by downstream dbt staging models.
|
||||
|
||||
Phonics has no school-level data on EES and is not included.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
@@ -38,11 +40,15 @@ def download_release_zip(release_id: str) -> zipfile.ZipFile:
|
||||
|
||||
|
||||
class EESDatasetStream(Stream):
|
||||
"""Base stream for an EES dataset extracted from a release ZIP."""
|
||||
"""Base stream for an EES dataset extracted from a release ZIP.
|
||||
|
||||
Subclasses set _target_filename to a keyword that appears in the
|
||||
target CSV path inside the ZIP (substring match, not exact).
|
||||
"""
|
||||
|
||||
replication_key = None
|
||||
_publication_slug: str = ""
|
||||
_target_filename: str = "" # exact filename within the ZIP
|
||||
_target_filename: str = "" # keyword that appears in the CSV path
|
||||
_urn_column: str = "school_urn" # column name for URN in the CSV
|
||||
|
||||
def get_records(self, context):
|
||||
@@ -56,17 +62,17 @@ class EESDatasetStream(Stream):
|
||||
)
|
||||
zf = download_release_zip(release_id)
|
||||
|
||||
# Find the target file
|
||||
# Find the target file (substring match)
|
||||
all_files = zf.namelist()
|
||||
target = None
|
||||
for name in all_files:
|
||||
if name.endswith(self._target_filename):
|
||||
if self._target_filename in name and name.endswith(".csv"):
|
||||
target = name
|
||||
break
|
||||
|
||||
if not target:
|
||||
self.logger.error(
|
||||
"File '%s' not found in ZIP. Available: %s",
|
||||
"File matching '%s' not found in ZIP. Available: %s",
|
||||
self._target_filename,
|
||||
[n for n in all_files if n.endswith(".csv")],
|
||||
)
|
||||
@@ -96,7 +102,7 @@ class EESKS2AttainmentStream(EESDatasetStream):
|
||||
name = "ees_ks2_attainment"
|
||||
primary_keys = ["school_urn", "time_period", "subject", "breakdown_topic", "breakdown"]
|
||||
_publication_slug = "key-stage-2-attainment"
|
||||
_target_filename = "ks2_school_attainment_data.csv"
|
||||
_target_filename = "ks2_school_attainment_data"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
th.Property("school_urn", th.StringType, required=True),
|
||||
@@ -126,7 +132,7 @@ class EESKS2InfoStream(EESDatasetStream):
|
||||
name = "ees_ks2_info"
|
||||
primary_keys = ["school_urn", "time_period"]
|
||||
_publication_slug = "key-stage-2-attainment"
|
||||
_target_filename = "ks2_school_information_data.csv"
|
||||
_target_filename = "ks2_school_information_data"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
th.Property("school_urn", th.StringType, required=True),
|
||||
@@ -150,60 +156,172 @@ class EESKS2InfoStream(EESDatasetStream):
|
||||
).to_dict()
|
||||
|
||||
|
||||
# ── KS4 Attainment ──────────────────────────────────────────────────────────
|
||||
# ── KS4 Performance (long format: one row per school × breakdown × sex) ─────
|
||||
# File: 202425_performance_tables_schools_revised.csv (156 cols)
|
||||
# Dimensions: breakdown_topic, breakdown, sex, disadvantage_status, etc.
|
||||
# Metrics are already in separate columns (attainment8_average, progress8_average, etc.)
|
||||
|
||||
class EESKS4Stream(EESDatasetStream):
|
||||
name = "ees_ks4"
|
||||
primary_keys = ["school_urn", "time_period"]
|
||||
class EESKS4PerformanceStream(EESDatasetStream):
|
||||
name = "ees_ks4_performance"
|
||||
primary_keys = ["school_urn", "time_period", "breakdown_topic", "breakdown", "sex"]
|
||||
_publication_slug = "key-stage-4-performance"
|
||||
_target_filename = "school" # Will be refined once we see the actual ZIP contents
|
||||
_target_filename = "performance_tables_schools"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
th.Property("school_urn", th.StringType, required=True),
|
||||
th.Property("school_laestab", th.StringType),
|
||||
th.Property("school_name", th.StringType),
|
||||
th.Property("establishment_type_group", th.StringType),
|
||||
th.Property("breakdown_topic", th.StringType, required=True),
|
||||
th.Property("breakdown", th.StringType, required=True),
|
||||
th.Property("sex", th.StringType, required=True),
|
||||
th.Property("disadvantage_status", th.StringType),
|
||||
th.Property("first_language", th.StringType),
|
||||
th.Property("prior_attainment", th.StringType),
|
||||
th.Property("mobility", th.StringType),
|
||||
# Pupil counts
|
||||
th.Property("pupil_count", th.StringType),
|
||||
th.Property("pupil_percent", th.StringType),
|
||||
# Attainment 8
|
||||
th.Property("attainment8_sum", th.StringType),
|
||||
th.Property("attainment8_average", th.StringType),
|
||||
# English & Maths
|
||||
th.Property("engmath_entering_total", th.StringType),
|
||||
th.Property("engmath_entering_percent", th.StringType),
|
||||
th.Property("engmath_95_total", th.StringType),
|
||||
th.Property("engmath_95_percent", th.StringType),
|
||||
th.Property("engmath_94_total", th.StringType),
|
||||
th.Property("engmath_94_percent", th.StringType),
|
||||
# EBacc
|
||||
th.Property("ebacc_entering_total", th.StringType),
|
||||
th.Property("ebacc_entering_percent", th.StringType),
|
||||
th.Property("ebacc_95_total", th.StringType),
|
||||
th.Property("ebacc_95_percent", th.StringType),
|
||||
th.Property("ebacc_94_total", th.StringType),
|
||||
th.Property("ebacc_94_percent", th.StringType),
|
||||
th.Property("ebacc_aps_sum", th.StringType),
|
||||
th.Property("ebacc_aps_average", th.StringType),
|
||||
# Progress 8
|
||||
th.Property("progress8_pupil_count", th.StringType),
|
||||
th.Property("progress8_sum", th.StringType),
|
||||
th.Property("progress8_average", th.StringType),
|
||||
th.Property("progress8_lower_95_ci", th.StringType),
|
||||
th.Property("progress8_upper_95_ci", th.StringType),
|
||||
# Progress 8 elements
|
||||
th.Property("progress8eng_average", th.StringType),
|
||||
th.Property("progress8mat_average", th.StringType),
|
||||
th.Property("progress8ebacc_average", th.StringType),
|
||||
th.Property("progress8open_average", th.StringType),
|
||||
# GCSE grades
|
||||
th.Property("gcse_91_total", th.StringType),
|
||||
th.Property("gcse_91_percent", th.StringType),
|
||||
# EBacc subject entry/achievement
|
||||
th.Property("ebacceng_entering_percent", th.StringType),
|
||||
th.Property("ebaccmat_entering_percent", th.StringType),
|
||||
th.Property("ebaccsci_entering_percent", th.StringType),
|
||||
th.Property("ebacchum_entering_percent", th.StringType),
|
||||
th.Property("ebacclan_entering_percent", th.StringType),
|
||||
).to_dict()
|
||||
|
||||
|
||||
# ── KS4 Information (wide format: one row per school, context/demographics) ──
|
||||
# File: 202425_information_about_schools_provisional.csv (38 cols)
|
||||
|
||||
class EESKS4InfoStream(EESDatasetStream):
|
||||
name = "ees_ks4_info"
|
||||
primary_keys = ["school_urn", "time_period"]
|
||||
_publication_slug = "key-stage-4-performance"
|
||||
_target_filename = "information_about_schools"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
th.Property("school_urn", th.StringType, required=True),
|
||||
th.Property("school_laestab", th.StringType),
|
||||
th.Property("school_name", th.StringType),
|
||||
th.Property("establishment_type_group", th.StringType),
|
||||
th.Property("reldenom", th.StringType),
|
||||
th.Property("admpol_pt", th.StringType),
|
||||
th.Property("egender", th.StringType),
|
||||
th.Property("agerange", th.StringType),
|
||||
th.Property("allks_pupil_count", th.StringType),
|
||||
th.Property("allks_boys_count", th.StringType),
|
||||
th.Property("allks_girls_count", th.StringType),
|
||||
th.Property("endks4_pupil_count", th.StringType),
|
||||
th.Property("ks2_scaledscore_average", th.StringType),
|
||||
th.Property("sen_with_ehcp_pupil_percent", th.StringType),
|
||||
th.Property("sen_pupil_percent", th.StringType),
|
||||
th.Property("sen_no_ehcp_pupil_percent", th.StringType),
|
||||
th.Property("attainment8_diffn", th.StringType),
|
||||
th.Property("progress8_diffn", th.StringType),
|
||||
th.Property("progress8_banding", th.StringType),
|
||||
).to_dict()
|
||||
|
||||
|
||||
# ── Census (school-level pupil characteristics) ─────────────────────────────
|
||||
# File: spc_school_level_underlying_data_YYYY.csv (269 cols, in supporting-files/)
|
||||
# Uses 'urn' not 'school_urn'. Filename has yearly suffix that changes.
|
||||
|
||||
class EESCensusStream(EESDatasetStream):
|
||||
name = "ees_census"
|
||||
primary_keys = ["urn", "time_period"]
|
||||
primary_keys = ["school_urn", "time_period"]
|
||||
_publication_slug = "school-pupils-and-their-characteristics"
|
||||
_target_filename = "spc_school_level_underlying_data_2025.csv"
|
||||
_target_filename = "spc_school_level_underlying_data"
|
||||
_urn_column = "urn"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
th.Property("urn", th.StringType, required=True),
|
||||
th.Property("school_urn", th.StringType, required=True),
|
||||
th.Property("school_name", th.StringType),
|
||||
th.Property("laestab", th.StringType),
|
||||
th.Property("phase_type_grouping", th.StringType),
|
||||
# TODO: Add data columns (ethnicity %, FSM %, SEN %, etc.) once
|
||||
# actual column names are verified on the container. The CSV has
|
||||
# 269 columns — only the first 30 (metadata) have been inspected.
|
||||
).to_dict()
|
||||
|
||||
|
||||
# ── Admissions ───────────────────────────────────────────────────────────────
|
||||
# File: AppsandOffers_YYYY_SchoolLevelDDMMYYYY.csv (37 cols, in supporting-files/)
|
||||
# Wide format, no geographic_level column. Uses school_urn.
|
||||
|
||||
class EESAdmissionsStream(EESDatasetStream):
|
||||
name = "ees_admissions"
|
||||
primary_keys = ["school_urn", "time_period"]
|
||||
_publication_slug = "primary-and-secondary-school-applications-and-offers"
|
||||
_target_filename = "school" # Will be refined once we see the actual ZIP contents
|
||||
_target_filename = "SchoolLevel"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
th.Property("school_urn", th.StringType, required=True),
|
||||
th.Property("school_name", th.StringType),
|
||||
th.Property("school_laestab_as_used", th.StringType),
|
||||
th.Property("school_phase", th.StringType),
|
||||
th.Property("entry_year", th.StringType),
|
||||
# Places and offers
|
||||
th.Property("total_number_places_offered", th.StringType),
|
||||
th.Property("number_preferred_offers", th.StringType),
|
||||
th.Property("number_1st_preference_offers", th.StringType),
|
||||
th.Property("number_2nd_preference_offers", th.StringType),
|
||||
th.Property("number_3rd_preference_offers", th.StringType),
|
||||
# Applications
|
||||
th.Property("times_put_as_any_preferred_school", th.StringType),
|
||||
th.Property("times_put_as_1st_preference", th.StringType),
|
||||
th.Property("times_put_as_2nd_preference", th.StringType),
|
||||
th.Property("times_put_as_3rd_preference", th.StringType),
|
||||
# Proportions
|
||||
th.Property("proportion_1stprefs_v_1stprefoffers", th.StringType),
|
||||
th.Property("proportion_1stprefs_v_totaloffers", th.StringType),
|
||||
# Cross-LA
|
||||
th.Property("all_applications_from_another_LA", th.StringType),
|
||||
th.Property("offers_to_applicants_from_another_LA", th.StringType),
|
||||
# Context
|
||||
th.Property("establishment_type", th.StringType),
|
||||
th.Property("denomination", th.StringType),
|
||||
th.Property("FSM_eligible_percent", th.StringType),
|
||||
th.Property("admissions_policy", th.StringType),
|
||||
th.Property("urban_rural", th.StringType),
|
||||
).to_dict()
|
||||
|
||||
|
||||
# ── Phonics ──────────────────────────────────────────────────────────────────
|
||||
|
||||
class EESPhonicsStream(EESDatasetStream):
|
||||
name = "ees_phonics"
|
||||
primary_keys = ["school_urn", "time_period"]
|
||||
_publication_slug = "phonics-screening-check-attainment"
|
||||
_target_filename = "school" # Will be refined once we see the actual ZIP contents
|
||||
schema = th.PropertiesList(
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
th.Property("school_urn", th.StringType, required=True),
|
||||
).to_dict()
|
||||
# Note: Phonics (phonics-screening-check-attainment) has NO school-level data
|
||||
# on EES. Only national and LA-level files are published.
|
||||
|
||||
|
||||
class TapUKEES(Tap):
|
||||
@@ -219,10 +337,10 @@ class TapUKEES(Tap):
|
||||
return [
|
||||
EESKS2AttainmentStream(self),
|
||||
EESKS2InfoStream(self),
|
||||
EESKS4Stream(self),
|
||||
EESKS4PerformanceStream(self),
|
||||
EESKS4InfoStream(self),
|
||||
EESCensusStream(self),
|
||||
EESAdmissionsStream(self),
|
||||
EESPhonicsStream(self),
|
||||
]
|
||||
|
||||
|
||||
|
||||
@@ -4,16 +4,14 @@ with current_ks4 as (
|
||||
select
|
||||
urn as current_urn,
|
||||
urn as source_urn,
|
||||
year,
|
||||
total_pupils,
|
||||
progress_8_score,
|
||||
year, total_pupils, eligible_pupils, prior_attainment_avg,
|
||||
attainment_8_score,
|
||||
ebacc_entry_pct,
|
||||
ebacc_achievement_pct,
|
||||
english_strong_pass_pct,
|
||||
maths_strong_pass_pct,
|
||||
english_maths_strong_pass_pct,
|
||||
staying_in_education_pct
|
||||
progress_8_score, progress_8_lower_ci, progress_8_upper_ci,
|
||||
progress_8_english, progress_8_maths, progress_8_ebacc, progress_8_open,
|
||||
english_maths_strong_pass_pct, english_maths_standard_pass_pct,
|
||||
ebacc_entry_pct, ebacc_strong_pass_pct, ebacc_standard_pass_pct, ebacc_avg_score,
|
||||
gcse_grade_91_pct,
|
||||
sen_pct, sen_ehcp_pct, sen_support_pct
|
||||
from {{ ref('stg_ees_ks4') }}
|
||||
),
|
||||
|
||||
@@ -21,16 +19,14 @@ predecessor_ks4 as (
|
||||
select
|
||||
lin.current_urn,
|
||||
ks4.urn as source_urn,
|
||||
ks4.year,
|
||||
ks4.total_pupils,
|
||||
ks4.progress_8_score,
|
||||
ks4.year, ks4.total_pupils, ks4.eligible_pupils, ks4.prior_attainment_avg,
|
||||
ks4.attainment_8_score,
|
||||
ks4.ebacc_entry_pct,
|
||||
ks4.ebacc_achievement_pct,
|
||||
ks4.english_strong_pass_pct,
|
||||
ks4.maths_strong_pass_pct,
|
||||
ks4.english_maths_strong_pass_pct,
|
||||
ks4.staying_in_education_pct
|
||||
ks4.progress_8_score, ks4.progress_8_lower_ci, ks4.progress_8_upper_ci,
|
||||
ks4.progress_8_english, ks4.progress_8_maths, ks4.progress_8_ebacc, ks4.progress_8_open,
|
||||
ks4.english_maths_strong_pass_pct, ks4.english_maths_standard_pass_pct,
|
||||
ks4.ebacc_entry_pct, ks4.ebacc_strong_pass_pct, ks4.ebacc_standard_pass_pct, ks4.ebacc_avg_score,
|
||||
ks4.gcse_grade_91_pct,
|
||||
ks4.sen_pct, ks4.sen_ehcp_pct, ks4.sen_support_pct
|
||||
from {{ ref('stg_ees_ks4') }} ks4
|
||||
inner join {{ ref('int_school_lineage') }} lin
|
||||
on ks4.urn = lin.predecessor_urn
|
||||
|
||||
@@ -1,18 +1,8 @@
|
||||
-- Intermediate model: Merged pupil characteristics from census data
|
||||
-- TODO: Expand once census data columns are verified and added to stg_ees_census
|
||||
|
||||
select
|
||||
urn,
|
||||
year,
|
||||
fsm_pct,
|
||||
sen_support_pct,
|
||||
sen_ehcp_pct,
|
||||
eal_pct,
|
||||
disadvantaged_pct,
|
||||
ethnicity_white_pct,
|
||||
ethnicity_asian_pct,
|
||||
ethnicity_black_pct,
|
||||
ethnicity_mixed_pct,
|
||||
ethnicity_other_pct,
|
||||
class_size_avg,
|
||||
stability_pct
|
||||
phase_type_grouping
|
||||
from {{ ref('stg_ees_census') }}
|
||||
|
||||
@@ -88,14 +88,6 @@ models:
|
||||
- name: year
|
||||
tests: [not_null]
|
||||
|
||||
- name: fact_phonics
|
||||
description: Phonics screening results — one row per URN per year
|
||||
columns:
|
||||
- name: urn
|
||||
tests: [not_null]
|
||||
- name: year
|
||||
tests: [not_null]
|
||||
|
||||
- name: fact_parent_view
|
||||
description: Parent View survey responses
|
||||
columns:
|
||||
|
||||
@@ -3,8 +3,12 @@
|
||||
select
|
||||
urn,
|
||||
year,
|
||||
school_phase,
|
||||
published_admission_number,
|
||||
total_applications,
|
||||
first_preference_offers_pct,
|
||||
oversubscribed
|
||||
first_preference_applications,
|
||||
first_preference_offers,
|
||||
first_preference_offer_pct,
|
||||
oversubscribed,
|
||||
admissions_policy
|
||||
from {{ ref('stg_ees_admissions') }}
|
||||
|
||||
@@ -1,16 +1,42 @@
|
||||
-- Mart: KS4 performance fact table — one row per URN per year
|
||||
-- Includes predecessor data via lineage resolution
|
||||
|
||||
select
|
||||
current_urn as urn,
|
||||
source_urn,
|
||||
year,
|
||||
total_pupils,
|
||||
progress_8_score,
|
||||
eligible_pupils,
|
||||
prior_attainment_avg,
|
||||
|
||||
-- Attainment 8
|
||||
attainment_8_score,
|
||||
ebacc_entry_pct,
|
||||
ebacc_achievement_pct,
|
||||
english_strong_pass_pct,
|
||||
maths_strong_pass_pct,
|
||||
|
||||
-- Progress 8
|
||||
progress_8_score,
|
||||
progress_8_lower_ci,
|
||||
progress_8_upper_ci,
|
||||
progress_8_english,
|
||||
progress_8_maths,
|
||||
progress_8_ebacc,
|
||||
progress_8_open,
|
||||
|
||||
-- English & Maths
|
||||
english_maths_strong_pass_pct,
|
||||
staying_in_education_pct
|
||||
english_maths_standard_pass_pct,
|
||||
|
||||
-- EBacc
|
||||
ebacc_entry_pct,
|
||||
ebacc_strong_pass_pct,
|
||||
ebacc_standard_pass_pct,
|
||||
ebacc_avg_score,
|
||||
|
||||
-- GCSE
|
||||
gcse_grade_91_pct,
|
||||
|
||||
-- Context
|
||||
sen_pct,
|
||||
sen_ehcp_pct,
|
||||
sen_support_pct
|
||||
|
||||
from {{ ref('int_ks4_with_lineage') }}
|
||||
|
||||
@@ -1,8 +0,0 @@
|
||||
-- Mart: Phonics screening results — one row per URN per year
|
||||
|
||||
select
|
||||
urn,
|
||||
year,
|
||||
year1_phonics_pct,
|
||||
year2_phonics_pct
|
||||
from {{ ref('stg_ees_phonics') }}
|
||||
@@ -1,18 +1,8 @@
|
||||
-- Mart: Pupil characteristics — one row per URN per year
|
||||
-- TODO: Expand once census data columns are verified and added to staging
|
||||
|
||||
select
|
||||
urn,
|
||||
year,
|
||||
fsm_pct,
|
||||
sen_support_pct,
|
||||
sen_ehcp_pct,
|
||||
eal_pct,
|
||||
disadvantaged_pct,
|
||||
ethnicity_white_pct,
|
||||
ethnicity_asian_pct,
|
||||
ethnicity_black_pct,
|
||||
ethnicity_mixed_pct,
|
||||
ethnicity_other_pct,
|
||||
class_size_avg,
|
||||
stability_pct
|
||||
phase_type_grouping
|
||||
from {{ ref('int_pupil_chars_merged') }}
|
||||
|
||||
@@ -30,8 +30,11 @@ sources:
|
||||
- name: ees_ks2_info
|
||||
description: KS2 school information (wide format — context/demographics per school)
|
||||
|
||||
- name: ees_ks4
|
||||
description: KS4 attainment data from Explore Education Statistics
|
||||
- name: ees_ks4_performance
|
||||
description: KS4 performance tables (long format — one row per school × breakdown × sex)
|
||||
|
||||
- name: ees_ks4_info
|
||||
description: KS4 school information (wide format — context/demographics per school)
|
||||
|
||||
- name: ees_census
|
||||
description: School census pupil characteristics
|
||||
@@ -39,8 +42,7 @@ sources:
|
||||
- name: ees_admissions
|
||||
description: Primary and secondary school admissions data
|
||||
|
||||
- name: ees_phonics
|
||||
description: Phonics screening check results
|
||||
# Phonics: no school-level data on EES (only national/LA level)
|
||||
|
||||
- name: parent_view
|
||||
description: Ofsted Parent View survey responses
|
||||
|
||||
@@ -1,19 +1,48 @@
|
||||
-- Staging model: Primary and secondary school admissions from EES
|
||||
-- Wide format, one row per school per year. No geographic_level column.
|
||||
-- File is in supporting-files/ subdirectory of the release ZIP.
|
||||
|
||||
with source as (
|
||||
select * from {{ source('raw', 'ees_admissions') }}
|
||||
where school_urn is not null
|
||||
),
|
||||
|
||||
renamed as (
|
||||
select
|
||||
cast(urn as integer) as urn,
|
||||
cast(time_period as integer) as year,
|
||||
cast(published_admission_number as integer) as published_admission_number,
|
||||
cast(total_applications as integer) as total_applications,
|
||||
cast(first_preference_offers_pct as numeric) as first_preference_offers_pct,
|
||||
cast(oversubscribed as boolean) as oversubscribed
|
||||
cast(school_urn as integer) as urn,
|
||||
cast(time_period as integer) as year,
|
||||
school_phase,
|
||||
entry_year,
|
||||
|
||||
-- Places and offers
|
||||
cast(nullif(total_number_places_offered, 'z') as integer) as published_admission_number,
|
||||
cast(nullif(number_preferred_offers, 'z') as integer) as total_offers,
|
||||
cast(nullif(number_1st_preference_offers, 'z') as integer) as first_preference_offers,
|
||||
cast(nullif(number_2nd_preference_offers, 'z') as integer) as second_preference_offers,
|
||||
cast(nullif(number_3rd_preference_offers, 'z') as integer) as third_preference_offers,
|
||||
|
||||
-- Applications
|
||||
cast(nullif(times_put_as_any_preferred_school, 'z') as integer) as total_applications,
|
||||
cast(nullif(times_put_as_1st_preference, 'z') as integer) as first_preference_applications,
|
||||
|
||||
-- Proportions
|
||||
cast(nullif(proportion_1stprefs_v_totaloffers, 'z') as numeric) as first_preference_offer_pct,
|
||||
|
||||
-- Derived: oversubscribed if applications > places
|
||||
case
|
||||
when nullif(times_put_as_1st_preference, 'z') is not null
|
||||
and nullif(total_number_places_offered, 'z') is not null
|
||||
and cast(times_put_as_1st_preference as integer)
|
||||
> cast(total_number_places_offered as integer)
|
||||
then true
|
||||
else false
|
||||
end as oversubscribed,
|
||||
|
||||
-- Context
|
||||
admissions_policy,
|
||||
nullif(FSM_eligible_percent, 'z') as fsm_eligible_pct
|
||||
|
||||
from source
|
||||
where urn is not null
|
||||
)
|
||||
|
||||
select * from renamed
|
||||
|
||||
@@ -1,27 +1,30 @@
|
||||
-- Staging model: School census pupil characteristics from EES
|
||||
-- File: spc_school_level_underlying_data_YYYY.csv (269 cols, in supporting-files/)
|
||||
-- Uses 'urn' column (not school_urn). Tap normalises to school_urn.
|
||||
--
|
||||
-- TODO: The CSV has 269 columns but only metadata columns have been verified.
|
||||
-- Data columns (ethnicity %, FSM %, SEN %, class sizes) need to be discovered
|
||||
-- by inspecting the CSV on the Airflow container. The column references below
|
||||
-- are placeholders and will fail until the tap schema and this model are updated
|
||||
-- with the actual column names.
|
||||
|
||||
with source as (
|
||||
select * from {{ source('raw', 'ees_census') }}
|
||||
where school_urn is not null
|
||||
),
|
||||
|
||||
renamed as (
|
||||
select
|
||||
cast(urn as integer) as urn,
|
||||
cast(time_period as integer) as year,
|
||||
cast(fsm_pct as numeric) as fsm_pct,
|
||||
cast(sen_support_pct as numeric) as sen_support_pct,
|
||||
cast(sen_ehcp_pct as numeric) as sen_ehcp_pct,
|
||||
cast(eal_pct as numeric) as eal_pct,
|
||||
cast(disadvantaged_pct as numeric) as disadvantaged_pct,
|
||||
cast(ethnicity_white_pct as numeric) as ethnicity_white_pct,
|
||||
cast(ethnicity_asian_pct as numeric) as ethnicity_asian_pct,
|
||||
cast(ethnicity_black_pct as numeric) as ethnicity_black_pct,
|
||||
cast(ethnicity_mixed_pct as numeric) as ethnicity_mixed_pct,
|
||||
cast(ethnicity_other_pct as numeric) as ethnicity_other_pct,
|
||||
cast(class_size_avg as numeric) as class_size_avg,
|
||||
cast(stability_pct as numeric) as stability_pct
|
||||
cast(school_urn as integer) as urn,
|
||||
cast(time_period as integer) as year,
|
||||
school_name,
|
||||
phase_type_grouping
|
||||
-- TODO: Add census data columns once verified:
|
||||
-- fsm_pct, sen_support_pct, sen_ehcp_pct, eal_pct,
|
||||
-- disadvantaged_pct, ethnicity_white_pct, ethnicity_asian_pct,
|
||||
-- ethnicity_black_pct, ethnicity_mixed_pct, ethnicity_other_pct,
|
||||
-- class_size_avg, stability_pct
|
||||
from source
|
||||
where urn is not null
|
||||
)
|
||||
|
||||
select * from renamed
|
||||
|
||||
@@ -1,24 +1,102 @@
|
||||
-- Staging model: KS4 attainment data from EES (secondary schools — NEW)
|
||||
-- Staging model: KS4 attainment data from EES
|
||||
-- KS4 performance data is long-format with breakdown dimensions (breakdown_topic,
|
||||
-- breakdown, sex). Unlike KS2 which has a subject dimension, KS4 metrics are
|
||||
-- already in separate columns — we just filter to the 'All pupils' breakdown.
|
||||
-- EES uses 'z' for suppressed values — cast to null via nullif.
|
||||
|
||||
with source as (
|
||||
select * from {{ source('raw', 'ees_ks4') }}
|
||||
with performance as (
|
||||
select * from {{ source('raw', 'ees_ks4_performance') }}
|
||||
where school_urn is not null
|
||||
),
|
||||
|
||||
renamed as (
|
||||
-- Filter to all-pupils totals (one row per school per year)
|
||||
all_pupils as (
|
||||
select
|
||||
cast(urn as integer) as urn,
|
||||
cast(time_period as integer) as year,
|
||||
cast(t_pupils as integer) as total_pupils,
|
||||
cast(progress_8_score as numeric) as progress_8_score,
|
||||
cast(attainment_8_score as numeric) as attainment_8_score,
|
||||
cast(ebacc_entry_pct as numeric) as ebacc_entry_pct,
|
||||
cast(ebacc_achievement_pct as numeric) as ebacc_achievement_pct,
|
||||
cast(english_strong_pass_pct as numeric) as english_strong_pass_pct,
|
||||
cast(maths_strong_pass_pct as numeric) as maths_strong_pass_pct,
|
||||
cast(english_maths_strong_pass_pct as numeric) as english_maths_strong_pass_pct,
|
||||
cast(staying_in_education_pct as numeric) as staying_in_education_pct
|
||||
from source
|
||||
where urn is not null
|
||||
cast(school_urn as integer) as urn,
|
||||
cast(time_period as integer) as year,
|
||||
cast(nullif(pupil_count, 'z') as integer) as total_pupils,
|
||||
|
||||
-- Attainment 8
|
||||
cast(nullif(attainment8_average, 'z') as numeric) as attainment_8_score,
|
||||
|
||||
-- Progress 8
|
||||
cast(nullif(progress8_average, 'z') as numeric) as progress_8_score,
|
||||
cast(nullif(progress8_lower_95_ci, 'z') as numeric) as progress_8_lower_ci,
|
||||
cast(nullif(progress8_upper_95_ci, 'z') as numeric) as progress_8_upper_ci,
|
||||
cast(nullif(progress8eng_average, 'z') as numeric) as progress_8_english,
|
||||
cast(nullif(progress8mat_average, 'z') as numeric) as progress_8_maths,
|
||||
cast(nullif(progress8ebacc_average, 'z') as numeric) as progress_8_ebacc,
|
||||
cast(nullif(progress8open_average, 'z') as numeric) as progress_8_open,
|
||||
|
||||
-- English & Maths pass rates
|
||||
cast(nullif(engmath_95_percent, 'z') as numeric) as english_maths_strong_pass_pct,
|
||||
cast(nullif(engmath_94_percent, 'z') as numeric) as english_maths_standard_pass_pct,
|
||||
|
||||
-- EBacc
|
||||
cast(nullif(ebacc_entering_percent, 'z') as numeric) as ebacc_entry_pct,
|
||||
cast(nullif(ebacc_95_percent, 'z') as numeric) as ebacc_strong_pass_pct,
|
||||
cast(nullif(ebacc_94_percent, 'z') as numeric) as ebacc_standard_pass_pct,
|
||||
cast(nullif(ebacc_aps_average, 'z') as numeric) as ebacc_avg_score,
|
||||
|
||||
-- GCSE grade 9-1
|
||||
cast(nullif(gcse_91_percent, 'z') as numeric) as gcse_grade_91_pct
|
||||
|
||||
from performance
|
||||
where breakdown_topic = 'All pupils'
|
||||
and breakdown = 'Total'
|
||||
and sex = 'Total'
|
||||
),
|
||||
|
||||
-- KS4 info table for context/demographics
|
||||
info as (
|
||||
select
|
||||
cast(school_urn as integer) as urn,
|
||||
cast(time_period as integer) as year,
|
||||
cast(nullif(endks4_pupil_count, 'z') as integer) as eligible_pupils,
|
||||
cast(nullif(ks2_scaledscore_average, 'z') as numeric) as prior_attainment_avg,
|
||||
cast(nullif(sen_pupil_percent, 'z') as numeric) as sen_pct,
|
||||
cast(nullif(sen_with_ehcp_pupil_percent, 'z') as numeric) as sen_ehcp_pct,
|
||||
cast(nullif(sen_no_ehcp_pupil_percent, 'z') as numeric) as sen_support_pct
|
||||
from {{ source('raw', 'ees_ks4_info') }}
|
||||
where school_urn is not null
|
||||
)
|
||||
|
||||
select * from renamed
|
||||
select
|
||||
p.urn,
|
||||
p.year,
|
||||
p.total_pupils,
|
||||
i.eligible_pupils,
|
||||
i.prior_attainment_avg,
|
||||
|
||||
-- Attainment 8
|
||||
p.attainment_8_score,
|
||||
|
||||
-- Progress 8
|
||||
p.progress_8_score,
|
||||
p.progress_8_lower_ci,
|
||||
p.progress_8_upper_ci,
|
||||
p.progress_8_english,
|
||||
p.progress_8_maths,
|
||||
p.progress_8_ebacc,
|
||||
p.progress_8_open,
|
||||
|
||||
-- English & Maths
|
||||
p.english_maths_strong_pass_pct,
|
||||
p.english_maths_standard_pass_pct,
|
||||
|
||||
-- EBacc
|
||||
p.ebacc_entry_pct,
|
||||
p.ebacc_strong_pass_pct,
|
||||
p.ebacc_standard_pass_pct,
|
||||
p.ebacc_avg_score,
|
||||
|
||||
-- GCSE
|
||||
p.gcse_grade_91_pct,
|
||||
|
||||
-- Context
|
||||
i.sen_pct,
|
||||
i.sen_ehcp_pct,
|
||||
i.sen_support_pct
|
||||
|
||||
from all_pupils p
|
||||
left join info i on p.urn = i.urn and p.year = i.year
|
||||
|
||||
@@ -1,17 +0,0 @@
|
||||
-- Staging model: Phonics screening check results from EES
|
||||
|
||||
with source as (
|
||||
select * from {{ source('raw', 'ees_phonics') }}
|
||||
),
|
||||
|
||||
renamed as (
|
||||
select
|
||||
cast(urn as integer) as urn,
|
||||
cast(time_period as integer) as year,
|
||||
cast(year1_phonics_pct as numeric) as year1_phonics_pct,
|
||||
cast(year2_phonics_pct as numeric) as year2_phonics_pct
|
||||
from source
|
||||
where urn is not null
|
||||
)
|
||||
|
||||
select * from renamed
|
||||
Reference in New Issue
Block a user