feat(census): add demographic columns to EES census tap and staging models
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 32s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m7s
Build and Push Docker Images / Build Integrator (push) Successful in 55s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m39s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 32s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m7s
Build and Push Docker Images / Build Integrator (push) Successful in 55s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m39s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
tap-uk-ees: EESCensusStream now declares 27 data columns (FSM %, EAL %, ethnicity breakdowns, pupil counts) with clean Singer field names mapped from the verbose CSV column names (e.g. '% of pupils known to be eligible for free school meals' → fsm_pct) via a new _column_renames mechanism on the base stream class. stg_ees_census: materialised as table, applies safe_numeric to all percentage/count columns, filters to numeric URNs. int_pupil_chars_merged + fact_pupil_characteristics: pass all columns through from staging (previously stubs with only 3 columns). Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -44,6 +44,8 @@ class EESDatasetStream(Stream):
|
||||
|
||||
Subclasses set _target_filename to a keyword that appears in the
|
||||
target CSV path inside the ZIP (substring match, not exact).
|
||||
Subclasses may set _column_renames to map messy CSV column names to
|
||||
clean Singer field names before yielding records.
|
||||
"""
|
||||
|
||||
replication_key = None
|
||||
@@ -51,6 +53,7 @@ class EESDatasetStream(Stream):
|
||||
_target_filename: str = "" # keyword that appears in the CSV path
|
||||
_urn_column: str = "school_urn" # column name for URN in the CSV
|
||||
_encoding: str = "utf-8" # CSV file encoding (some DfE files use latin-1)
|
||||
_column_renames: dict = {} # CSV column name → Singer field name
|
||||
|
||||
def get_records(self, context):
|
||||
import pandas as pd
|
||||
@@ -104,6 +107,10 @@ class EESDatasetStream(Stream):
|
||||
|
||||
for _, row in df.iterrows():
|
||||
record = row.to_dict()
|
||||
# Apply subclass column renames (messy CSV names → clean Singer fields)
|
||||
for old, new in self._column_renames.items():
|
||||
if old in record:
|
||||
record[new] = record.pop(old)
|
||||
# Normalise URN column to 'school_urn' for consistency
|
||||
if self._urn_column in record and self._urn_column != "school_urn":
|
||||
record["school_urn"] = record.pop(self._urn_column)
|
||||
@@ -281,15 +288,75 @@ class EESCensusStream(EESDatasetStream):
|
||||
_target_filename = "spc_school_level_underlying_data"
|
||||
_urn_column = "urn"
|
||||
_encoding = "latin-1"
|
||||
|
||||
# Map verbose CSV column names to clean Singer field names
|
||||
_column_renames = {
|
||||
"headcount of pupils": "total_pupils",
|
||||
"headcount total female": "female_pupils",
|
||||
"headcount total male": "male_pupils",
|
||||
"full-time pupils": "full_time_pupils",
|
||||
"part-time pupils": "part_time_pupils",
|
||||
"total boarders": "total_boarders",
|
||||
"number of pupils known to be eligible for free school meals": "fsm_eligible_n",
|
||||
"% of pupils known to be eligible for free school meals": "fsm_pct",
|
||||
"% of pupils whose first language is known or believed to be other than English": "eal_pct",
|
||||
"% of pupils who are a young carer": "young_carer_pct",
|
||||
"% of pupils classified as white British ethnic origin": "ethnicity_white_british_pct",
|
||||
"% of pupils classified as any other white background ethnic origin": "ethnicity_white_other_pct",
|
||||
"% of pupils classified as Gypsy/Roma ethnic origin": "ethnicity_gypsy_roma_pct",
|
||||
"% of pupils classified as white and black Caribbean ethnic origin": "ethnicity_mixed_wbc_pct",
|
||||
"% of pupils classified as white and black African ethnic origin": "ethnicity_mixed_wba_pct",
|
||||
"% of pupils classified as white and Asian ethnic origin": "ethnicity_mixed_wa_pct",
|
||||
"% of pupils classified as any other mixed background ethnic origin": "ethnicity_mixed_other_pct",
|
||||
"% of pupils classified as Indian ethnic origin": "ethnicity_asian_indian_pct",
|
||||
"% of pupils classified as Pakistani ethnic origin": "ethnicity_asian_pakistani_pct",
|
||||
"% of pupils classified as Bangladeshi ethnic origin": "ethnicity_asian_bangladeshi_pct",
|
||||
"% of pupils classified as any other Asian background ethnic origin": "ethnicity_asian_other_pct",
|
||||
"% of pupils classified as Caribbean ethnic origin": "ethnicity_black_caribbean_pct",
|
||||
"% of pupils classified as African ethnic origin": "ethnicity_black_african_pct",
|
||||
"% of pupils classified as any other black background ethnic origin": "ethnicity_black_other_pct",
|
||||
"% of pupils classified as Chinese ethnic origin": "ethnicity_chinese_pct",
|
||||
"% of pupils classified as any other ethnic group ethnic origin": "ethnicity_other_pct",
|
||||
"% of pupils unclassified": "ethnicity_unclassified_pct",
|
||||
}
|
||||
|
||||
schema = th.PropertiesList(
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
th.Property("school_urn", th.StringType, required=True),
|
||||
th.Property("school_name", th.StringType),
|
||||
th.Property("laestab", th.StringType),
|
||||
th.Property("phase_type_grouping", th.StringType),
|
||||
# TODO: Add data columns (ethnicity %, FSM %, SEN %, etc.) once
|
||||
# actual column names are verified on the container. The CSV has
|
||||
# 269 columns — only the first 30 (metadata) have been inspected.
|
||||
# Pupil counts
|
||||
th.Property("total_pupils", th.StringType),
|
||||
th.Property("female_pupils", th.StringType),
|
||||
th.Property("male_pupils", th.StringType),
|
||||
th.Property("full_time_pupils", th.StringType),
|
||||
th.Property("part_time_pupils", th.StringType),
|
||||
th.Property("total_boarders", th.StringType),
|
||||
# FSM
|
||||
th.Property("fsm_eligible_n", th.StringType),
|
||||
th.Property("fsm_pct", th.StringType),
|
||||
# EAL & young carers
|
||||
th.Property("eal_pct", th.StringType),
|
||||
th.Property("young_carer_pct", th.StringType),
|
||||
# Ethnicity percentages
|
||||
th.Property("ethnicity_white_british_pct", th.StringType),
|
||||
th.Property("ethnicity_white_other_pct", th.StringType),
|
||||
th.Property("ethnicity_gypsy_roma_pct", th.StringType),
|
||||
th.Property("ethnicity_mixed_wbc_pct", th.StringType),
|
||||
th.Property("ethnicity_mixed_wba_pct", th.StringType),
|
||||
th.Property("ethnicity_mixed_wa_pct", th.StringType),
|
||||
th.Property("ethnicity_mixed_other_pct", th.StringType),
|
||||
th.Property("ethnicity_asian_indian_pct", th.StringType),
|
||||
th.Property("ethnicity_asian_pakistani_pct", th.StringType),
|
||||
th.Property("ethnicity_asian_bangladeshi_pct", th.StringType),
|
||||
th.Property("ethnicity_asian_other_pct", th.StringType),
|
||||
th.Property("ethnicity_black_caribbean_pct", th.StringType),
|
||||
th.Property("ethnicity_black_african_pct", th.StringType),
|
||||
th.Property("ethnicity_black_other_pct", th.StringType),
|
||||
th.Property("ethnicity_chinese_pct", th.StringType),
|
||||
th.Property("ethnicity_other_pct", th.StringType),
|
||||
th.Property("ethnicity_unclassified_pct", th.StringType),
|
||||
).to_dict()
|
||||
|
||||
|
||||
|
||||
Reference in New Issue
Block a user