feat: migrate backend to marts schema, update EES tap for verified datasets

Pipeline: - EES tap: split KS4 into performance + info streams, fix admissions filename (SchoolLevel keyword match), fix census filename (yearly suffix), remove phonics (no school-level data on EES), change endswith → in for matching - stg_ees_ks4: rewrite to filter long-format data and extract Attainment 8, Progress 8, EBacc, English/Maths metrics; join KS4 info for context - stg_ees_admissions: map real CSV columns (total_number_places_offered, etc.) - stg_ees_census: update source reference, stub with TODO for data columns - Remove stg_ees_phonics, fact_phonics (no school-level EES data) - Add ees_ks4_performance + ees_ks4_info sources, remove ees_ks4 + ees_phonics - Update int_ks4_with_lineage + fact_ks4_performance with new KS4 columns - Annual EES DAG: remove stg_ees_phonics+ from selector Backend: - models.py: replace all models to point at marts.* tables with schema='marts' (DimSchool, DimLocation, KS2Performance, FactOfstedInspection, etc.) - data_loader.py: rewrite load_school_data_as_dataframe() using raw SQL joining dim_school + dim_location + fact_ks2_performance; update get_supplementary_data() - database.py: remove migration machinery, keep only connection setup - app.py: remove check_and_migrate_if_needed, remove /api/admin/reimport-ks2 endpoints (pipeline handles all imports) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 09:29:27 +00:00
parent d82e36e7b2
commit ca351e9d73
18 changed files with 805 additions and 1245 deletions
@@ -1,9 +1,11 @@
-"""EES Singer tap — extracts KS2, KS4, Census, Admissions, Phonics data.
+"""EES Singer tap — extracts KS2, KS4, Census, Admissions data.

 Each stream targets a specific CSV file within an EES release ZIP.
 The EES data uses 'school_urn' for school-level records and 'z' for
 suppressed values. Column names vary by file — schemas declare all
 columns needed by downstream dbt staging models.
+
+Phonics has no school-level data on EES and is not included.
 """

 from __future__ import annotations
@@ -38,11 +40,15 @@ def download_release_zip(release_id: str) -> zipfile.ZipFile:


 class EESDatasetStream(Stream):
-    """Base stream for an EES dataset extracted from a release ZIP."""
+    """Base stream for an EES dataset extracted from a release ZIP.
+
+    Subclasses set _target_filename to a keyword that appears in the
+    target CSV path inside the ZIP (substring match, not exact).
+    """

    replication_key = None
    _publication_slug: str = ""
-    _target_filename: str = ""  # exact filename within the ZIP
+    _target_filename: str = ""  # keyword that appears in the CSV path
    _urn_column: str = "school_urn"  # column name for URN in the CSV

    def get_records(self, context):
@@ -56,17 +62,17 @@ class EESDatasetStream(Stream):
        )
        zf = download_release_zip(release_id)

-        # Find the target file
+        # Find the target file (substring match)
        all_files = zf.namelist()
        target = None
        for name in all_files:
-            if name.endswith(self._target_filename):
+            if self._target_filename in name and name.endswith(".csv"):
                target = name
                break

        if not target:
            self.logger.error(
-                "File '%s' not found in ZIP. Available: %s",
+                "File matching '%s' not found in ZIP. Available: %s",
                self._target_filename,
                [n for n in all_files if n.endswith(".csv")],
            )
@@ -96,7 +102,7 @@ class EESKS2AttainmentStream(EESDatasetStream):
    name = "ees_ks2_attainment"
    primary_keys = ["school_urn", "time_period", "subject", "breakdown_topic", "breakdown"]
    _publication_slug = "key-stage-2-attainment"
-    _target_filename = "ks2_school_attainment_data.csv"
+    _target_filename = "ks2_school_attainment_data"
    schema = th.PropertiesList(
        th.Property("time_period", th.StringType, required=True),
        th.Property("school_urn", th.StringType, required=True),
@@ -126,7 +132,7 @@ class EESKS2InfoStream(EESDatasetStream):
    name = "ees_ks2_info"
    primary_keys = ["school_urn", "time_period"]
    _publication_slug = "key-stage-2-attainment"
-    _target_filename = "ks2_school_information_data.csv"
+    _target_filename = "ks2_school_information_data"
    schema = th.PropertiesList(
        th.Property("time_period", th.StringType, required=True),
        th.Property("school_urn", th.StringType, required=True),
@@ -150,60 +156,172 @@ class EESKS2InfoStream(EESDatasetStream):
    ).to_dict()


-# ── KS4 Attainment ──────────────────────────────────────────────────────────
+# ── KS4 Performance (long format: one row per school × breakdown × sex) ─────
+# File: 202425_performance_tables_schools_revised.csv (156 cols)
+# Dimensions: breakdown_topic, breakdown, sex, disadvantage_status, etc.
+# Metrics are already in separate columns (attainment8_average, progress8_average, etc.)

-class EESKS4Stream(EESDatasetStream):
-    name = "ees_ks4"
-    primary_keys = ["school_urn", "time_period"]
+class EESKS4PerformanceStream(EESDatasetStream):
+    name = "ees_ks4_performance"
+    primary_keys = ["school_urn", "time_period", "breakdown_topic", "breakdown", "sex"]
    _publication_slug = "key-stage-4-performance"
-    _target_filename = "school"  # Will be refined once we see the actual ZIP contents
+    _target_filename = "performance_tables_schools"
    schema = th.PropertiesList(
        th.Property("time_period", th.StringType, required=True),
        th.Property("school_urn", th.StringType, required=True),
+        th.Property("school_laestab", th.StringType),
+        th.Property("school_name", th.StringType),
+        th.Property("establishment_type_group", th.StringType),
+        th.Property("breakdown_topic", th.StringType, required=True),
+        th.Property("breakdown", th.StringType, required=True),
+        th.Property("sex", th.StringType, required=True),
+        th.Property("disadvantage_status", th.StringType),
+        th.Property("first_language", th.StringType),
+        th.Property("prior_attainment", th.StringType),
+        th.Property("mobility", th.StringType),
+        # Pupil counts
+        th.Property("pupil_count", th.StringType),
+        th.Property("pupil_percent", th.StringType),
+        # Attainment 8
+        th.Property("attainment8_sum", th.StringType),
+        th.Property("attainment8_average", th.StringType),
+        # English & Maths
+        th.Property("engmath_entering_total", th.StringType),
+        th.Property("engmath_entering_percent", th.StringType),
+        th.Property("engmath_95_total", th.StringType),
+        th.Property("engmath_95_percent", th.StringType),
+        th.Property("engmath_94_total", th.StringType),
+        th.Property("engmath_94_percent", th.StringType),
+        # EBacc
+        th.Property("ebacc_entering_total", th.StringType),
+        th.Property("ebacc_entering_percent", th.StringType),
+        th.Property("ebacc_95_total", th.StringType),
+        th.Property("ebacc_95_percent", th.StringType),
+        th.Property("ebacc_94_total", th.StringType),
+        th.Property("ebacc_94_percent", th.StringType),
+        th.Property("ebacc_aps_sum", th.StringType),
+        th.Property("ebacc_aps_average", th.StringType),
+        # Progress 8
+        th.Property("progress8_pupil_count", th.StringType),
+        th.Property("progress8_sum", th.StringType),
+        th.Property("progress8_average", th.StringType),
+        th.Property("progress8_lower_95_ci", th.StringType),
+        th.Property("progress8_upper_95_ci", th.StringType),
+        # Progress 8 elements
+        th.Property("progress8eng_average", th.StringType),
+        th.Property("progress8mat_average", th.StringType),
+        th.Property("progress8ebacc_average", th.StringType),
+        th.Property("progress8open_average", th.StringType),
+        # GCSE grades
+        th.Property("gcse_91_total", th.StringType),
+        th.Property("gcse_91_percent", th.StringType),
+        # EBacc subject entry/achievement
+        th.Property("ebacceng_entering_percent", th.StringType),
+        th.Property("ebaccmat_entering_percent", th.StringType),
+        th.Property("ebaccsci_entering_percent", th.StringType),
+        th.Property("ebacchum_entering_percent", th.StringType),
+        th.Property("ebacclan_entering_percent", th.StringType),
+    ).to_dict()
+
+
+# ── KS4 Information (wide format: one row per school, context/demographics) ──
+# File: 202425_information_about_schools_provisional.csv (38 cols)
+
+class EESKS4InfoStream(EESDatasetStream):
+    name = "ees_ks4_info"
+    primary_keys = ["school_urn", "time_period"]
+    _publication_slug = "key-stage-4-performance"
+    _target_filename = "information_about_schools"
+    schema = th.PropertiesList(
+        th.Property("time_period", th.StringType, required=True),
+        th.Property("school_urn", th.StringType, required=True),
+        th.Property("school_laestab", th.StringType),
+        th.Property("school_name", th.StringType),
+        th.Property("establishment_type_group", th.StringType),
+        th.Property("reldenom", th.StringType),
+        th.Property("admpol_pt", th.StringType),
+        th.Property("egender", th.StringType),
+        th.Property("agerange", th.StringType),
+        th.Property("allks_pupil_count", th.StringType),
+        th.Property("allks_boys_count", th.StringType),
+        th.Property("allks_girls_count", th.StringType),
+        th.Property("endks4_pupil_count", th.StringType),
+        th.Property("ks2_scaledscore_average", th.StringType),
+        th.Property("sen_with_ehcp_pupil_percent", th.StringType),
+        th.Property("sen_pupil_percent", th.StringType),
+        th.Property("sen_no_ehcp_pupil_percent", th.StringType),
+        th.Property("attainment8_diffn", th.StringType),
+        th.Property("progress8_diffn", th.StringType),
+        th.Property("progress8_banding", th.StringType),
    ).to_dict()


 # ── Census (school-level pupil characteristics) ─────────────────────────────
+# File: spc_school_level_underlying_data_YYYY.csv (269 cols, in supporting-files/)
+# Uses 'urn' not 'school_urn'. Filename has yearly suffix that changes.

 class EESCensusStream(EESDatasetStream):
    name = "ees_census"
-    primary_keys = ["urn", "time_period"]
+    primary_keys = ["school_urn", "time_period"]
    _publication_slug = "school-pupils-and-their-characteristics"
-    _target_filename = "spc_school_level_underlying_data_2025.csv"
+    _target_filename = "spc_school_level_underlying_data"
    _urn_column = "urn"
    schema = th.PropertiesList(
        th.Property("time_period", th.StringType, required=True),
-        th.Property("urn", th.StringType, required=True),
+        th.Property("school_urn", th.StringType, required=True),
        th.Property("school_name", th.StringType),
        th.Property("laestab", th.StringType),
        th.Property("phase_type_grouping", th.StringType),
+        # TODO: Add data columns (ethnicity %, FSM %, SEN %, etc.) once
+        # actual column names are verified on the container. The CSV has
+        # 269 columns — only the first 30 (metadata) have been inspected.
    ).to_dict()


 # ── Admissions ───────────────────────────────────────────────────────────────
+# File: AppsandOffers_YYYY_SchoolLevelDDMMYYYY.csv (37 cols, in supporting-files/)
+# Wide format, no geographic_level column. Uses school_urn.

 class EESAdmissionsStream(EESDatasetStream):
    name = "ees_admissions"
    primary_keys = ["school_urn", "time_period"]
    _publication_slug = "primary-and-secondary-school-applications-and-offers"
-    _target_filename = "school"  # Will be refined once we see the actual ZIP contents
+    _target_filename = "SchoolLevel"
    schema = th.PropertiesList(
        th.Property("time_period", th.StringType, required=True),
        th.Property("school_urn", th.StringType, required=True),
+        th.Property("school_name", th.StringType),
+        th.Property("school_laestab_as_used", th.StringType),
+        th.Property("school_phase", th.StringType),
+        th.Property("entry_year", th.StringType),
+        # Places and offers
+        th.Property("total_number_places_offered", th.StringType),
+        th.Property("number_preferred_offers", th.StringType),
+        th.Property("number_1st_preference_offers", th.StringType),
+        th.Property("number_2nd_preference_offers", th.StringType),
+        th.Property("number_3rd_preference_offers", th.StringType),
+        # Applications
+        th.Property("times_put_as_any_preferred_school", th.StringType),
+        th.Property("times_put_as_1st_preference", th.StringType),
+        th.Property("times_put_as_2nd_preference", th.StringType),
+        th.Property("times_put_as_3rd_preference", th.StringType),
+        # Proportions
+        th.Property("proportion_1stprefs_v_1stprefoffers", th.StringType),
+        th.Property("proportion_1stprefs_v_totaloffers", th.StringType),
+        # Cross-LA
+        th.Property("all_applications_from_another_LA", th.StringType),
+        th.Property("offers_to_applicants_from_another_LA", th.StringType),
+        # Context
+        th.Property("establishment_type", th.StringType),
+        th.Property("denomination", th.StringType),
+        th.Property("FSM_eligible_percent", th.StringType),
+        th.Property("admissions_policy", th.StringType),
+        th.Property("urban_rural", th.StringType),
    ).to_dict()


-# ── Phonics ──────────────────────────────────────────────────────────────────
-
-class EESPhonicsStream(EESDatasetStream):
-    name = "ees_phonics"
-    primary_keys = ["school_urn", "time_period"]
-    _publication_slug = "phonics-screening-check-attainment"
-    _target_filename = "school"  # Will be refined once we see the actual ZIP contents
-    schema = th.PropertiesList(
-        th.Property("time_period", th.StringType, required=True),
-        th.Property("school_urn", th.StringType, required=True),
-    ).to_dict()
+# Note: Phonics (phonics-screening-check-attainment) has NO school-level data
+# on EES. Only national and LA-level files are published.


 class TapUKEES(Tap):
@@ -219,10 +337,10 @@ class TapUKEES(Tap):
        return [
            EESKS2AttainmentStream(self),
            EESKS2InfoStream(self),
-            EESKS4Stream(self),
+            EESKS4PerformanceStream(self),
+            EESKS4InfoStream(self),
            EESCensusStream(self),
            EESAdmissionsStream(self),
-            EESPhonicsStream(self),
        ]