feat: migrate backend to marts schema, update EES tap for verified datasets

Pipeline:
- EES tap: split KS4 into performance + info streams, fix admissions filename
  (SchoolLevel keyword match), fix census filename (yearly suffix), remove
  phonics (no school-level data on EES), change endswith → in for matching
- stg_ees_ks4: rewrite to filter long-format data and extract Attainment 8,
  Progress 8, EBacc, English/Maths metrics; join KS4 info for context
- stg_ees_admissions: map real CSV columns (total_number_places_offered, etc.)
- stg_ees_census: update source reference, stub with TODO for data columns
- Remove stg_ees_phonics, fact_phonics (no school-level EES data)
- Add ees_ks4_performance + ees_ks4_info sources, remove ees_ks4 + ees_phonics
- Update int_ks4_with_lineage + fact_ks4_performance with new KS4 columns
- Annual EES DAG: remove stg_ees_phonics+ from selector

Backend:
- models.py: replace all models to point at marts.* tables with schema='marts'
  (DimSchool, DimLocation, KS2Performance, FactOfstedInspection, etc.)
- data_loader.py: rewrite load_school_data_as_dataframe() using raw SQL joining
  dim_school + dim_location + fact_ks2_performance; update get_supplementary_data()
- database.py: remove migration machinery, keep only connection setup
- app.py: remove check_and_migrate_if_needed, remove /api/admin/reimport-ks2
  endpoints (pipeline handles all imports)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-27 09:29:27 +00:00
parent d82e36e7b2
commit ca351e9d73
18 changed files with 805 additions and 1245 deletions

View File

@@ -1,408 +1,216 @@
"""
SQLAlchemy database models for school data.
Normalized schema with separate tables for schools and yearly results.
SQLAlchemy models — all tables live in the marts schema, built by dbt.
Read-only: the pipeline writes to these tables; the backend only reads.
"""
from datetime import datetime
from sqlalchemy import Column, Integer, String, Float, Boolean, Date, Text, Index
from sqlalchemy import (
Column, Integer, String, Float, ForeignKey, Index, UniqueConstraint,
Text, Boolean, DateTime, Date
)
from sqlalchemy.orm import relationship
from .database import Base
MARTS = {"schema": "marts"}
class School(Base):
"""
Core school information - relatively static data.
"""
__tablename__ = "schools"
id = Column(Integer, primary_key=True, autoincrement=True)
urn = Column(Integer, unique=True, nullable=False, index=True)
class DimSchool(Base):
"""Canonical school dimension — one row per active URN."""
__tablename__ = "dim_school"
__table_args__ = MARTS
urn = Column(Integer, primary_key=True)
school_name = Column(String(255), nullable=False)
local_authority = Column(String(100))
local_authority_code = Column(Integer)
phase = Column(String(100))
school_type = Column(String(100))
school_type_code = Column(String(10))
religious_denomination = Column(String(100))
academy_trust_name = Column(String(255))
academy_trust_uid = Column(String(20))
religious_character = Column(String(100))
gender = Column(String(20))
age_range = Column(String(20))
# Address
address1 = Column(String(255))
address2 = Column(String(255))
capacity = Column(Integer)
total_pupils = Column(Integer)
headteacher_name = Column(String(200))
website = Column(String(255))
telephone = Column(String(30))
status = Column(String(50))
nursery_provision = Column(Boolean)
admissions_policy = Column(String(50))
# Denormalised Ofsted summary (updated by monthly pipeline)
ofsted_grade = Column(Integer)
ofsted_date = Column(Date)
ofsted_framework = Column(String(20))
class DimLocation(Base):
"""School location — address, lat/lng from easting/northing (BNG→WGS84)."""
__tablename__ = "dim_location"
__table_args__ = MARTS
urn = Column(Integer, primary_key=True)
address_line1 = Column(String(255))
address_line2 = Column(String(255))
town = Column(String(100))
postcode = Column(String(20), index=True)
# Geocoding (cached)
county = Column(String(100))
postcode = Column(String(20))
local_authority_code = Column(Integer)
local_authority_name = Column(String(100))
parliamentary_constituency = Column(String(100))
urban_rural = Column(String(50))
easting = Column(Integer)
northing = Column(Integer)
latitude = Column(Float)
longitude = Column(Float)
# GIAS enrichment fields
website = Column(String(255))
headteacher_name = Column(String(200))
capacity = Column(Integer)
trust_name = Column(String(255))
trust_uid = Column(String(20))
gender = Column(String(20)) # Mixed / Girls / Boys
nursery_provision = Column(Boolean)
# Relationships
results = relationship("SchoolResult", back_populates="school", cascade="all, delete-orphan")
def __repr__(self):
return f"<School(urn={self.urn}, name='{self.school_name}')>"
@property
def address(self):
"""Combine address fields into single string."""
parts = [self.address1, self.address2, self.town, self.postcode]
return ", ".join(p for p in parts if p)
# geom is a PostGIS geometry — not mapped to SQLAlchemy (accessed via raw SQL)
class SchoolResult(Base):
"""
Yearly KS2 results for a school.
Each school can have multiple years of results.
"""
__tablename__ = "school_results"
id = Column(Integer, primary_key=True, autoincrement=True)
school_id = Column(Integer, ForeignKey("schools.id", ondelete="CASCADE"), nullable=False)
year = Column(Integer, nullable=False, index=True)
# Pupil numbers
class KS2Performance(Base):
"""KS2 attainment — one row per URN per year (includes predecessor data)."""
__tablename__ = "fact_ks2_performance"
__table_args__ = (
Index("ix_ks2_urn_year", "urn", "year"),
MARTS,
)
urn = Column(Integer, primary_key=True)
year = Column(Integer, primary_key=True)
source_urn = Column(Integer)
total_pupils = Column(Integer)
eligible_pupils = Column(Integer)
# Core KS2 metrics - Expected Standard
# Core attainment
rwm_expected_pct = Column(Float)
reading_expected_pct = Column(Float)
writing_expected_pct = Column(Float)
maths_expected_pct = Column(Float)
gps_expected_pct = Column(Float)
science_expected_pct = Column(Float)
# Higher Standard
rwm_high_pct = Column(Float)
reading_expected_pct = Column(Float)
reading_high_pct = Column(Float)
writing_high_pct = Column(Float)
maths_high_pct = Column(Float)
gps_high_pct = Column(Float)
# Progress Scores
reading_progress = Column(Float)
writing_progress = Column(Float)
maths_progress = Column(Float)
# Average Scores
reading_avg_score = Column(Float)
reading_progress = Column(Float)
writing_expected_pct = Column(Float)
writing_high_pct = Column(Float)
writing_progress = Column(Float)
maths_expected_pct = Column(Float)
maths_high_pct = Column(Float)
maths_avg_score = Column(Float)
maths_progress = Column(Float)
gps_expected_pct = Column(Float)
gps_high_pct = Column(Float)
gps_avg_score = Column(Float)
# School Context
science_expected_pct = Column(Float)
# Absence
reading_absence_pct = Column(Float)
writing_absence_pct = Column(Float)
maths_absence_pct = Column(Float)
gps_absence_pct = Column(Float)
science_absence_pct = Column(Float)
# Gender
rwm_expected_boys_pct = Column(Float)
rwm_high_boys_pct = Column(Float)
rwm_expected_girls_pct = Column(Float)
rwm_high_girls_pct = Column(Float)
# Disadvantaged
rwm_expected_disadvantaged_pct = Column(Float)
rwm_expected_non_disadvantaged_pct = Column(Float)
disadvantaged_gap = Column(Float)
# Context
disadvantaged_pct = Column(Float)
eal_pct = Column(Float)
sen_support_pct = Column(Float)
sen_ehcp_pct = Column(Float)
stability_pct = Column(Float)
# Pupil Absence from Tests
reading_absence_pct = Column(Float)
gps_absence_pct = Column(Float)
maths_absence_pct = Column(Float)
writing_absence_pct = Column(Float)
science_absence_pct = Column(Float)
# Gender Breakdown
rwm_expected_boys_pct = Column(Float)
rwm_expected_girls_pct = Column(Float)
rwm_high_boys_pct = Column(Float)
rwm_high_girls_pct = Column(Float)
# Disadvantaged Performance
rwm_expected_disadvantaged_pct = Column(Float)
rwm_expected_non_disadvantaged_pct = Column(Float)
disadvantaged_gap = Column(Float)
# 3-Year Averages
rwm_expected_3yr_pct = Column(Float)
reading_avg_3yr = Column(Float)
maths_avg_3yr = Column(Float)
# Relationship
school = relationship("School", back_populates="results")
# Constraints
class FactOfstedInspection(Base):
"""Full Ofsted inspection history — one row per inspection."""
__tablename__ = "fact_ofsted_inspection"
__table_args__ = (
UniqueConstraint('school_id', 'year', name='uq_school_year'),
Index('ix_school_results_school_year', 'school_id', 'year'),
Index("ix_ofsted_urn_date", "urn", "inspection_date"),
MARTS,
)
def __repr__(self):
return f"<SchoolResult(school_id={self.school_id}, year={self.year})>"
class SchemaVersion(Base):
"""
Tracks database schema version for automatic migrations.
Single-row table that stores the current schema version.
"""
__tablename__ = "schema_version"
id = Column(Integer, primary_key=True, default=1)
version = Column(Integer, nullable=False)
migrated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
def __repr__(self):
return f"<SchemaVersion(version={self.version}, migrated_at={self.migrated_at})>"
# ---------------------------------------------------------------------------
# Supplementary data tables (populated by the Kestra data integrator)
# ---------------------------------------------------------------------------
class OfstedInspection(Base):
"""Latest Ofsted inspection judgement per school."""
__tablename__ = "ofsted_inspections"
urn = Column(Integer, primary_key=True)
inspection_date = Column(Date)
publication_date = Column(Date)
inspection_type = Column(String(100)) # Section 5 / Section 8 etc.
# Which inspection framework was used: 'OEIF' or 'ReportCard'
inspection_date = Column(Date, primary_key=True)
inspection_type = Column(String(100))
framework = Column(String(20))
# --- OEIF grades (old framework, pre-Nov 2025) ---
# 1=Outstanding 2=Good 3=Requires improvement 4=Inadequate
overall_effectiveness = Column(Integer)
quality_of_education = Column(Integer)
behaviour_attitudes = Column(Integer)
personal_development = Column(Integer)
leadership_management = Column(Integer)
early_years_provision = Column(Integer) # nullable — not all schools
previous_overall = Column(Integer) # for trend display
# --- Report Card grades (new framework, from Nov 2025) ---
# 1=Exceptional 2=Strong 3=Expected standard 4=Needs attention 5=Urgent improvement
rc_safeguarding_met = Column(Boolean) # True=Met, False=Not met
early_years_provision = Column(Integer)
sixth_form_provision = Column(Integer)
rc_safeguarding_met = Column(Boolean)
rc_inclusion = Column(Integer)
rc_curriculum_teaching = Column(Integer)
rc_achievement = Column(Integer)
rc_attendance_behaviour = Column(Integer)
rc_personal_development = Column(Integer)
rc_leadership_governance = Column(Integer)
rc_early_years = Column(Integer) # nullable — not all schools
rc_sixth_form = Column(Integer) # nullable — secondary only
def __repr__(self):
return f"<OfstedInspection(urn={self.urn}, framework={self.framework}, overall={self.overall_effectiveness})>"
rc_early_years = Column(Integer)
rc_sixth_form = Column(Integer)
report_url = Column(Text)
class OfstedParentView(Base):
"""Ofsted Parent View survey — latest per school. 14 questions, % saying Yes."""
__tablename__ = "ofsted_parent_view"
class FactParentView(Base):
"""Ofsted Parent View survey — latest per school."""
__tablename__ = "fact_parent_view"
__table_args__ = MARTS
urn = Column(Integer, primary_key=True)
survey_date = Column(Date)
total_responses = Column(Integer)
q_happy_pct = Column(Float) # My child is happy at this school
q_safe_pct = Column(Float) # My child feels safe at this school
q_bullying_pct = Column(Float) # School deals with bullying well
q_communication_pct = Column(Float) # School keeps me informed
q_progress_pct = Column(Float) # My child does well / good progress
q_teaching_pct = Column(Float) # Teaching is good
q_information_pct = Column(Float) # I receive valuable info about progress
q_curriculum_pct = Column(Float) # Broad range of subjects taught
q_future_pct = Column(Float) # Prepares child well for the future
q_leadership_pct = Column(Float) # Led and managed effectively
q_wellbeing_pct = Column(Float) # Supports wider personal development
q_behaviour_pct = Column(Float) # Pupils are well behaved
q_recommend_pct = Column(Float) # I would recommend this school
q_sen_pct = Column(Float) # Good information about child's SEN (where applicable)
def __repr__(self):
return f"<OfstedParentView(urn={self.urn}, responses={self.total_responses})>"
q_happy_pct = Column(Float)
q_safe_pct = Column(Float)
q_behaviour_pct = Column(Float)
q_bullying_pct = Column(Float)
q_communication_pct = Column(Float)
q_progress_pct = Column(Float)
q_teaching_pct = Column(Float)
q_information_pct = Column(Float)
q_curriculum_pct = Column(Float)
q_future_pct = Column(Float)
q_leadership_pct = Column(Float)
q_wellbeing_pct = Column(Float)
q_recommend_pct = Column(Float)
class SchoolCensus(Base):
"""Annual school census snapshot — class sizes and ethnicity breakdown."""
__tablename__ = "school_census"
urn = Column(Integer, primary_key=True)
year = Column(Integer, primary_key=True)
class_size_avg = Column(Float)
ethnicity_white_pct = Column(Float)
ethnicity_asian_pct = Column(Float)
ethnicity_black_pct = Column(Float)
ethnicity_mixed_pct = Column(Float)
ethnicity_other_pct = Column(Float)
class FactAdmissions(Base):
"""School admissions — one row per URN per year."""
__tablename__ = "fact_admissions"
__table_args__ = (
Index('ix_school_census_urn_year', 'urn', 'year'),
Index("ix_admissions_urn_year", "urn", "year"),
MARTS,
)
def __repr__(self):
return f"<SchoolCensus(urn={self.urn}, year={self.year})>"
class SchoolAdmissions(Base):
"""Annual admissions statistics per school."""
__tablename__ = "school_admissions"
urn = Column(Integer, primary_key=True)
year = Column(Integer, primary_key=True)
published_admission_number = Column(Integer) # PAN
school_phase = Column(String(50))
published_admission_number = Column(Integer)
total_applications = Column(Integer)
first_preference_offers_pct = Column(Float) # % receiving 1st choice
first_preference_applications = Column(Integer)
first_preference_offers = Column(Integer)
first_preference_offer_pct = Column(Float)
oversubscribed = Column(Boolean)
__table_args__ = (
Index('ix_school_admissions_urn_year', 'urn', 'year'),
)
def __repr__(self):
return f"<SchoolAdmissions(urn={self.urn}, year={self.year})>"
admissions_policy = Column(String(100))
class SenDetail(Base):
"""SEN primary need type breakdown — more granular than school_results context fields."""
__tablename__ = "sen_detail"
urn = Column(Integer, primary_key=True)
year = Column(Integer, primary_key=True)
primary_need_speech_pct = Column(Float) # SLCN
primary_need_autism_pct = Column(Float) # ASD
primary_need_mld_pct = Column(Float) # Moderate learning difficulty
primary_need_spld_pct = Column(Float) # Specific learning difficulty (dyslexia etc.)
primary_need_semh_pct = Column(Float) # Social, emotional, mental health
primary_need_physical_pct = Column(Float) # Physical/sensory
primary_need_other_pct = Column(Float)
__table_args__ = (
Index('ix_sen_detail_urn_year', 'urn', 'year'),
)
def __repr__(self):
return f"<SenDetail(urn={self.urn}, year={self.year})>"
class Phonics(Base):
"""Phonics Screening Check pass rates."""
__tablename__ = "phonics"
urn = Column(Integer, primary_key=True)
year = Column(Integer, primary_key=True)
year1_phonics_pct = Column(Float) # % reaching expected standard in Year 1
year2_phonics_pct = Column(Float) # % reaching standard in Year 2 (re-takers)
__table_args__ = (
Index('ix_phonics_urn_year', 'urn', 'year'),
)
def __repr__(self):
return f"<Phonics(urn={self.urn}, year={self.year})>"
class SchoolDeprivation(Base):
"""IDACI deprivation index — derived via postcode → LSOA lookup."""
__tablename__ = "school_deprivation"
class FactDeprivation(Base):
"""IDACI deprivation index — one row per URN."""
__tablename__ = "fact_deprivation"
__table_args__ = MARTS
urn = Column(Integer, primary_key=True)
lsoa_code = Column(String(20))
idaci_score = Column(Float) # 01, higher = more deprived
idaci_decile = Column(Integer) # 1 = most deprived, 10 = least deprived
def __repr__(self):
return f"<SchoolDeprivation(urn={self.urn}, decile={self.idaci_decile})>"
idaci_score = Column(Float)
idaci_decile = Column(Integer)
class SchoolFinance(Base):
"""FBIT financial benchmarking data."""
__tablename__ = "school_finance"
class FactFinance(Base):
"""FBIT financial benchmarking — one row per URN per year."""
__tablename__ = "fact_finance"
__table_args__ = (
Index("ix_finance_urn_year", "urn", "year"),
MARTS,
)
urn = Column(Integer, primary_key=True)
year = Column(Integer, primary_key=True)
per_pupil_spend = Column(Float) # £ total expenditure per pupil
staff_cost_pct = Column(Float) # % of budget on all staff
teacher_cost_pct = Column(Float) # % on teachers specifically
per_pupil_spend = Column(Float)
staff_cost_pct = Column(Float)
teacher_cost_pct = Column(Float)
support_staff_cost_pct = Column(Float)
premises_cost_pct = Column(Float)
__table_args__ = (
Index('ix_school_finance_urn_year', 'urn', 'year'),
)
def __repr__(self):
return f"<SchoolFinance(urn={self.urn}, year={self.year})>"
# Mapping from CSV columns to model fields
SCHOOL_FIELD_MAPPING = {
'urn': 'urn',
'school_name': 'school_name',
'local_authority': 'local_authority',
'local_authority_code': 'local_authority_code',
'school_type': 'school_type',
'school_type_code': 'school_type_code',
'religious_denomination': 'religious_denomination',
'age_range': 'age_range',
'address1': 'address1',
'address2': 'address2',
'town': 'town',
'postcode': 'postcode',
}
RESULT_FIELD_MAPPING = {
'year': 'year',
'total_pupils': 'total_pupils',
'eligible_pupils': 'eligible_pupils',
# Expected Standard
'rwm_expected_pct': 'rwm_expected_pct',
'reading_expected_pct': 'reading_expected_pct',
'writing_expected_pct': 'writing_expected_pct',
'maths_expected_pct': 'maths_expected_pct',
'gps_expected_pct': 'gps_expected_pct',
'science_expected_pct': 'science_expected_pct',
# Higher Standard
'rwm_high_pct': 'rwm_high_pct',
'reading_high_pct': 'reading_high_pct',
'writing_high_pct': 'writing_high_pct',
'maths_high_pct': 'maths_high_pct',
'gps_high_pct': 'gps_high_pct',
# Progress
'reading_progress': 'reading_progress',
'writing_progress': 'writing_progress',
'maths_progress': 'maths_progress',
# Averages
'reading_avg_score': 'reading_avg_score',
'maths_avg_score': 'maths_avg_score',
'gps_avg_score': 'gps_avg_score',
# Context
'disadvantaged_pct': 'disadvantaged_pct',
'eal_pct': 'eal_pct',
'sen_support_pct': 'sen_support_pct',
'sen_ehcp_pct': 'sen_ehcp_pct',
'stability_pct': 'stability_pct',
# Absence
'reading_absence_pct': 'reading_absence_pct',
'gps_absence_pct': 'gps_absence_pct',
'maths_absence_pct': 'maths_absence_pct',
'writing_absence_pct': 'writing_absence_pct',
'science_absence_pct': 'science_absence_pct',
# Gender
'rwm_expected_boys_pct': 'rwm_expected_boys_pct',
'rwm_expected_girls_pct': 'rwm_expected_girls_pct',
'rwm_high_boys_pct': 'rwm_high_boys_pct',
'rwm_high_girls_pct': 'rwm_high_girls_pct',
# Disadvantaged
'rwm_expected_disadvantaged_pct': 'rwm_expected_disadvantaged_pct',
'rwm_expected_non_disadvantaged_pct': 'rwm_expected_non_disadvantaged_pct',
'disadvantaged_gap': 'disadvantaged_gap',
# 3-Year
'rwm_expected_3yr_pct': 'rwm_expected_3yr_pct',
'reading_avg_3yr': 'reading_avg_3yr',
'maths_avg_3yr': 'maths_avg_3yr',
}