feat: migrate backend to marts schema, update EES tap for verified datasets

Pipeline:
- EES tap: split KS4 into performance + info streams, fix admissions filename
  (SchoolLevel keyword match), fix census filename (yearly suffix), remove
  phonics (no school-level data on EES), change endswith → in for matching
- stg_ees_ks4: rewrite to filter long-format data and extract Attainment 8,
  Progress 8, EBacc, English/Maths metrics; join KS4 info for context
- stg_ees_admissions: map real CSV columns (total_number_places_offered, etc.)
- stg_ees_census: update source reference, stub with TODO for data columns
- Remove stg_ees_phonics, fact_phonics (no school-level EES data)
- Add ees_ks4_performance + ees_ks4_info sources, remove ees_ks4 + ees_phonics
- Update int_ks4_with_lineage + fact_ks4_performance with new KS4 columns
- Annual EES DAG: remove stg_ees_phonics+ from selector

Backend:
- models.py: replace all models to point at marts.* tables with schema='marts'
  (DimSchool, DimLocation, KS2Performance, FactOfstedInspection, etc.)
- data_loader.py: rewrite load_school_data_as_dataframe() using raw SQL joining
  dim_school + dim_location + fact_ks2_performance; update get_supplementary_data()
- database.py: remove migration machinery, keep only connection setup
- app.py: remove check_and_migrate_if_needed, remove /api/admin/reimport-ks2
  endpoints (pipeline handles all imports)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-27 09:29:27 +00:00
parent d82e36e7b2
commit ca351e9d73
18 changed files with 805 additions and 1245 deletions

View File

@@ -1,36 +1,30 @@
"""
Database connection setup using SQLAlchemy.
The schema is managed by dbt — the backend only reads from marts.* tables.
"""
from datetime import datetime
from typing import Optional
from sqlalchemy import create_engine, inspect
from sqlalchemy.orm import sessionmaker, declarative_base
from contextlib import contextmanager
from sqlalchemy import create_engine
from sqlalchemy.orm import sessionmaker, declarative_base
from .config import settings
# Create engine
engine = create_engine(
settings.database_url,
pool_size=10,
max_overflow=20,
pool_pre_ping=True, # Verify connections before use
echo=False, # Set to True for SQL debugging
pool_pre_ping=True,
echo=False,
)
# Session factory
SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
# Base class for models
Base = declarative_base()
def get_db():
"""
Dependency for FastAPI routes to get a database session.
"""
"""Dependency for FastAPI routes."""
db = SessionLocal()
try:
yield db
@@ -40,10 +34,7 @@ def get_db():
@contextmanager
def get_db_session():
"""
Context manager for database sessions.
Use in non-FastAPI contexts (scripts, etc).
"""
"""Context manager for non-FastAPI contexts."""
db = SessionLocal()
try:
yield db
@@ -53,95 +44,3 @@ def get_db_session():
raise
finally:
db.close()
def init_db():
"""
Initialize database - create all tables.
"""
Base.metadata.create_all(bind=engine)
def drop_db():
"""
Drop all tables - use with caution!
"""
Base.metadata.drop_all(bind=engine)
def get_db_schema_version() -> Optional[int]:
"""
Get the current schema version from the database.
Returns None if table doesn't exist or no version is set.
"""
from .models import SchemaVersion # Import here to avoid circular imports
# Check if schema_version table exists
inspector = inspect(engine)
if "schema_version" not in inspector.get_table_names():
return None
try:
with get_db_session() as db:
row = db.query(SchemaVersion).first()
return row.version if row else None
except Exception:
return None
def set_db_schema_version(version: int):
"""
Set/update the schema version in the database.
Creates the row if it doesn't exist.
"""
from .models import SchemaVersion
with get_db_session() as db:
row = db.query(SchemaVersion).first()
if row:
row.version = version
row.migrated_at = datetime.utcnow()
else:
db.add(SchemaVersion(id=1, version=version, migrated_at=datetime.utcnow()))
def check_and_migrate_if_needed():
"""
Check schema version and run migration if needed.
Called during application startup.
"""
from .version import SCHEMA_VERSION
from .migration import run_full_migration
db_version = get_db_schema_version()
if db_version == SCHEMA_VERSION:
print(f"Schema version {SCHEMA_VERSION} matches. Fast startup.")
# Still ensure tables exist (they should if version matches)
init_db()
return
if db_version is None:
print(f"No schema version found. Running initial migration (v{SCHEMA_VERSION})...")
else:
print(f"Schema mismatch: DB has v{db_version}, code expects v{SCHEMA_VERSION}")
print("Running full migration...")
try:
# Set schema version BEFORE migration so a crash mid-migration
# doesn't cause an infinite re-migration loop on every restart.
init_db()
set_db_schema_version(SCHEMA_VERSION)
success = run_full_migration(geocode=False)
if success:
print(f"Migration complete. Schema version {SCHEMA_VERSION}.")
else:
print("Warning: Migration completed but no data was imported.")
except Exception as e:
print(f"FATAL: Migration failed: {e}")
print("Application cannot start. Please check database and CSV files.")
raise