feat: migrate backend to marts schema, update EES tap for verified datasets

Pipeline:
- EES tap: split KS4 into performance + info streams, fix admissions filename
  (SchoolLevel keyword match), fix census filename (yearly suffix), remove
  phonics (no school-level data on EES), change endswith → in for matching
- stg_ees_ks4: rewrite to filter long-format data and extract Attainment 8,
  Progress 8, EBacc, English/Maths metrics; join KS4 info for context
- stg_ees_admissions: map real CSV columns (total_number_places_offered, etc.)
- stg_ees_census: update source reference, stub with TODO for data columns
- Remove stg_ees_phonics, fact_phonics (no school-level EES data)
- Add ees_ks4_performance + ees_ks4_info sources, remove ees_ks4 + ees_phonics
- Update int_ks4_with_lineage + fact_ks4_performance with new KS4 columns
- Annual EES DAG: remove stg_ees_phonics+ from selector

Backend:
- models.py: replace all models to point at marts.* tables with schema='marts'
  (DimSchool, DimLocation, KS2Performance, FactOfstedInspection, etc.)
- data_loader.py: rewrite load_school_data_as_dataframe() using raw SQL joining
  dim_school + dim_location + fact_ks2_performance; update get_supplementary_data()
- database.py: remove migration machinery, keep only connection setup
- app.py: remove check_and_migrate_if_needed, remove /api/admin/reimport-ks2
  endpoints (pipeline handles all imports)

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-27 09:29:27 +00:00
parent d82e36e7b2
commit ca351e9d73
18 changed files with 805 additions and 1245 deletions

View File

@@ -28,8 +28,6 @@ from .data_loader import (
get_supplementary_data,
)
from .data_loader import get_data_info as get_db_info
from .database import check_and_migrate_if_needed
from .migration import run_full_migration
from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
from .utils import clean_for_json
@@ -138,20 +136,15 @@ def validate_postcode(postcode: Optional[str]) -> Optional[str]:
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan - startup and shutdown events."""
# Startup: check schema version and migrate if needed
print("Starting up: Checking database schema...")
check_and_migrate_if_needed()
print("Loading school data from database...")
print("Loading school data from marts...")
df = load_school_data()
if df.empty:
print("Warning: No data in database. Check CSV files in data/ folder.")
print("Warning: No data in marts. Run the annual EES pipeline to populate KS2 data.")
else:
print(f"Data loaded successfully: {len(df)} records.")
yield # Application runs here
yield
# Shutdown: cleanup if needed
print("Shutting down...")
@@ -585,7 +578,7 @@ async def get_data_info(request: Request):
if db_info["total_schools"] == 0:
return {
"status": "no_data",
"message": "No data in database. Run the migration script: python scripts/migrate_csv_to_db.py",
"message": "No data in marts. Run the annual EES pipeline to load KS2 data.",
"data_source": "PostgreSQL",
}
@@ -635,56 +628,6 @@ async def reload_data(
return {"status": "reloaded"}
_reimport_status: dict = {"running": False, "done": False, "error": None}
@app.post("/api/admin/reimport-ks2")
@limiter.limit("2/minute")
async def reimport_ks2(
request: Request,
geocode: bool = True,
_: bool = Depends(verify_admin_api_key)
):
"""
Start a full KS2 CSV migration in the background and return immediately.
Poll GET /api/admin/reimport-ks2/status to check progress.
Pass ?geocode=false to skip postcode → lat/lng resolution.
Requires X-API-Key header with valid admin API key.
"""
global _reimport_status
if _reimport_status["running"]:
return {"status": "already_running"}
_reimport_status = {"running": True, "done": False, "error": None}
def _run():
global _reimport_status
try:
success = run_full_migration(geocode=geocode)
if not success:
_reimport_status = {"running": False, "done": False, "error": "No CSV data found"}
return
clear_cache()
load_school_data()
_reimport_status = {"running": False, "done": True, "error": None}
except Exception as exc:
_reimport_status = {"running": False, "done": False, "error": str(exc)}
import threading
threading.Thread(target=_run, daemon=True).start()
return {"status": "started"}
@app.get("/api/admin/reimport-ks2/status")
async def reimport_ks2_status(
request: Request,
_: bool = Depends(verify_admin_api_key)
):
"""Poll this endpoint to check reimport progress."""
s = _reimport_status
if s["error"]:
raise HTTPException(status_code=500, detail=s["error"])
return {"running": s["running"], "done": s["done"]}
# =============================================================================