Add automatic schema versioning with startup migration

On startup, the app now checks if the database schema version matches the code. If there's a mismatch or no version exists, it automatically runs a full data migration before starting. - Add backend/version.py with SCHEMA_VERSION constant - Add backend/migration.py with extracted migration logic - Add SchemaVersion model to track DB version - Add version check functions to database.py - Update app.py lifespan to use check_and_migrate_if_needed() - Simplify migrate_csv_to_db.py to use shared logic Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
2026-01-16 10:23:02 +00:00
parent 352eeec2db
commit f4919db3b9
6 changed files with 564 additions and 406 deletions
@@ -26,7 +26,7 @@ from .data_loader import (
    geocode_single_postcode,
 )
 from .data_loader import get_data_info as get_db_info
-from .database import init_db
+from .database import check_and_migrate_if_needed
 from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
 from .utils import clean_for_json
@@ -135,16 +135,16 @@ def validate_postcode(postcode: Optional[str]) -> Optional[str]:
@asynccontextmanager
 async def lifespan(app: FastAPI):
    """Application lifespan - startup and shutdown events."""
-    # Startup: initialize database and pre-load data
+    # Startup: check schema version and migrate if needed
-    print("Starting up: Initializing database...")
+    print("Starting up: Checking database schema...")
-    init_db()  # Ensure tables exist
+    check_and_migrate_if_needed()
    print("Loading school data from database...")
    df = load_school_data()
    if df.empty:
-        print("Warning: No data in database. Run the migration script to import data.")
+        print("Warning: No data in database. Check CSV files in data/ folder.")
    else:
-        print("Data loaded successfully.")
+        print(f"Data loaded successfully: {len(df)} records.")
    yield  # Application runs here
@@ -2,7 +2,10 @@
 Database connection setup using SQLAlchemy.
 """
-from sqlalchemy import create_engine
+from datetime import datetime
 from typing import Optional
 from sqlalchemy import create_engine, inspect
 from sqlalchemy.orm import sessionmaker, declarative_base
 from contextlib import contextmanager
@@ -65,3 +68,80 @@ def drop_db():
    """
    Base.metadata.drop_all(bind=engine)
 def get_db_schema_version() -> Optional[int]:
    """
    Get the current schema version from the database.
    Returns None if table doesn't exist or no version is set.
    """
    from .models import SchemaVersion  # Import here to avoid circular imports
    # Check if schema_version table exists
    inspector = inspect(engine)
    if "schema_version" not in inspector.get_table_names():
        return None
    try:
        with get_db_session() as db:
            row = db.query(SchemaVersion).first()
            return row.version if row else None
    except Exception:
        return None
 def set_db_schema_version(version: int):
    """
    Set/update the schema version in the database.
    Creates the row if it doesn't exist.
    """
    from .models import SchemaVersion
    with get_db_session() as db:
        row = db.query(SchemaVersion).first()
        if row:
            row.version = version
            row.migrated_at = datetime.utcnow()
        else:
            db.add(SchemaVersion(id=1, version=version, migrated_at=datetime.utcnow()))
 def check_and_migrate_if_needed():
    """
    Check schema version and run migration if needed.
    Called during application startup.
    """
    from .version import SCHEMA_VERSION
    from .migration import run_full_migration
    db_version = get_db_schema_version()
    if db_version == SCHEMA_VERSION:
        print(f"Schema version {SCHEMA_VERSION} matches. Fast startup.")
        # Still ensure tables exist (they should if version matches)
        init_db()
        return
    if db_version is None:
        print(f"No schema version found. Running initial migration (v{SCHEMA_VERSION})...")
    else:
        print(f"Schema mismatch: DB has v{db_version}, code expects v{SCHEMA_VERSION}")
        print("Running full migration...")
    try:
        success = run_full_migration(geocode=False)
        if success:
            # Ensure schema_version table exists before writing
            init_db()
            set_db_schema_version(SCHEMA_VERSION)
            print(f"Migration complete. Schema version set to {SCHEMA_VERSION}")
        else:
            print("Warning: Migration completed but no data was imported.")
            init_db()
            set_db_schema_version(SCHEMA_VERSION)
    except Exception as e:
        print(f"FATAL: Migration failed: {e}")
        print("Application cannot start. Please check database and CSV files.")
        raise
@@ -0,0 +1,413 @@
 """
 Database migration logic for importing CSV data.
 Used by both CLI script and automatic startup migration.
 """
 import re
 from pathlib import Path
 from typing import Dict, Optional
 import numpy as np
 import pandas as pd
 import requests
 from .config import settings
 from .database import Base, engine, get_db_session
 from .models import School, SchoolResult
 from .schemas import (
    COLUMN_MAPPINGS,
    LA_CODE_TO_NAME,
    NULL_VALUES,
    SCHOOL_TYPE_MAP,
 )
 def parse_numeric(value) -> Optional[float]:
    """Parse a numeric value, handling special cases."""
    if pd.isna(value):
        return None
    if isinstance(value, (int, float)):
        return float(value) if not np.isnan(value) else None
    str_val = str(value).strip().upper()
    if str_val in NULL_VALUES or str_val == "":
        return None
    # Remove percentage signs if present
    str_val = str_val.replace("%", "")
    try:
        return float(str_val)
    except ValueError:
        return None
 def extract_year_from_folder(folder_name: str) -> Optional[int]:
    """Extract year from folder name like '2023-2024'."""
    match = re.search(r"(\d{4})-(\d{4})", folder_name)
    if match:
        return int(match.group(2))
    match = re.search(r"(\d{4})", folder_name)
    if match:
        return int(match.group(1))
    return None
 def geocode_postcodes_bulk(postcodes: list) -> Dict[str, tuple]:
    """
    Geocode postcodes in bulk using postcodes.io API.
    Returns dict of postcode -> (latitude, longitude).
    """
    results = {}
    valid_postcodes = [
        p.strip().upper()
        for p in postcodes
        if p and isinstance(p, str) and len(p.strip()) >= 5
    ]
    valid_postcodes = list(set(valid_postcodes))
    if not valid_postcodes:
        return results
    batch_size = 100
    total_batches = (len(valid_postcodes) + batch_size - 1) // batch_size
    for i, batch_start in enumerate(range(0, len(valid_postcodes), batch_size)):
        batch = valid_postcodes[batch_start : batch_start + batch_size]
        print(
            f"    Geocoding batch {i + 1}/{total_batches} ({len(batch)} postcodes)..."
        )
        try:
            response = requests.post(
                "https://api.postcodes.io/postcodes",
                json={"postcodes": batch},
                timeout=30,
            )
            if response.status_code == 200:
                data = response.json()
                for item in data.get("result", []):
                    if item and item.get("result"):
                        pc = item["query"].upper()
                        lat = item["result"].get("latitude")
                        lon = item["result"].get("longitude")
                        if lat and lon:
                            results[pc] = (lat, lon)
        except Exception as e:
            print(f"    Warning: Geocoding batch failed: {e}")
    return results
 def load_csv_data(data_dir: Path) -> pd.DataFrame:
    """Load all CSV data from data directory."""
    all_data = []
    for folder in sorted(data_dir.iterdir()):
        if not folder.is_dir():
            continue
        year = extract_year_from_folder(folder.name)
        if not year:
            continue
        # Specifically look for the KS2 results file
        ks2_file = folder / "england_ks2final.csv"
        if not ks2_file.exists():
            continue
        csv_file = ks2_file
        print(f"  Loading {csv_file.name} (year {year})...")
        try:
            df = pd.read_csv(csv_file, encoding="latin-1", low_memory=False)
        except Exception as e:
            print(f"    Error loading {csv_file}: {e}")
            continue
        # Rename columns
        df.rename(columns=COLUMN_MAPPINGS, inplace=True)
        df["year"] = year
        # Handle local authority name
        la_name_cols = ["LANAME", "LA (name)", "LA_NAME", "LA NAME"]
        la_name_col = next((c for c in la_name_cols if c in df.columns), None)
        if la_name_col and la_name_col != "local_authority":
            df["local_authority"] = df[la_name_col]
        elif "LEA" in df.columns:
            df["local_authority_code"] = pd.to_numeric(df["LEA"], errors="coerce")
            df["local_authority"] = (
                df["local_authority_code"]
                .map(LA_CODE_TO_NAME)
                .fillna(df["LEA"].astype(str))
            )
        # Store LEA code
        if "LEA" in df.columns:
            df["local_authority_code"] = pd.to_numeric(df["LEA"], errors="coerce")
        # Map school type
        if "school_type_code" in df.columns:
            df["school_type"] = (
                df["school_type_code"]
                .map(SCHOOL_TYPE_MAP)
                .fillna(df["school_type_code"])
            )
        # Create combined address
        addr_parts = ["address1", "address2", "town", "postcode"]
        for col in addr_parts:
            if col not in df.columns:
                df[col] = None
        df["address"] = df.apply(
            lambda r: ", ".join(
                str(v)
                for v in [
                    r.get("address1"),
                    r.get("address2"),
                    r.get("town"),
                    r.get("postcode"),
                ]
                if pd.notna(v) and str(v).strip()
            ),
            axis=1,
        )
        all_data.append(df)
        print(f"    Loaded {len(df)} records")
    if all_data:
        result = pd.concat(all_data, ignore_index=True)
        print(f"\nTotal records loaded: {len(result)}")
        print(f"Unique schools: {result['urn'].nunique()}")
        print(f"Years: {sorted(result['year'].unique())}")
        return result
    return pd.DataFrame()
 def migrate_data(df: pd.DataFrame, geocode: bool = False):
    """Migrate DataFrame data to database."""
    # Clean URN column - convert to integer, drop invalid values
    df = df.copy()
    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
    df = df.dropna(subset=["urn"])
    df["urn"] = df["urn"].astype(int)
    # Group by URN to get unique schools (use latest year's data)
    school_data = (
        df.sort_values("year", ascending=False).groupby("urn").first().reset_index()
    )
    print(f"\nMigrating {len(school_data)} unique schools...")
    # Geocode if requested
    geocoded = {}
    if geocode and "postcode" in df.columns:
        print("\nGeocoding postcodes...")
        postcodes = df["postcode"].dropna().unique().tolist()
        geocoded = geocode_postcodes_bulk(postcodes)
        print(f"  Successfully geocoded {len(geocoded)} postcodes")
    with get_db_session() as db:
        # Create schools
        urn_to_school_id = {}
        schools_created = 0
        for _, row in school_data.iterrows():
            # Safely parse URN - handle None, NaN, whitespace, and invalid values
            urn_val = row.get("urn")
            urn = None
            if pd.notna(urn_val):
                try:
                    urn_str = str(urn_val).strip()
                    if urn_str:
                        urn = int(float(urn_str))  # Handle "12345.0" format
                except (ValueError, TypeError):
                    pass
            if not urn:
                continue
            # Skip if we've already added this URN (handles duplicates in source data)
            if urn in urn_to_school_id:
                continue
            # Get geocoding data
            postcode = row.get("postcode")
            lat, lon = None, None
            if postcode and pd.notna(postcode):
                coords = geocoded.get(str(postcode).strip().upper())
                if coords:
                    lat, lon = coords
            # Safely parse local_authority_code
            la_code = None
            la_code_val = row.get("local_authority_code")
            if pd.notna(la_code_val):
                try:
                    la_code_str = str(la_code_val).strip()
                    if la_code_str:
                        la_code = int(float(la_code_str))
                except (ValueError, TypeError):
                    pass
            school = School(
                urn=urn,
                school_name=row.get("school_name")
                if pd.notna(row.get("school_name"))
                else "Unknown",
                local_authority=row.get("local_authority")
                if pd.notna(row.get("local_authority"))
                else None,
                local_authority_code=la_code,
                school_type=row.get("school_type")
                if pd.notna(row.get("school_type"))
                else None,
                school_type_code=row.get("school_type_code")
                if pd.notna(row.get("school_type_code"))
                else None,
                religious_denomination=row.get("religious_denomination")
                if pd.notna(row.get("religious_denomination"))
                else None,
                age_range=row.get("age_range")
                if pd.notna(row.get("age_range"))
                else None,
                address1=row.get("address1") if pd.notna(row.get("address1")) else None,
                address2=row.get("address2") if pd.notna(row.get("address2")) else None,
                town=row.get("town") if pd.notna(row.get("town")) else None,
                postcode=row.get("postcode") if pd.notna(row.get("postcode")) else None,
                latitude=lat,
                longitude=lon,
            )
            db.add(school)
            db.flush()  # Get the ID
            urn_to_school_id[urn] = school.id
            schools_created += 1
            if schools_created % 1000 == 0:
                print(f"  Created {schools_created} schools...")
        print(f"  Created {schools_created} schools")
        # Create results
        print(f"\nMigrating {len(df)} yearly results...")
        results_created = 0
        for _, row in df.iterrows():
            # Safely parse URN
            urn_val = row.get("urn")
            urn = None
            if pd.notna(urn_val):
                try:
                    urn_str = str(urn_val).strip()
                    if urn_str:
                        urn = int(float(urn_str))
                except (ValueError, TypeError):
                    pass
            if not urn or urn not in urn_to_school_id:
                continue
            school_id = urn_to_school_id[urn]
            # Safely parse year
            year_val = row.get("year")
            year = None
            if pd.notna(year_val):
                try:
                    year = int(float(str(year_val).strip()))
                except (ValueError, TypeError):
                    pass
            if not year:
                continue
            result = SchoolResult(
                school_id=school_id,
                year=year,
                total_pupils=parse_numeric(row.get("total_pupils")),
                eligible_pupils=parse_numeric(row.get("eligible_pupils")),
                # Expected Standard
                rwm_expected_pct=parse_numeric(row.get("rwm_expected_pct")),
                reading_expected_pct=parse_numeric(row.get("reading_expected_pct")),
                writing_expected_pct=parse_numeric(row.get("writing_expected_pct")),
                maths_expected_pct=parse_numeric(row.get("maths_expected_pct")),
                gps_expected_pct=parse_numeric(row.get("gps_expected_pct")),
                science_expected_pct=parse_numeric(row.get("science_expected_pct")),
                # Higher Standard
                rwm_high_pct=parse_numeric(row.get("rwm_high_pct")),
                reading_high_pct=parse_numeric(row.get("reading_high_pct")),
                writing_high_pct=parse_numeric(row.get("writing_high_pct")),
                maths_high_pct=parse_numeric(row.get("maths_high_pct")),
                gps_high_pct=parse_numeric(row.get("gps_high_pct")),
                # Progress
                reading_progress=parse_numeric(row.get("reading_progress")),
                writing_progress=parse_numeric(row.get("writing_progress")),
                maths_progress=parse_numeric(row.get("maths_progress")),
                # Averages
                reading_avg_score=parse_numeric(row.get("reading_avg_score")),
                maths_avg_score=parse_numeric(row.get("maths_avg_score")),
                gps_avg_score=parse_numeric(row.get("gps_avg_score")),
                # Context
                disadvantaged_pct=parse_numeric(row.get("disadvantaged_pct")),
                eal_pct=parse_numeric(row.get("eal_pct")),
                sen_support_pct=parse_numeric(row.get("sen_support_pct")),
                sen_ehcp_pct=parse_numeric(row.get("sen_ehcp_pct")),
                stability_pct=parse_numeric(row.get("stability_pct")),
                # Absence
                reading_absence_pct=parse_numeric(row.get("reading_absence_pct")),
                gps_absence_pct=parse_numeric(row.get("gps_absence_pct")),
                maths_absence_pct=parse_numeric(row.get("maths_absence_pct")),
                writing_absence_pct=parse_numeric(row.get("writing_absence_pct")),
                science_absence_pct=parse_numeric(row.get("science_absence_pct")),
                # Gender
                rwm_expected_boys_pct=parse_numeric(row.get("rwm_expected_boys_pct")),
                rwm_expected_girls_pct=parse_numeric(row.get("rwm_expected_girls_pct")),
                rwm_high_boys_pct=parse_numeric(row.get("rwm_high_boys_pct")),
                rwm_high_girls_pct=parse_numeric(row.get("rwm_high_girls_pct")),
                # Disadvantaged
                rwm_expected_disadvantaged_pct=parse_numeric(
                    row.get("rwm_expected_disadvantaged_pct")
                ),
                rwm_expected_non_disadvantaged_pct=parse_numeric(
                    row.get("rwm_expected_non_disadvantaged_pct")
                ),
                disadvantaged_gap=parse_numeric(row.get("disadvantaged_gap")),
                # 3-Year
                rwm_expected_3yr_pct=parse_numeric(row.get("rwm_expected_3yr_pct")),
                reading_avg_3yr=parse_numeric(row.get("reading_avg_3yr")),
                maths_avg_3yr=parse_numeric(row.get("maths_avg_3yr")),
            )
            db.add(result)
            results_created += 1
            if results_created % 10000 == 0:
                print(f"  Created {results_created} results...")
                db.flush()
        print(f"  Created {results_created} results")
        # Commit all changes
        db.commit()
        print("\nMigration complete!")
 def run_full_migration(geocode: bool = False) -> bool:
    """
    Run a complete migration: drop all tables and reimport from CSV.
    Returns True if successful, False if no data found.
    Raises exception on error.
    """
    print("Dropping existing tables...")
    Base.metadata.drop_all(bind=engine)
    print("Creating tables...")
    Base.metadata.create_all(bind=engine)
    print("\nLoading CSV data...")
    df = load_csv_data(settings.data_dir)
    if df.empty:
        print("Warning: No CSV data found to migrate!")
        return False
    migrate_data(df, geocode=geocode)
    return True
@@ -3,9 +3,11 @@ SQLAlchemy database models for school data.
 Normalized schema with separate tables for schools and yearly results.
 """
 from datetime import datetime
 from sqlalchemy import (
    Column, Integer, String, Float, ForeignKey, Index, UniqueConstraint,
-    Text, Boolean
+    Text, Boolean, DateTime
 )
 from sqlalchemy.orm import relationship
 from .database import Base
@@ -133,6 +135,21 @@ class SchoolResult(Base):
        return f"<SchoolResult(school_id={self.school_id}, year={self.year})>"
 class SchemaVersion(Base):
    """
    Tracks database schema version for automatic migrations.
    Single-row table that stores the current schema version.
    """
    __tablename__ = "schema_version"
    id = Column(Integer, primary_key=True, default=1)
    version = Column(Integer, nullable=False)
    migrated_at = Column(DateTime, default=datetime.utcnow, onupdate=datetime.utcnow)
    def __repr__(self):
        return f"<SchemaVersion(version={self.version}, migrated_at={self.migrated_at})>"
 # Mapping from CSV columns to model fields
 SCHOOL_FIELD_MAPPING = {
    'urn': 'urn',
@@ -0,0 +1,22 @@
 """
 Schema versioning for database migrations.
 HOW TO USE:
 - Bump SCHEMA_VERSION when making changes to database models
 - This triggers an automatic full data reimport on next app startup
 WHEN TO BUMP:
 - Adding/removing columns in models.py
 - Changing column types or constraints
 - Modifying CSV column mappings in schemas.py
 - Any change that requires fresh data import
 """
 # Current schema version - increment when models change
 SCHEMA_VERSION = 2
 # Changelog for documentation
 SCHEMA_CHANGELOG = {
    1: "Initial schema with School and SchoolResult tables",
    2: "Added pupil absence fields (reading, maths, gps, writing, science)",
 }
@@ -1,16 +1,15 @@
 #!/usr/bin/env python3
 """
-Migration script to import CSV data into PostgreSQL database.
+CLI script for manual database migration.
 Usage:
    python scripts/migrate_csv_to_db.py [--drop] [--geocode]
 Options:
-    --drop      Drop existing tables before migration
+    --drop      Drop existing tables before migration (full reimport)
    --geocode   Geocode postcodes (requires network access)
 """
 import os
 import sys
 from pathlib import Path
@@ -18,389 +17,11 @@ from pathlib import Path
 sys.path.insert(0, str(Path(__file__).parent.parent))
 import argparse
 import re
 from typing import Dict, Optional
 import numpy as np
 import pandas as pd
 import requests
 from backend.config import settings
-from backend.database import Base, engine, get_db_session
+from backend.database import Base, engine, init_db, set_db_schema_version
-from backend.models import (
+from backend.migration import load_csv_data, migrate_data, run_full_migration
-    RESULT_FIELD_MAPPING,
+from backend.version import SCHEMA_VERSION
    SCHOOL_FIELD_MAPPING,
    School,
    SchoolResult,
 )
 from backend.schemas import (
    COLUMN_MAPPINGS,
    LA_CODE_TO_NAME,
    NULL_VALUES,
    NUMERIC_COLUMNS,
    SCHOOL_TYPE_MAP,
 )
 def parse_numeric(value) -> Optional[float]:
    """Parse a numeric value, handling special cases."""
    if pd.isna(value):
        return None
    if isinstance(value, (int, float)):
        return float(value) if not np.isnan(value) else None
    str_val = str(value).strip().upper()
    if str_val in NULL_VALUES or str_val == "":
        return None
    # Remove percentage signs if present
    str_val = str_val.replace("%", "")
    try:
        return float(str_val)
    except ValueError:
        return None
 def extract_year_from_folder(folder_name: str) -> Optional[int]:
    """Extract year from folder name like '2023-2024'."""
    match = re.search(r"(\d{4})-(\d{4})", folder_name)
    if match:
        return int(match.group(2))
    match = re.search(r"(\d{4})", folder_name)
    if match:
        return int(match.group(1))
    return None
 def geocode_postcodes_bulk(postcodes: list) -> Dict[str, tuple]:
    """
    Geocode postcodes in bulk using postcodes.io API.
    Returns dict of postcode -> (latitude, longitude).
    """
    results = {}
    valid_postcodes = [
        p.strip().upper()
        for p in postcodes
        if p and isinstance(p, str) and len(p.strip()) >= 5
    ]
    valid_postcodes = list(set(valid_postcodes))
    if not valid_postcodes:
        return results
    batch_size = 100
    total_batches = (len(valid_postcodes) + batch_size - 1) // batch_size
    for i, batch_start in enumerate(range(0, len(valid_postcodes), batch_size)):
        batch = valid_postcodes[batch_start : batch_start + batch_size]
        print(
            f"    Geocoding batch {i + 1}/{total_batches} ({len(batch)} postcodes)..."
        )
        try:
            response = requests.post(
                "https://api.postcodes.io/postcodes",
                json={"postcodes": batch},
                timeout=30,
            )
            if response.status_code == 200:
                data = response.json()
                for item in data.get("result", []):
                    if item and item.get("result"):
                        pc = item["query"].upper()
                        lat = item["result"].get("latitude")
                        lon = item["result"].get("longitude")
                        if lat and lon:
                            results[pc] = (lat, lon)
        except Exception as e:
            print(f"    Warning: Geocoding batch failed: {e}")
    return results
 def load_csv_data(data_dir: Path) -> pd.DataFrame:
    """Load all CSV data from data directory."""
    all_data = []
    for folder in sorted(data_dir.iterdir()):
        if not folder.is_dir():
            continue
        year = extract_year_from_folder(folder.name)
        if not year:
            continue
        # Specifically look for the KS2 results file
        ks2_file = folder / "england_ks2final.csv"
        if not ks2_file.exists():
            continue
        csv_file = ks2_file
        print(f"  Loading {csv_file.name} (year {year})...")
        try:
            df = pd.read_csv(csv_file, encoding="latin-1", low_memory=False)
        except Exception as e:
            print(f"    Error loading {csv_file}: {e}")
            continue
        # Rename columns
        df.rename(columns=COLUMN_MAPPINGS, inplace=True)
        df["year"] = year
        # Handle local authority name
        la_name_cols = ["LANAME", "LA (name)", "LA_NAME", "LA NAME"]
        la_name_col = next((c for c in la_name_cols if c in df.columns), None)
        if la_name_col and la_name_col != "local_authority":
            df["local_authority"] = df[la_name_col]
        elif "LEA" in df.columns:
            df["local_authority_code"] = pd.to_numeric(df["LEA"], errors="coerce")
            df["local_authority"] = (
                df["local_authority_code"]
                .map(LA_CODE_TO_NAME)
                .fillna(df["LEA"].astype(str))
            )
        # Store LEA code
        if "LEA" in df.columns:
            df["local_authority_code"] = pd.to_numeric(df["LEA"], errors="coerce")
        # Map school type
        if "school_type_code" in df.columns:
            df["school_type"] = (
                df["school_type_code"]
                .map(SCHOOL_TYPE_MAP)
                .fillna(df["school_type_code"])
            )
        # Create combined address
        addr_parts = ["address1", "address2", "town", "postcode"]
        for col in addr_parts:
            if col not in df.columns:
                df[col] = None
        df["address"] = df.apply(
            lambda r: ", ".join(
                str(v)
                for v in [
                    r.get("address1"),
                    r.get("address2"),
                    r.get("town"),
                    r.get("postcode"),
                ]
                if pd.notna(v) and str(v).strip()
            ),
            axis=1,
        )
        all_data.append(df)
        print(f"    Loaded {len(df)} records")
    if all_data:
        result = pd.concat(all_data, ignore_index=True)
        print(f"\nTotal records loaded: {len(result)}")
        print(f"Unique schools: {result['urn'].nunique()}")
        print(f"Years: {sorted(result['year'].unique())}")
        return result
    return pd.DataFrame()
 def migrate_data(df: pd.DataFrame, geocode: bool = False):
    """Migrate DataFrame data to database."""
    # Clean URN column - convert to integer, drop invalid values
    df = df.copy()
    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
    df = df.dropna(subset=["urn"])
    df["urn"] = df["urn"].astype(int)
    # Group by URN to get unique schools (use latest year's data)
    school_data = (
        df.sort_values("year", ascending=False).groupby("urn").first().reset_index()
    )
    print(f"\nMigrating {len(school_data)} unique schools...")
    # Geocode if requested
    geocoded = {}
    if geocode and "postcode" in df.columns:
        print("\nGeocoding postcodes...")
        postcodes = df["postcode"].dropna().unique().tolist()
        geocoded = geocode_postcodes_bulk(postcodes)
        print(f"  Successfully geocoded {len(geocoded)} postcodes")
    with get_db_session() as db:
        # Create schools
        urn_to_school_id = {}
        schools_created = 0
        for _, row in school_data.iterrows():
            # Safely parse URN - handle None, NaN, whitespace, and invalid values
            urn_val = row.get("urn")
            urn = None
            if pd.notna(urn_val):
                try:
                    urn_str = str(urn_val).strip()
                    if urn_str:
                        urn = int(float(urn_str))  # Handle "12345.0" format
                except (ValueError, TypeError):
                    pass
            if not urn:
                continue
            # Skip if we've already added this URN (handles duplicates in source data)
            if urn in urn_to_school_id:
                continue
            # Get geocoding data
            postcode = row.get("postcode")
            lat, lon = None, None
            if postcode and pd.notna(postcode):
                coords = geocoded.get(str(postcode).strip().upper())
                if coords:
                    lat, lon = coords
            # Safely parse local_authority_code
            la_code = None
            la_code_val = row.get("local_authority_code")
            if pd.notna(la_code_val):
                try:
                    la_code_str = str(la_code_val).strip()
                    if la_code_str:
                        la_code = int(float(la_code_str))
                except (ValueError, TypeError):
                    pass
            school = School(
                urn=urn,
                school_name=row.get("school_name")
                if pd.notna(row.get("school_name"))
                else "Unknown",
                local_authority=row.get("local_authority")
                if pd.notna(row.get("local_authority"))
                else None,
                local_authority_code=la_code,
                school_type=row.get("school_type")
                if pd.notna(row.get("school_type"))
                else None,
                school_type_code=row.get("school_type_code")
                if pd.notna(row.get("school_type_code"))
                else None,
                religious_denomination=row.get("religious_denomination")
                if pd.notna(row.get("religious_denomination"))
                else None,
                age_range=row.get("age_range")
                if pd.notna(row.get("age_range"))
                else None,
                address1=row.get("address1") if pd.notna(row.get("address1")) else None,
                address2=row.get("address2") if pd.notna(row.get("address2")) else None,
                town=row.get("town") if pd.notna(row.get("town")) else None,
                postcode=row.get("postcode") if pd.notna(row.get("postcode")) else None,
                latitude=lat,
                longitude=lon,
            )
            db.add(school)
            db.flush()  # Get the ID
            urn_to_school_id[urn] = school.id
            schools_created += 1
            if schools_created % 1000 == 0:
                print(f"  Created {schools_created} schools...")
        print(f"  Created {schools_created} schools")
        # Create results
        print(f"\nMigrating {len(df)} yearly results...")
        results_created = 0
        for _, row in df.iterrows():
            # Safely parse URN
            urn_val = row.get("urn")
            urn = None
            if pd.notna(urn_val):
                try:
                    urn_str = str(urn_val).strip()
                    if urn_str:
                        urn = int(float(urn_str))
                except (ValueError, TypeError):
                    pass
            if not urn or urn not in urn_to_school_id:
                continue
            school_id = urn_to_school_id[urn]
            # Safely parse year
            year_val = row.get("year")
            year = None
            if pd.notna(year_val):
                try:
                    year = int(float(str(year_val).strip()))
                except (ValueError, TypeError):
                    pass
            if not year:
                continue
            result = SchoolResult(
                school_id=school_id,
                year=year,
                total_pupils=parse_numeric(row.get("total_pupils")),
                eligible_pupils=parse_numeric(row.get("eligible_pupils")),
                # Expected Standard
                rwm_expected_pct=parse_numeric(row.get("rwm_expected_pct")),
                reading_expected_pct=parse_numeric(row.get("reading_expected_pct")),
                writing_expected_pct=parse_numeric(row.get("writing_expected_pct")),
                maths_expected_pct=parse_numeric(row.get("maths_expected_pct")),
                gps_expected_pct=parse_numeric(row.get("gps_expected_pct")),
                science_expected_pct=parse_numeric(row.get("science_expected_pct")),
                # Higher Standard
                rwm_high_pct=parse_numeric(row.get("rwm_high_pct")),
                reading_high_pct=parse_numeric(row.get("reading_high_pct")),
                writing_high_pct=parse_numeric(row.get("writing_high_pct")),
                maths_high_pct=parse_numeric(row.get("maths_high_pct")),
                gps_high_pct=parse_numeric(row.get("gps_high_pct")),
                # Progress
                reading_progress=parse_numeric(row.get("reading_progress")),
                writing_progress=parse_numeric(row.get("writing_progress")),
                maths_progress=parse_numeric(row.get("maths_progress")),
                # Averages
                reading_avg_score=parse_numeric(row.get("reading_avg_score")),
                maths_avg_score=parse_numeric(row.get("maths_avg_score")),
                gps_avg_score=parse_numeric(row.get("gps_avg_score")),
                # Context
                disadvantaged_pct=parse_numeric(row.get("disadvantaged_pct")),
                eal_pct=parse_numeric(row.get("eal_pct")),
                sen_support_pct=parse_numeric(row.get("sen_support_pct")),
                sen_ehcp_pct=parse_numeric(row.get("sen_ehcp_pct")),
                stability_pct=parse_numeric(row.get("stability_pct")),
                # Gender
                rwm_expected_boys_pct=parse_numeric(row.get("rwm_expected_boys_pct")),
                rwm_expected_girls_pct=parse_numeric(row.get("rwm_expected_girls_pct")),
                rwm_high_boys_pct=parse_numeric(row.get("rwm_high_boys_pct")),
                rwm_high_girls_pct=parse_numeric(row.get("rwm_high_girls_pct")),
                # Disadvantaged
                rwm_expected_disadvantaged_pct=parse_numeric(
                    row.get("rwm_expected_disadvantaged_pct")
                ),
                rwm_expected_non_disadvantaged_pct=parse_numeric(
                    row.get("rwm_expected_non_disadvantaged_pct")
                ),
                disadvantaged_gap=parse_numeric(row.get("disadvantaged_gap")),
                # 3-Year
                rwm_expected_3yr_pct=parse_numeric(row.get("rwm_expected_3yr_pct")),
                reading_avg_3yr=parse_numeric(row.get("reading_avg_3yr")),
                maths_avg_3yr=parse_numeric(row.get("maths_avg_3yr")),
            )
            db.add(result)
            results_created += 1
            if results_created % 10000 == 0:
                print(f"  Created {results_created} results...")
                db.flush()
        print(f"  Created {results_created} results")
        # Commit all changes
        db.commit()
        print("\nMigration complete!")
 def main():
@@ -418,24 +39,29 @@ def main():
    print("=" * 60)
    print(f"\nDatabase: {settings.database_url.split('@')[-1]}")
    print(f"Data directory: {settings.data_dir}")
    print(f"Target schema version: {SCHEMA_VERSION}")
    if args.drop:
-        print("\n⚠️  Dropping existing tables...")
+        print("\nRunning full migration (drop + reimport)...")
-        Base.metadata.drop_all(bind=engine)
+        success = run_full_migration(geocode=args.geocode)
-
+    else:
-    print("\nCreating tables...")
+        print("\nCreating tables (preserving existing data)...")
-    Base.metadata.create_all(bind=engine)
+        init_db()
        print("\nLoading CSV data...")
        df = load_csv_data(settings.data_dir)
        if df.empty:
            print("No data found to migrate!")
            return 1
        migrate_data(df, geocode=args.geocode)
        success = True
-    return 0
+    if success:
        # Ensure schema_version table exists
        init_db()
        set_db_schema_version(SCHEMA_VERSION)
        print(f"\nSchema version set to {SCHEMA_VERSION}")
    return 0 if success else 1
 if __name__ == "__main__":