Refactoring and bug fixes

2026-01-06 16:30:32 +00:00
parent 0ea4720ac1
commit 54e4bc2e77
11 changed files with 1246 additions and 534 deletions
@@ -0,0 +1,2 @@
+# Backend package
+
@@ -4,284 +4,83 @@ Serves primary school (KS2) performance data for comparing schools.
 Uses real data from UK Government Compare School Performance downloads.
 """

+from contextlib import asynccontextmanager
 from fastapi import FastAPI, HTTPException, Query
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
-import pandas as pd
-import numpy as np
-from pathlib import Path
 from typing import Optional
-import os
-import re

-# No longer filtering by specific LA codes - load all available schools
+from .config import settings
+from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
+from .data_loader import load_school_data, clear_cache
+from .utils import clean_for_json
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan - startup and shutdown events."""
+    # Startup: pre-load data
+    print("Starting up: Loading school data...")
+    load_school_data()
+    print("Data loaded successfully.")
+    
+    yield  # Application runs here
+    
+    # Shutdown: cleanup if needed
+    print("Shutting down...")
+

 app = FastAPI(
    title="SchoolCompare API",
    description="API for comparing primary school (KS2) performance data - schoolcompare.co.uk",
-    version="1.0.0"
+    version="2.0.0",
+    lifespan=lifespan,
 )

-# CORS middleware for development
+# CORS middleware with configurable origins
 app.add_middleware(
    CORSMiddleware,
-    allow_origins=["*"],
+    allow_origins=settings.allowed_origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )

-# Data directory
-DATA_DIR = Path(__file__).parent.parent / "data"
-FRONTEND_DIR = Path(__file__).parent.parent / "frontend"
-
-# Cache for loaded data - cleared on reload (updated for 2016-2017 data)
-_data_cache: Optional[pd.DataFrame] = None
-
-
-def convert_to_native(value):
-    """Convert numpy types to native Python types for JSON serialization."""
-    if pd.isna(value):
-        return None
-    if isinstance(value, (np.integer,)):
-        return int(value)
-    if isinstance(value, (np.floating,)):
-        if np.isnan(value) or np.isinf(value):
-            return None
-        return float(value)
-    if isinstance(value, np.ndarray):
-        return value.tolist()
-    if value == "SUPP" or value == "NE" or value == "NA" or value == "NP":
-        return None
-    return value
-
-
-def clean_for_json(df: pd.DataFrame) -> list:
-    """Convert DataFrame to list of dicts, replacing NaN/inf with None for JSON serialization."""
-    records = df.to_dict(orient="records")
-    cleaned = []
-    for record in records:
-        clean_record = {}
-        for key, value in record.items():
-            clean_record[key] = convert_to_native(value)
-        cleaned.append(clean_record)
-    return cleaned
-
-
-def parse_numeric(value):
-    """Parse a value to numeric, handling SUPP, NE, NA, %, etc."""
-    if pd.isna(value):
-        return None
-    if isinstance(value, (int, float)):
-        if np.isnan(value) or np.isinf(value):
-            return None
-        return value
-    if isinstance(value, str):
-        value = value.strip()
-        if value in ["SUPP", "NE", "NA", "NP", "NEW", "LOW", ""]:
-            return None
-        # Remove % sign if present
-        if value.endswith('%'):
-            value = value[:-1]
-        try:
-            return float(value)
-        except ValueError:
-            return None
-    return None
-
-
-def extract_year_from_folder(folder_name: str) -> Optional[int]:
-    """Extract the end year from folder name like '2023-2024' -> 2024."""
-    match = re.search(r'(\d{4})-(\d{4})', folder_name)
-    if match:
-        return int(match.group(2))
-    return None
-
-
-def load_school_data() -> pd.DataFrame:
-    """Load and combine all school data from CSV files in year folders."""
-    global _data_cache
-    
-    if _data_cache is not None:
-        return _data_cache
-    
-    all_data = []
-    
-    # Look for year folders in data directory
-    if DATA_DIR.exists():
-        for year_folder in DATA_DIR.iterdir():
-            if year_folder.is_dir() and re.match(r'\d{4}-\d{4}', year_folder.name):
-                year = extract_year_from_folder(year_folder.name)
-                if year is None:
-                    continue
-                
-                # Look for KS2 data file
-                ks2_file = year_folder / "england_ks2final.csv"
-                if ks2_file.exists():
-                    try:
-                        print(f"Loading data from {ks2_file}")
-                        df = pd.read_csv(ks2_file, low_memory=False)
-                        
-                        # Handle both string and integer columns
-                        if 'LEA' in df.columns and df['LEA'].dtype == 'object':
-                            df['LEA'] = pd.to_numeric(df['LEA'], errors='coerce')
-                        if 'URN' in df.columns and df['URN'].dtype == 'object':
-                            df['URN'] = pd.to_numeric(df['URN'], errors='coerce')
-                        
-                        # Filter to schools only (RECTYPE == 1 means school level data)
-                        if 'RECTYPE' in df.columns:
-                            df = df[df['RECTYPE'] == 1]
-                        
-                        # Add year and local authority name from LANAME column
-                        df['year'] = year
-                        if 'LANAME' in df.columns:
-                            df['local_authority'] = df['LANAME']
-                        elif 'LEA' in df.columns:
-                            df['local_authority'] = df['LEA'].astype(str)
-                        
-                        # Standardize column names for our API
-                        df = df.rename(columns={
-                            'URN': 'urn',
-                            'SCHNAME': 'school_name',
-                            'ADDRESS1': 'address1',
-                            'ADDRESS2': 'address2',
-                            'TOWN': 'town',
-                            'PCODE': 'postcode',
-                            'NFTYPE': 'school_type_code',
-                            'RELDENOM': 'religious_denomination',
-                            'AGERANGE': 'age_range',
-                            'TOTPUPS': 'total_pupils',
-                            'TELIG': 'eligible_pupils',
-                            # Core KS2 metrics
-                            'PTRWM_EXP': 'rwm_expected_pct',
-                            'PTRWM_HIGH': 'rwm_high_pct',
-                            'READPROG': 'reading_progress',
-                            'WRITPROG': 'writing_progress',
-                            'MATPROG': 'maths_progress',
-                            'PTREAD_EXP': 'reading_expected_pct',
-                            'PTWRITTA_EXP': 'writing_expected_pct',
-                            'PTMAT_EXP': 'maths_expected_pct',
-                            'READ_AVERAGE': 'reading_avg_score',
-                            'MAT_AVERAGE': 'maths_avg_score',
-                            'PTREAD_HIGH': 'reading_high_pct',
-                            'PTWRITTA_HIGH': 'writing_high_pct',
-                            'PTMAT_HIGH': 'maths_high_pct',
-                            # GPS (Grammar, Punctuation & Spelling)
-                            'PTGPS_EXP': 'gps_expected_pct',
-                            'PTGPS_HIGH': 'gps_high_pct',
-                            'GPS_AVERAGE': 'gps_avg_score',
-                            # Science
-                            'PTSCITA_EXP': 'science_expected_pct',
-                            # School context
-                            'PTFSM6CLA1A': 'disadvantaged_pct',
-                            'PTEALGRP2': 'eal_pct',
-                            'PSENELK': 'sen_support_pct',
-                            'PSENELE': 'sen_ehcp_pct',
-                            'PTMOBN': 'stability_pct',
-                            # Gender breakdown
-                            'PTRWM_EXP_B': 'rwm_expected_boys_pct',
-                            'PTRWM_EXP_G': 'rwm_expected_girls_pct',
-                            'PTRWM_HIGH_B': 'rwm_high_boys_pct',
-                            'PTRWM_HIGH_G': 'rwm_high_girls_pct',
-                            # Disadvantaged performance
-                            'PTRWM_EXP_FSM6CLA1A': 'rwm_expected_disadvantaged_pct',
-                            'PTRWM_EXP_NotFSM6CLA1A': 'rwm_expected_non_disadvantaged_pct',
-                            'DIFFN_RWM_EXP': 'disadvantaged_gap',
-                            # 3-year averages
-                            'PTRWM_EXP_3YR': 'rwm_expected_3yr_pct',
-                            'READ_AVERAGE_3YR': 'reading_avg_3yr',
-                            'MAT_AVERAGE_3YR': 'maths_avg_3yr',
-                        })
-                        
-                        # Create address field
-                        def make_address(row):
-                            parts = []
-                            if pd.notna(row.get('address1')) and row.get('address1'):
-                                parts.append(str(row['address1']))
-                            if pd.notna(row.get('town')) and row.get('town'):
-                                parts.append(str(row['town']))
-                            if pd.notna(row.get('postcode')) and row.get('postcode'):
-                                parts.append(str(row['postcode']))
-                            return ', '.join(parts) if parts else ''
-                        
-                        df['address'] = df.apply(make_address, axis=1)
-                        
-                        # Map school type codes to names
-                        school_type_map = {
-                            'AC': 'Academy', 'ACC': 'Academy Converter', 'ACS': 'Academy Sponsor Led',
-                            'CY': 'Community School', 'VA': 'Voluntary Aided', 'VC': 'Voluntary Controlled',
-                            'FD': 'Foundation', 'F': 'Foundation', 'FS': 'Free School',
-                        }
-                        df['school_type'] = df['school_type_code'].map(school_type_map).fillna('Other')
-                        
-                        # Parse numeric columns
-                        numeric_cols = [
-                            # Core metrics
-                            'rwm_expected_pct', 'rwm_high_pct', 'reading_progress', 
-                            'writing_progress', 'maths_progress', 'reading_expected_pct',
-                            'writing_expected_pct', 'maths_expected_pct', 'reading_avg_score',
-                            'maths_avg_score', 'reading_high_pct', 'writing_high_pct', 'maths_high_pct',
-                            # GPS & Science
-                            'gps_expected_pct', 'gps_high_pct', 'gps_avg_score', 'science_expected_pct',
-                            # School context
-                            'total_pupils', 'eligible_pupils', 'disadvantaged_pct', 'eal_pct',
-                            'sen_support_pct', 'sen_ehcp_pct', 'stability_pct',
-                            # Gender breakdown
-                            'rwm_expected_boys_pct', 'rwm_expected_girls_pct',
-                            'rwm_high_boys_pct', 'rwm_high_girls_pct',
-                            # Disadvantaged performance
-                            'rwm_expected_disadvantaged_pct', 'rwm_expected_non_disadvantaged_pct', 'disadvantaged_gap',
-                            # 3-year averages
-                            'rwm_expected_3yr_pct', 'reading_avg_3yr', 'maths_avg_3yr',
-                        ]
-                        
-                        for col in numeric_cols:
-                            if col in df.columns:
-                                df[col] = df[col].apply(parse_numeric)
-                        
-                        all_data.append(df)
-                        print(f"  Loaded {len(df)} schools for year {year}")
-                        
-                    except Exception as e:
-                        print(f"Error loading {ks2_file}: {e}")
-    
-    if all_data:
-        _data_cache = pd.concat(all_data, ignore_index=True)
-        print(f"\nTotal records loaded: {len(_data_cache)}")
-        print(f"Unique schools: {_data_cache['urn'].nunique()}")
-        print(f"Years: {sorted(_data_cache['year'].unique())}")
-    else:
-        print("No data files found. Creating empty DataFrame.")
-        _data_cache = pd.DataFrame()
-    
-    return _data_cache
-

@app.get("/")
 async def root():
    """Serve the frontend."""
-    return FileResponse(FRONTEND_DIR / "index.html")
+    return FileResponse(settings.frontend_dir / "index.html")


@app.get("/api/schools")
 async def get_schools(
    search: Optional[str] = Query(None, description="Search by school name"),
-    local_authority: Optional[str] = Query(None, description="Filter by local authority (Wandsworth or Merton)"),
+    local_authority: Optional[str] = Query(None, description="Filter by local authority"),
    school_type: Optional[str] = Query(None, description="Filter by school type"),
+    page: int = Query(1, ge=1, description="Page number"),
+    page_size: int = Query(None, ge=1, le=100, description="Results per page"),
 ):
-    """Get list of unique primary schools in Wandsworth and Merton."""
+    """
+    Get list of unique primary schools with pagination.
+    
+    Returns paginated results with total count for efficient loading.
+    """
    df = load_school_data()
    
    if df.empty:
-        return {"schools": []}
+        return {"schools": [], "total": 0, "page": page, "page_size": 0}
+    
+    # Use configured default if not specified
+    if page_size is None:
+        page_size = settings.default_page_size
    
    # Get unique schools (latest year data for each)
    latest_year = df.groupby('urn')['year'].max().reset_index()
    df_latest = df.merge(latest_year, on=['urn', 'year'])
    
-    school_cols = ["urn", "school_name", "local_authority", "school_type", "address", "town", "postcode"]
-    available_cols = [c for c in school_cols if c in df_latest.columns]
+    available_cols = [c for c in SCHOOL_COLUMNS if c in df_latest.columns]
    schools_df = df_latest[available_cols].drop_duplicates(subset=['urn'])
    
    # Apply filters
@@ -298,7 +97,19 @@ async def get_schools(
    if school_type:
        schools_df = schools_df[schools_df["school_type"].str.lower() == school_type.lower()]
    
-    return {"schools": clean_for_json(schools_df)}
+    # Pagination
+    total = len(schools_df)
+    start_idx = (page - 1) * page_size
+    end_idx = start_idx + page_size
+    schools_df = schools_df.iloc[start_idx:end_idx]
+    
+    return {
+        "schools": clean_for_json(schools_df),
+        "total": total,
+        "page": page,
+        "page_size": page_size,
+        "total_pages": (total + page_size - 1) // page_size if page_size > 0 else 0,
+    }


@app.get("/api/schools/{urn}")
@@ -390,56 +201,18 @@ async def get_filter_options():

@app.get("/api/metrics")
 async def get_available_metrics():
-    """Get list of available KS2 performance metrics for primary schools."""
+    """
+    Get list of available KS2 performance metrics for primary schools.
+    
+    This is the single source of truth for metric definitions.
+    Frontend should consume this to avoid duplication.
+    """
    df = load_school_data()
    
-    # Define KS2 metric metadata organized by category
-    metric_info = {
-        # Expected Standard
-        "rwm_expected_pct": {"name": "RWM Combined %", "description": "% meeting expected standard in reading, writing and maths", "type": "percentage", "category": "expected"},
-        "reading_expected_pct": {"name": "Reading Expected %", "description": "% meeting expected standard in reading", "type": "percentage", "category": "expected"},
-        "writing_expected_pct": {"name": "Writing Expected %", "description": "% meeting expected standard in writing", "type": "percentage", "category": "expected"},
-        "maths_expected_pct": {"name": "Maths Expected %", "description": "% meeting expected standard in maths", "type": "percentage", "category": "expected"},
-        "gps_expected_pct": {"name": "GPS Expected %", "description": "% meeting expected standard in grammar, punctuation & spelling", "type": "percentage", "category": "expected"},
-        "science_expected_pct": {"name": "Science Expected %", "description": "% meeting expected standard in science", "type": "percentage", "category": "expected"},
-        # Higher Standard
-        "rwm_high_pct": {"name": "RWM Combined Higher %", "description": "% achieving higher standard in RWM combined", "type": "percentage", "category": "higher"},
-        "reading_high_pct": {"name": "Reading Higher %", "description": "% achieving higher standard in reading", "type": "percentage", "category": "higher"},
-        "writing_high_pct": {"name": "Writing Higher %", "description": "% achieving greater depth in writing", "type": "percentage", "category": "higher"},
-        "maths_high_pct": {"name": "Maths Higher %", "description": "% achieving higher standard in maths", "type": "percentage", "category": "higher"},
-        "gps_high_pct": {"name": "GPS Higher %", "description": "% achieving higher standard in GPS", "type": "percentage", "category": "higher"},
-        # Progress Scores
-        "reading_progress": {"name": "Reading Progress", "description": "Progress in reading from KS1 to KS2", "type": "score", "category": "progress"},
-        "writing_progress": {"name": "Writing Progress", "description": "Progress in writing from KS1 to KS2", "type": "score", "category": "progress"},
-        "maths_progress": {"name": "Maths Progress", "description": "Progress in maths from KS1 to KS2", "type": "score", "category": "progress"},
-        # Average Scores
-        "reading_avg_score": {"name": "Reading Avg Score", "description": "Average scaled score in reading", "type": "score", "category": "average"},
-        "maths_avg_score": {"name": "Maths Avg Score", "description": "Average scaled score in maths", "type": "score", "category": "average"},
-        "gps_avg_score": {"name": "GPS Avg Score", "description": "Average scaled score in GPS", "type": "score", "category": "average"},
-        # Gender Performance
-        "rwm_expected_boys_pct": {"name": "RWM Expected % (Boys)", "description": "% of boys meeting expected standard", "type": "percentage", "category": "gender"},
-        "rwm_expected_girls_pct": {"name": "RWM Expected % (Girls)", "description": "% of girls meeting expected standard", "type": "percentage", "category": "gender"},
-        "rwm_high_boys_pct": {"name": "RWM Higher % (Boys)", "description": "% of boys at higher standard", "type": "percentage", "category": "gender"},
-        "rwm_high_girls_pct": {"name": "RWM Higher % (Girls)", "description": "% of girls at higher standard", "type": "percentage", "category": "gender"},
-        # Disadvantaged Performance
-        "rwm_expected_disadvantaged_pct": {"name": "RWM Expected % (Disadvantaged)", "description": "% of disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
-        "rwm_expected_non_disadvantaged_pct": {"name": "RWM Expected % (Non-Disadvantaged)", "description": "% of non-disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
-        "disadvantaged_gap": {"name": "Disadvantaged Gap", "description": "Gap between disadvantaged and national non-disadvantaged", "type": "score", "category": "equity"},
-        # School Context
-        "disadvantaged_pct": {"name": "% Disadvantaged Pupils", "description": "% of pupils eligible for free school meals or looked after", "type": "percentage", "category": "context"},
-        "eal_pct": {"name": "% EAL Pupils", "description": "% of pupils with English as additional language", "type": "percentage", "category": "context"},
-        "sen_support_pct": {"name": "% SEN Support", "description": "% of pupils with SEN support", "type": "percentage", "category": "context"},
-        "stability_pct": {"name": "% Pupil Stability", "description": "% of non-mobile pupils (stayed at school)", "type": "percentage", "category": "context"},
-        # 3-Year Averages
-        "rwm_expected_3yr_pct": {"name": "RWM Expected % (3-Year Avg)", "description": "3-year average % meeting expected", "type": "percentage", "category": "trends"},
-        "reading_avg_3yr": {"name": "Reading Score (3-Year Avg)", "description": "3-year average reading score", "type": "score", "category": "trends"},
-        "maths_avg_3yr": {"name": "Maths Score (3-Year Avg)", "description": "3-year average maths score", "type": "score", "category": "trends"},
-    }
-    
    available = []
-    for col, info in metric_info.items():
-        if df.empty or col in df.columns:
-            available.append({"key": col, **info})
+    for key, info in METRIC_DEFINITIONS.items():
+        if df.empty or key in df.columns:
+            available.append({"key": key, **info})
    
    return {"metrics": available}

@@ -448,13 +221,14 @@ async def get_available_metrics():
 async def get_rankings(
    metric: str = Query("rwm_expected_pct", description="KS2 metric to rank by"),
    year: Optional[int] = Query(None, description="Specific year (defaults to most recent)"),
-    limit: int = Query(20, description="Number of schools to return"),
+    limit: int = Query(20, ge=1, le=100, description="Number of schools to return"),
+    local_authority: Optional[str] = Query(None, description="Filter by local authority"),
 ):
    """Get primary school rankings by a specific KS2 metric."""
    df = load_school_data()
    
    if df.empty:
-        return {"metric": metric, "year": None, "rankings": []}
+        return {"metric": metric, "year": None, "rankings": [], "total": 0}
    
    if metric not in df.columns:
        raise HTTPException(status_code=400, detail=f"Metric '{metric}' not available")
@@ -467,39 +241,26 @@ async def get_rankings(
        max_year = df["year"].max()
        df = df[df["year"] == max_year]
    
+    # Filter by local authority if specified
+    if local_authority:
+        df = df[df["local_authority"].str.lower() == local_authority.lower()]
+    
    # Sort and rank (exclude rows with no data for this metric)
    df = df.dropna(subset=[metric])
+    total = len(df)
    
    # For progress scores, higher is better. For percentages, higher is also better.
    df = df.sort_values(metric, ascending=False).head(limit)
    
    # Return only relevant fields for rankings
-    ranking_cols = [
-        "urn", "school_name", "local_authority", "school_type", "address", "year", "total_pupils",
-        # Core expected
-        "rwm_expected_pct", "reading_expected_pct", "writing_expected_pct", "maths_expected_pct",
-        "gps_expected_pct", "science_expected_pct",
-        # Core higher
-        "rwm_high_pct", "reading_high_pct", "writing_high_pct", "maths_high_pct", "gps_high_pct",
-        # Progress & averages
-        "reading_progress", "writing_progress", "maths_progress",
-        "reading_avg_score", "maths_avg_score", "gps_avg_score",
-        # Gender
-        "rwm_expected_boys_pct", "rwm_expected_girls_pct", "rwm_high_boys_pct", "rwm_high_girls_pct",
-        # Equity
-        "rwm_expected_disadvantaged_pct", "rwm_expected_non_disadvantaged_pct", "disadvantaged_gap",
-        # Context
-        "disadvantaged_pct", "eal_pct", "sen_support_pct", "stability_pct",
-        # 3-year
-        "rwm_expected_3yr_pct", "reading_avg_3yr", "maths_avg_3yr",
-    ]
-    available_cols = [c for c in ranking_cols if c in df.columns]
+    available_cols = [c for c in RANKING_COLUMNS if c in df.columns]
    df = df[available_cols]
    
    return {
        "metric": metric,
        "year": int(df["year"].iloc[0]) if not df.empty else None,
-        "rankings": clean_for_json(df)
+        "rankings": clean_for_json(df),
+        "total": total,
    }


@@ -512,7 +273,7 @@ async def get_data_info():
        return {
            "status": "no_data",
            "message": "No data files found in data folder. Please download KS2 data from the government website.",
-            "data_folder": str(DATA_DIR),
+            "data_folder": str(settings.data_dir),
        }
    
    years = [int(y) for y in sorted(df["year"].unique())]
@@ -529,17 +290,22 @@ async def get_data_info():
    }


-# Mount static files
-@app.on_event("startup")
-async def startup():
-    """Setup static file serving and load data on startup."""
-    if FRONTEND_DIR.exists():
-        app.mount("/static", StaticFiles(directory=FRONTEND_DIR), name="static")
-    
-    # Pre-load data
+@app.post("/api/admin/reload")
+async def reload_data():
+    """Admin endpoint to force data reload (useful after data updates)."""
+    clear_cache()
    load_school_data()
+    return {"status": "reloaded"}
+
+
+# Mount static files after all routes are defined
+@app.on_event("startup")
+async def mount_static():
+    """Mount static file serving."""
+    if settings.frontend_dir.exists():
+        app.mount("/static", StaticFiles(directory=settings.frontend_dir), name="static")


 if __name__ == "__main__":
    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)
+    uvicorn.run(app, host=settings.host, port=settings.port)
@@ -0,0 +1,38 @@
+"""
+Application configuration using pydantic-settings.
+Loads from environment variables and .env file.
+"""
+
+from pathlib import Path
+from typing import List
+from pydantic_settings import BaseSettings
+import os
+
+
+class Settings(BaseSettings):
+    """Application settings loaded from environment."""
+    
+    # Paths
+    data_dir: Path = Path(__file__).parent.parent / "data"
+    frontend_dir: Path = Path(__file__).parent.parent / "frontend"
+    
+    # Server
+    host: str = "0.0.0.0"
+    port: int = 80
+    
+    # CORS
+    allowed_origins: List[str] = ["https://schoolcompare.co.uk", "http://localhost:8000", "http://localhost:3000"]
+    
+    # API
+    default_page_size: int = 50
+    max_page_size: int = 100
+    
+    class Config:
+        env_file = ".env"
+        env_file_encoding = "utf-8"
+        extra = "ignore"
+
+
+# Singleton instance
+settings = Settings()
+
@@ -0,0 +1,196 @@
+"""
+Data loading module with optimized pandas operations.
+Uses vectorized operations instead of .apply() for performance.
+"""
+
+import pandas as pd
+import numpy as np
+from pathlib import Path
+from functools import lru_cache
+import re
+from typing import Optional
+
+from .config import settings
+from .schemas import (
+    COLUMN_MAPPINGS,
+    NUMERIC_COLUMNS,
+    SCHOOL_TYPE_MAP,
+    NULL_VALUES,
+    LA_CODE_TO_NAME,
+)
+
+
+def extract_year_from_folder(folder_name: str) -> Optional[int]:
+    """Extract the end year from folder name like '2023-2024' -> 2024."""
+    match = re.search(r'(\d{4})-(\d{4})', folder_name)
+    if match:
+        return int(match.group(2))
+    return None
+
+
+def parse_numeric_vectorized(series: pd.Series) -> pd.Series:
+    """
+    Vectorized numeric parsing - much faster than .apply().
+    Handles SUPP, NE, NA, NP, %, etc.
+    """
+    # Convert to string first
+    str_series = series.astype(str)
+    
+    # Replace null values with NaN
+    for null_val in NULL_VALUES:
+        str_series = str_series.replace(null_val, np.nan)
+    
+    # Remove % signs
+    str_series = str_series.str.rstrip('%')
+    
+    # Convert to numeric
+    return pd.to_numeric(str_series, errors='coerce')
+
+
+def create_address_vectorized(df: pd.DataFrame) -> pd.Series:
+    """
+    Vectorized address creation - much faster than .apply().
+    """
+    parts = []
+    
+    if 'address1' in df.columns:
+        parts.append(df['address1'].fillna('').astype(str))
+    if 'town' in df.columns:
+        parts.append(df['town'].fillna('').astype(str))
+    if 'postcode' in df.columns:
+        parts.append(df['postcode'].fillna('').astype(str))
+    
+    if not parts:
+        return pd.Series([''] * len(df), index=df.index)
+    
+    # Combine parts with comma separator, filtering empty strings
+    result = pd.Series([''] * len(df), index=df.index)
+    for i, row_idx in enumerate(df.index):
+        row_parts = [p.iloc[i] if hasattr(p, 'iloc') else p[i] for p in parts]
+        row_parts = [p for p in row_parts if p and p.strip()]
+        result.iloc[i] = ', '.join(row_parts)
+    
+    return result
+
+
+def create_address_fast(df: pd.DataFrame) -> pd.Series:
+    """
+    Fast vectorized address creation using string concatenation.
+    """
+    addr1 = df.get('address1', pd.Series([''] * len(df))).fillna('').astype(str)
+    town = df.get('town', pd.Series([''] * len(df))).fillna('').astype(str)
+    postcode = df.get('postcode', pd.Series([''] * len(df))).fillna('').astype(str)
+    
+    # Build address with proper separators
+    result = addr1.str.strip()
+    
+    # Add town if not empty
+    town_mask = town.str.strip() != ''
+    result = result.where(~town_mask, result + ', ' + town.str.strip())
+    
+    # Add postcode if not empty  
+    postcode_mask = postcode.str.strip() != ''
+    result = result.where(~postcode_mask, result + ', ' + postcode.str.strip())
+    
+    # Clean up leading commas
+    result = result.str.lstrip(', ')
+    
+    return result
+
+
+def load_year_data(year_folder: Path, year: int) -> Optional[pd.DataFrame]:
+    """Load and process data for a single year."""
+    ks2_file = year_folder / "england_ks2final.csv"
+    if not ks2_file.exists():
+        return None
+    
+    try:
+        print(f"Loading data from {ks2_file}")
+        df = pd.read_csv(ks2_file, low_memory=False)
+        
+        # Handle column types
+        if 'LEA' in df.columns and df['LEA'].dtype == 'object':
+            df['LEA'] = pd.to_numeric(df['LEA'], errors='coerce')
+        if 'URN' in df.columns and df['URN'].dtype == 'object':
+            df['URN'] = pd.to_numeric(df['URN'], errors='coerce')
+        
+        # Filter to schools only (RECTYPE == 1 means school level data)
+        if 'RECTYPE' in df.columns:
+            df = df[df['RECTYPE'] == 1].copy()
+        
+        # Add year and local authority name
+        df['year'] = year
+        
+        # Try different column names for LA name
+        la_name_cols = ['LANAME', 'LA (name)', 'LA_NAME', 'LA NAME']
+        la_col_found = None
+        for col in la_name_cols:
+            if col in df.columns:
+                la_col_found = col
+                break
+        
+        if la_col_found:
+            df['local_authority'] = df[la_col_found]
+        elif 'LEA' in df.columns:
+            # Map LEA codes to names using our mapping
+            df['local_authority'] = df['LEA'].map(LA_CODE_TO_NAME).fillna(df['LEA'].astype(str))
+        
+        # Rename columns using mapping
+        rename_dict = {k: v for k, v in COLUMN_MAPPINGS.items() if k in df.columns}
+        df = df.rename(columns=rename_dict)
+        
+        # Create address field (vectorized)
+        df['address'] = create_address_fast(df)
+        
+        # Map school type codes to names (vectorized)
+        if 'school_type_code' in df.columns:
+            df['school_type'] = df['school_type_code'].map(SCHOOL_TYPE_MAP).fillna('Other')
+        
+        # Parse numeric columns (vectorized - much faster than .apply())
+        for col in NUMERIC_COLUMNS:
+            if col in df.columns:
+                df[col] = parse_numeric_vectorized(df[col])
+        
+        print(f"  Loaded {len(df)} schools for year {year}")
+        return df
+        
+    except Exception as e:
+        print(f"Error loading {ks2_file}: {e}")
+        return None
+
+
+@lru_cache(maxsize=1)
+def load_school_data() -> pd.DataFrame:
+    """
+    Load and combine all school data from CSV files in year folders.
+    Uses lru_cache for singleton-like behavior.
+    """
+    all_data = []
+    
+    data_dir = settings.data_dir
+    if data_dir.exists():
+        for year_folder in data_dir.iterdir():
+            if year_folder.is_dir() and re.match(r'\d{4}-\d{4}', year_folder.name):
+                year = extract_year_from_folder(year_folder.name)
+                if year is None:
+                    continue
+                
+                df = load_year_data(year_folder, year)
+                if df is not None:
+                    all_data.append(df)
+    
+    if all_data:
+        result = pd.concat(all_data, ignore_index=True)
+        print(f"\nTotal records loaded: {len(result)}")
+        print(f"Unique schools: {result['urn'].nunique()}")
+        print(f"Years: {sorted(result['year'].unique())}")
+        return result
+    else:
+        print("No data files found. Creating empty DataFrame.")
+        return pd.DataFrame()
+
+
+def clear_cache():
+    """Clear the data cache to force reload."""
+    load_school_data.cache_clear()
+
@@ -0,0 +1,398 @@
+"""
+Schema definitions: column mappings, metric definitions, school type mappings.
+Single source of truth for all data transformations.
+"""
+
+# Column name mappings from DfE CSV to API field names
+COLUMN_MAPPINGS = {
+    'URN': 'urn',
+    'SCHNAME': 'school_name',
+    'ADDRESS1': 'address1',
+    'ADDRESS2': 'address2',
+    'TOWN': 'town',
+    'PCODE': 'postcode',
+    'NFTYPE': 'school_type_code',
+    'RELDENOM': 'religious_denomination',
+    'AGERANGE': 'age_range',
+    'TOTPUPS': 'total_pupils',
+    'TELIG': 'eligible_pupils',
+    # Core KS2 metrics
+    'PTRWM_EXP': 'rwm_expected_pct',
+    'PTRWM_HIGH': 'rwm_high_pct',
+    'READPROG': 'reading_progress',
+    'WRITPROG': 'writing_progress',
+    'MATPROG': 'maths_progress',
+    'PTREAD_EXP': 'reading_expected_pct',
+    'PTWRITTA_EXP': 'writing_expected_pct',
+    'PTMAT_EXP': 'maths_expected_pct',
+    'READ_AVERAGE': 'reading_avg_score',
+    'MAT_AVERAGE': 'maths_avg_score',
+    'PTREAD_HIGH': 'reading_high_pct',
+    'PTWRITTA_HIGH': 'writing_high_pct',
+    'PTMAT_HIGH': 'maths_high_pct',
+    # GPS (Grammar, Punctuation & Spelling)
+    'PTGPS_EXP': 'gps_expected_pct',
+    'PTGPS_HIGH': 'gps_high_pct',
+    'GPS_AVERAGE': 'gps_avg_score',
+    # Science
+    'PTSCITA_EXP': 'science_expected_pct',
+    # School context
+    'PTFSM6CLA1A': 'disadvantaged_pct',
+    'PTEALGRP2': 'eal_pct',
+    'PSENELK': 'sen_support_pct',
+    'PSENELE': 'sen_ehcp_pct',
+    'PTMOBN': 'stability_pct',
+    # Gender breakdown
+    'PTRWM_EXP_B': 'rwm_expected_boys_pct',
+    'PTRWM_EXP_G': 'rwm_expected_girls_pct',
+    'PTRWM_HIGH_B': 'rwm_high_boys_pct',
+    'PTRWM_HIGH_G': 'rwm_high_girls_pct',
+    # Disadvantaged performance
+    'PTRWM_EXP_FSM6CLA1A': 'rwm_expected_disadvantaged_pct',
+    'PTRWM_EXP_NotFSM6CLA1A': 'rwm_expected_non_disadvantaged_pct',
+    'DIFFN_RWM_EXP': 'disadvantaged_gap',
+    # 3-year averages
+    'PTRWM_EXP_3YR': 'rwm_expected_3yr_pct',
+    'READ_AVERAGE_3YR': 'reading_avg_3yr',
+    'MAT_AVERAGE_3YR': 'maths_avg_3yr',
+}
+
+# Numeric columns that need parsing
+NUMERIC_COLUMNS = [
+    # Core metrics
+    'rwm_expected_pct', 'rwm_high_pct', 'reading_progress', 
+    'writing_progress', 'maths_progress', 'reading_expected_pct',
+    'writing_expected_pct', 'maths_expected_pct', 'reading_avg_score',
+    'maths_avg_score', 'reading_high_pct', 'writing_high_pct', 'maths_high_pct',
+    # GPS & Science
+    'gps_expected_pct', 'gps_high_pct', 'gps_avg_score', 'science_expected_pct',
+    # School context
+    'total_pupils', 'eligible_pupils', 'disadvantaged_pct', 'eal_pct',
+    'sen_support_pct', 'sen_ehcp_pct', 'stability_pct',
+    # Gender breakdown
+    'rwm_expected_boys_pct', 'rwm_expected_girls_pct',
+    'rwm_high_boys_pct', 'rwm_high_girls_pct',
+    # Disadvantaged performance
+    'rwm_expected_disadvantaged_pct', 'rwm_expected_non_disadvantaged_pct', 'disadvantaged_gap',
+    # 3-year averages
+    'rwm_expected_3yr_pct', 'reading_avg_3yr', 'maths_avg_3yr',
+]
+
+# School type code to name mapping
+SCHOOL_TYPE_MAP = {
+    'AC': 'Academy',
+    'ACC': 'Academy Converter',
+    'ACS': 'Academy Sponsor Led',
+    'CY': 'Community School',
+    'VA': 'Voluntary Aided',
+    'VC': 'Voluntary Controlled',
+    'FD': 'Foundation',
+    'F': 'Foundation',
+    'FS': 'Free School',
+}
+
+# Special values to treat as null
+NULL_VALUES = ['SUPP', 'NE', 'NA', 'NP', 'NEW', 'LOW', '']
+
+# KS2 Metric definitions - single source of truth
+# Used by both backend API and frontend
+METRIC_DEFINITIONS = {
+    # Expected Standard
+    "rwm_expected_pct": {
+        "name": "RWM Combined %",
+        "short_name": "RWM %",
+        "description": "% meeting expected standard in reading, writing and maths",
+        "type": "percentage",
+        "category": "expected"
+    },
+    "reading_expected_pct": {
+        "name": "Reading Expected %",
+        "short_name": "Reading %",
+        "description": "% meeting expected standard in reading",
+        "type": "percentage",
+        "category": "expected"
+    },
+    "writing_expected_pct": {
+        "name": "Writing Expected %",
+        "short_name": "Writing %",
+        "description": "% meeting expected standard in writing",
+        "type": "percentage",
+        "category": "expected"
+    },
+    "maths_expected_pct": {
+        "name": "Maths Expected %",
+        "short_name": "Maths %",
+        "description": "% meeting expected standard in maths",
+        "type": "percentage",
+        "category": "expected"
+    },
+    "gps_expected_pct": {
+        "name": "GPS Expected %",
+        "short_name": "GPS %",
+        "description": "% meeting expected standard in grammar, punctuation & spelling",
+        "type": "percentage",
+        "category": "expected"
+    },
+    "science_expected_pct": {
+        "name": "Science Expected %",
+        "short_name": "Science %",
+        "description": "% meeting expected standard in science",
+        "type": "percentage",
+        "category": "expected"
+    },
+    # Higher Standard
+    "rwm_high_pct": {
+        "name": "RWM Combined Higher %",
+        "short_name": "RWM Higher %",
+        "description": "% achieving higher standard in RWM combined",
+        "type": "percentage",
+        "category": "higher"
+    },
+    "reading_high_pct": {
+        "name": "Reading Higher %",
+        "short_name": "Reading Higher %",
+        "description": "% achieving higher standard in reading",
+        "type": "percentage",
+        "category": "higher"
+    },
+    "writing_high_pct": {
+        "name": "Writing Higher %",
+        "short_name": "Writing Higher %",
+        "description": "% achieving greater depth in writing",
+        "type": "percentage",
+        "category": "higher"
+    },
+    "maths_high_pct": {
+        "name": "Maths Higher %",
+        "short_name": "Maths Higher %",
+        "description": "% achieving higher standard in maths",
+        "type": "percentage",
+        "category": "higher"
+    },
+    "gps_high_pct": {
+        "name": "GPS Higher %",
+        "short_name": "GPS Higher %",
+        "description": "% achieving higher standard in GPS",
+        "type": "percentage",
+        "category": "higher"
+    },
+    # Progress Scores
+    "reading_progress": {
+        "name": "Reading Progress",
+        "short_name": "Reading Progress",
+        "description": "Progress in reading from KS1 to KS2",
+        "type": "score",
+        "category": "progress"
+    },
+    "writing_progress": {
+        "name": "Writing Progress",
+        "short_name": "Writing Progress",
+        "description": "Progress in writing from KS1 to KS2",
+        "type": "score",
+        "category": "progress"
+    },
+    "maths_progress": {
+        "name": "Maths Progress",
+        "short_name": "Maths Progress",
+        "description": "Progress in maths from KS1 to KS2",
+        "type": "score",
+        "category": "progress"
+    },
+    # Average Scores
+    "reading_avg_score": {
+        "name": "Reading Average Score",
+        "short_name": "Reading Avg",
+        "description": "Average scaled score in reading",
+        "type": "score",
+        "category": "average"
+    },
+    "maths_avg_score": {
+        "name": "Maths Average Score",
+        "short_name": "Maths Avg",
+        "description": "Average scaled score in maths",
+        "type": "score",
+        "category": "average"
+    },
+    "gps_avg_score": {
+        "name": "GPS Average Score",
+        "short_name": "GPS Avg",
+        "description": "Average scaled score in GPS",
+        "type": "score",
+        "category": "average"
+    },
+    # Gender Performance
+    "rwm_expected_boys_pct": {
+        "name": "RWM Expected % (Boys)",
+        "short_name": "Boys RWM %",
+        "description": "% of boys meeting expected standard",
+        "type": "percentage",
+        "category": "gender"
+    },
+    "rwm_expected_girls_pct": {
+        "name": "RWM Expected % (Girls)",
+        "short_name": "Girls RWM %",
+        "description": "% of girls meeting expected standard",
+        "type": "percentage",
+        "category": "gender"
+    },
+    "rwm_high_boys_pct": {
+        "name": "RWM Higher % (Boys)",
+        "short_name": "Boys Higher %",
+        "description": "% of boys at higher standard",
+        "type": "percentage",
+        "category": "gender"
+    },
+    "rwm_high_girls_pct": {
+        "name": "RWM Higher % (Girls)",
+        "short_name": "Girls Higher %",
+        "description": "% of girls at higher standard",
+        "type": "percentage",
+        "category": "gender"
+    },
+    # Disadvantaged Performance
+    "rwm_expected_disadvantaged_pct": {
+        "name": "RWM Expected % (Disadvantaged)",
+        "short_name": "Disadvantaged %",
+        "description": "% of disadvantaged pupils meeting expected",
+        "type": "percentage",
+        "category": "equity"
+    },
+    "rwm_expected_non_disadvantaged_pct": {
+        "name": "RWM Expected % (Non-Disadvantaged)",
+        "short_name": "Non-Disadv %",
+        "description": "% of non-disadvantaged pupils meeting expected",
+        "type": "percentage",
+        "category": "equity"
+    },
+    "disadvantaged_gap": {
+        "name": "Disadvantaged Gap",
+        "short_name": "Disadv Gap",
+        "description": "Gap between disadvantaged and national non-disadvantaged",
+        "type": "score",
+        "category": "equity"
+    },
+    # School Context
+    "disadvantaged_pct": {
+        "name": "% Disadvantaged Pupils",
+        "short_name": "% Disadvantaged",
+        "description": "% of pupils eligible for free school meals or looked after",
+        "type": "percentage",
+        "category": "context"
+    },
+    "eal_pct": {
+        "name": "% EAL Pupils",
+        "short_name": "% EAL",
+        "description": "% of pupils with English as additional language",
+        "type": "percentage",
+        "category": "context"
+    },
+    "sen_support_pct": {
+        "name": "% SEN Support",
+        "short_name": "% SEN",
+        "description": "% of pupils with SEN support",
+        "type": "percentage",
+        "category": "context"
+    },
+    "stability_pct": {
+        "name": "% Pupil Stability",
+        "short_name": "% Stable",
+        "description": "% of non-mobile pupils (stayed at school)",
+        "type": "percentage",
+        "category": "context"
+    },
+    # 3-Year Averages
+    "rwm_expected_3yr_pct": {
+        "name": "RWM Expected % (3-Year Avg)",
+        "short_name": "RWM 3yr %",
+        "description": "3-year average % meeting expected",
+        "type": "percentage",
+        "category": "trends"
+    },
+    "reading_avg_3yr": {
+        "name": "Reading Score (3-Year Avg)",
+        "short_name": "Reading 3yr",
+        "description": "3-year average reading score",
+        "type": "score",
+        "category": "trends"
+    },
+    "maths_avg_3yr": {
+        "name": "Maths Score (3-Year Avg)",
+        "short_name": "Maths 3yr",
+        "description": "3-year average maths score",
+        "type": "score",
+        "category": "trends"
+    },
+}
+
+# Ranking columns to include in rankings response
+RANKING_COLUMNS = [
+    "urn", "school_name", "local_authority", "school_type", "address", "year", "total_pupils",
+    # Core expected
+    "rwm_expected_pct", "reading_expected_pct", "writing_expected_pct", "maths_expected_pct",
+    "gps_expected_pct", "science_expected_pct",
+    # Core higher
+    "rwm_high_pct", "reading_high_pct", "writing_high_pct", "maths_high_pct", "gps_high_pct",
+    # Progress & averages
+    "reading_progress", "writing_progress", "maths_progress",
+    "reading_avg_score", "maths_avg_score", "gps_avg_score",
+    # Gender
+    "rwm_expected_boys_pct", "rwm_expected_girls_pct", "rwm_high_boys_pct", "rwm_high_girls_pct",
+    # Equity
+    "rwm_expected_disadvantaged_pct", "rwm_expected_non_disadvantaged_pct", "disadvantaged_gap",
+    # Context
+    "disadvantaged_pct", "eal_pct", "sen_support_pct", "stability_pct",
+    # 3-year
+    "rwm_expected_3yr_pct", "reading_avg_3yr", "maths_avg_3yr",
+]
+
+# School listing columns
+SCHOOL_COLUMNS = ["urn", "school_name", "local_authority", "school_type", "address", "town", "postcode"]
+
+# Local Authority code to name mapping (for fallback when LANAME column missing)
+# Source: https://www.gov.uk/government/publications/local-authority-codes
+LA_CODE_TO_NAME = {
+    201: "City of London", 202: "Camden", 203: "Greenwich", 204: "Hackney",
+    205: "Hammersmith and Fulham", 206: "Islington", 207: "Kensington and Chelsea",
+    208: "Lambeth", 209: "Lewisham", 210: "Southwark", 211: "Tower Hamlets",
+    212: "Wandsworth", 213: "Westminster", 301: "Barking and Dagenham", 302: "Barnet",
+    303: "Bexley", 304: "Brent", 305: "Bromley", 306: "Croydon", 307: "Ealing",
+    308: "Enfield", 309: "Haringey", 310: "Harrow", 311: "Havering", 312: "Hillingdon",
+    313: "Hounslow", 314: "Kingston upon Thames", 315: "Merton", 316: "Newham",
+    317: "Redbridge", 318: "Richmond upon Thames", 319: "Sutton", 320: "Waltham Forest",
+    330: "Birmingham", 331: "Coventry", 332: "Dudley", 333: "Sandwell", 334: "Solihull",
+    335: "Walsall", 336: "Wolverhampton", 340: "Knowsley", 341: "Liverpool",
+    342: "St. Helens", 343: "Sefton", 344: "Wirral", 350: "Bolton", 351: "Bury",
+    352: "Manchester", 353: "Oldham", 354: "Rochdale", 355: "Salford", 356: "Stockport",
+    357: "Tameside", 358: "Trafford", 359: "Wigan", 370: "Barnsley", 371: "Doncaster",
+    372: "Rotherham", 373: "Sheffield", 380: "Bradford", 381: "Calderdale",
+    382: "Kirklees", 383: "Leeds", 384: "Wakefield", 390: "Gateshead",
+    391: "Newcastle upon Tyne", 392: "North Tyneside", 393: "South Tyneside",
+    394: "Sunderland", 420: "Isles of Scilly", 800: "Bath and North East Somerset",
+    801: "Bristol, City of", 802: "North Somerset", 803: "South Gloucestershire",
+    805: "Hartlepool", 806: "Middlesbrough", 807: "Redcar and Cleveland",
+    808: "Stockton-on-Tees", 810: "Kingston Upon Hull, City of", 811: "East Riding of Yorkshire",
+    812: "North East Lincolnshire", 813: "North Lincolnshire", 815: "North Yorkshire",
+    816: "York", 820: "Bedford", 821: "Central Bedfordshire", 822: "Luton",
+    825: "Buckinghamshire", 826: "Milton Keynes", 830: "Derbyshire", 831: "Derby",
+    835: "Dorset", 836: "Bournemouth, Christchurch and Poole", 837: "Poole",
+    838: "Bournemouth", 839: "Durham", 840: "Darlington", 841: "East Sussex",
+    845: "Brighton and Hove", 846: "Hampshire", 850: "Portsmouth", 851: "Southampton",
+    852: "Isle of Wight", 855: "Leicestershire", 856: "Leicester", 857: "Rutland",
+    860: "Staffordshire", 861: "Stoke-on-Trent", 865: "Wiltshire", 866: "Swindon",
+    867: "Bracknell Forest", 868: "Windsor and Maidenhead", 869: "West Berkshire",
+    870: "Reading", 871: "Slough", 872: "Wokingham", 873: "Cambridgeshire",
+    874: "Peterborough", 876: "Halton", 877: "Warrington", 878: "Devon",
+    879: "Plymouth", 880: "Torbay", 881: "Essex", 882: "Southend-on-Sea",
+    883: "Thurrock", 884: "Herefordshire", 885: "Worcestershire", 886: "Kent",
+    887: "Medway", 888: "Lancashire", 889: "Blackburn with Darwen", 890: "Blackpool",
+    891: "Nottinghamshire", 892: "Nottingham", 893: "Shropshire", 894: "Telford and Wrekin",
+    895: "Cheshire East", 896: "Cheshire West and Chester", 908: "Cornwall",
+    909: "Cumbria", 916: "Gloucestershire", 919: "Hertfordshire", 921: "Norfolk",
+    925: "Lincolnshire", 926: "Northamptonshire", 928: "Northumberland",
+    929: "Oxfordshire", 931: "Somerset", 933: "Suffolk", 935: "Surrey",
+    936: "Warwickshire", 937: "West Sussex", 938: "Westmorland and Furness",
+    940: "Cumberland",
+    # Additional codes
+    420: "Isles of Scilly",
+}
+
@@ -0,0 +1,37 @@
+"""
+Utility functions for data conversion and JSON serialization.
+"""
+
+import pandas as pd
+import numpy as np
+from typing import Any, List
+
+
+def convert_to_native(value: Any) -> Any:
+    """Convert numpy types to native Python types for JSON serialization."""
+    if pd.isna(value):
+        return None
+    if isinstance(value, (np.integer,)):
+        return int(value)
+    if isinstance(value, (np.floating,)):
+        if np.isnan(value) or np.isinf(value):
+            return None
+        return float(value)
+    if isinstance(value, np.ndarray):
+        return value.tolist()
+    if value == "SUPP" or value == "NE" or value == "NA" or value == "NP":
+        return None
+    return value
+
+
+def clean_for_json(df: pd.DataFrame) -> List[dict]:
+    """Convert DataFrame to list of dicts, replacing NaN/inf with None for JSON serialization."""
+    records = df.to_dict(orient="records")
+    cleaned = []
+    for record in records:
+        clean_record = {}
+        for key, value in record.items():
+            clean_record[key] = convert_to_native(value)
+        cleaned.append(clean_record)
+    return cleaned
+