Refactoring and bug fixes

2026-01-06 16:30:32 +00:00
parent 0ea4720ac1
commit 54e4bc2e77
11 changed files with 1246 additions and 534 deletions
@@ -4,284 +4,83 @@ Serves primary school (KS2) performance data for comparing schools.
 Uses real data from UK Government Compare School Performance downloads.
 """

+from contextlib import asynccontextmanager
 from fastapi import FastAPI, HTTPException, Query
 from fastapi.staticfiles import StaticFiles
 from fastapi.responses import FileResponse
 from fastapi.middleware.cors import CORSMiddleware
-import pandas as pd
-import numpy as np
-from pathlib import Path
 from typing import Optional
-import os
-import re

-# No longer filtering by specific LA codes - load all available schools
+from .config import settings
+from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
+from .data_loader import load_school_data, clear_cache
+from .utils import clean_for_json
+
+
+@asynccontextmanager
+async def lifespan(app: FastAPI):
+    """Application lifespan - startup and shutdown events."""
+    # Startup: pre-load data
+    print("Starting up: Loading school data...")
+    load_school_data()
+    print("Data loaded successfully.")
+    
+    yield  # Application runs here
+    
+    # Shutdown: cleanup if needed
+    print("Shutting down...")
+

 app = FastAPI(
    title="SchoolCompare API",
    description="API for comparing primary school (KS2) performance data - schoolcompare.co.uk",
-    version="1.0.0"
+    version="2.0.0",
+    lifespan=lifespan,
 )

-# CORS middleware for development
+# CORS middleware with configurable origins
 app.add_middleware(
    CORSMiddleware,
-    allow_origins=["*"],
+    allow_origins=settings.allowed_origins,
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
 )

-# Data directory
-DATA_DIR = Path(__file__).parent.parent / "data"
-FRONTEND_DIR = Path(__file__).parent.parent / "frontend"
-
-# Cache for loaded data - cleared on reload (updated for 2016-2017 data)
-_data_cache: Optional[pd.DataFrame] = None
-
-
-def convert_to_native(value):
-    """Convert numpy types to native Python types for JSON serialization."""
-    if pd.isna(value):
-        return None
-    if isinstance(value, (np.integer,)):
-        return int(value)
-    if isinstance(value, (np.floating,)):
-        if np.isnan(value) or np.isinf(value):
-            return None
-        return float(value)
-    if isinstance(value, np.ndarray):
-        return value.tolist()
-    if value == "SUPP" or value == "NE" or value == "NA" or value == "NP":
-        return None
-    return value
-
-
-def clean_for_json(df: pd.DataFrame) -> list:
-    """Convert DataFrame to list of dicts, replacing NaN/inf with None for JSON serialization."""
-    records = df.to_dict(orient="records")
-    cleaned = []
-    for record in records:
-        clean_record = {}
-        for key, value in record.items():
-            clean_record[key] = convert_to_native(value)
-        cleaned.append(clean_record)
-    return cleaned
-
-
-def parse_numeric(value):
-    """Parse a value to numeric, handling SUPP, NE, NA, %, etc."""
-    if pd.isna(value):
-        return None
-    if isinstance(value, (int, float)):
-        if np.isnan(value) or np.isinf(value):
-            return None
-        return value
-    if isinstance(value, str):
-        value = value.strip()
-        if value in ["SUPP", "NE", "NA", "NP", "NEW", "LOW", ""]:
-            return None
-        # Remove % sign if present
-        if value.endswith('%'):
-            value = value[:-1]
-        try:
-            return float(value)
-        except ValueError:
-            return None
-    return None
-
-
-def extract_year_from_folder(folder_name: str) -> Optional[int]:
-    """Extract the end year from folder name like '2023-2024' -> 2024."""
-    match = re.search(r'(\d{4})-(\d{4})', folder_name)
-    if match:
-        return int(match.group(2))
-    return None
-
-
-def load_school_data() -> pd.DataFrame:
-    """Load and combine all school data from CSV files in year folders."""
-    global _data_cache
-    
-    if _data_cache is not None:
-        return _data_cache
-    
-    all_data = []
-    
-    # Look for year folders in data directory
-    if DATA_DIR.exists():
-        for year_folder in DATA_DIR.iterdir():
-            if year_folder.is_dir() and re.match(r'\d{4}-\d{4}', year_folder.name):
-                year = extract_year_from_folder(year_folder.name)
-                if year is None:
-                    continue
-                
-                # Look for KS2 data file
-                ks2_file = year_folder / "england_ks2final.csv"
-                if ks2_file.exists():
-                    try:
-                        print(f"Loading data from {ks2_file}")
-                        df = pd.read_csv(ks2_file, low_memory=False)
-                        
-                        # Handle both string and integer columns
-                        if 'LEA' in df.columns and df['LEA'].dtype == 'object':
-                            df['LEA'] = pd.to_numeric(df['LEA'], errors='coerce')
-                        if 'URN' in df.columns and df['URN'].dtype == 'object':
-                            df['URN'] = pd.to_numeric(df['URN'], errors='coerce')
-                        
-                        # Filter to schools only (RECTYPE == 1 means school level data)
-                        if 'RECTYPE' in df.columns:
-                            df = df[df['RECTYPE'] == 1]
-                        
-                        # Add year and local authority name from LANAME column
-                        df['year'] = year
-                        if 'LANAME' in df.columns:
-                            df['local_authority'] = df['LANAME']
-                        elif 'LEA' in df.columns:
-                            df['local_authority'] = df['LEA'].astype(str)
-                        
-                        # Standardize column names for our API
-                        df = df.rename(columns={
-                            'URN': 'urn',
-                            'SCHNAME': 'school_name',
-                            'ADDRESS1': 'address1',
-                            'ADDRESS2': 'address2',
-                            'TOWN': 'town',
-                            'PCODE': 'postcode',
-                            'NFTYPE': 'school_type_code',
-                            'RELDENOM': 'religious_denomination',
-                            'AGERANGE': 'age_range',
-                            'TOTPUPS': 'total_pupils',
-                            'TELIG': 'eligible_pupils',
-                            # Core KS2 metrics
-                            'PTRWM_EXP': 'rwm_expected_pct',
-                            'PTRWM_HIGH': 'rwm_high_pct',
-                            'READPROG': 'reading_progress',
-                            'WRITPROG': 'writing_progress',
-                            'MATPROG': 'maths_progress',
-                            'PTREAD_EXP': 'reading_expected_pct',
-                            'PTWRITTA_EXP': 'writing_expected_pct',
-                            'PTMAT_EXP': 'maths_expected_pct',
-                            'READ_AVERAGE': 'reading_avg_score',
-                            'MAT_AVERAGE': 'maths_avg_score',
-                            'PTREAD_HIGH': 'reading_high_pct',
-                            'PTWRITTA_HIGH': 'writing_high_pct',
-                            'PTMAT_HIGH': 'maths_high_pct',
-                            # GPS (Grammar, Punctuation & Spelling)
-                            'PTGPS_EXP': 'gps_expected_pct',
-                            'PTGPS_HIGH': 'gps_high_pct',
-                            'GPS_AVERAGE': 'gps_avg_score',
-                            # Science
-                            'PTSCITA_EXP': 'science_expected_pct',
-                            # School context
-                            'PTFSM6CLA1A': 'disadvantaged_pct',
-                            'PTEALGRP2': 'eal_pct',
-                            'PSENELK': 'sen_support_pct',
-                            'PSENELE': 'sen_ehcp_pct',
-                            'PTMOBN': 'stability_pct',
-                            # Gender breakdown
-                            'PTRWM_EXP_B': 'rwm_expected_boys_pct',
-                            'PTRWM_EXP_G': 'rwm_expected_girls_pct',
-                            'PTRWM_HIGH_B': 'rwm_high_boys_pct',
-                            'PTRWM_HIGH_G': 'rwm_high_girls_pct',
-                            # Disadvantaged performance
-                            'PTRWM_EXP_FSM6CLA1A': 'rwm_expected_disadvantaged_pct',
-                            'PTRWM_EXP_NotFSM6CLA1A': 'rwm_expected_non_disadvantaged_pct',
-                            'DIFFN_RWM_EXP': 'disadvantaged_gap',
-                            # 3-year averages
-                            'PTRWM_EXP_3YR': 'rwm_expected_3yr_pct',
-                            'READ_AVERAGE_3YR': 'reading_avg_3yr',
-                            'MAT_AVERAGE_3YR': 'maths_avg_3yr',
-                        })
-                        
-                        # Create address field
-                        def make_address(row):
-                            parts = []
-                            if pd.notna(row.get('address1')) and row.get('address1'):
-                                parts.append(str(row['address1']))
-                            if pd.notna(row.get('town')) and row.get('town'):
-                                parts.append(str(row['town']))
-                            if pd.notna(row.get('postcode')) and row.get('postcode'):
-                                parts.append(str(row['postcode']))
-                            return ', '.join(parts) if parts else ''
-                        
-                        df['address'] = df.apply(make_address, axis=1)
-                        
-                        # Map school type codes to names
-                        school_type_map = {
-                            'AC': 'Academy', 'ACC': 'Academy Converter', 'ACS': 'Academy Sponsor Led',
-                            'CY': 'Community School', 'VA': 'Voluntary Aided', 'VC': 'Voluntary Controlled',
-                            'FD': 'Foundation', 'F': 'Foundation', 'FS': 'Free School',
-                        }
-                        df['school_type'] = df['school_type_code'].map(school_type_map).fillna('Other')
-                        
-                        # Parse numeric columns
-                        numeric_cols = [
-                            # Core metrics
-                            'rwm_expected_pct', 'rwm_high_pct', 'reading_progress', 
-                            'writing_progress', 'maths_progress', 'reading_expected_pct',
-                            'writing_expected_pct', 'maths_expected_pct', 'reading_avg_score',
-                            'maths_avg_score', 'reading_high_pct', 'writing_high_pct', 'maths_high_pct',
-                            # GPS & Science
-                            'gps_expected_pct', 'gps_high_pct', 'gps_avg_score', 'science_expected_pct',
-                            # School context
-                            'total_pupils', 'eligible_pupils', 'disadvantaged_pct', 'eal_pct',
-                            'sen_support_pct', 'sen_ehcp_pct', 'stability_pct',
-                            # Gender breakdown
-                            'rwm_expected_boys_pct', 'rwm_expected_girls_pct',
-                            'rwm_high_boys_pct', 'rwm_high_girls_pct',
-                            # Disadvantaged performance
-                            'rwm_expected_disadvantaged_pct', 'rwm_expected_non_disadvantaged_pct', 'disadvantaged_gap',
-                            # 3-year averages
-                            'rwm_expected_3yr_pct', 'reading_avg_3yr', 'maths_avg_3yr',
-                        ]
-                        
-                        for col in numeric_cols:
-                            if col in df.columns:
-                                df[col] = df[col].apply(parse_numeric)
-                        
-                        all_data.append(df)
-                        print(f"  Loaded {len(df)} schools for year {year}")
-                        
-                    except Exception as e:
-                        print(f"Error loading {ks2_file}: {e}")
-    
-    if all_data:
-        _data_cache = pd.concat(all_data, ignore_index=True)
-        print(f"\nTotal records loaded: {len(_data_cache)}")
-        print(f"Unique schools: {_data_cache['urn'].nunique()}")
-        print(f"Years: {sorted(_data_cache['year'].unique())}")
-    else:
-        print("No data files found. Creating empty DataFrame.")
-        _data_cache = pd.DataFrame()
-    
-    return _data_cache
-

@app.get("/")
 async def root():
    """Serve the frontend."""
-    return FileResponse(FRONTEND_DIR / "index.html")
+    return FileResponse(settings.frontend_dir / "index.html")


@app.get("/api/schools")
 async def get_schools(
    search: Optional[str] = Query(None, description="Search by school name"),
-    local_authority: Optional[str] = Query(None, description="Filter by local authority (Wandsworth or Merton)"),
+    local_authority: Optional[str] = Query(None, description="Filter by local authority"),
    school_type: Optional[str] = Query(None, description="Filter by school type"),
+    page: int = Query(1, ge=1, description="Page number"),
+    page_size: int = Query(None, ge=1, le=100, description="Results per page"),
 ):
-    """Get list of unique primary schools in Wandsworth and Merton."""
+    """
+    Get list of unique primary schools with pagination.
+    
+    Returns paginated results with total count for efficient loading.
+    """
    df = load_school_data()
    
    if df.empty:
-        return {"schools": []}
+        return {"schools": [], "total": 0, "page": page, "page_size": 0}
+    
+    # Use configured default if not specified
+    if page_size is None:
+        page_size = settings.default_page_size
    
    # Get unique schools (latest year data for each)
    latest_year = df.groupby('urn')['year'].max().reset_index()
    df_latest = df.merge(latest_year, on=['urn', 'year'])
    
-    school_cols = ["urn", "school_name", "local_authority", "school_type", "address", "town", "postcode"]
-    available_cols = [c for c in school_cols if c in df_latest.columns]
+    available_cols = [c for c in SCHOOL_COLUMNS if c in df_latest.columns]
    schools_df = df_latest[available_cols].drop_duplicates(subset=['urn'])
    
    # Apply filters
@@ -298,7 +97,19 @@ async def get_schools(
    if school_type:
        schools_df = schools_df[schools_df["school_type"].str.lower() == school_type.lower()]
    
-    return {"schools": clean_for_json(schools_df)}
+    # Pagination
+    total = len(schools_df)
+    start_idx = (page - 1) * page_size
+    end_idx = start_idx + page_size
+    schools_df = schools_df.iloc[start_idx:end_idx]
+    
+    return {
+        "schools": clean_for_json(schools_df),
+        "total": total,
+        "page": page,
+        "page_size": page_size,
+        "total_pages": (total + page_size - 1) // page_size if page_size > 0 else 0,
+    }


@app.get("/api/schools/{urn}")
@@ -390,56 +201,18 @@ async def get_filter_options():

@app.get("/api/metrics")
 async def get_available_metrics():
-    """Get list of available KS2 performance metrics for primary schools."""
+    """
+    Get list of available KS2 performance metrics for primary schools.
+    
+    This is the single source of truth for metric definitions.
+    Frontend should consume this to avoid duplication.
+    """
    df = load_school_data()
    
-    # Define KS2 metric metadata organized by category
-    metric_info = {
-        # Expected Standard
-        "rwm_expected_pct": {"name": "RWM Combined %", "description": "% meeting expected standard in reading, writing and maths", "type": "percentage", "category": "expected"},
-        "reading_expected_pct": {"name": "Reading Expected %", "description": "% meeting expected standard in reading", "type": "percentage", "category": "expected"},
-        "writing_expected_pct": {"name": "Writing Expected %", "description": "% meeting expected standard in writing", "type": "percentage", "category": "expected"},
-        "maths_expected_pct": {"name": "Maths Expected %", "description": "% meeting expected standard in maths", "type": "percentage", "category": "expected"},
-        "gps_expected_pct": {"name": "GPS Expected %", "description": "% meeting expected standard in grammar, punctuation & spelling", "type": "percentage", "category": "expected"},
-        "science_expected_pct": {"name": "Science Expected %", "description": "% meeting expected standard in science", "type": "percentage", "category": "expected"},
-        # Higher Standard
-        "rwm_high_pct": {"name": "RWM Combined Higher %", "description": "% achieving higher standard in RWM combined", "type": "percentage", "category": "higher"},
-        "reading_high_pct": {"name": "Reading Higher %", "description": "% achieving higher standard in reading", "type": "percentage", "category": "higher"},
-        "writing_high_pct": {"name": "Writing Higher %", "description": "% achieving greater depth in writing", "type": "percentage", "category": "higher"},
-        "maths_high_pct": {"name": "Maths Higher %", "description": "% achieving higher standard in maths", "type": "percentage", "category": "higher"},
-        "gps_high_pct": {"name": "GPS Higher %", "description": "% achieving higher standard in GPS", "type": "percentage", "category": "higher"},
-        # Progress Scores
-        "reading_progress": {"name": "Reading Progress", "description": "Progress in reading from KS1 to KS2", "type": "score", "category": "progress"},
-        "writing_progress": {"name": "Writing Progress", "description": "Progress in writing from KS1 to KS2", "type": "score", "category": "progress"},
-        "maths_progress": {"name": "Maths Progress", "description": "Progress in maths from KS1 to KS2", "type": "score", "category": "progress"},
-        # Average Scores
-        "reading_avg_score": {"name": "Reading Avg Score", "description": "Average scaled score in reading", "type": "score", "category": "average"},
-        "maths_avg_score": {"name": "Maths Avg Score", "description": "Average scaled score in maths", "type": "score", "category": "average"},
-        "gps_avg_score": {"name": "GPS Avg Score", "description": "Average scaled score in GPS", "type": "score", "category": "average"},
-        # Gender Performance
-        "rwm_expected_boys_pct": {"name": "RWM Expected % (Boys)", "description": "% of boys meeting expected standard", "type": "percentage", "category": "gender"},
-        "rwm_expected_girls_pct": {"name": "RWM Expected % (Girls)", "description": "% of girls meeting expected standard", "type": "percentage", "category": "gender"},
-        "rwm_high_boys_pct": {"name": "RWM Higher % (Boys)", "description": "% of boys at higher standard", "type": "percentage", "category": "gender"},
-        "rwm_high_girls_pct": {"name": "RWM Higher % (Girls)", "description": "% of girls at higher standard", "type": "percentage", "category": "gender"},
-        # Disadvantaged Performance
-        "rwm_expected_disadvantaged_pct": {"name": "RWM Expected % (Disadvantaged)", "description": "% of disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
-        "rwm_expected_non_disadvantaged_pct": {"name": "RWM Expected % (Non-Disadvantaged)", "description": "% of non-disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
-        "disadvantaged_gap": {"name": "Disadvantaged Gap", "description": "Gap between disadvantaged and national non-disadvantaged", "type": "score", "category": "equity"},
-        # School Context
-        "disadvantaged_pct": {"name": "% Disadvantaged Pupils", "description": "% of pupils eligible for free school meals or looked after", "type": "percentage", "category": "context"},
-        "eal_pct": {"name": "% EAL Pupils", "description": "% of pupils with English as additional language", "type": "percentage", "category": "context"},
-        "sen_support_pct": {"name": "% SEN Support", "description": "% of pupils with SEN support", "type": "percentage", "category": "context"},
-        "stability_pct": {"name": "% Pupil Stability", "description": "% of non-mobile pupils (stayed at school)", "type": "percentage", "category": "context"},
-        # 3-Year Averages
-        "rwm_expected_3yr_pct": {"name": "RWM Expected % (3-Year Avg)", "description": "3-year average % meeting expected", "type": "percentage", "category": "trends"},
-        "reading_avg_3yr": {"name": "Reading Score (3-Year Avg)", "description": "3-year average reading score", "type": "score", "category": "trends"},
-        "maths_avg_3yr": {"name": "Maths Score (3-Year Avg)", "description": "3-year average maths score", "type": "score", "category": "trends"},
-    }
-    
    available = []
-    for col, info in metric_info.items():
-        if df.empty or col in df.columns:
-            available.append({"key": col, **info})
+    for key, info in METRIC_DEFINITIONS.items():
+        if df.empty or key in df.columns:
+            available.append({"key": key, **info})
    
    return {"metrics": available}

@@ -448,13 +221,14 @@ async def get_available_metrics():
 async def get_rankings(
    metric: str = Query("rwm_expected_pct", description="KS2 metric to rank by"),
    year: Optional[int] = Query(None, description="Specific year (defaults to most recent)"),
-    limit: int = Query(20, description="Number of schools to return"),
+    limit: int = Query(20, ge=1, le=100, description="Number of schools to return"),
+    local_authority: Optional[str] = Query(None, description="Filter by local authority"),
 ):
    """Get primary school rankings by a specific KS2 metric."""
    df = load_school_data()
    
    if df.empty:
-        return {"metric": metric, "year": None, "rankings": []}
+        return {"metric": metric, "year": None, "rankings": [], "total": 0}
    
    if metric not in df.columns:
        raise HTTPException(status_code=400, detail=f"Metric '{metric}' not available")
@@ -467,39 +241,26 @@ async def get_rankings(
        max_year = df["year"].max()
        df = df[df["year"] == max_year]
    
+    # Filter by local authority if specified
+    if local_authority:
+        df = df[df["local_authority"].str.lower() == local_authority.lower()]
+    
    # Sort and rank (exclude rows with no data for this metric)
    df = df.dropna(subset=[metric])
+    total = len(df)
    
    # For progress scores, higher is better. For percentages, higher is also better.
    df = df.sort_values(metric, ascending=False).head(limit)
    
    # Return only relevant fields for rankings
-    ranking_cols = [
-        "urn", "school_name", "local_authority", "school_type", "address", "year", "total_pupils",
-        # Core expected
-        "rwm_expected_pct", "reading_expected_pct", "writing_expected_pct", "maths_expected_pct",
-        "gps_expected_pct", "science_expected_pct",
-        # Core higher
-        "rwm_high_pct", "reading_high_pct", "writing_high_pct", "maths_high_pct", "gps_high_pct",
-        # Progress & averages
-        "reading_progress", "writing_progress", "maths_progress",
-        "reading_avg_score", "maths_avg_score", "gps_avg_score",
-        # Gender
-        "rwm_expected_boys_pct", "rwm_expected_girls_pct", "rwm_high_boys_pct", "rwm_high_girls_pct",
-        # Equity
-        "rwm_expected_disadvantaged_pct", "rwm_expected_non_disadvantaged_pct", "disadvantaged_gap",
-        # Context
-        "disadvantaged_pct", "eal_pct", "sen_support_pct", "stability_pct",
-        # 3-year
-        "rwm_expected_3yr_pct", "reading_avg_3yr", "maths_avg_3yr",
-    ]
-    available_cols = [c for c in ranking_cols if c in df.columns]
+    available_cols = [c for c in RANKING_COLUMNS if c in df.columns]
    df = df[available_cols]
    
    return {
        "metric": metric,
        "year": int(df["year"].iloc[0]) if not df.empty else None,
-        "rankings": clean_for_json(df)
+        "rankings": clean_for_json(df),
+        "total": total,
    }


@@ -512,7 +273,7 @@ async def get_data_info():
        return {
            "status": "no_data",
            "message": "No data files found in data folder. Please download KS2 data from the government website.",
-            "data_folder": str(DATA_DIR),
+            "data_folder": str(settings.data_dir),
        }
    
    years = [int(y) for y in sorted(df["year"].unique())]
@@ -529,17 +290,22 @@ async def get_data_info():
    }


-# Mount static files
-@app.on_event("startup")
-async def startup():
-    """Setup static file serving and load data on startup."""
-    if FRONTEND_DIR.exists():
-        app.mount("/static", StaticFiles(directory=FRONTEND_DIR), name="static")
-    
-    # Pre-load data
+@app.post("/api/admin/reload")
+async def reload_data():
+    """Admin endpoint to force data reload (useful after data updates)."""
+    clear_cache()
    load_school_data()
+    return {"status": "reloaded"}
+
+
+# Mount static files after all routes are defined
+@app.on_event("startup")
+async def mount_static():
+    """Mount static file serving."""
+    if settings.frontend_dir.exists():
+        app.mount("/static", StaticFiles(directory=settings.frontend_dir), name="static")


 if __name__ == "__main__":
    import uvicorn
-    uvicorn.run(app, host="0.0.0.0", port=8000)
+    uvicorn.run(app, host=settings.host, port=settings.port)