school_compare/backend/app.py

"""
SchoolCompare.co.uk API
Serves primary school (KS2) performance data for comparing schools.
Uses real data from UK Government Compare School Performance downloads.
"""

from fastapi import FastAPI, HTTPException, Query
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional
import os
import re

# No longer filtering by specific LA codes - load all available schools

app = FastAPI(
    title="SchoolCompare API",
    description="API for comparing primary school (KS2) performance data - schoolcompare.co.uk",
    version="1.0.0"
)

# CORS middleware for development
app.add_middleware(
    CORSMiddleware,
    allow_origins=["*"],
    allow_credentials=True,
    allow_methods=["*"],
    allow_headers=["*"],
)

# Data directory
DATA_DIR = Path(__file__).parent.parent / "data"
FRONTEND_DIR = Path(__file__).parent.parent / "frontend"

# Cache for loaded data - cleared on reload (updated for 2016-2017 data)
_data_cache: Optional[pd.DataFrame] = None


def convert_to_native(value):
    """Convert numpy types to native Python types for JSON serialization."""
    if pd.isna(value):
        return None
    if isinstance(value, (np.integer,)):
        return int(value)
    if isinstance(value, (np.floating,)):
        if np.isnan(value) or np.isinf(value):
            return None
        return float(value)
    if isinstance(value, np.ndarray):
        return value.tolist()
    if value == "SUPP" or value == "NE" or value == "NA" or value == "NP":
        return None
    return value


def clean_for_json(df: pd.DataFrame) -> list:
    """Convert DataFrame to list of dicts, replacing NaN/inf with None for JSON serialization."""
    records = df.to_dict(orient="records")
    cleaned = []
    for record in records:
        clean_record = {}
        for key, value in record.items():
            clean_record[key] = convert_to_native(value)
        cleaned.append(clean_record)
    return cleaned


def parse_numeric(value):
    """Parse a value to numeric, handling SUPP, NE, NA, %, etc."""
    if pd.isna(value):
        return None
    if isinstance(value, (int, float)):
        if np.isnan(value) or np.isinf(value):
            return None
        return value
    if isinstance(value, str):
        value = value.strip()
        if value in ["SUPP", "NE", "NA", "NP", "NEW", "LOW", ""]:
            return None
        # Remove % sign if present
        if value.endswith('%'):
            value = value[:-1]
        try:
            return float(value)
        except ValueError:
            return None
    return None


def extract_year_from_folder(folder_name: str) -> Optional[int]:
    """Extract the end year from folder name like '2023-2024' -> 2024."""
    match = re.search(r'(\d{4})-(\d{4})', folder_name)
    if match:
        return int(match.group(2))
    return None


def load_school_data() -> pd.DataFrame:
    """Load and combine all school data from CSV files in year folders."""
    global _data_cache

    if _data_cache is not None:
        return _data_cache

    all_data = []

    # Look for year folders in data directory
    if DATA_DIR.exists():
        for year_folder in DATA_DIR.iterdir():
            if year_folder.is_dir() and re.match(r'\d{4}-\d{4}', year_folder.name):
                year = extract_year_from_folder(year_folder.name)
                if year is None:
                    continue

                # Look for KS2 data file
                ks2_file = year_folder / "england_ks2final.csv"
                if ks2_file.exists():
                    try:
                        print(f"Loading data from {ks2_file}")
                        df = pd.read_csv(ks2_file, low_memory=False)

                        # Handle both string and integer columns
                        if 'LEA' in df.columns and df['LEA'].dtype == 'object':
                            df['LEA'] = pd.to_numeric(df['LEA'], errors='coerce')
                        if 'URN' in df.columns and df['URN'].dtype == 'object':
                            df['URN'] = pd.to_numeric(df['URN'], errors='coerce')

                        # Filter to schools only (RECTYPE == 1 means school level data)
                        if 'RECTYPE' in df.columns:
                            df = df[df['RECTYPE'] == 1]

                        # Add year and local authority name from LANAME column
                        df['year'] = year
                        if 'LANAME' in df.columns:
                            df['local_authority'] = df['LANAME']
                        elif 'LEA' in df.columns:
                            df['local_authority'] = df['LEA'].astype(str)

                        # Standardize column names for our API
                        df = df.rename(columns={
                            'URN': 'urn',
                            'SCHNAME': 'school_name',
                            'ADDRESS1': 'address1',
                            'ADDRESS2': 'address2',
                            'TOWN': 'town',
                            'PCODE': 'postcode',
                            'NFTYPE': 'school_type_code',
                            'RELDENOM': 'religious_denomination',
                            'AGERANGE': 'age_range',
                            'TOTPUPS': 'total_pupils',
                            'TELIG': 'eligible_pupils',
                            # Core KS2 metrics
                            'PTRWM_EXP': 'rwm_expected_pct',
                            'PTRWM_HIGH': 'rwm_high_pct',
                            'READPROG': 'reading_progress',
                            'WRITPROG': 'writing_progress',
                            'MATPROG': 'maths_progress',
                            'PTREAD_EXP': 'reading_expected_pct',
                            'PTWRITTA_EXP': 'writing_expected_pct',
                            'PTMAT_EXP': 'maths_expected_pct',
                            'READ_AVERAGE': 'reading_avg_score',
                            'MAT_AVERAGE': 'maths_avg_score',
                            'PTREAD_HIGH': 'reading_high_pct',
                            'PTWRITTA_HIGH': 'writing_high_pct',
                            'PTMAT_HIGH': 'maths_high_pct',
                            # GPS (Grammar, Punctuation & Spelling)
                            'PTGPS_EXP': 'gps_expected_pct',
                            'PTGPS_HIGH': 'gps_high_pct',
                            'GPS_AVERAGE': 'gps_avg_score',
                            # Science
                            'PTSCITA_EXP': 'science_expected_pct',
                            # School context
                            'PTFSM6CLA1A': 'disadvantaged_pct',
                            'PTEALGRP2': 'eal_pct',
                            'PSENELK': 'sen_support_pct',
                            'PSENELE': 'sen_ehcp_pct',
                            'PTMOBN': 'stability_pct',
                            # Gender breakdown
                            'PTRWM_EXP_B': 'rwm_expected_boys_pct',
                            'PTRWM_EXP_G': 'rwm_expected_girls_pct',
                            'PTRWM_HIGH_B': 'rwm_high_boys_pct',
                            'PTRWM_HIGH_G': 'rwm_high_girls_pct',
                            # Disadvantaged performance
                            'PTRWM_EXP_FSM6CLA1A': 'rwm_expected_disadvantaged_pct',
                            'PTRWM_EXP_NotFSM6CLA1A': 'rwm_expected_non_disadvantaged_pct',
                            'DIFFN_RWM_EXP': 'disadvantaged_gap',
                            # 3-year averages
                            'PTRWM_EXP_3YR': 'rwm_expected_3yr_pct',
                            'READ_AVERAGE_3YR': 'reading_avg_3yr',
                            'MAT_AVERAGE_3YR': 'maths_avg_3yr',
                        })

                        # Create address field
                        def make_address(row):
                            parts = []
                            if pd.notna(row.get('address1')) and row.get('address1'):
                                parts.append(str(row['address1']))
                            if pd.notna(row.get('town')) and row.get('town'):
                                parts.append(str(row['town']))
                            if pd.notna(row.get('postcode')) and row.get('postcode'):
                                parts.append(str(row['postcode']))
                            return ', '.join(parts) if parts else ''

                        df['address'] = df.apply(make_address, axis=1)

                        # Map school type codes to names
                        school_type_map = {
                            'AC': 'Academy', 'ACC': 'Academy Converter', 'ACS': 'Academy Sponsor Led',
                            'CY': 'Community School', 'VA': 'Voluntary Aided', 'VC': 'Voluntary Controlled',
                            'FD': 'Foundation', 'F': 'Foundation', 'FS': 'Free School',
                        }
                        df['school_type'] = df['school_type_code'].map(school_type_map).fillna('Other')

                        # Parse numeric columns
                        numeric_cols = [
                            # Core metrics
                            'rwm_expected_pct', 'rwm_high_pct', 'reading_progress',
                            'writing_progress', 'maths_progress', 'reading_expected_pct',
                            'writing_expected_pct', 'maths_expected_pct', 'reading_avg_score',
                            'maths_avg_score', 'reading_high_pct', 'writing_high_pct', 'maths_high_pct',
                            # GPS & Science
                            'gps_expected_pct', 'gps_high_pct', 'gps_avg_score', 'science_expected_pct',
                            # School context
                            'total_pupils', 'eligible_pupils', 'disadvantaged_pct', 'eal_pct',
                            'sen_support_pct', 'sen_ehcp_pct', 'stability_pct',
                            # Gender breakdown
                            'rwm_expected_boys_pct', 'rwm_expected_girls_pct',
                            'rwm_high_boys_pct', 'rwm_high_girls_pct',
                            # Disadvantaged performance
                            'rwm_expected_disadvantaged_pct', 'rwm_expected_non_disadvantaged_pct', 'disadvantaged_gap',
                            # 3-year averages
                            'rwm_expected_3yr_pct', 'reading_avg_3yr', 'maths_avg_3yr',
                        ]

                        for col in numeric_cols:
                            if col in df.columns:
                                df[col] = df[col].apply(parse_numeric)

                        all_data.append(df)
                        print(f"  Loaded {len(df)} schools for year {year}")

                    except Exception as e:
                        print(f"Error loading {ks2_file}: {e}")

    if all_data:
        _data_cache = pd.concat(all_data, ignore_index=True)
        print(f"\nTotal records loaded: {len(_data_cache)}")
        print(f"Unique schools: {_data_cache['urn'].nunique()}")
        print(f"Years: {sorted(_data_cache['year'].unique())}")
    else:
        print("No data files found. Creating empty DataFrame.")
        _data_cache = pd.DataFrame()

    return _data_cache


@app.get("/")
async def root():
    """Serve the frontend."""
    return FileResponse(FRONTEND_DIR / "index.html")


@app.get("/api/schools")
async def get_schools(
    search: Optional[str] = Query(None, description="Search by school name"),
    local_authority: Optional[str] = Query(None, description="Filter by local authority (Wandsworth or Merton)"),
    school_type: Optional[str] = Query(None, description="Filter by school type"),
):
    """Get list of unique primary schools in Wandsworth and Merton."""
    df = load_school_data()

    if df.empty:
        return {"schools": []}

    # Get unique schools (latest year data for each)
    latest_year = df.groupby('urn')['year'].max().reset_index()
    df_latest = df.merge(latest_year, on=['urn', 'year'])

    school_cols = ["urn", "school_name", "local_authority", "school_type", "address", "town", "postcode"]
    available_cols = [c for c in school_cols if c in df_latest.columns]
    schools_df = df_latest[available_cols].drop_duplicates(subset=['urn'])

    # Apply filters
    if search:
        search_lower = search.lower()
        mask = schools_df["school_name"].str.lower().str.contains(search_lower, na=False)
        if "address" in schools_df.columns:
            mask = mask | schools_df["address"].str.lower().str.contains(search_lower, na=False)
        schools_df = schools_df[mask]

    if local_authority:
        schools_df = schools_df[schools_df["local_authority"].str.lower() == local_authority.lower()]

    if school_type:
        schools_df = schools_df[schools_df["school_type"].str.lower() == school_type.lower()]

    return {"schools": clean_for_json(schools_df)}


@app.get("/api/schools/{urn}")
async def get_school_details(urn: int):
    """Get detailed KS2 data for a specific primary school across all years."""
    df = load_school_data()

    if df.empty:
        raise HTTPException(status_code=404, detail="No data available")

    school_data = df[df["urn"] == urn]

    if school_data.empty:
        raise HTTPException(status_code=404, detail="School not found")

    # Sort by year
    school_data = school_data.sort_values("year")

    # Get latest info for the school
    latest = school_data.iloc[-1]

    return {
        "school_info": {
            "urn": urn,
            "school_name": latest.get("school_name", ""),
            "local_authority": latest.get("local_authority", ""),
            "school_type": latest.get("school_type", ""),
            "address": latest.get("address", ""),
            "phase": "Primary",
        },
        "yearly_data": clean_for_json(school_data)
    }


@app.get("/api/compare")
async def compare_schools(urns: str = Query(..., description="Comma-separated URNs")):
    """Compare multiple primary schools side by side."""
    df = load_school_data()

    if df.empty:
        raise HTTPException(status_code=404, detail="No data available")

    try:
        urn_list = [int(u.strip()) for u in urns.split(",")]
    except ValueError:
        raise HTTPException(status_code=400, detail="Invalid URN format")

    comparison_data = df[df["urn"].isin(urn_list)]

    if comparison_data.empty:
        raise HTTPException(status_code=404, detail="No schools found")

    result = {}
    for urn in urn_list:
        school_data = comparison_data[comparison_data["urn"] == urn].sort_values("year")
        if not school_data.empty:
            latest = school_data.iloc[-1]
            result[str(urn)] = {
                "school_info": {
                    "urn": urn,
                    "school_name": latest.get("school_name", ""),
                    "local_authority": latest.get("local_authority", ""),
                    "address": latest.get("address", ""),
                },
                "yearly_data": clean_for_json(school_data)
            }

    return {"comparison": result}


@app.get("/api/filters")
async def get_filter_options():
    """Get available filter options (local authorities, school types, years)."""
    df = load_school_data()

    if df.empty:
        return {
            "local_authorities": [],
            "school_types": [],
            "years": [],
        }

    return {
        "local_authorities": sorted(df["local_authority"].dropna().unique().tolist()),
        "school_types": sorted(df["school_type"].dropna().unique().tolist()),
        "years": sorted(df["year"].dropna().unique().tolist()),
    }


@app.get("/api/metrics")
async def get_available_metrics():
    """Get list of available KS2 performance metrics for primary schools."""
    df = load_school_data()

    # Define KS2 metric metadata organized by category
    metric_info = {
        # Expected Standard
        "rwm_expected_pct": {"name": "RWM Combined %", "description": "% meeting expected standard in reading, writing and maths", "type": "percentage", "category": "expected"},
        "reading_expected_pct": {"name": "Reading Expected %", "description": "% meeting expected standard in reading", "type": "percentage", "category": "expected"},
        "writing_expected_pct": {"name": "Writing Expected %", "description": "% meeting expected standard in writing", "type": "percentage", "category": "expected"},
        "maths_expected_pct": {"name": "Maths Expected %", "description": "% meeting expected standard in maths", "type": "percentage", "category": "expected"},
        "gps_expected_pct": {"name": "GPS Expected %", "description": "% meeting expected standard in grammar, punctuation & spelling", "type": "percentage", "category": "expected"},
        "science_expected_pct": {"name": "Science Expected %", "description": "% meeting expected standard in science", "type": "percentage", "category": "expected"},
        # Higher Standard
        "rwm_high_pct": {"name": "RWM Combined Higher %", "description": "% achieving higher standard in RWM combined", "type": "percentage", "category": "higher"},
        "reading_high_pct": {"name": "Reading Higher %", "description": "% achieving higher standard in reading", "type": "percentage", "category": "higher"},
        "writing_high_pct": {"name": "Writing Higher %", "description": "% achieving greater depth in writing", "type": "percentage", "category": "higher"},
        "maths_high_pct": {"name": "Maths Higher %", "description": "% achieving higher standard in maths", "type": "percentage", "category": "higher"},
        "gps_high_pct": {"name": "GPS Higher %", "description": "% achieving higher standard in GPS", "type": "percentage", "category": "higher"},
        # Progress Scores
        "reading_progress": {"name": "Reading Progress", "description": "Progress in reading from KS1 to KS2", "type": "score", "category": "progress"},
        "writing_progress": {"name": "Writing Progress", "description": "Progress in writing from KS1 to KS2", "type": "score", "category": "progress"},
        "maths_progress": {"name": "Maths Progress", "description": "Progress in maths from KS1 to KS2", "type": "score", "category": "progress"},
        # Average Scores
        "reading_avg_score": {"name": "Reading Avg Score", "description": "Average scaled score in reading", "type": "score", "category": "average"},
        "maths_avg_score": {"name": "Maths Avg Score", "description": "Average scaled score in maths", "type": "score", "category": "average"},
        "gps_avg_score": {"name": "GPS Avg Score", "description": "Average scaled score in GPS", "type": "score", "category": "average"},
        # Gender Performance
        "rwm_expected_boys_pct": {"name": "RWM Expected % (Boys)", "description": "% of boys meeting expected standard", "type": "percentage", "category": "gender"},
        "rwm_expected_girls_pct": {"name": "RWM Expected % (Girls)", "description": "% of girls meeting expected standard", "type": "percentage", "category": "gender"},
        "rwm_high_boys_pct": {"name": "RWM Higher % (Boys)", "description": "% of boys at higher standard", "type": "percentage", "category": "gender"},
        "rwm_high_girls_pct": {"name": "RWM Higher % (Girls)", "description": "% of girls at higher standard", "type": "percentage", "category": "gender"},
        # Disadvantaged Performance
        "rwm_expected_disadvantaged_pct": {"name": "RWM Expected % (Disadvantaged)", "description": "% of disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
        "rwm_expected_non_disadvantaged_pct": {"name": "RWM Expected % (Non-Disadvantaged)", "description": "% of non-disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
        "disadvantaged_gap": {"name": "Disadvantaged Gap", "description": "Gap between disadvantaged and national non-disadvantaged", "type": "score", "category": "equity"},
        # School Context
        "disadvantaged_pct": {"name": "% Disadvantaged Pupils", "description": "% of pupils eligible for free school meals or looked after", "type": "percentage", "category": "context"},
        "eal_pct": {"name": "% EAL Pupils", "description": "% of pupils with English as additional language", "type": "percentage", "category": "context"},
        "sen_support_pct": {"name": "% SEN Support", "description": "% of pupils with SEN support", "type": "percentage", "category": "context"},
        "stability_pct": {"name": "% Pupil Stability", "description": "% of non-mobile pupils (stayed at school)", "type": "percentage", "category": "context"},
        # 3-Year Averages
        "rwm_expected_3yr_pct": {"name": "RWM Expected % (3-Year Avg)", "description": "3-year average % meeting expected", "type": "percentage", "category": "trends"},
        "reading_avg_3yr": {"name": "Reading Score (3-Year Avg)", "description": "3-year average reading score", "type": "score", "category": "trends"},
        "maths_avg_3yr": {"name": "Maths Score (3-Year Avg)", "description": "3-year average maths score", "type": "score", "category": "trends"},
    }

    available = []
    for col, info in metric_info.items():
        if df.empty or col in df.columns:
            available.append({"key": col, **info})

    return {"metrics": available}


@app.get("/api/rankings")
async def get_rankings(
    metric: str = Query("rwm_expected_pct", description="KS2 metric to rank by"),
    year: Optional[int] = Query(None, description="Specific year (defaults to most recent)"),
    limit: int = Query(20, description="Number of schools to return"),
):
    """Get primary school rankings by a specific KS2 metric."""
    df = load_school_data()

    if df.empty:
        return {"metric": metric, "year": None, "rankings": []}

    if metric not in df.columns:
        raise HTTPException(status_code=400, detail=f"Metric '{metric}' not available")

    # Filter by year
    if year:
        df = df[df["year"] == year]
    else:
        # Use most recent year
        max_year = df["year"].max()
        df = df[df["year"] == max_year]

    # Sort and rank (exclude rows with no data for this metric)
    df = df.dropna(subset=[metric])

    # For progress scores, higher is better. For percentages, higher is also better.
    df = df.sort_values(metric, ascending=False).head(limit)

    # Return only relevant fields for rankings
    ranking_cols = [
        "urn", "school_name", "local_authority", "school_type", "address", "year", "total_pupils",
        # Core expected
        "rwm_expected_pct", "reading_expected_pct", "writing_expected_pct", "maths_expected_pct",
        "gps_expected_pct", "science_expected_pct",
        # Core higher
        "rwm_high_pct", "reading_high_pct", "writing_high_pct", "maths_high_pct", "gps_high_pct",
        # Progress & averages
        "reading_progress", "writing_progress", "maths_progress",
        "reading_avg_score", "maths_avg_score", "gps_avg_score",
        # Gender
        "rwm_expected_boys_pct", "rwm_expected_girls_pct", "rwm_high_boys_pct", "rwm_high_girls_pct",
        # Equity
        "rwm_expected_disadvantaged_pct", "rwm_expected_non_disadvantaged_pct", "disadvantaged_gap",
        # Context
        "disadvantaged_pct", "eal_pct", "sen_support_pct", "stability_pct",
        # 3-year
        "rwm_expected_3yr_pct", "reading_avg_3yr", "maths_avg_3yr",
    ]
    available_cols = [c for c in ranking_cols if c in df.columns]
    df = df[available_cols]

    return {
        "metric": metric,
        "year": int(df["year"].iloc[0]) if not df.empty else None,
        "rankings": clean_for_json(df)
    }


@app.get("/api/data-info")
async def get_data_info():
    """Get information about loaded data."""
    df = load_school_data()

    if df.empty:
        return {
            "status": "no_data",
            "message": "No data files found in data folder. Please download KS2 data from the government website.",
            "data_folder": str(DATA_DIR),
        }

    years = [int(y) for y in sorted(df["year"].unique())]
    schools_per_year = {str(int(k)): int(v) for k, v in df.groupby("year")["urn"].nunique().to_dict().items()}
    la_counts = {str(k): int(v) for k, v in df["local_authority"].value_counts().to_dict().items()}

    return {
        "status": "loaded",
        "total_records": int(len(df)),
        "unique_schools": int(df["urn"].nunique()),
        "years_available": years,
        "schools_per_year": schools_per_year,
        "local_authorities": la_counts,
    }


# Mount static files
@app.on_event("startup")
async def startup():
    """Setup static file serving and load data on startup."""
    if FRONTEND_DIR.exists():
        app.mount("/static", StaticFiles(directory=FRONTEND_DIR), name="static")

    # Pre-load data
    load_school_data()


if __name__ == "__main__":
    import uvicorn
    uvicorn.run(app, host="0.0.0.0", port=8000)