school_compare/backend/app.py

"""
SchoolCompare.co.uk API
Serves primary and secondary school performance data for comparing schools.
Uses real data from UK Government Compare School Performance downloads.
"""

import re
from contextlib import asynccontextmanager
from typing import Optional

import numpy as np
import pandas as pd
from fastapi import FastAPI, HTTPException, Query, Request, Depends, Header
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse, Response
from fastapi.staticfiles import StaticFiles
from slowapi import Limiter, _rate_limit_exceeded_handler
from slowapi.util import get_remote_address
from slowapi.errors import RateLimitExceeded
from starlette.middleware.base import BaseHTTPMiddleware

import asyncio
from .config import settings
from .data_loader import (
    clear_cache,
    load_school_data,
    geocode_single_postcode,
    get_supplementary_data,
    search_schools_typesense,
)
from .data_loader import get_data_info as get_db_info
from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
from .utils import clean_for_json

# Values to exclude from filter dropdowns (empty strings, non-applicable labels)
EXCLUDED_FILTER_VALUES = {"", "Not applicable", "Does not apply"}

# Maps user-facing phase filter values to the GIAS PhaseOfEducation values they include.
# All-through schools appear in both primary and secondary results.
PHASE_GROUPS: dict[str, set[str]] = {
    "primary": {"primary", "middle deemed primary", "all-through"},
    "secondary": {"secondary", "middle deemed secondary", "all-through", "16 plus"},
    "all-through": {"all-through"},
}

BASE_URL = "https://schoolcompare.co.uk"
MAX_SLUG_LENGTH = 60

# In-memory sitemap cache
_sitemap_xml: str | None = None


def _slugify(text: str) -> str:
    text = text.lower()
    text = re.sub(r"[^\w\s-]", "", text)
    text = re.sub(r"\s+", "-", text)
    text = re.sub(r"-+", "-", text)
    return text.strip("-")


def _school_url(urn: int, school_name: str) -> str:
    slug = _slugify(school_name)
    if len(slug) > MAX_SLUG_LENGTH:
        slug = slug[:MAX_SLUG_LENGTH].rstrip("-")
    return f"/school/{urn}-{slug}"


def build_sitemap() -> str:
    """Generate sitemap XML from in-memory school data. Returns the XML string."""
    df = load_school_data()

    static_urls = [
        (BASE_URL + "/",          "daily",   "1.0"),
        (BASE_URL + "/rankings",  "weekly",  "0.8"),
        (BASE_URL + "/compare",   "weekly",  "0.8"),
    ]

    lines = ['<?xml version="1.0" encoding="UTF-8"?>',
             '<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">']

    for url, freq, priority in static_urls:
        lines.append(
            f"  <url><loc>{url}</loc>"
            f"<changefreq>{freq}</changefreq>"
            f"<priority>{priority}</priority></url>"
        )

    if not df.empty and "urn" in df.columns and "school_name" in df.columns:
        seen = set()
        for _, row in df[["urn", "school_name"]].drop_duplicates(subset="urn").iterrows():
            urn = int(row["urn"])
            name = str(row["school_name"])
            if urn in seen:
                continue
            seen.add(urn)
            path = _school_url(urn, name)
            lines.append(
                f"  <url><loc>{BASE_URL}{path}</loc>"
                f"<changefreq>monthly</changefreq>"
                f"<priority>0.6</priority></url>"
            )

    lines.append("</urlset>")
    return "\n".join(lines)


def clean_filter_values(series: pd.Series) -> list[str]:
    """Return sorted unique values from a Series, excluding NaN and junk labels."""
    return sorted(
        v for v in series.dropna().unique().tolist()
        if v not in EXCLUDED_FILTER_VALUES
    )


# =============================================================================
# SECURITY MIDDLEWARE & HELPERS
# =============================================================================

# Rate limiter
limiter = Limiter(key_func=get_remote_address)


class SecurityHeadersMiddleware(BaseHTTPMiddleware):
    """Add security headers to all responses."""

    async def dispatch(self, request: Request, call_next):
        response = await call_next(request)

        # Prevent clickjacking
        response.headers["X-Frame-Options"] = "DENY"

        # Prevent MIME type sniffing
        response.headers["X-Content-Type-Options"] = "nosniff"

        # XSS Protection (legacy browsers)
        response.headers["X-XSS-Protection"] = "1; mode=block"

        # Referrer policy
        response.headers["Referrer-Policy"] = "strict-origin-when-cross-origin"

        # Permissions policy (restrict browser features)
        response.headers["Permissions-Policy"] = (
            "geolocation=(), microphone=(), camera=(), payment=()"
        )

        # Content Security Policy
        response.headers["Content-Security-Policy"] = (
            "default-src 'self'; "
            "script-src 'self' 'unsafe-inline' https://cdn.jsdelivr.net https://unpkg.com https://analytics.schoolcompare.co.uk; "
            "style-src 'self' 'unsafe-inline' https://fonts.googleapis.com https://cdn.jsdelivr.net https://unpkg.com; "
            "font-src 'self' https://fonts.gstatic.com; "
            "img-src 'self' data: https://*.tile.openstreetmap.org https://unpkg.com; "
            "connect-src 'self' https://cdn.jsdelivr.net https://*.tile.openstreetmap.org https://unpkg.com https://analytics.schoolcompare.co.uk; "
            "frame-ancestors 'none'; "
            "base-uri 'self'; "
            "form-action 'self' https://formsubmit.co;"
        )

        # HSTS (only enable if using HTTPS in production)
        response.headers["Strict-Transport-Security"] = (
            "max-age=31536000; includeSubDomains"
        )

        return response


class RequestSizeLimitMiddleware(BaseHTTPMiddleware):
    """Limit request body size to prevent DoS attacks."""

    async def dispatch(self, request: Request, call_next):
        content_length = request.headers.get("content-length")
        if content_length:
            if int(content_length) > settings.max_request_size:
                return Response(
                    content="Request too large",
                    status_code=413,
                )
        return await call_next(request)


def verify_admin_api_key(x_api_key: str = Header(None)) -> bool:
    """Verify admin API key for protected endpoints."""
    if not x_api_key or x_api_key != settings.admin_api_key:
        raise HTTPException(
            status_code=401,
            detail="Invalid or missing API key",
            headers={"WWW-Authenticate": "ApiKey"},
        )
    return True


# Input validation helpers
def sanitize_search_input(value: Optional[str], max_length: int = 100) -> Optional[str]:
    """Sanitize search input to prevent injection attacks."""
    if value is None:
        return None
    # Strip whitespace and limit length
    value = value.strip()[:max_length]
    # Remove potentially dangerous characters (allow alphanumeric, spaces, common punctuation)
    value = re.sub(r"[^\w\s\-\',\.]", "", value)
    return value if value else None


def validate_postcode(postcode: Optional[str]) -> Optional[str]:
    """Validate and normalize UK postcode format."""
    if not postcode:
        return None
    postcode = postcode.strip().upper()
    # UK postcode pattern
    pattern = r"^[A-Z]{1,2}[0-9][A-Z0-9]?\s*[0-9][A-Z]{2}$"
    if not re.match(pattern, postcode):
        return None
    return postcode


@asynccontextmanager
async def lifespan(app: FastAPI):
    """Application lifespan - startup and shutdown events."""
    global _sitemap_xml
    print("Loading school data from marts...")
    df = load_school_data()
    if df.empty:
        print("Warning: No data in marts. Run the annual EES pipeline to populate KS2 data.")
    else:
        print(f"Data loaded successfully: {len(df)} records.")
    try:
        _sitemap_xml = build_sitemap()
        n = _sitemap_xml.count("<url>")
        print(f"Sitemap built: {n} URLs.")
    except Exception as e:
        print(f"Warning: sitemap build failed on startup: {e}")

    yield

    print("Shutting down...")


app = FastAPI(
    title="SchoolCompare API",
    description="API for comparing primary and secondary school performance data - schoolcompare.co.uk",
    version="2.0.0",
    lifespan=lifespan,
    # Disable docs in production for security
    docs_url="/docs" if settings.debug else None,
    redoc_url="/redoc" if settings.debug else None,
    openapi_url="/openapi.json" if settings.debug else None,
)

# Add rate limiter
app.state.limiter = limiter
app.add_exception_handler(RateLimitExceeded, _rate_limit_exceeded_handler)

# Security middleware (order matters - these run in reverse order)
app.add_middleware(SecurityHeadersMiddleware)
app.add_middleware(RequestSizeLimitMiddleware)

# CORS middleware - restricted for production
app.add_middleware(
    CORSMiddleware,
    allow_origins=settings.allowed_origins,
    allow_credentials=False,  # Don't allow credentials unless needed
    allow_methods=["GET", "POST"],  # Only allow needed methods
    allow_headers=["Content-Type", "X-API-Key"],  # Only allow needed headers
)


@app.get("/")
async def root():
    """Serve the frontend."""
    return FileResponse(settings.frontend_dir / "index.html")


@app.get("/compare")
async def serve_compare():
    """Serve the frontend for /compare route (SPA routing)."""
    return FileResponse(settings.frontend_dir / "index.html")


@app.get("/rankings")
async def serve_rankings():
    """Serve the frontend for /rankings route (SPA routing)."""
    return FileResponse(settings.frontend_dir / "index.html")


@app.get("/api/config")
async def get_config():
    """Return public configuration for the frontend."""
    return {
        "ga_measurement_id": settings.ga_measurement_id
    }


@app.get("/api/schools")
@limiter.limit(f"{settings.rate_limit_per_minute}/minute")
async def get_schools(
    request: Request,
    search: Optional[str] = Query(None, description="Search by school name", max_length=100),
    local_authority: Optional[str] = Query(
        None, description="Filter by local authority", max_length=100
    ),
    school_type: Optional[str] = Query(None, description="Filter by school type", max_length=100),
    phase: Optional[str] = Query(None, description="Filter by phase: primary, secondary, all-through", max_length=50),
    postcode: Optional[str] = Query(None, description="Search near postcode", max_length=10),
    radius: float = Query(5.0, ge=0.1, le=5, description="Search radius in miles"),
    page: int = Query(1, ge=1, le=1000, description="Page number"),
    page_size: int = Query(25, ge=1, le=500, description="Results per page"),
    gender: Optional[str] = Query(None, description="Filter by gender (Mixed/Boys/Girls)", max_length=50),
    admissions_policy: Optional[str] = Query(None, description="Filter by admissions policy", max_length=100),
    has_sixth_form: Optional[str] = Query(None, description="Filter by sixth form presence: yes/no", max_length=3),
):
    """
    Get list of schools with pagination.

    Returns paginated results with total count for efficient loading.
    Supports location-based search using postcode and phase filtering.
    """
    # Sanitize inputs
    search = sanitize_search_input(search)
    local_authority = sanitize_search_input(local_authority)
    school_type = sanitize_search_input(school_type)
    phase = sanitize_search_input(phase)
    postcode = validate_postcode(postcode)

    df = load_school_data()

    if df.empty:
        return {"schools": [], "total": 0, "page": page, "page_size": 0}

    # Use configured default if not specified
    if page_size is None:
        page_size = settings.default_page_size

    # Schools with no performance data (special schools, PRUs, newly opened, etc.)
    # have NULL year from the LEFT JOIN — keep them but skip the groupby/trend logic.
    df_no_perf = df[df["year"].isna()].drop_duplicates(subset=["urn"])
    df = df[df["year"].notna()]

    # Get unique schools (latest year data for each)
    latest_year = df.groupby("urn")["year"].max().reset_index()
    df_latest = df.merge(latest_year, on=["urn", "year"])

    # Calculate trend by comparing to previous year
    # Get second-latest year for each school
    df_sorted = df.sort_values(["urn", "year"], ascending=[True, False])
    df_prev = df_sorted.groupby("urn").nth(1).reset_index()
    if not df_prev.empty and "rwm_expected_pct" in df_prev.columns:
        prev_rwm = df_prev[["urn", "rwm_expected_pct"]].rename(
            columns={"rwm_expected_pct": "prev_rwm_expected_pct"}
        )
        if "attainment_8_score" in df_prev.columns:
            prev_rwm = prev_rwm.merge(
                df_prev[["urn", "attainment_8_score"]].rename(
                    columns={"attainment_8_score": "prev_attainment_8_score"}
                ),
                on="urn", how="outer"
            )
        df_latest = df_latest.merge(prev_rwm, on="urn", how="left")

    # Merge back schools with no performance data
    df_latest = pd.concat([df_latest, df_no_perf], ignore_index=True)

    # Phase filter — uses PHASE_GROUPS so all-through/middle schools appear
    # in the correct phase(s) rather than being invisible to both filters.
    if phase:
        phase_lower = phase.lower().replace("_", "-")
        allowed = PHASE_GROUPS.get(phase_lower)
        if allowed:
            df_latest = df_latest[df_latest["phase"].str.lower().isin(allowed)]

    # Secondary-specific filters (after phase filter)
    if gender:
        df_latest = df_latest[df_latest["gender"].str.lower() == gender.lower()]
    if admissions_policy:
        df_latest = df_latest[df_latest["admissions_policy"].str.lower() == admissions_policy.lower()]
    if has_sixth_form == "yes":
        df_latest = df_latest[df_latest["age_range"].str.contains("18", na=False)]
    elif has_sixth_form == "no":
        df_latest = df_latest[~df_latest["age_range"].str.contains("18", na=False)]

    # Include key result metrics for display on cards
    location_cols = ["latitude", "longitude"]
    result_cols = [
        "phase",
        "year",
        "rwm_expected_pct",
        "rwm_high_pct",
        "prev_rwm_expected_pct",
        "prev_attainment_8_score",
        "reading_expected_pct",
        "writing_expected_pct",
        "maths_expected_pct",
        "total_pupils",
        "attainment_8_score",
        "english_maths_standard_pass_pct",
    ]
    available_cols = [
        c
        for c in SCHOOL_COLUMNS + location_cols + result_cols
        if c in df_latest.columns
    ]
    # fact_performance guarantees one row per (urn, year); df_latest has one row per urn.
    schools_df = df_latest[available_cols]

    # Location-based search (uses pre-geocoded data from database)
    search_coords = None
    if postcode:
        coords = geocode_single_postcode(postcode)
        if coords:
            search_coords = coords
            schools_df = schools_df.copy()

            # Filter by distance using pre-geocoded lat/long from database
            # Use vectorized haversine calculation for better performance
            lat1, lon1 = search_coords

            # Handle potential duplicate columns by taking first occurrence
            lat_col = schools_df.loc[:, "latitude"]
            lon_col = schools_df.loc[:, "longitude"]
            if isinstance(lat_col, pd.DataFrame):
                lat_col = lat_col.iloc[:, 0]
            if isinstance(lon_col, pd.DataFrame):
                lon_col = lon_col.iloc[:, 0]

            lat2 = lat_col.values
            lon2 = lon_col.values

            # Vectorized haversine formula
            R = 3959  # Earth's radius in miles
            lat1_rad = np.radians(lat1)
            lat2_rad = np.radians(lat2)
            dlat = np.radians(lat2 - lat1)
            dlon = np.radians(lon2 - lon1)

            a = np.sin(dlat / 2) ** 2 + np.cos(lat1_rad) * np.cos(lat2_rad) * np.sin(dlon / 2) ** 2
            c = 2 * np.arctan2(np.sqrt(a), np.sqrt(1 - a))
            distances = R * c

            # Handle missing coordinates
            has_coords = ~(pd.isna(lat_col) | pd.isna(lon_col))
            distances = np.where(has_coords.values, distances, float("inf"))
            schools_df["distance"] = distances
            schools_df = schools_df[schools_df["distance"] <= radius]
            schools_df = schools_df.sort_values("distance")

    # Apply filters
    if search:
        ts_urns = search_schools_typesense(search)
        if ts_urns:
            urn_order = {urn: i for i, urn in enumerate(ts_urns)}
            schools_df = schools_df[schools_df["urn"].isin(set(ts_urns))].copy()
            schools_df["_ts_rank"] = schools_df["urn"].map(urn_order)
            schools_df = schools_df.sort_values("_ts_rank").drop(columns=["_ts_rank"])
        else:
            # Fallback: Typesense unavailable, use substring match
            search_lower = search.lower()
            mask = schools_df["school_name"].str.lower().str.contains(search_lower, na=False)
            if "address" in schools_df.columns:
                mask = mask | schools_df["address"].str.lower().str.contains(search_lower, na=False)
            schools_df = schools_df[mask]

    if local_authority:
        schools_df = schools_df[
            schools_df["local_authority"].str.lower() == local_authority.lower()
        ]

    if school_type:
        schools_df = schools_df[
            schools_df["school_type"].str.lower() == school_type.lower()
        ]

    # Compute result-scoped filter values (before pagination).
    # Gender and admissions are secondary-only filters — scope them to schools
    # with KS4 data so they don't appear for purely primary result sets.
    _sec_mask = schools_df["attainment_8_score"].notna() if "attainment_8_score" in schools_df.columns else pd.Series(False, index=schools_df.index)
    result_filters = {
        "local_authorities": clean_filter_values(schools_df["local_authority"]) if "local_authority" in schools_df.columns else [],
        "school_types": clean_filter_values(schools_df["school_type"]) if "school_type" in schools_df.columns else [],
        "phases": clean_filter_values(schools_df["phase"]) if "phase" in schools_df.columns else [],
        "genders": clean_filter_values(schools_df.loc[_sec_mask, "gender"]) if "gender" in schools_df.columns and _sec_mask.any() else [],
        "admissions_policies": clean_filter_values(schools_df.loc[_sec_mask, "admissions_policy"]) if "admissions_policy" in schools_df.columns and _sec_mask.any() else [],
    }

    # Pagination
    total = len(schools_df)
    start_idx = (page - 1) * page_size
    end_idx = start_idx + page_size
    schools_df = schools_df.iloc[start_idx:end_idx]

    return {
        "schools": clean_for_json(schools_df),
        "total": total,
        "page": page,
        "page_size": page_size,
        "total_pages": (total + page_size - 1) // page_size if page_size > 0 else 0,
        "result_filters": result_filters,
        "location_info": {
            "postcode": postcode,
            "radius": radius * 1.60934,  # Convert miles to km for frontend display
            "coordinates": [search_coords[0], search_coords[1]]
        }
        if search_coords
        else None,
    }


@app.get("/api/schools/{urn}")
@limiter.limit(f"{settings.rate_limit_per_minute}/minute")
async def get_school_details(request: Request, urn: int):
    """Get detailed performance data for a specific school across all years."""
    # Validate URN range (UK school URNs are 6 digits)
    if not (100000 <= urn <= 999999):
        raise HTTPException(status_code=400, detail="Invalid URN format")

    df = load_school_data()

    if df.empty:
        raise HTTPException(status_code=404, detail="No data available")

    school_data = df[df["urn"] == urn]

    if school_data.empty:
        raise HTTPException(status_code=404, detail="School not found")

    # Sort by year
    school_data = school_data.sort_values("year")

    # Get latest info for the school
    latest = school_data.iloc[-1]

    # Fetch supplementary data (Ofsted, Parent View, admissions, etc.)
    from .database import SessionLocal
    supplementary = {}
    try:
        db = SessionLocal()
        supplementary = get_supplementary_data(db, urn)
        db.close()
    except Exception:
        pass

    return {
        "school_info": {
            "urn": urn,
            "school_name": latest.get("school_name", ""),
            "local_authority": latest.get("local_authority", ""),
            "school_type": latest.get("school_type", ""),
            "address": latest.get("address", ""),
            "religious_denomination": latest.get("religious_denomination", ""),
            "age_range": latest.get("age_range", ""),
            "latitude": latest.get("latitude"),
            "longitude": latest.get("longitude"),
            "phase": latest.get("phase"),
            # GIAS fields
            "website": latest.get("website"),
            "headteacher_name": latest.get("headteacher_name"),
            "capacity": latest.get("capacity"),
            "trust_name": latest.get("trust_name"),
            "gender": latest.get("gender"),
        },
        "yearly_data": clean_for_json(school_data),
        # Supplementary data (null if not yet populated by Kestra)
        "ofsted": supplementary.get("ofsted"),
        "parent_view": supplementary.get("parent_view"),
        "census": supplementary.get("census"),
        "admissions": supplementary.get("admissions"),
        "sen_detail": supplementary.get("sen_detail"),
        "phonics": supplementary.get("phonics"),
        "deprivation": supplementary.get("deprivation"),
        "finance": supplementary.get("finance"),
    }


@app.get("/api/compare")
@limiter.limit(f"{settings.rate_limit_per_minute}/minute")
async def compare_schools(
    request: Request,
    urns: str = Query(..., description="Comma-separated URNs", max_length=100)
):
    """Compare multiple schools side by side."""
    df = load_school_data()

    if df.empty:
        raise HTTPException(status_code=404, detail="No data available")

    try:
        urn_list = [int(u.strip()) for u in urns.split(",")]
        # Limit number of schools to compare
        if len(urn_list) > 10:
            raise HTTPException(status_code=400, detail="Maximum 10 schools can be compared")
        # Validate URN format
        for urn in urn_list:
            if not (100000 <= urn <= 999999):
                raise HTTPException(status_code=400, detail="Invalid URN format")
    except ValueError:
        raise HTTPException(status_code=400, detail="Invalid URN format")

    comparison_data = df[df["urn"].isin(urn_list)]

    if comparison_data.empty:
        raise HTTPException(status_code=404, detail="No schools found")

    result = {}
    for urn in urn_list:
        school_data = comparison_data[comparison_data["urn"] == urn].sort_values("year")
        if not school_data.empty:
            latest = school_data.iloc[-1]
            result[str(urn)] = {
                "school_info": {
                    "urn": urn,
                    "school_name": latest.get("school_name", ""),
                    "local_authority": latest.get("local_authority", ""),
                    "school_type": latest.get("school_type", ""),
                    "address": latest.get("address", ""),
                    "phase": latest.get("phase", ""),
                    "attainment_8_score": float(latest["attainment_8_score"]) if pd.notna(latest.get("attainment_8_score")) else None,
                    "rwm_expected_pct": float(latest["rwm_expected_pct"]) if pd.notna(latest.get("rwm_expected_pct")) else None,
                },
                "yearly_data": clean_for_json(school_data),
            }

    return {"comparison": result}


@app.get("/api/filters")
@limiter.limit(f"{settings.rate_limit_per_minute}/minute")
async def get_filter_options(request: Request):
    """Get available filter options (local authorities, school types, years)."""
    df = load_school_data()

    if df.empty:
        return {
            "local_authorities": [],
            "school_types": [],
            "years": [],
        }

    # Phases: return values from data, ordered sensibly
    phases = clean_filter_values(df["phase"]) if "phase" in df.columns else []

    secondary_df = df[df["attainment_8_score"].notna()] if "attainment_8_score" in df.columns else df.iloc[0:0]
    genders = clean_filter_values(secondary_df["gender"]) if "gender" in secondary_df.columns else []
    admissions_policies = clean_filter_values(secondary_df["admissions_policy"]) if "admissions_policy" in secondary_df.columns else []

    return {
        "local_authorities": clean_filter_values(df["local_authority"]) if "local_authority" in df.columns else [],
        "school_types": clean_filter_values(df["school_type"]) if "school_type" in df.columns else [],
        "years": sorted(df["year"].dropna().unique().tolist()),
        "phases": phases,
        "genders": genders,
        "admissions_policies": admissions_policies,
    }


@app.get("/api/la-averages")
@limiter.limit(f"{settings.rate_limit_per_minute}/minute")
async def get_la_averages(request: Request):
    """Get per-LA average Attainment 8 score for secondary schools in the latest year."""
    df = load_school_data()
    if df.empty:
        return {"year": 0, "secondary": {"attainment_8_by_la": {}}}
    latest_year = int(df["year"].max())
    sec_df = df[(df["year"] == latest_year) & df["attainment_8_score"].notna()]
    la_avg = sec_df.groupby("local_authority")["attainment_8_score"].mean().round(1).to_dict()
    return {"year": latest_year, "secondary": {"attainment_8_by_la": la_avg}}


@app.get("/api/national-averages")
@limiter.limit(f"{settings.rate_limit_per_minute}/minute")
async def get_national_averages(request: Request):
    """
    Compute national average for each metric from the latest data year.
    Returns separate averages for primary (KS2) and secondary (KS4) schools.
    Values are derived from the loaded DataFrame so they automatically
    stay current when new data is loaded.
    """
    df = load_school_data()
    if df.empty:
        return {"primary": {}, "secondary": {}}

    latest_year = int(df["year"].max())
    df_latest = df[df["year"] == latest_year]

    ks2_metrics = [
        "rwm_expected_pct", "rwm_high_pct",
        "reading_expected_pct", "writing_expected_pct", "maths_expected_pct",
        "reading_avg_score", "maths_avg_score", "gps_avg_score",
        "reading_progress", "writing_progress", "maths_progress",
        "overall_absence_pct", "persistent_absence_pct",
        "disadvantaged_gap", "disadvantaged_pct", "sen_support_pct",
    ]
    ks4_metrics = [
        "attainment_8_score", "progress_8_score",
        "english_maths_standard_pass_pct", "english_maths_strong_pass_pct",
        "ebacc_entry_pct", "ebacc_standard_pass_pct", "ebacc_strong_pass_pct",
        "ebacc_avg_score", "gcse_grade_91_pct",
    ]

    def _means(sub_df, metric_list):
        out = {}
        for col in metric_list:
            if col in sub_df.columns:
                val = sub_df[col].dropna()
                if len(val) > 0:
                    out[col] = round(float(val.mean()), 2)
        return out

    # Primary: schools where KS2 data is non-null
    primary_df = df_latest[df_latest["rwm_expected_pct"].notna()]
    # Secondary: schools where KS4 data is non-null
    secondary_df = df_latest[df_latest["attainment_8_score"].notna()]

    return {
        "year": latest_year,
        "primary": _means(primary_df, ks2_metrics),
        "secondary": _means(secondary_df, ks4_metrics),
    }


@app.get("/api/metrics")
@limiter.limit(f"{settings.rate_limit_per_minute}/minute")
async def get_available_metrics(request: Request):
    """
    Get list of available performance metrics for schools.

    This is the single source of truth for metric definitions.
    Frontend should consume this to avoid duplication.
    """
    df = load_school_data()

    available = []
    for key, info in METRIC_DEFINITIONS.items():
        if df.empty or key in df.columns:
            available.append({"key": key, **info})

    return {"metrics": available}


@app.get("/api/rankings")
@limiter.limit(f"{settings.rate_limit_per_minute}/minute")
async def get_rankings(
    request: Request,
    metric: str = Query("rwm_expected_pct", description="Metric to rank by", max_length=50),
    year: Optional[int] = Query(
        None, description="Specific year (defaults to most recent)", ge=2000, le=2100
    ),
    limit: int = Query(20, ge=1, le=100, description="Number of schools to return"),
    local_authority: Optional[str] = Query(
        None, description="Filter by local authority", max_length=100
    ),
    phase: Optional[str] = Query(
        None, description="Filter by phase: primary or secondary", max_length=20
    ),
):
    """Get school rankings by a specific metric."""
    # Sanitize local authority input
    local_authority = sanitize_search_input(local_authority)

    # Validate metric name (only allow alphanumeric and underscore)
    if not re.match(r"^[a-z0-9_]+$", metric):
        raise HTTPException(status_code=400, detail="Invalid metric name")

    df = load_school_data()

    if df.empty:
        return {"metric": metric, "year": None, "rankings": [], "total": 0}

    if metric not in df.columns:
        raise HTTPException(status_code=400, detail=f"Metric '{metric}' not available")

    # Filter by year
    if year:
        df = df[df["year"] == year]
    else:
        # Use most recent year
        max_year = df["year"].max()
        df = df[df["year"] == max_year]

    # Filter by local authority if specified
    if local_authority:
        df = df[df["local_authority"].str.lower() == local_authority.lower()]

    # Filter by phase
    if phase == "primary" and "rwm_expected_pct" in df.columns:
        df = df[df["rwm_expected_pct"].notna()]
    elif phase == "secondary" and "attainment_8_score" in df.columns:
        df = df[df["attainment_8_score"].notna()]

    # Sort and rank (exclude rows with no data for this metric)
    df = df.dropna(subset=[metric])
    total = len(df)

    # For progress scores, higher is better. For percentages, higher is also better.
    df = df.sort_values(metric, ascending=False).head(limit)

    # Return only relevant fields for rankings
    available_cols = [c for c in RANKING_COLUMNS if c in df.columns]
    df = df[available_cols]

    return {
        "metric": metric,
        "year": int(df["year"].iloc[0]) if not df.empty else None,
        "rankings": clean_for_json(df),
        "total": total,
    }


@app.get("/api/data-info")
@limiter.limit(f"{settings.rate_limit_per_minute}/minute")
async def get_data_info(request: Request):
    """Get information about loaded data."""
    # Get info directly from database
    db_info = get_db_info()

    if db_info["total_schools"] == 0:
        return {
            "status": "no_data",
            "message": "No data in marts. Run the annual EES pipeline to load KS2 data.",
            "data_source": "PostgreSQL",
        }

    # Also get DataFrame-based stats for backwards compatibility
    df = load_school_data()

    if df.empty:
        return {
            "status": "no_data",
            "message": "No data available",
            "data_source": "PostgreSQL",
        }

    years = [int(y) for y in sorted(df["year"].dropna().unique())]
    schools_per_year = {
        str(int(k)): int(v)
        for k, v in df.dropna(subset=["year"]).groupby("year")["urn"].nunique().to_dict().items()
    }
    la_counts = {
        str(k): int(v)
        for k, v in df["local_authority"].value_counts().to_dict().items()
    }

    return {
        "status": "loaded",
        "data_source": "PostgreSQL",
        "total_records": int(len(df)),
        "unique_schools": int(df["urn"].nunique()),
        "years_available": years,
        "schools_per_year": schools_per_year,
        "local_authorities": la_counts,
    }


@app.post("/api/admin/reload")
@limiter.limit("5/minute")
async def reload_data(
    request: Request,
    _: bool = Depends(verify_admin_api_key)
):
    """
    Admin endpoint to force data reload (useful after data updates).
    Requires X-API-Key header with valid admin API key.
    """
    clear_cache()
    load_school_data()
    return {"status": "reloaded"}


# =============================================================================
# SEO FILES
# =============================================================================


@app.get("/favicon.svg")
async def favicon():
    """Serve favicon."""
    return FileResponse(settings.frontend_dir / "favicon.svg", media_type="image/svg+xml")


@app.get("/robots.txt")
async def robots_txt():
    """Serve robots.txt for search engine crawlers."""
    return FileResponse(settings.frontend_dir / "robots.txt", media_type="text/plain")


@app.get("/sitemap.xml")
async def sitemap_xml():
    """Serve sitemap.xml for search engine indexing."""
    global _sitemap_xml
    if _sitemap_xml is None:
        try:
            _sitemap_xml = build_sitemap()
        except Exception as e:
            raise HTTPException(status_code=503, detail=f"Sitemap unavailable: {e}")
    return Response(content=_sitemap_xml, media_type="application/xml")


@app.post("/api/admin/regenerate-sitemap")
@limiter.limit("10/minute")
async def regenerate_sitemap(
    request: Request,
    _: bool = Depends(verify_admin_api_key),
):
    """Rebuild and cache the sitemap from current school data. Called by Airflow after data updates."""
    global _sitemap_xml
    _sitemap_xml = build_sitemap()
    n = _sitemap_xml.count("<url>")
    return {"status": "ok", "urls": n}


# Mount static files directly (must be after all routes to avoid catching API calls)
if settings.frontend_dir.exists():
    app.mount("/static", StaticFiles(directory=settings.frontend_dir), name="static")


if __name__ == "__main__":
    import uvicorn

    uvicorn.run(app, host=settings.host, port=settings.port)