""" Primary School Performance Comparison API Serves primary school (KS2) performance data for Wandsworth and Merton. Uses real data from UK Government Compare School Performance downloads. """ from fastapi import FastAPI, HTTPException, Query from fastapi.staticfiles import StaticFiles from fastapi.responses import FileResponse from fastapi.middleware.cors import CORSMiddleware import pandas as pd import numpy as np from pathlib import Path from typing import Optional import os import re # Local Authority codes for Wandsworth and Merton LA_CODES = { 212: "Wandsworth", 315: "Merton" } ALLOWED_LA_CODES = list(LA_CODES.keys()) app = FastAPI( title="Primary School Performance API - Wandsworth & Merton", description="API for comparing primary school (KS2) performance data in Wandsworth and Merton", version="1.0.0" ) # CORS middleware for development app.add_middleware( CORSMiddleware, allow_origins=["*"], allow_credentials=True, allow_methods=["*"], allow_headers=["*"], ) # Data directory DATA_DIR = Path(__file__).parent.parent / "data" FRONTEND_DIR = Path(__file__).parent.parent / "frontend" # Cache for loaded data - cleared on reload (updated for 2016-2017 data) _data_cache: Optional[pd.DataFrame] = None def convert_to_native(value): """Convert numpy types to native Python types for JSON serialization.""" if pd.isna(value): return None if isinstance(value, (np.integer,)): return int(value) if isinstance(value, (np.floating,)): if np.isnan(value) or np.isinf(value): return None return float(value) if isinstance(value, np.ndarray): return value.tolist() if value == "SUPP" or value == "NE" or value == "NA" or value == "NP": return None return value def clean_for_json(df: pd.DataFrame) -> list: """Convert DataFrame to list of dicts, replacing NaN/inf with None for JSON serialization.""" records = df.to_dict(orient="records") cleaned = [] for record in records: clean_record = {} for key, value in record.items(): clean_record[key] = convert_to_native(value) cleaned.append(clean_record) return cleaned def parse_numeric(value): """Parse a value to numeric, handling SUPP, NE, NA, %, etc.""" if pd.isna(value): return None if isinstance(value, (int, float)): if np.isnan(value) or np.isinf(value): return None return value if isinstance(value, str): value = value.strip() if value in ["SUPP", "NE", "NA", "NP", "NEW", "LOW", ""]: return None # Remove % sign if present if value.endswith('%'): value = value[:-1] try: return float(value) except ValueError: return None return None def extract_year_from_folder(folder_name: str) -> Optional[int]: """Extract the end year from folder name like '2023-2024' -> 2024.""" match = re.search(r'(\d{4})-(\d{4})', folder_name) if match: return int(match.group(2)) return None def load_school_data() -> pd.DataFrame: """Load and combine all school data from CSV files in year folders.""" global _data_cache if _data_cache is not None: return _data_cache all_data = [] # Look for year folders in data directory if DATA_DIR.exists(): for year_folder in DATA_DIR.iterdir(): if year_folder.is_dir() and re.match(r'\d{4}-\d{4}', year_folder.name): year = extract_year_from_folder(year_folder.name) if year is None: continue # Look for KS2 data file ks2_file = year_folder / "england_ks2final.csv" if ks2_file.exists(): try: print(f"Loading data from {ks2_file}") df = pd.read_csv(ks2_file, low_memory=False) # Filter to Wandsworth (212) and Merton (315) # Handle both string and integer columns if df['LEA'].dtype == 'object': df['LEA'] = pd.to_numeric(df['LEA'], errors='coerce') if df['URN'].dtype == 'object': df['URN'] = pd.to_numeric(df['URN'], errors='coerce') df = df[df['LEA'].isin(ALLOWED_LA_CODES)] # Filter to schools only (RECTYPE == 1 means school level data) if 'RECTYPE' in df.columns: df = df[df['RECTYPE'] == 1] # Add year and local authority name df['year'] = year df['local_authority'] = df['LEA'].map(LA_CODES) # Standardize column names for our API df = df.rename(columns={ 'URN': 'urn', 'SCHNAME': 'school_name', 'ADDRESS1': 'address1', 'ADDRESS2': 'address2', 'TOWN': 'town', 'PCODE': 'postcode', 'NFTYPE': 'school_type_code', 'RELDENOM': 'religious_denomination', 'AGERANGE': 'age_range', 'TOTPUPS': 'total_pupils', 'TELIG': 'eligible_pupils', # Core KS2 metrics 'PTRWM_EXP': 'rwm_expected_pct', 'PTRWM_HIGH': 'rwm_high_pct', 'READPROG': 'reading_progress', 'WRITPROG': 'writing_progress', 'MATPROG': 'maths_progress', 'PTREAD_EXP': 'reading_expected_pct', 'PTWRITTA_EXP': 'writing_expected_pct', 'PTMAT_EXP': 'maths_expected_pct', 'READ_AVERAGE': 'reading_avg_score', 'MAT_AVERAGE': 'maths_avg_score', 'PTREAD_HIGH': 'reading_high_pct', 'PTWRITTA_HIGH': 'writing_high_pct', 'PTMAT_HIGH': 'maths_high_pct', # GPS (Grammar, Punctuation & Spelling) 'PTGPS_EXP': 'gps_expected_pct', 'PTGPS_HIGH': 'gps_high_pct', 'GPS_AVERAGE': 'gps_avg_score', # Science 'PTSCITA_EXP': 'science_expected_pct', # School context 'PTFSM6CLA1A': 'disadvantaged_pct', 'PTEALGRP2': 'eal_pct', 'PSENELK': 'sen_support_pct', 'PSENELE': 'sen_ehcp_pct', 'PTMOBN': 'stability_pct', # Gender breakdown 'PTRWM_EXP_B': 'rwm_expected_boys_pct', 'PTRWM_EXP_G': 'rwm_expected_girls_pct', 'PTRWM_HIGH_B': 'rwm_high_boys_pct', 'PTRWM_HIGH_G': 'rwm_high_girls_pct', # Disadvantaged performance 'PTRWM_EXP_FSM6CLA1A': 'rwm_expected_disadvantaged_pct', 'PTRWM_EXP_NotFSM6CLA1A': 'rwm_expected_non_disadvantaged_pct', 'DIFFN_RWM_EXP': 'disadvantaged_gap', # 3-year averages 'PTRWM_EXP_3YR': 'rwm_expected_3yr_pct', 'READ_AVERAGE_3YR': 'reading_avg_3yr', 'MAT_AVERAGE_3YR': 'maths_avg_3yr', }) # Create address field def make_address(row): parts = [] if pd.notna(row.get('address1')) and row.get('address1'): parts.append(str(row['address1'])) if pd.notna(row.get('town')) and row.get('town'): parts.append(str(row['town'])) if pd.notna(row.get('postcode')) and row.get('postcode'): parts.append(str(row['postcode'])) return ', '.join(parts) if parts else '' df['address'] = df.apply(make_address, axis=1) # Map school type codes to names school_type_map = { 'AC': 'Academy', 'ACC': 'Academy Converter', 'ACS': 'Academy Sponsor Led', 'CY': 'Community School', 'VA': 'Voluntary Aided', 'VC': 'Voluntary Controlled', 'FD': 'Foundation', 'F': 'Foundation', 'FS': 'Free School', } df['school_type'] = df['school_type_code'].map(school_type_map).fillna('Other') # Parse numeric columns numeric_cols = [ # Core metrics 'rwm_expected_pct', 'rwm_high_pct', 'reading_progress', 'writing_progress', 'maths_progress', 'reading_expected_pct', 'writing_expected_pct', 'maths_expected_pct', 'reading_avg_score', 'maths_avg_score', 'reading_high_pct', 'writing_high_pct', 'maths_high_pct', # GPS & Science 'gps_expected_pct', 'gps_high_pct', 'gps_avg_score', 'science_expected_pct', # School context 'total_pupils', 'eligible_pupils', 'disadvantaged_pct', 'eal_pct', 'sen_support_pct', 'sen_ehcp_pct', 'stability_pct', # Gender breakdown 'rwm_expected_boys_pct', 'rwm_expected_girls_pct', 'rwm_high_boys_pct', 'rwm_high_girls_pct', # Disadvantaged performance 'rwm_expected_disadvantaged_pct', 'rwm_expected_non_disadvantaged_pct', 'disadvantaged_gap', # 3-year averages 'rwm_expected_3yr_pct', 'reading_avg_3yr', 'maths_avg_3yr', ] for col in numeric_cols: if col in df.columns: df[col] = df[col].apply(parse_numeric) all_data.append(df) print(f" Loaded {len(df)} schools for year {year}") except Exception as e: print(f"Error loading {ks2_file}: {e}") if all_data: _data_cache = pd.concat(all_data, ignore_index=True) print(f"\nTotal records loaded: {len(_data_cache)}") print(f"Unique schools: {_data_cache['urn'].nunique()}") print(f"Years: {sorted(_data_cache['year'].unique())}") else: print("No data files found. Creating empty DataFrame.") _data_cache = pd.DataFrame() return _data_cache @app.get("/") async def root(): """Serve the frontend.""" return FileResponse(FRONTEND_DIR / "index.html") @app.get("/api/schools") async def get_schools( search: Optional[str] = Query(None, description="Search by school name"), local_authority: Optional[str] = Query(None, description="Filter by local authority (Wandsworth or Merton)"), school_type: Optional[str] = Query(None, description="Filter by school type"), ): """Get list of unique primary schools in Wandsworth and Merton.""" df = load_school_data() if df.empty: return {"schools": []} # Get unique schools (latest year data for each) latest_year = df.groupby('urn')['year'].max().reset_index() df_latest = df.merge(latest_year, on=['urn', 'year']) school_cols = ["urn", "school_name", "local_authority", "school_type", "address", "town", "postcode"] available_cols = [c for c in school_cols if c in df_latest.columns] schools_df = df_latest[available_cols].drop_duplicates(subset=['urn']) # Apply filters if search: search_lower = search.lower() mask = schools_df["school_name"].str.lower().str.contains(search_lower, na=False) if "address" in schools_df.columns: mask = mask | schools_df["address"].str.lower().str.contains(search_lower, na=False) schools_df = schools_df[mask] if local_authority: schools_df = schools_df[schools_df["local_authority"].str.lower() == local_authority.lower()] if school_type: schools_df = schools_df[schools_df["school_type"].str.lower() == school_type.lower()] return {"schools": clean_for_json(schools_df)} @app.get("/api/schools/{urn}") async def get_school_details(urn: int): """Get detailed KS2 data for a specific primary school across all years.""" df = load_school_data() if df.empty: raise HTTPException(status_code=404, detail="No data available") school_data = df[df["urn"] == urn] if school_data.empty: raise HTTPException(status_code=404, detail="School not found") # Sort by year school_data = school_data.sort_values("year") # Get latest info for the school latest = school_data.iloc[-1] return { "school_info": { "urn": urn, "school_name": latest.get("school_name", ""), "local_authority": latest.get("local_authority", ""), "school_type": latest.get("school_type", ""), "address": latest.get("address", ""), "phase": "Primary", }, "yearly_data": clean_for_json(school_data) } @app.get("/api/compare") async def compare_schools(urns: str = Query(..., description="Comma-separated URNs")): """Compare multiple primary schools side by side.""" df = load_school_data() if df.empty: raise HTTPException(status_code=404, detail="No data available") try: urn_list = [int(u.strip()) for u in urns.split(",")] except ValueError: raise HTTPException(status_code=400, detail="Invalid URN format") comparison_data = df[df["urn"].isin(urn_list)] if comparison_data.empty: raise HTTPException(status_code=404, detail="No schools found") result = {} for urn in urn_list: school_data = comparison_data[comparison_data["urn"] == urn].sort_values("year") if not school_data.empty: latest = school_data.iloc[-1] result[str(urn)] = { "school_info": { "urn": urn, "school_name": latest.get("school_name", ""), "local_authority": latest.get("local_authority", ""), "address": latest.get("address", ""), }, "yearly_data": clean_for_json(school_data) } return {"comparison": result} @app.get("/api/filters") async def get_filter_options(): """Get available filter options (local authorities, school types, years).""" df = load_school_data() if df.empty: return { "local_authorities": ["Wandsworth", "Merton"], "school_types": [], "years": [], } return { "local_authorities": sorted(df["local_authority"].dropna().unique().tolist()), "school_types": sorted(df["school_type"].dropna().unique().tolist()), "years": sorted(df["year"].dropna().unique().tolist()), } @app.get("/api/metrics") async def get_available_metrics(): """Get list of available KS2 performance metrics for primary schools.""" df = load_school_data() # Define KS2 metric metadata organized by category metric_info = { # Expected Standard "rwm_expected_pct": {"name": "RWM Combined %", "description": "% meeting expected standard in reading, writing and maths", "type": "percentage", "category": "expected"}, "reading_expected_pct": {"name": "Reading Expected %", "description": "% meeting expected standard in reading", "type": "percentage", "category": "expected"}, "writing_expected_pct": {"name": "Writing Expected %", "description": "% meeting expected standard in writing", "type": "percentage", "category": "expected"}, "maths_expected_pct": {"name": "Maths Expected %", "description": "% meeting expected standard in maths", "type": "percentage", "category": "expected"}, "gps_expected_pct": {"name": "GPS Expected %", "description": "% meeting expected standard in grammar, punctuation & spelling", "type": "percentage", "category": "expected"}, "science_expected_pct": {"name": "Science Expected %", "description": "% meeting expected standard in science", "type": "percentage", "category": "expected"}, # Higher Standard "rwm_high_pct": {"name": "RWM Combined Higher %", "description": "% achieving higher standard in RWM combined", "type": "percentage", "category": "higher"}, "reading_high_pct": {"name": "Reading Higher %", "description": "% achieving higher standard in reading", "type": "percentage", "category": "higher"}, "writing_high_pct": {"name": "Writing Higher %", "description": "% achieving greater depth in writing", "type": "percentage", "category": "higher"}, "maths_high_pct": {"name": "Maths Higher %", "description": "% achieving higher standard in maths", "type": "percentage", "category": "higher"}, "gps_high_pct": {"name": "GPS Higher %", "description": "% achieving higher standard in GPS", "type": "percentage", "category": "higher"}, # Progress Scores "reading_progress": {"name": "Reading Progress", "description": "Progress in reading from KS1 to KS2", "type": "score", "category": "progress"}, "writing_progress": {"name": "Writing Progress", "description": "Progress in writing from KS1 to KS2", "type": "score", "category": "progress"}, "maths_progress": {"name": "Maths Progress", "description": "Progress in maths from KS1 to KS2", "type": "score", "category": "progress"}, # Average Scores "reading_avg_score": {"name": "Reading Avg Score", "description": "Average scaled score in reading", "type": "score", "category": "average"}, "maths_avg_score": {"name": "Maths Avg Score", "description": "Average scaled score in maths", "type": "score", "category": "average"}, "gps_avg_score": {"name": "GPS Avg Score", "description": "Average scaled score in GPS", "type": "score", "category": "average"}, # Gender Performance "rwm_expected_boys_pct": {"name": "RWM Expected % (Boys)", "description": "% of boys meeting expected standard", "type": "percentage", "category": "gender"}, "rwm_expected_girls_pct": {"name": "RWM Expected % (Girls)", "description": "% of girls meeting expected standard", "type": "percentage", "category": "gender"}, "rwm_high_boys_pct": {"name": "RWM Higher % (Boys)", "description": "% of boys at higher standard", "type": "percentage", "category": "gender"}, "rwm_high_girls_pct": {"name": "RWM Higher % (Girls)", "description": "% of girls at higher standard", "type": "percentage", "category": "gender"}, # Disadvantaged Performance "rwm_expected_disadvantaged_pct": {"name": "RWM Expected % (Disadvantaged)", "description": "% of disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"}, "rwm_expected_non_disadvantaged_pct": {"name": "RWM Expected % (Non-Disadvantaged)", "description": "% of non-disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"}, "disadvantaged_gap": {"name": "Disadvantaged Gap", "description": "Gap between disadvantaged and national non-disadvantaged", "type": "score", "category": "equity"}, # School Context "disadvantaged_pct": {"name": "% Disadvantaged Pupils", "description": "% of pupils eligible for free school meals or looked after", "type": "percentage", "category": "context"}, "eal_pct": {"name": "% EAL Pupils", "description": "% of pupils with English as additional language", "type": "percentage", "category": "context"}, "sen_support_pct": {"name": "% SEN Support", "description": "% of pupils with SEN support", "type": "percentage", "category": "context"}, "stability_pct": {"name": "% Pupil Stability", "description": "% of non-mobile pupils (stayed at school)", "type": "percentage", "category": "context"}, # 3-Year Averages "rwm_expected_3yr_pct": {"name": "RWM Expected % (3-Year Avg)", "description": "3-year average % meeting expected", "type": "percentage", "category": "trends"}, "reading_avg_3yr": {"name": "Reading Score (3-Year Avg)", "description": "3-year average reading score", "type": "score", "category": "trends"}, "maths_avg_3yr": {"name": "Maths Score (3-Year Avg)", "description": "3-year average maths score", "type": "score", "category": "trends"}, } available = [] for col, info in metric_info.items(): if df.empty or col in df.columns: available.append({"key": col, **info}) return {"metrics": available} @app.get("/api/rankings") async def get_rankings( metric: str = Query("rwm_expected_pct", description="KS2 metric to rank by"), year: Optional[int] = Query(None, description="Specific year (defaults to most recent)"), limit: int = Query(20, description="Number of schools to return"), ): """Get primary school rankings by a specific KS2 metric.""" df = load_school_data() if df.empty: return {"metric": metric, "year": None, "rankings": []} if metric not in df.columns: raise HTTPException(status_code=400, detail=f"Metric '{metric}' not available") # Filter by year if year: df = df[df["year"] == year] else: # Use most recent year max_year = df["year"].max() df = df[df["year"] == max_year] # Sort and rank (exclude rows with no data for this metric) df = df.dropna(subset=[metric]) # For progress scores, higher is better. For percentages, higher is also better. df = df.sort_values(metric, ascending=False).head(limit) # Return only relevant fields for rankings ranking_cols = [ "urn", "school_name", "local_authority", "school_type", "address", "year", "total_pupils", # Core expected "rwm_expected_pct", "reading_expected_pct", "writing_expected_pct", "maths_expected_pct", "gps_expected_pct", "science_expected_pct", # Core higher "rwm_high_pct", "reading_high_pct", "writing_high_pct", "maths_high_pct", "gps_high_pct", # Progress & averages "reading_progress", "writing_progress", "maths_progress", "reading_avg_score", "maths_avg_score", "gps_avg_score", # Gender "rwm_expected_boys_pct", "rwm_expected_girls_pct", "rwm_high_boys_pct", "rwm_high_girls_pct", # Equity "rwm_expected_disadvantaged_pct", "rwm_expected_non_disadvantaged_pct", "disadvantaged_gap", # Context "disadvantaged_pct", "eal_pct", "sen_support_pct", "stability_pct", # 3-year "rwm_expected_3yr_pct", "reading_avg_3yr", "maths_avg_3yr", ] available_cols = [c for c in ranking_cols if c in df.columns] df = df[available_cols] return { "metric": metric, "year": int(df["year"].iloc[0]) if not df.empty else None, "rankings": clean_for_json(df) } @app.get("/api/data-info") async def get_data_info(): """Get information about loaded data.""" df = load_school_data() if df.empty: return { "status": "no_data", "message": "No data files found in data folder. Please download KS2 data from the government website.", "data_folder": str(DATA_DIR), } years = [int(y) for y in sorted(df["year"].unique())] schools_per_year = {str(int(k)): int(v) for k, v in df.groupby("year")["urn"].nunique().to_dict().items()} la_counts = {str(k): int(v) for k, v in df["local_authority"].value_counts().to_dict().items()} return { "status": "loaded", "total_records": int(len(df)), "unique_schools": int(df["urn"].nunique()), "years_available": years, "schools_per_year": schools_per_year, "local_authorities": la_counts, } # Mount static files @app.on_event("startup") async def startup(): """Setup static file serving and load data on startup.""" if FRONTEND_DIR.exists(): app.mount("/static", StaticFiles(directory=FRONTEND_DIR), name="static") # Pre-load data load_school_data() if __name__ == "__main__": import uvicorn uvicorn.run(app, host="0.0.0.0", port=8000)