Refactoring and bug fixes
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 1m7s

This commit is contained in:
Tudor Sitaru
2026-01-06 16:30:32 +00:00
parent 0ea4720ac1
commit 54e4bc2e77
11 changed files with 1246 additions and 534 deletions

View File

@@ -4,284 +4,83 @@ Serves primary school (KS2) performance data for comparing schools.
Uses real data from UK Government Compare School Performance downloads.
"""
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException, Query
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional
import os
import re
# No longer filtering by specific LA codes - load all available schools
from .config import settings
from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
from .data_loader import load_school_data, clear_cache
from .utils import clean_for_json
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan - startup and shutdown events."""
# Startup: pre-load data
print("Starting up: Loading school data...")
load_school_data()
print("Data loaded successfully.")
yield # Application runs here
# Shutdown: cleanup if needed
print("Shutting down...")
app = FastAPI(
title="SchoolCompare API",
description="API for comparing primary school (KS2) performance data - schoolcompare.co.uk",
version="1.0.0"
version="2.0.0",
lifespan=lifespan,
)
# CORS middleware for development
# CORS middleware with configurable origins
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_origins=settings.allowed_origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Data directory
DATA_DIR = Path(__file__).parent.parent / "data"
FRONTEND_DIR = Path(__file__).parent.parent / "frontend"
# Cache for loaded data - cleared on reload (updated for 2016-2017 data)
_data_cache: Optional[pd.DataFrame] = None
def convert_to_native(value):
"""Convert numpy types to native Python types for JSON serialization."""
if pd.isna(value):
return None
if isinstance(value, (np.integer,)):
return int(value)
if isinstance(value, (np.floating,)):
if np.isnan(value) or np.isinf(value):
return None
return float(value)
if isinstance(value, np.ndarray):
return value.tolist()
if value == "SUPP" or value == "NE" or value == "NA" or value == "NP":
return None
return value
def clean_for_json(df: pd.DataFrame) -> list:
"""Convert DataFrame to list of dicts, replacing NaN/inf with None for JSON serialization."""
records = df.to_dict(orient="records")
cleaned = []
for record in records:
clean_record = {}
for key, value in record.items():
clean_record[key] = convert_to_native(value)
cleaned.append(clean_record)
return cleaned
def parse_numeric(value):
"""Parse a value to numeric, handling SUPP, NE, NA, %, etc."""
if pd.isna(value):
return None
if isinstance(value, (int, float)):
if np.isnan(value) or np.isinf(value):
return None
return value
if isinstance(value, str):
value = value.strip()
if value in ["SUPP", "NE", "NA", "NP", "NEW", "LOW", ""]:
return None
# Remove % sign if present
if value.endswith('%'):
value = value[:-1]
try:
return float(value)
except ValueError:
return None
return None
def extract_year_from_folder(folder_name: str) -> Optional[int]:
"""Extract the end year from folder name like '2023-2024' -> 2024."""
match = re.search(r'(\d{4})-(\d{4})', folder_name)
if match:
return int(match.group(2))
return None
def load_school_data() -> pd.DataFrame:
"""Load and combine all school data from CSV files in year folders."""
global _data_cache
if _data_cache is not None:
return _data_cache
all_data = []
# Look for year folders in data directory
if DATA_DIR.exists():
for year_folder in DATA_DIR.iterdir():
if year_folder.is_dir() and re.match(r'\d{4}-\d{4}', year_folder.name):
year = extract_year_from_folder(year_folder.name)
if year is None:
continue
# Look for KS2 data file
ks2_file = year_folder / "england_ks2final.csv"
if ks2_file.exists():
try:
print(f"Loading data from {ks2_file}")
df = pd.read_csv(ks2_file, low_memory=False)
# Handle both string and integer columns
if 'LEA' in df.columns and df['LEA'].dtype == 'object':
df['LEA'] = pd.to_numeric(df['LEA'], errors='coerce')
if 'URN' in df.columns and df['URN'].dtype == 'object':
df['URN'] = pd.to_numeric(df['URN'], errors='coerce')
# Filter to schools only (RECTYPE == 1 means school level data)
if 'RECTYPE' in df.columns:
df = df[df['RECTYPE'] == 1]
# Add year and local authority name from LANAME column
df['year'] = year
if 'LANAME' in df.columns:
df['local_authority'] = df['LANAME']
elif 'LEA' in df.columns:
df['local_authority'] = df['LEA'].astype(str)
# Standardize column names for our API
df = df.rename(columns={
'URN': 'urn',
'SCHNAME': 'school_name',
'ADDRESS1': 'address1',
'ADDRESS2': 'address2',
'TOWN': 'town',
'PCODE': 'postcode',
'NFTYPE': 'school_type_code',
'RELDENOM': 'religious_denomination',
'AGERANGE': 'age_range',
'TOTPUPS': 'total_pupils',
'TELIG': 'eligible_pupils',
# Core KS2 metrics
'PTRWM_EXP': 'rwm_expected_pct',
'PTRWM_HIGH': 'rwm_high_pct',
'READPROG': 'reading_progress',
'WRITPROG': 'writing_progress',
'MATPROG': 'maths_progress',
'PTREAD_EXP': 'reading_expected_pct',
'PTWRITTA_EXP': 'writing_expected_pct',
'PTMAT_EXP': 'maths_expected_pct',
'READ_AVERAGE': 'reading_avg_score',
'MAT_AVERAGE': 'maths_avg_score',
'PTREAD_HIGH': 'reading_high_pct',
'PTWRITTA_HIGH': 'writing_high_pct',
'PTMAT_HIGH': 'maths_high_pct',
# GPS (Grammar, Punctuation & Spelling)
'PTGPS_EXP': 'gps_expected_pct',
'PTGPS_HIGH': 'gps_high_pct',
'GPS_AVERAGE': 'gps_avg_score',
# Science
'PTSCITA_EXP': 'science_expected_pct',
# School context
'PTFSM6CLA1A': 'disadvantaged_pct',
'PTEALGRP2': 'eal_pct',
'PSENELK': 'sen_support_pct',
'PSENELE': 'sen_ehcp_pct',
'PTMOBN': 'stability_pct',
# Gender breakdown
'PTRWM_EXP_B': 'rwm_expected_boys_pct',
'PTRWM_EXP_G': 'rwm_expected_girls_pct',
'PTRWM_HIGH_B': 'rwm_high_boys_pct',
'PTRWM_HIGH_G': 'rwm_high_girls_pct',
# Disadvantaged performance
'PTRWM_EXP_FSM6CLA1A': 'rwm_expected_disadvantaged_pct',
'PTRWM_EXP_NotFSM6CLA1A': 'rwm_expected_non_disadvantaged_pct',
'DIFFN_RWM_EXP': 'disadvantaged_gap',
# 3-year averages
'PTRWM_EXP_3YR': 'rwm_expected_3yr_pct',
'READ_AVERAGE_3YR': 'reading_avg_3yr',
'MAT_AVERAGE_3YR': 'maths_avg_3yr',
})
# Create address field
def make_address(row):
parts = []
if pd.notna(row.get('address1')) and row.get('address1'):
parts.append(str(row['address1']))
if pd.notna(row.get('town')) and row.get('town'):
parts.append(str(row['town']))
if pd.notna(row.get('postcode')) and row.get('postcode'):
parts.append(str(row['postcode']))
return ', '.join(parts) if parts else ''
df['address'] = df.apply(make_address, axis=1)
# Map school type codes to names
school_type_map = {
'AC': 'Academy', 'ACC': 'Academy Converter', 'ACS': 'Academy Sponsor Led',
'CY': 'Community School', 'VA': 'Voluntary Aided', 'VC': 'Voluntary Controlled',
'FD': 'Foundation', 'F': 'Foundation', 'FS': 'Free School',
}
df['school_type'] = df['school_type_code'].map(school_type_map).fillna('Other')
# Parse numeric columns
numeric_cols = [
# Core metrics
'rwm_expected_pct', 'rwm_high_pct', 'reading_progress',
'writing_progress', 'maths_progress', 'reading_expected_pct',
'writing_expected_pct', 'maths_expected_pct', 'reading_avg_score',
'maths_avg_score', 'reading_high_pct', 'writing_high_pct', 'maths_high_pct',
# GPS & Science
'gps_expected_pct', 'gps_high_pct', 'gps_avg_score', 'science_expected_pct',
# School context
'total_pupils', 'eligible_pupils', 'disadvantaged_pct', 'eal_pct',
'sen_support_pct', 'sen_ehcp_pct', 'stability_pct',
# Gender breakdown
'rwm_expected_boys_pct', 'rwm_expected_girls_pct',
'rwm_high_boys_pct', 'rwm_high_girls_pct',
# Disadvantaged performance
'rwm_expected_disadvantaged_pct', 'rwm_expected_non_disadvantaged_pct', 'disadvantaged_gap',
# 3-year averages
'rwm_expected_3yr_pct', 'reading_avg_3yr', 'maths_avg_3yr',
]
for col in numeric_cols:
if col in df.columns:
df[col] = df[col].apply(parse_numeric)
all_data.append(df)
print(f" Loaded {len(df)} schools for year {year}")
except Exception as e:
print(f"Error loading {ks2_file}: {e}")
if all_data:
_data_cache = pd.concat(all_data, ignore_index=True)
print(f"\nTotal records loaded: {len(_data_cache)}")
print(f"Unique schools: {_data_cache['urn'].nunique()}")
print(f"Years: {sorted(_data_cache['year'].unique())}")
else:
print("No data files found. Creating empty DataFrame.")
_data_cache = pd.DataFrame()
return _data_cache
@app.get("/")
async def root():
"""Serve the frontend."""
return FileResponse(FRONTEND_DIR / "index.html")
return FileResponse(settings.frontend_dir / "index.html")
@app.get("/api/schools")
async def get_schools(
search: Optional[str] = Query(None, description="Search by school name"),
local_authority: Optional[str] = Query(None, description="Filter by local authority (Wandsworth or Merton)"),
local_authority: Optional[str] = Query(None, description="Filter by local authority"),
school_type: Optional[str] = Query(None, description="Filter by school type"),
page: int = Query(1, ge=1, description="Page number"),
page_size: int = Query(None, ge=1, le=100, description="Results per page"),
):
"""Get list of unique primary schools in Wandsworth and Merton."""
"""
Get list of unique primary schools with pagination.
Returns paginated results with total count for efficient loading.
"""
df = load_school_data()
if df.empty:
return {"schools": []}
return {"schools": [], "total": 0, "page": page, "page_size": 0}
# Use configured default if not specified
if page_size is None:
page_size = settings.default_page_size
# Get unique schools (latest year data for each)
latest_year = df.groupby('urn')['year'].max().reset_index()
df_latest = df.merge(latest_year, on=['urn', 'year'])
school_cols = ["urn", "school_name", "local_authority", "school_type", "address", "town", "postcode"]
available_cols = [c for c in school_cols if c in df_latest.columns]
available_cols = [c for c in SCHOOL_COLUMNS if c in df_latest.columns]
schools_df = df_latest[available_cols].drop_duplicates(subset=['urn'])
# Apply filters
@@ -298,7 +97,19 @@ async def get_schools(
if school_type:
schools_df = schools_df[schools_df["school_type"].str.lower() == school_type.lower()]
return {"schools": clean_for_json(schools_df)}
# Pagination
total = len(schools_df)
start_idx = (page - 1) * page_size
end_idx = start_idx + page_size
schools_df = schools_df.iloc[start_idx:end_idx]
return {
"schools": clean_for_json(schools_df),
"total": total,
"page": page,
"page_size": page_size,
"total_pages": (total + page_size - 1) // page_size if page_size > 0 else 0,
}
@app.get("/api/schools/{urn}")
@@ -390,56 +201,18 @@ async def get_filter_options():
@app.get("/api/metrics")
async def get_available_metrics():
"""Get list of available KS2 performance metrics for primary schools."""
"""
Get list of available KS2 performance metrics for primary schools.
This is the single source of truth for metric definitions.
Frontend should consume this to avoid duplication.
"""
df = load_school_data()
# Define KS2 metric metadata organized by category
metric_info = {
# Expected Standard
"rwm_expected_pct": {"name": "RWM Combined %", "description": "% meeting expected standard in reading, writing and maths", "type": "percentage", "category": "expected"},
"reading_expected_pct": {"name": "Reading Expected %", "description": "% meeting expected standard in reading", "type": "percentage", "category": "expected"},
"writing_expected_pct": {"name": "Writing Expected %", "description": "% meeting expected standard in writing", "type": "percentage", "category": "expected"},
"maths_expected_pct": {"name": "Maths Expected %", "description": "% meeting expected standard in maths", "type": "percentage", "category": "expected"},
"gps_expected_pct": {"name": "GPS Expected %", "description": "% meeting expected standard in grammar, punctuation & spelling", "type": "percentage", "category": "expected"},
"science_expected_pct": {"name": "Science Expected %", "description": "% meeting expected standard in science", "type": "percentage", "category": "expected"},
# Higher Standard
"rwm_high_pct": {"name": "RWM Combined Higher %", "description": "% achieving higher standard in RWM combined", "type": "percentage", "category": "higher"},
"reading_high_pct": {"name": "Reading Higher %", "description": "% achieving higher standard in reading", "type": "percentage", "category": "higher"},
"writing_high_pct": {"name": "Writing Higher %", "description": "% achieving greater depth in writing", "type": "percentage", "category": "higher"},
"maths_high_pct": {"name": "Maths Higher %", "description": "% achieving higher standard in maths", "type": "percentage", "category": "higher"},
"gps_high_pct": {"name": "GPS Higher %", "description": "% achieving higher standard in GPS", "type": "percentage", "category": "higher"},
# Progress Scores
"reading_progress": {"name": "Reading Progress", "description": "Progress in reading from KS1 to KS2", "type": "score", "category": "progress"},
"writing_progress": {"name": "Writing Progress", "description": "Progress in writing from KS1 to KS2", "type": "score", "category": "progress"},
"maths_progress": {"name": "Maths Progress", "description": "Progress in maths from KS1 to KS2", "type": "score", "category": "progress"},
# Average Scores
"reading_avg_score": {"name": "Reading Avg Score", "description": "Average scaled score in reading", "type": "score", "category": "average"},
"maths_avg_score": {"name": "Maths Avg Score", "description": "Average scaled score in maths", "type": "score", "category": "average"},
"gps_avg_score": {"name": "GPS Avg Score", "description": "Average scaled score in GPS", "type": "score", "category": "average"},
# Gender Performance
"rwm_expected_boys_pct": {"name": "RWM Expected % (Boys)", "description": "% of boys meeting expected standard", "type": "percentage", "category": "gender"},
"rwm_expected_girls_pct": {"name": "RWM Expected % (Girls)", "description": "% of girls meeting expected standard", "type": "percentage", "category": "gender"},
"rwm_high_boys_pct": {"name": "RWM Higher % (Boys)", "description": "% of boys at higher standard", "type": "percentage", "category": "gender"},
"rwm_high_girls_pct": {"name": "RWM Higher % (Girls)", "description": "% of girls at higher standard", "type": "percentage", "category": "gender"},
# Disadvantaged Performance
"rwm_expected_disadvantaged_pct": {"name": "RWM Expected % (Disadvantaged)", "description": "% of disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
"rwm_expected_non_disadvantaged_pct": {"name": "RWM Expected % (Non-Disadvantaged)", "description": "% of non-disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
"disadvantaged_gap": {"name": "Disadvantaged Gap", "description": "Gap between disadvantaged and national non-disadvantaged", "type": "score", "category": "equity"},
# School Context
"disadvantaged_pct": {"name": "% Disadvantaged Pupils", "description": "% of pupils eligible for free school meals or looked after", "type": "percentage", "category": "context"},
"eal_pct": {"name": "% EAL Pupils", "description": "% of pupils with English as additional language", "type": "percentage", "category": "context"},
"sen_support_pct": {"name": "% SEN Support", "description": "% of pupils with SEN support", "type": "percentage", "category": "context"},
"stability_pct": {"name": "% Pupil Stability", "description": "% of non-mobile pupils (stayed at school)", "type": "percentage", "category": "context"},
# 3-Year Averages
"rwm_expected_3yr_pct": {"name": "RWM Expected % (3-Year Avg)", "description": "3-year average % meeting expected", "type": "percentage", "category": "trends"},
"reading_avg_3yr": {"name": "Reading Score (3-Year Avg)", "description": "3-year average reading score", "type": "score", "category": "trends"},
"maths_avg_3yr": {"name": "Maths Score (3-Year Avg)", "description": "3-year average maths score", "type": "score", "category": "trends"},
}
available = []
for col, info in metric_info.items():
if df.empty or col in df.columns:
available.append({"key": col, **info})
for key, info in METRIC_DEFINITIONS.items():
if df.empty or key in df.columns:
available.append({"key": key, **info})
return {"metrics": available}
@@ -448,13 +221,14 @@ async def get_available_metrics():
async def get_rankings(
metric: str = Query("rwm_expected_pct", description="KS2 metric to rank by"),
year: Optional[int] = Query(None, description="Specific year (defaults to most recent)"),
limit: int = Query(20, description="Number of schools to return"),
limit: int = Query(20, ge=1, le=100, description="Number of schools to return"),
local_authority: Optional[str] = Query(None, description="Filter by local authority"),
):
"""Get primary school rankings by a specific KS2 metric."""
df = load_school_data()
if df.empty:
return {"metric": metric, "year": None, "rankings": []}
return {"metric": metric, "year": None, "rankings": [], "total": 0}
if metric not in df.columns:
raise HTTPException(status_code=400, detail=f"Metric '{metric}' not available")
@@ -467,39 +241,26 @@ async def get_rankings(
max_year = df["year"].max()
df = df[df["year"] == max_year]
# Filter by local authority if specified
if local_authority:
df = df[df["local_authority"].str.lower() == local_authority.lower()]
# Sort and rank (exclude rows with no data for this metric)
df = df.dropna(subset=[metric])
total = len(df)
# For progress scores, higher is better. For percentages, higher is also better.
df = df.sort_values(metric, ascending=False).head(limit)
# Return only relevant fields for rankings
ranking_cols = [
"urn", "school_name", "local_authority", "school_type", "address", "year", "total_pupils",
# Core expected
"rwm_expected_pct", "reading_expected_pct", "writing_expected_pct", "maths_expected_pct",
"gps_expected_pct", "science_expected_pct",
# Core higher
"rwm_high_pct", "reading_high_pct", "writing_high_pct", "maths_high_pct", "gps_high_pct",
# Progress & averages
"reading_progress", "writing_progress", "maths_progress",
"reading_avg_score", "maths_avg_score", "gps_avg_score",
# Gender
"rwm_expected_boys_pct", "rwm_expected_girls_pct", "rwm_high_boys_pct", "rwm_high_girls_pct",
# Equity
"rwm_expected_disadvantaged_pct", "rwm_expected_non_disadvantaged_pct", "disadvantaged_gap",
# Context
"disadvantaged_pct", "eal_pct", "sen_support_pct", "stability_pct",
# 3-year
"rwm_expected_3yr_pct", "reading_avg_3yr", "maths_avg_3yr",
]
available_cols = [c for c in ranking_cols if c in df.columns]
available_cols = [c for c in RANKING_COLUMNS if c in df.columns]
df = df[available_cols]
return {
"metric": metric,
"year": int(df["year"].iloc[0]) if not df.empty else None,
"rankings": clean_for_json(df)
"rankings": clean_for_json(df),
"total": total,
}
@@ -512,7 +273,7 @@ async def get_data_info():
return {
"status": "no_data",
"message": "No data files found in data folder. Please download KS2 data from the government website.",
"data_folder": str(DATA_DIR),
"data_folder": str(settings.data_dir),
}
years = [int(y) for y in sorted(df["year"].unique())]
@@ -529,17 +290,22 @@ async def get_data_info():
}
# Mount static files
@app.on_event("startup")
async def startup():
"""Setup static file serving and load data on startup."""
if FRONTEND_DIR.exists():
app.mount("/static", StaticFiles(directory=FRONTEND_DIR), name="static")
# Pre-load data
@app.post("/api/admin/reload")
async def reload_data():
"""Admin endpoint to force data reload (useful after data updates)."""
clear_cache()
load_school_data()
return {"status": "reloaded"}
# Mount static files after all routes are defined
@app.on_event("startup")
async def mount_static():
"""Mount static file serving."""
if settings.frontend_dir.exists():
app.mount("/static", StaticFiles(directory=settings.frontend_dir), name="static")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
uvicorn.run(app, host=settings.host, port=settings.port)