Refactoring and bug fixes
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 1m7s

This commit is contained in:
Tudor Sitaru
2026-01-06 16:30:32 +00:00
parent 0ea4720ac1
commit 54e4bc2e77
11 changed files with 1246 additions and 534 deletions

2
backend/__init__.py Normal file
View File

@@ -0,0 +1,2 @@
# Backend package

View File

@@ -4,284 +4,83 @@ Serves primary school (KS2) performance data for comparing schools.
Uses real data from UK Government Compare School Performance downloads.
"""
from contextlib import asynccontextmanager
from fastapi import FastAPI, HTTPException, Query
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
import pandas as pd
import numpy as np
from pathlib import Path
from typing import Optional
import os
import re
# No longer filtering by specific LA codes - load all available schools
from .config import settings
from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
from .data_loader import load_school_data, clear_cache
from .utils import clean_for_json
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan - startup and shutdown events."""
# Startup: pre-load data
print("Starting up: Loading school data...")
load_school_data()
print("Data loaded successfully.")
yield # Application runs here
# Shutdown: cleanup if needed
print("Shutting down...")
app = FastAPI(
title="SchoolCompare API",
description="API for comparing primary school (KS2) performance data - schoolcompare.co.uk",
version="1.0.0"
version="2.0.0",
lifespan=lifespan,
)
# CORS middleware for development
# CORS middleware with configurable origins
app.add_middleware(
CORSMiddleware,
allow_origins=["*"],
allow_origins=settings.allowed_origins,
allow_credentials=True,
allow_methods=["*"],
allow_headers=["*"],
)
# Data directory
DATA_DIR = Path(__file__).parent.parent / "data"
FRONTEND_DIR = Path(__file__).parent.parent / "frontend"
# Cache for loaded data - cleared on reload (updated for 2016-2017 data)
_data_cache: Optional[pd.DataFrame] = None
def convert_to_native(value):
"""Convert numpy types to native Python types for JSON serialization."""
if pd.isna(value):
return None
if isinstance(value, (np.integer,)):
return int(value)
if isinstance(value, (np.floating,)):
if np.isnan(value) or np.isinf(value):
return None
return float(value)
if isinstance(value, np.ndarray):
return value.tolist()
if value == "SUPP" or value == "NE" or value == "NA" or value == "NP":
return None
return value
def clean_for_json(df: pd.DataFrame) -> list:
"""Convert DataFrame to list of dicts, replacing NaN/inf with None for JSON serialization."""
records = df.to_dict(orient="records")
cleaned = []
for record in records:
clean_record = {}
for key, value in record.items():
clean_record[key] = convert_to_native(value)
cleaned.append(clean_record)
return cleaned
def parse_numeric(value):
"""Parse a value to numeric, handling SUPP, NE, NA, %, etc."""
if pd.isna(value):
return None
if isinstance(value, (int, float)):
if np.isnan(value) or np.isinf(value):
return None
return value
if isinstance(value, str):
value = value.strip()
if value in ["SUPP", "NE", "NA", "NP", "NEW", "LOW", ""]:
return None
# Remove % sign if present
if value.endswith('%'):
value = value[:-1]
try:
return float(value)
except ValueError:
return None
return None
def extract_year_from_folder(folder_name: str) -> Optional[int]:
"""Extract the end year from folder name like '2023-2024' -> 2024."""
match = re.search(r'(\d{4})-(\d{4})', folder_name)
if match:
return int(match.group(2))
return None
def load_school_data() -> pd.DataFrame:
"""Load and combine all school data from CSV files in year folders."""
global _data_cache
if _data_cache is not None:
return _data_cache
all_data = []
# Look for year folders in data directory
if DATA_DIR.exists():
for year_folder in DATA_DIR.iterdir():
if year_folder.is_dir() and re.match(r'\d{4}-\d{4}', year_folder.name):
year = extract_year_from_folder(year_folder.name)
if year is None:
continue
# Look for KS2 data file
ks2_file = year_folder / "england_ks2final.csv"
if ks2_file.exists():
try:
print(f"Loading data from {ks2_file}")
df = pd.read_csv(ks2_file, low_memory=False)
# Handle both string and integer columns
if 'LEA' in df.columns and df['LEA'].dtype == 'object':
df['LEA'] = pd.to_numeric(df['LEA'], errors='coerce')
if 'URN' in df.columns and df['URN'].dtype == 'object':
df['URN'] = pd.to_numeric(df['URN'], errors='coerce')
# Filter to schools only (RECTYPE == 1 means school level data)
if 'RECTYPE' in df.columns:
df = df[df['RECTYPE'] == 1]
# Add year and local authority name from LANAME column
df['year'] = year
if 'LANAME' in df.columns:
df['local_authority'] = df['LANAME']
elif 'LEA' in df.columns:
df['local_authority'] = df['LEA'].astype(str)
# Standardize column names for our API
df = df.rename(columns={
'URN': 'urn',
'SCHNAME': 'school_name',
'ADDRESS1': 'address1',
'ADDRESS2': 'address2',
'TOWN': 'town',
'PCODE': 'postcode',
'NFTYPE': 'school_type_code',
'RELDENOM': 'religious_denomination',
'AGERANGE': 'age_range',
'TOTPUPS': 'total_pupils',
'TELIG': 'eligible_pupils',
# Core KS2 metrics
'PTRWM_EXP': 'rwm_expected_pct',
'PTRWM_HIGH': 'rwm_high_pct',
'READPROG': 'reading_progress',
'WRITPROG': 'writing_progress',
'MATPROG': 'maths_progress',
'PTREAD_EXP': 'reading_expected_pct',
'PTWRITTA_EXP': 'writing_expected_pct',
'PTMAT_EXP': 'maths_expected_pct',
'READ_AVERAGE': 'reading_avg_score',
'MAT_AVERAGE': 'maths_avg_score',
'PTREAD_HIGH': 'reading_high_pct',
'PTWRITTA_HIGH': 'writing_high_pct',
'PTMAT_HIGH': 'maths_high_pct',
# GPS (Grammar, Punctuation & Spelling)
'PTGPS_EXP': 'gps_expected_pct',
'PTGPS_HIGH': 'gps_high_pct',
'GPS_AVERAGE': 'gps_avg_score',
# Science
'PTSCITA_EXP': 'science_expected_pct',
# School context
'PTFSM6CLA1A': 'disadvantaged_pct',
'PTEALGRP2': 'eal_pct',
'PSENELK': 'sen_support_pct',
'PSENELE': 'sen_ehcp_pct',
'PTMOBN': 'stability_pct',
# Gender breakdown
'PTRWM_EXP_B': 'rwm_expected_boys_pct',
'PTRWM_EXP_G': 'rwm_expected_girls_pct',
'PTRWM_HIGH_B': 'rwm_high_boys_pct',
'PTRWM_HIGH_G': 'rwm_high_girls_pct',
# Disadvantaged performance
'PTRWM_EXP_FSM6CLA1A': 'rwm_expected_disadvantaged_pct',
'PTRWM_EXP_NotFSM6CLA1A': 'rwm_expected_non_disadvantaged_pct',
'DIFFN_RWM_EXP': 'disadvantaged_gap',
# 3-year averages
'PTRWM_EXP_3YR': 'rwm_expected_3yr_pct',
'READ_AVERAGE_3YR': 'reading_avg_3yr',
'MAT_AVERAGE_3YR': 'maths_avg_3yr',
})
# Create address field
def make_address(row):
parts = []
if pd.notna(row.get('address1')) and row.get('address1'):
parts.append(str(row['address1']))
if pd.notna(row.get('town')) and row.get('town'):
parts.append(str(row['town']))
if pd.notna(row.get('postcode')) and row.get('postcode'):
parts.append(str(row['postcode']))
return ', '.join(parts) if parts else ''
df['address'] = df.apply(make_address, axis=1)
# Map school type codes to names
school_type_map = {
'AC': 'Academy', 'ACC': 'Academy Converter', 'ACS': 'Academy Sponsor Led',
'CY': 'Community School', 'VA': 'Voluntary Aided', 'VC': 'Voluntary Controlled',
'FD': 'Foundation', 'F': 'Foundation', 'FS': 'Free School',
}
df['school_type'] = df['school_type_code'].map(school_type_map).fillna('Other')
# Parse numeric columns
numeric_cols = [
# Core metrics
'rwm_expected_pct', 'rwm_high_pct', 'reading_progress',
'writing_progress', 'maths_progress', 'reading_expected_pct',
'writing_expected_pct', 'maths_expected_pct', 'reading_avg_score',
'maths_avg_score', 'reading_high_pct', 'writing_high_pct', 'maths_high_pct',
# GPS & Science
'gps_expected_pct', 'gps_high_pct', 'gps_avg_score', 'science_expected_pct',
# School context
'total_pupils', 'eligible_pupils', 'disadvantaged_pct', 'eal_pct',
'sen_support_pct', 'sen_ehcp_pct', 'stability_pct',
# Gender breakdown
'rwm_expected_boys_pct', 'rwm_expected_girls_pct',
'rwm_high_boys_pct', 'rwm_high_girls_pct',
# Disadvantaged performance
'rwm_expected_disadvantaged_pct', 'rwm_expected_non_disadvantaged_pct', 'disadvantaged_gap',
# 3-year averages
'rwm_expected_3yr_pct', 'reading_avg_3yr', 'maths_avg_3yr',
]
for col in numeric_cols:
if col in df.columns:
df[col] = df[col].apply(parse_numeric)
all_data.append(df)
print(f" Loaded {len(df)} schools for year {year}")
except Exception as e:
print(f"Error loading {ks2_file}: {e}")
if all_data:
_data_cache = pd.concat(all_data, ignore_index=True)
print(f"\nTotal records loaded: {len(_data_cache)}")
print(f"Unique schools: {_data_cache['urn'].nunique()}")
print(f"Years: {sorted(_data_cache['year'].unique())}")
else:
print("No data files found. Creating empty DataFrame.")
_data_cache = pd.DataFrame()
return _data_cache
@app.get("/")
async def root():
"""Serve the frontend."""
return FileResponse(FRONTEND_DIR / "index.html")
return FileResponse(settings.frontend_dir / "index.html")
@app.get("/api/schools")
async def get_schools(
search: Optional[str] = Query(None, description="Search by school name"),
local_authority: Optional[str] = Query(None, description="Filter by local authority (Wandsworth or Merton)"),
local_authority: Optional[str] = Query(None, description="Filter by local authority"),
school_type: Optional[str] = Query(None, description="Filter by school type"),
page: int = Query(1, ge=1, description="Page number"),
page_size: int = Query(None, ge=1, le=100, description="Results per page"),
):
"""Get list of unique primary schools in Wandsworth and Merton."""
"""
Get list of unique primary schools with pagination.
Returns paginated results with total count for efficient loading.
"""
df = load_school_data()
if df.empty:
return {"schools": []}
return {"schools": [], "total": 0, "page": page, "page_size": 0}
# Use configured default if not specified
if page_size is None:
page_size = settings.default_page_size
# Get unique schools (latest year data for each)
latest_year = df.groupby('urn')['year'].max().reset_index()
df_latest = df.merge(latest_year, on=['urn', 'year'])
school_cols = ["urn", "school_name", "local_authority", "school_type", "address", "town", "postcode"]
available_cols = [c for c in school_cols if c in df_latest.columns]
available_cols = [c for c in SCHOOL_COLUMNS if c in df_latest.columns]
schools_df = df_latest[available_cols].drop_duplicates(subset=['urn'])
# Apply filters
@@ -298,7 +97,19 @@ async def get_schools(
if school_type:
schools_df = schools_df[schools_df["school_type"].str.lower() == school_type.lower()]
return {"schools": clean_for_json(schools_df)}
# Pagination
total = len(schools_df)
start_idx = (page - 1) * page_size
end_idx = start_idx + page_size
schools_df = schools_df.iloc[start_idx:end_idx]
return {
"schools": clean_for_json(schools_df),
"total": total,
"page": page,
"page_size": page_size,
"total_pages": (total + page_size - 1) // page_size if page_size > 0 else 0,
}
@app.get("/api/schools/{urn}")
@@ -390,56 +201,18 @@ async def get_filter_options():
@app.get("/api/metrics")
async def get_available_metrics():
"""Get list of available KS2 performance metrics for primary schools."""
"""
Get list of available KS2 performance metrics for primary schools.
This is the single source of truth for metric definitions.
Frontend should consume this to avoid duplication.
"""
df = load_school_data()
# Define KS2 metric metadata organized by category
metric_info = {
# Expected Standard
"rwm_expected_pct": {"name": "RWM Combined %", "description": "% meeting expected standard in reading, writing and maths", "type": "percentage", "category": "expected"},
"reading_expected_pct": {"name": "Reading Expected %", "description": "% meeting expected standard in reading", "type": "percentage", "category": "expected"},
"writing_expected_pct": {"name": "Writing Expected %", "description": "% meeting expected standard in writing", "type": "percentage", "category": "expected"},
"maths_expected_pct": {"name": "Maths Expected %", "description": "% meeting expected standard in maths", "type": "percentage", "category": "expected"},
"gps_expected_pct": {"name": "GPS Expected %", "description": "% meeting expected standard in grammar, punctuation & spelling", "type": "percentage", "category": "expected"},
"science_expected_pct": {"name": "Science Expected %", "description": "% meeting expected standard in science", "type": "percentage", "category": "expected"},
# Higher Standard
"rwm_high_pct": {"name": "RWM Combined Higher %", "description": "% achieving higher standard in RWM combined", "type": "percentage", "category": "higher"},
"reading_high_pct": {"name": "Reading Higher %", "description": "% achieving higher standard in reading", "type": "percentage", "category": "higher"},
"writing_high_pct": {"name": "Writing Higher %", "description": "% achieving greater depth in writing", "type": "percentage", "category": "higher"},
"maths_high_pct": {"name": "Maths Higher %", "description": "% achieving higher standard in maths", "type": "percentage", "category": "higher"},
"gps_high_pct": {"name": "GPS Higher %", "description": "% achieving higher standard in GPS", "type": "percentage", "category": "higher"},
# Progress Scores
"reading_progress": {"name": "Reading Progress", "description": "Progress in reading from KS1 to KS2", "type": "score", "category": "progress"},
"writing_progress": {"name": "Writing Progress", "description": "Progress in writing from KS1 to KS2", "type": "score", "category": "progress"},
"maths_progress": {"name": "Maths Progress", "description": "Progress in maths from KS1 to KS2", "type": "score", "category": "progress"},
# Average Scores
"reading_avg_score": {"name": "Reading Avg Score", "description": "Average scaled score in reading", "type": "score", "category": "average"},
"maths_avg_score": {"name": "Maths Avg Score", "description": "Average scaled score in maths", "type": "score", "category": "average"},
"gps_avg_score": {"name": "GPS Avg Score", "description": "Average scaled score in GPS", "type": "score", "category": "average"},
# Gender Performance
"rwm_expected_boys_pct": {"name": "RWM Expected % (Boys)", "description": "% of boys meeting expected standard", "type": "percentage", "category": "gender"},
"rwm_expected_girls_pct": {"name": "RWM Expected % (Girls)", "description": "% of girls meeting expected standard", "type": "percentage", "category": "gender"},
"rwm_high_boys_pct": {"name": "RWM Higher % (Boys)", "description": "% of boys at higher standard", "type": "percentage", "category": "gender"},
"rwm_high_girls_pct": {"name": "RWM Higher % (Girls)", "description": "% of girls at higher standard", "type": "percentage", "category": "gender"},
# Disadvantaged Performance
"rwm_expected_disadvantaged_pct": {"name": "RWM Expected % (Disadvantaged)", "description": "% of disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
"rwm_expected_non_disadvantaged_pct": {"name": "RWM Expected % (Non-Disadvantaged)", "description": "% of non-disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
"disadvantaged_gap": {"name": "Disadvantaged Gap", "description": "Gap between disadvantaged and national non-disadvantaged", "type": "score", "category": "equity"},
# School Context
"disadvantaged_pct": {"name": "% Disadvantaged Pupils", "description": "% of pupils eligible for free school meals or looked after", "type": "percentage", "category": "context"},
"eal_pct": {"name": "% EAL Pupils", "description": "% of pupils with English as additional language", "type": "percentage", "category": "context"},
"sen_support_pct": {"name": "% SEN Support", "description": "% of pupils with SEN support", "type": "percentage", "category": "context"},
"stability_pct": {"name": "% Pupil Stability", "description": "% of non-mobile pupils (stayed at school)", "type": "percentage", "category": "context"},
# 3-Year Averages
"rwm_expected_3yr_pct": {"name": "RWM Expected % (3-Year Avg)", "description": "3-year average % meeting expected", "type": "percentage", "category": "trends"},
"reading_avg_3yr": {"name": "Reading Score (3-Year Avg)", "description": "3-year average reading score", "type": "score", "category": "trends"},
"maths_avg_3yr": {"name": "Maths Score (3-Year Avg)", "description": "3-year average maths score", "type": "score", "category": "trends"},
}
available = []
for col, info in metric_info.items():
if df.empty or col in df.columns:
available.append({"key": col, **info})
for key, info in METRIC_DEFINITIONS.items():
if df.empty or key in df.columns:
available.append({"key": key, **info})
return {"metrics": available}
@@ -448,13 +221,14 @@ async def get_available_metrics():
async def get_rankings(
metric: str = Query("rwm_expected_pct", description="KS2 metric to rank by"),
year: Optional[int] = Query(None, description="Specific year (defaults to most recent)"),
limit: int = Query(20, description="Number of schools to return"),
limit: int = Query(20, ge=1, le=100, description="Number of schools to return"),
local_authority: Optional[str] = Query(None, description="Filter by local authority"),
):
"""Get primary school rankings by a specific KS2 metric."""
df = load_school_data()
if df.empty:
return {"metric": metric, "year": None, "rankings": []}
return {"metric": metric, "year": None, "rankings": [], "total": 0}
if metric not in df.columns:
raise HTTPException(status_code=400, detail=f"Metric '{metric}' not available")
@@ -467,39 +241,26 @@ async def get_rankings(
max_year = df["year"].max()
df = df[df["year"] == max_year]
# Filter by local authority if specified
if local_authority:
df = df[df["local_authority"].str.lower() == local_authority.lower()]
# Sort and rank (exclude rows with no data for this metric)
df = df.dropna(subset=[metric])
total = len(df)
# For progress scores, higher is better. For percentages, higher is also better.
df = df.sort_values(metric, ascending=False).head(limit)
# Return only relevant fields for rankings
ranking_cols = [
"urn", "school_name", "local_authority", "school_type", "address", "year", "total_pupils",
# Core expected
"rwm_expected_pct", "reading_expected_pct", "writing_expected_pct", "maths_expected_pct",
"gps_expected_pct", "science_expected_pct",
# Core higher
"rwm_high_pct", "reading_high_pct", "writing_high_pct", "maths_high_pct", "gps_high_pct",
# Progress & averages
"reading_progress", "writing_progress", "maths_progress",
"reading_avg_score", "maths_avg_score", "gps_avg_score",
# Gender
"rwm_expected_boys_pct", "rwm_expected_girls_pct", "rwm_high_boys_pct", "rwm_high_girls_pct",
# Equity
"rwm_expected_disadvantaged_pct", "rwm_expected_non_disadvantaged_pct", "disadvantaged_gap",
# Context
"disadvantaged_pct", "eal_pct", "sen_support_pct", "stability_pct",
# 3-year
"rwm_expected_3yr_pct", "reading_avg_3yr", "maths_avg_3yr",
]
available_cols = [c for c in ranking_cols if c in df.columns]
available_cols = [c for c in RANKING_COLUMNS if c in df.columns]
df = df[available_cols]
return {
"metric": metric,
"year": int(df["year"].iloc[0]) if not df.empty else None,
"rankings": clean_for_json(df)
"rankings": clean_for_json(df),
"total": total,
}
@@ -512,7 +273,7 @@ async def get_data_info():
return {
"status": "no_data",
"message": "No data files found in data folder. Please download KS2 data from the government website.",
"data_folder": str(DATA_DIR),
"data_folder": str(settings.data_dir),
}
years = [int(y) for y in sorted(df["year"].unique())]
@@ -529,17 +290,22 @@ async def get_data_info():
}
# Mount static files
@app.on_event("startup")
async def startup():
"""Setup static file serving and load data on startup."""
if FRONTEND_DIR.exists():
app.mount("/static", StaticFiles(directory=FRONTEND_DIR), name="static")
# Pre-load data
@app.post("/api/admin/reload")
async def reload_data():
"""Admin endpoint to force data reload (useful after data updates)."""
clear_cache()
load_school_data()
return {"status": "reloaded"}
# Mount static files after all routes are defined
@app.on_event("startup")
async def mount_static():
"""Mount static file serving."""
if settings.frontend_dir.exists():
app.mount("/static", StaticFiles(directory=settings.frontend_dir), name="static")
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host="0.0.0.0", port=8000)
uvicorn.run(app, host=settings.host, port=settings.port)

38
backend/config.py Normal file
View File

@@ -0,0 +1,38 @@
"""
Application configuration using pydantic-settings.
Loads from environment variables and .env file.
"""
from pathlib import Path
from typing import List
from pydantic_settings import BaseSettings
import os
class Settings(BaseSettings):
"""Application settings loaded from environment."""
# Paths
data_dir: Path = Path(__file__).parent.parent / "data"
frontend_dir: Path = Path(__file__).parent.parent / "frontend"
# Server
host: str = "0.0.0.0"
port: int = 80
# CORS
allowed_origins: List[str] = ["https://schoolcompare.co.uk", "http://localhost:8000", "http://localhost:3000"]
# API
default_page_size: int = 50
max_page_size: int = 100
class Config:
env_file = ".env"
env_file_encoding = "utf-8"
extra = "ignore"
# Singleton instance
settings = Settings()

196
backend/data_loader.py Normal file
View File

@@ -0,0 +1,196 @@
"""
Data loading module with optimized pandas operations.
Uses vectorized operations instead of .apply() for performance.
"""
import pandas as pd
import numpy as np
from pathlib import Path
from functools import lru_cache
import re
from typing import Optional
from .config import settings
from .schemas import (
COLUMN_MAPPINGS,
NUMERIC_COLUMNS,
SCHOOL_TYPE_MAP,
NULL_VALUES,
LA_CODE_TO_NAME,
)
def extract_year_from_folder(folder_name: str) -> Optional[int]:
"""Extract the end year from folder name like '2023-2024' -> 2024."""
match = re.search(r'(\d{4})-(\d{4})', folder_name)
if match:
return int(match.group(2))
return None
def parse_numeric_vectorized(series: pd.Series) -> pd.Series:
"""
Vectorized numeric parsing - much faster than .apply().
Handles SUPP, NE, NA, NP, %, etc.
"""
# Convert to string first
str_series = series.astype(str)
# Replace null values with NaN
for null_val in NULL_VALUES:
str_series = str_series.replace(null_val, np.nan)
# Remove % signs
str_series = str_series.str.rstrip('%')
# Convert to numeric
return pd.to_numeric(str_series, errors='coerce')
def create_address_vectorized(df: pd.DataFrame) -> pd.Series:
"""
Vectorized address creation - much faster than .apply().
"""
parts = []
if 'address1' in df.columns:
parts.append(df['address1'].fillna('').astype(str))
if 'town' in df.columns:
parts.append(df['town'].fillna('').astype(str))
if 'postcode' in df.columns:
parts.append(df['postcode'].fillna('').astype(str))
if not parts:
return pd.Series([''] * len(df), index=df.index)
# Combine parts with comma separator, filtering empty strings
result = pd.Series([''] * len(df), index=df.index)
for i, row_idx in enumerate(df.index):
row_parts = [p.iloc[i] if hasattr(p, 'iloc') else p[i] for p in parts]
row_parts = [p for p in row_parts if p and p.strip()]
result.iloc[i] = ', '.join(row_parts)
return result
def create_address_fast(df: pd.DataFrame) -> pd.Series:
"""
Fast vectorized address creation using string concatenation.
"""
addr1 = df.get('address1', pd.Series([''] * len(df))).fillna('').astype(str)
town = df.get('town', pd.Series([''] * len(df))).fillna('').astype(str)
postcode = df.get('postcode', pd.Series([''] * len(df))).fillna('').astype(str)
# Build address with proper separators
result = addr1.str.strip()
# Add town if not empty
town_mask = town.str.strip() != ''
result = result.where(~town_mask, result + ', ' + town.str.strip())
# Add postcode if not empty
postcode_mask = postcode.str.strip() != ''
result = result.where(~postcode_mask, result + ', ' + postcode.str.strip())
# Clean up leading commas
result = result.str.lstrip(', ')
return result
def load_year_data(year_folder: Path, year: int) -> Optional[pd.DataFrame]:
"""Load and process data for a single year."""
ks2_file = year_folder / "england_ks2final.csv"
if not ks2_file.exists():
return None
try:
print(f"Loading data from {ks2_file}")
df = pd.read_csv(ks2_file, low_memory=False)
# Handle column types
if 'LEA' in df.columns and df['LEA'].dtype == 'object':
df['LEA'] = pd.to_numeric(df['LEA'], errors='coerce')
if 'URN' in df.columns and df['URN'].dtype == 'object':
df['URN'] = pd.to_numeric(df['URN'], errors='coerce')
# Filter to schools only (RECTYPE == 1 means school level data)
if 'RECTYPE' in df.columns:
df = df[df['RECTYPE'] == 1].copy()
# Add year and local authority name
df['year'] = year
# Try different column names for LA name
la_name_cols = ['LANAME', 'LA (name)', 'LA_NAME', 'LA NAME']
la_col_found = None
for col in la_name_cols:
if col in df.columns:
la_col_found = col
break
if la_col_found:
df['local_authority'] = df[la_col_found]
elif 'LEA' in df.columns:
# Map LEA codes to names using our mapping
df['local_authority'] = df['LEA'].map(LA_CODE_TO_NAME).fillna(df['LEA'].astype(str))
# Rename columns using mapping
rename_dict = {k: v for k, v in COLUMN_MAPPINGS.items() if k in df.columns}
df = df.rename(columns=rename_dict)
# Create address field (vectorized)
df['address'] = create_address_fast(df)
# Map school type codes to names (vectorized)
if 'school_type_code' in df.columns:
df['school_type'] = df['school_type_code'].map(SCHOOL_TYPE_MAP).fillna('Other')
# Parse numeric columns (vectorized - much faster than .apply())
for col in NUMERIC_COLUMNS:
if col in df.columns:
df[col] = parse_numeric_vectorized(df[col])
print(f" Loaded {len(df)} schools for year {year}")
return df
except Exception as e:
print(f"Error loading {ks2_file}: {e}")
return None
@lru_cache(maxsize=1)
def load_school_data() -> pd.DataFrame:
"""
Load and combine all school data from CSV files in year folders.
Uses lru_cache for singleton-like behavior.
"""
all_data = []
data_dir = settings.data_dir
if data_dir.exists():
for year_folder in data_dir.iterdir():
if year_folder.is_dir() and re.match(r'\d{4}-\d{4}', year_folder.name):
year = extract_year_from_folder(year_folder.name)
if year is None:
continue
df = load_year_data(year_folder, year)
if df is not None:
all_data.append(df)
if all_data:
result = pd.concat(all_data, ignore_index=True)
print(f"\nTotal records loaded: {len(result)}")
print(f"Unique schools: {result['urn'].nunique()}")
print(f"Years: {sorted(result['year'].unique())}")
return result
else:
print("No data files found. Creating empty DataFrame.")
return pd.DataFrame()
def clear_cache():
"""Clear the data cache to force reload."""
load_school_data.cache_clear()

398
backend/schemas.py Normal file
View File

@@ -0,0 +1,398 @@
"""
Schema definitions: column mappings, metric definitions, school type mappings.
Single source of truth for all data transformations.
"""
# Column name mappings from DfE CSV to API field names
COLUMN_MAPPINGS = {
'URN': 'urn',
'SCHNAME': 'school_name',
'ADDRESS1': 'address1',
'ADDRESS2': 'address2',
'TOWN': 'town',
'PCODE': 'postcode',
'NFTYPE': 'school_type_code',
'RELDENOM': 'religious_denomination',
'AGERANGE': 'age_range',
'TOTPUPS': 'total_pupils',
'TELIG': 'eligible_pupils',
# Core KS2 metrics
'PTRWM_EXP': 'rwm_expected_pct',
'PTRWM_HIGH': 'rwm_high_pct',
'READPROG': 'reading_progress',
'WRITPROG': 'writing_progress',
'MATPROG': 'maths_progress',
'PTREAD_EXP': 'reading_expected_pct',
'PTWRITTA_EXP': 'writing_expected_pct',
'PTMAT_EXP': 'maths_expected_pct',
'READ_AVERAGE': 'reading_avg_score',
'MAT_AVERAGE': 'maths_avg_score',
'PTREAD_HIGH': 'reading_high_pct',
'PTWRITTA_HIGH': 'writing_high_pct',
'PTMAT_HIGH': 'maths_high_pct',
# GPS (Grammar, Punctuation & Spelling)
'PTGPS_EXP': 'gps_expected_pct',
'PTGPS_HIGH': 'gps_high_pct',
'GPS_AVERAGE': 'gps_avg_score',
# Science
'PTSCITA_EXP': 'science_expected_pct',
# School context
'PTFSM6CLA1A': 'disadvantaged_pct',
'PTEALGRP2': 'eal_pct',
'PSENELK': 'sen_support_pct',
'PSENELE': 'sen_ehcp_pct',
'PTMOBN': 'stability_pct',
# Gender breakdown
'PTRWM_EXP_B': 'rwm_expected_boys_pct',
'PTRWM_EXP_G': 'rwm_expected_girls_pct',
'PTRWM_HIGH_B': 'rwm_high_boys_pct',
'PTRWM_HIGH_G': 'rwm_high_girls_pct',
# Disadvantaged performance
'PTRWM_EXP_FSM6CLA1A': 'rwm_expected_disadvantaged_pct',
'PTRWM_EXP_NotFSM6CLA1A': 'rwm_expected_non_disadvantaged_pct',
'DIFFN_RWM_EXP': 'disadvantaged_gap',
# 3-year averages
'PTRWM_EXP_3YR': 'rwm_expected_3yr_pct',
'READ_AVERAGE_3YR': 'reading_avg_3yr',
'MAT_AVERAGE_3YR': 'maths_avg_3yr',
}
# Numeric columns that need parsing
NUMERIC_COLUMNS = [
# Core metrics
'rwm_expected_pct', 'rwm_high_pct', 'reading_progress',
'writing_progress', 'maths_progress', 'reading_expected_pct',
'writing_expected_pct', 'maths_expected_pct', 'reading_avg_score',
'maths_avg_score', 'reading_high_pct', 'writing_high_pct', 'maths_high_pct',
# GPS & Science
'gps_expected_pct', 'gps_high_pct', 'gps_avg_score', 'science_expected_pct',
# School context
'total_pupils', 'eligible_pupils', 'disadvantaged_pct', 'eal_pct',
'sen_support_pct', 'sen_ehcp_pct', 'stability_pct',
# Gender breakdown
'rwm_expected_boys_pct', 'rwm_expected_girls_pct',
'rwm_high_boys_pct', 'rwm_high_girls_pct',
# Disadvantaged performance
'rwm_expected_disadvantaged_pct', 'rwm_expected_non_disadvantaged_pct', 'disadvantaged_gap',
# 3-year averages
'rwm_expected_3yr_pct', 'reading_avg_3yr', 'maths_avg_3yr',
]
# School type code to name mapping
SCHOOL_TYPE_MAP = {
'AC': 'Academy',
'ACC': 'Academy Converter',
'ACS': 'Academy Sponsor Led',
'CY': 'Community School',
'VA': 'Voluntary Aided',
'VC': 'Voluntary Controlled',
'FD': 'Foundation',
'F': 'Foundation',
'FS': 'Free School',
}
# Special values to treat as null
NULL_VALUES = ['SUPP', 'NE', 'NA', 'NP', 'NEW', 'LOW', '']
# KS2 Metric definitions - single source of truth
# Used by both backend API and frontend
METRIC_DEFINITIONS = {
# Expected Standard
"rwm_expected_pct": {
"name": "RWM Combined %",
"short_name": "RWM %",
"description": "% meeting expected standard in reading, writing and maths",
"type": "percentage",
"category": "expected"
},
"reading_expected_pct": {
"name": "Reading Expected %",
"short_name": "Reading %",
"description": "% meeting expected standard in reading",
"type": "percentage",
"category": "expected"
},
"writing_expected_pct": {
"name": "Writing Expected %",
"short_name": "Writing %",
"description": "% meeting expected standard in writing",
"type": "percentage",
"category": "expected"
},
"maths_expected_pct": {
"name": "Maths Expected %",
"short_name": "Maths %",
"description": "% meeting expected standard in maths",
"type": "percentage",
"category": "expected"
},
"gps_expected_pct": {
"name": "GPS Expected %",
"short_name": "GPS %",
"description": "% meeting expected standard in grammar, punctuation & spelling",
"type": "percentage",
"category": "expected"
},
"science_expected_pct": {
"name": "Science Expected %",
"short_name": "Science %",
"description": "% meeting expected standard in science",
"type": "percentage",
"category": "expected"
},
# Higher Standard
"rwm_high_pct": {
"name": "RWM Combined Higher %",
"short_name": "RWM Higher %",
"description": "% achieving higher standard in RWM combined",
"type": "percentage",
"category": "higher"
},
"reading_high_pct": {
"name": "Reading Higher %",
"short_name": "Reading Higher %",
"description": "% achieving higher standard in reading",
"type": "percentage",
"category": "higher"
},
"writing_high_pct": {
"name": "Writing Higher %",
"short_name": "Writing Higher %",
"description": "% achieving greater depth in writing",
"type": "percentage",
"category": "higher"
},
"maths_high_pct": {
"name": "Maths Higher %",
"short_name": "Maths Higher %",
"description": "% achieving higher standard in maths",
"type": "percentage",
"category": "higher"
},
"gps_high_pct": {
"name": "GPS Higher %",
"short_name": "GPS Higher %",
"description": "% achieving higher standard in GPS",
"type": "percentage",
"category": "higher"
},
# Progress Scores
"reading_progress": {
"name": "Reading Progress",
"short_name": "Reading Progress",
"description": "Progress in reading from KS1 to KS2",
"type": "score",
"category": "progress"
},
"writing_progress": {
"name": "Writing Progress",
"short_name": "Writing Progress",
"description": "Progress in writing from KS1 to KS2",
"type": "score",
"category": "progress"
},
"maths_progress": {
"name": "Maths Progress",
"short_name": "Maths Progress",
"description": "Progress in maths from KS1 to KS2",
"type": "score",
"category": "progress"
},
# Average Scores
"reading_avg_score": {
"name": "Reading Average Score",
"short_name": "Reading Avg",
"description": "Average scaled score in reading",
"type": "score",
"category": "average"
},
"maths_avg_score": {
"name": "Maths Average Score",
"short_name": "Maths Avg",
"description": "Average scaled score in maths",
"type": "score",
"category": "average"
},
"gps_avg_score": {
"name": "GPS Average Score",
"short_name": "GPS Avg",
"description": "Average scaled score in GPS",
"type": "score",
"category": "average"
},
# Gender Performance
"rwm_expected_boys_pct": {
"name": "RWM Expected % (Boys)",
"short_name": "Boys RWM %",
"description": "% of boys meeting expected standard",
"type": "percentage",
"category": "gender"
},
"rwm_expected_girls_pct": {
"name": "RWM Expected % (Girls)",
"short_name": "Girls RWM %",
"description": "% of girls meeting expected standard",
"type": "percentage",
"category": "gender"
},
"rwm_high_boys_pct": {
"name": "RWM Higher % (Boys)",
"short_name": "Boys Higher %",
"description": "% of boys at higher standard",
"type": "percentage",
"category": "gender"
},
"rwm_high_girls_pct": {
"name": "RWM Higher % (Girls)",
"short_name": "Girls Higher %",
"description": "% of girls at higher standard",
"type": "percentage",
"category": "gender"
},
# Disadvantaged Performance
"rwm_expected_disadvantaged_pct": {
"name": "RWM Expected % (Disadvantaged)",
"short_name": "Disadvantaged %",
"description": "% of disadvantaged pupils meeting expected",
"type": "percentage",
"category": "equity"
},
"rwm_expected_non_disadvantaged_pct": {
"name": "RWM Expected % (Non-Disadvantaged)",
"short_name": "Non-Disadv %",
"description": "% of non-disadvantaged pupils meeting expected",
"type": "percentage",
"category": "equity"
},
"disadvantaged_gap": {
"name": "Disadvantaged Gap",
"short_name": "Disadv Gap",
"description": "Gap between disadvantaged and national non-disadvantaged",
"type": "score",
"category": "equity"
},
# School Context
"disadvantaged_pct": {
"name": "% Disadvantaged Pupils",
"short_name": "% Disadvantaged",
"description": "% of pupils eligible for free school meals or looked after",
"type": "percentage",
"category": "context"
},
"eal_pct": {
"name": "% EAL Pupils",
"short_name": "% EAL",
"description": "% of pupils with English as additional language",
"type": "percentage",
"category": "context"
},
"sen_support_pct": {
"name": "% SEN Support",
"short_name": "% SEN",
"description": "% of pupils with SEN support",
"type": "percentage",
"category": "context"
},
"stability_pct": {
"name": "% Pupil Stability",
"short_name": "% Stable",
"description": "% of non-mobile pupils (stayed at school)",
"type": "percentage",
"category": "context"
},
# 3-Year Averages
"rwm_expected_3yr_pct": {
"name": "RWM Expected % (3-Year Avg)",
"short_name": "RWM 3yr %",
"description": "3-year average % meeting expected",
"type": "percentage",
"category": "trends"
},
"reading_avg_3yr": {
"name": "Reading Score (3-Year Avg)",
"short_name": "Reading 3yr",
"description": "3-year average reading score",
"type": "score",
"category": "trends"
},
"maths_avg_3yr": {
"name": "Maths Score (3-Year Avg)",
"short_name": "Maths 3yr",
"description": "3-year average maths score",
"type": "score",
"category": "trends"
},
}
# Ranking columns to include in rankings response
RANKING_COLUMNS = [
"urn", "school_name", "local_authority", "school_type", "address", "year", "total_pupils",
# Core expected
"rwm_expected_pct", "reading_expected_pct", "writing_expected_pct", "maths_expected_pct",
"gps_expected_pct", "science_expected_pct",
# Core higher
"rwm_high_pct", "reading_high_pct", "writing_high_pct", "maths_high_pct", "gps_high_pct",
# Progress & averages
"reading_progress", "writing_progress", "maths_progress",
"reading_avg_score", "maths_avg_score", "gps_avg_score",
# Gender
"rwm_expected_boys_pct", "rwm_expected_girls_pct", "rwm_high_boys_pct", "rwm_high_girls_pct",
# Equity
"rwm_expected_disadvantaged_pct", "rwm_expected_non_disadvantaged_pct", "disadvantaged_gap",
# Context
"disadvantaged_pct", "eal_pct", "sen_support_pct", "stability_pct",
# 3-year
"rwm_expected_3yr_pct", "reading_avg_3yr", "maths_avg_3yr",
]
# School listing columns
SCHOOL_COLUMNS = ["urn", "school_name", "local_authority", "school_type", "address", "town", "postcode"]
# Local Authority code to name mapping (for fallback when LANAME column missing)
# Source: https://www.gov.uk/government/publications/local-authority-codes
LA_CODE_TO_NAME = {
201: "City of London", 202: "Camden", 203: "Greenwich", 204: "Hackney",
205: "Hammersmith and Fulham", 206: "Islington", 207: "Kensington and Chelsea",
208: "Lambeth", 209: "Lewisham", 210: "Southwark", 211: "Tower Hamlets",
212: "Wandsworth", 213: "Westminster", 301: "Barking and Dagenham", 302: "Barnet",
303: "Bexley", 304: "Brent", 305: "Bromley", 306: "Croydon", 307: "Ealing",
308: "Enfield", 309: "Haringey", 310: "Harrow", 311: "Havering", 312: "Hillingdon",
313: "Hounslow", 314: "Kingston upon Thames", 315: "Merton", 316: "Newham",
317: "Redbridge", 318: "Richmond upon Thames", 319: "Sutton", 320: "Waltham Forest",
330: "Birmingham", 331: "Coventry", 332: "Dudley", 333: "Sandwell", 334: "Solihull",
335: "Walsall", 336: "Wolverhampton", 340: "Knowsley", 341: "Liverpool",
342: "St. Helens", 343: "Sefton", 344: "Wirral", 350: "Bolton", 351: "Bury",
352: "Manchester", 353: "Oldham", 354: "Rochdale", 355: "Salford", 356: "Stockport",
357: "Tameside", 358: "Trafford", 359: "Wigan", 370: "Barnsley", 371: "Doncaster",
372: "Rotherham", 373: "Sheffield", 380: "Bradford", 381: "Calderdale",
382: "Kirklees", 383: "Leeds", 384: "Wakefield", 390: "Gateshead",
391: "Newcastle upon Tyne", 392: "North Tyneside", 393: "South Tyneside",
394: "Sunderland", 420: "Isles of Scilly", 800: "Bath and North East Somerset",
801: "Bristol, City of", 802: "North Somerset", 803: "South Gloucestershire",
805: "Hartlepool", 806: "Middlesbrough", 807: "Redcar and Cleveland",
808: "Stockton-on-Tees", 810: "Kingston Upon Hull, City of", 811: "East Riding of Yorkshire",
812: "North East Lincolnshire", 813: "North Lincolnshire", 815: "North Yorkshire",
816: "York", 820: "Bedford", 821: "Central Bedfordshire", 822: "Luton",
825: "Buckinghamshire", 826: "Milton Keynes", 830: "Derbyshire", 831: "Derby",
835: "Dorset", 836: "Bournemouth, Christchurch and Poole", 837: "Poole",
838: "Bournemouth", 839: "Durham", 840: "Darlington", 841: "East Sussex",
845: "Brighton and Hove", 846: "Hampshire", 850: "Portsmouth", 851: "Southampton",
852: "Isle of Wight", 855: "Leicestershire", 856: "Leicester", 857: "Rutland",
860: "Staffordshire", 861: "Stoke-on-Trent", 865: "Wiltshire", 866: "Swindon",
867: "Bracknell Forest", 868: "Windsor and Maidenhead", 869: "West Berkshire",
870: "Reading", 871: "Slough", 872: "Wokingham", 873: "Cambridgeshire",
874: "Peterborough", 876: "Halton", 877: "Warrington", 878: "Devon",
879: "Plymouth", 880: "Torbay", 881: "Essex", 882: "Southend-on-Sea",
883: "Thurrock", 884: "Herefordshire", 885: "Worcestershire", 886: "Kent",
887: "Medway", 888: "Lancashire", 889: "Blackburn with Darwen", 890: "Blackpool",
891: "Nottinghamshire", 892: "Nottingham", 893: "Shropshire", 894: "Telford and Wrekin",
895: "Cheshire East", 896: "Cheshire West and Chester", 908: "Cornwall",
909: "Cumbria", 916: "Gloucestershire", 919: "Hertfordshire", 921: "Norfolk",
925: "Lincolnshire", 926: "Northamptonshire", 928: "Northumberland",
929: "Oxfordshire", 931: "Somerset", 933: "Suffolk", 935: "Surrey",
936: "Warwickshire", 937: "West Sussex", 938: "Westmorland and Furness",
940: "Cumberland",
# Additional codes
420: "Isles of Scilly",
}

37
backend/utils.py Normal file
View File

@@ -0,0 +1,37 @@
"""
Utility functions for data conversion and JSON serialization.
"""
import pandas as pd
import numpy as np
from typing import Any, List
def convert_to_native(value: Any) -> Any:
"""Convert numpy types to native Python types for JSON serialization."""
if pd.isna(value):
return None
if isinstance(value, (np.integer,)):
return int(value)
if isinstance(value, (np.floating,)):
if np.isnan(value) or np.isinf(value):
return None
return float(value)
if isinstance(value, np.ndarray):
return value.tolist()
if value == "SUPP" or value == "NE" or value == "NA" or value == "NP":
return None
return value
def clean_for_json(df: pd.DataFrame) -> List[dict]:
"""Convert DataFrame to list of dicts, replacing NaN/inf with None for JSON serialization."""
records = df.to_dict(orient="records")
cleaned = []
for record in records:
clean_record = {}
for key, value in record.items():
clean_record[key] = convert_to_native(value)
cleaned.append(clean_record)
return cleaned