Refactoring and bug fixes
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 1m7s
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 1m7s
This commit is contained in:
2
backend/__init__.py
Normal file
2
backend/__init__.py
Normal file
@@ -0,0 +1,2 @@
|
||||
# Backend package
|
||||
|
||||
408
backend/app.py
408
backend/app.py
@@ -4,284 +4,83 @@ Serves primary school (KS2) performance data for comparing schools.
|
||||
Uses real data from UK Government Compare School Performance downloads.
|
||||
"""
|
||||
|
||||
from contextlib import asynccontextmanager
|
||||
from fastapi import FastAPI, HTTPException, Query
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.responses import FileResponse
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
import os
|
||||
import re
|
||||
|
||||
# No longer filtering by specific LA codes - load all available schools
|
||||
from .config import settings
|
||||
from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
|
||||
from .data_loader import load_school_data, clear_cache
|
||||
from .utils import clean_for_json
|
||||
|
||||
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Application lifespan - startup and shutdown events."""
|
||||
# Startup: pre-load data
|
||||
print("Starting up: Loading school data...")
|
||||
load_school_data()
|
||||
print("Data loaded successfully.")
|
||||
|
||||
yield # Application runs here
|
||||
|
||||
# Shutdown: cleanup if needed
|
||||
print("Shutting down...")
|
||||
|
||||
|
||||
app = FastAPI(
|
||||
title="SchoolCompare API",
|
||||
description="API for comparing primary school (KS2) performance data - schoolcompare.co.uk",
|
||||
version="1.0.0"
|
||||
version="2.0.0",
|
||||
lifespan=lifespan,
|
||||
)
|
||||
|
||||
# CORS middleware for development
|
||||
# CORS middleware with configurable origins
|
||||
app.add_middleware(
|
||||
CORSMiddleware,
|
||||
allow_origins=["*"],
|
||||
allow_origins=settings.allowed_origins,
|
||||
allow_credentials=True,
|
||||
allow_methods=["*"],
|
||||
allow_headers=["*"],
|
||||
)
|
||||
|
||||
# Data directory
|
||||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||
FRONTEND_DIR = Path(__file__).parent.parent / "frontend"
|
||||
|
||||
# Cache for loaded data - cleared on reload (updated for 2016-2017 data)
|
||||
_data_cache: Optional[pd.DataFrame] = None
|
||||
|
||||
|
||||
def convert_to_native(value):
|
||||
"""Convert numpy types to native Python types for JSON serialization."""
|
||||
if pd.isna(value):
|
||||
return None
|
||||
if isinstance(value, (np.integer,)):
|
||||
return int(value)
|
||||
if isinstance(value, (np.floating,)):
|
||||
if np.isnan(value) or np.isinf(value):
|
||||
return None
|
||||
return float(value)
|
||||
if isinstance(value, np.ndarray):
|
||||
return value.tolist()
|
||||
if value == "SUPP" or value == "NE" or value == "NA" or value == "NP":
|
||||
return None
|
||||
return value
|
||||
|
||||
|
||||
def clean_for_json(df: pd.DataFrame) -> list:
|
||||
"""Convert DataFrame to list of dicts, replacing NaN/inf with None for JSON serialization."""
|
||||
records = df.to_dict(orient="records")
|
||||
cleaned = []
|
||||
for record in records:
|
||||
clean_record = {}
|
||||
for key, value in record.items():
|
||||
clean_record[key] = convert_to_native(value)
|
||||
cleaned.append(clean_record)
|
||||
return cleaned
|
||||
|
||||
|
||||
def parse_numeric(value):
|
||||
"""Parse a value to numeric, handling SUPP, NE, NA, %, etc."""
|
||||
if pd.isna(value):
|
||||
return None
|
||||
if isinstance(value, (int, float)):
|
||||
if np.isnan(value) or np.isinf(value):
|
||||
return None
|
||||
return value
|
||||
if isinstance(value, str):
|
||||
value = value.strip()
|
||||
if value in ["SUPP", "NE", "NA", "NP", "NEW", "LOW", ""]:
|
||||
return None
|
||||
# Remove % sign if present
|
||||
if value.endswith('%'):
|
||||
value = value[:-1]
|
||||
try:
|
||||
return float(value)
|
||||
except ValueError:
|
||||
return None
|
||||
return None
|
||||
|
||||
|
||||
def extract_year_from_folder(folder_name: str) -> Optional[int]:
|
||||
"""Extract the end year from folder name like '2023-2024' -> 2024."""
|
||||
match = re.search(r'(\d{4})-(\d{4})', folder_name)
|
||||
if match:
|
||||
return int(match.group(2))
|
||||
return None
|
||||
|
||||
|
||||
def load_school_data() -> pd.DataFrame:
|
||||
"""Load and combine all school data from CSV files in year folders."""
|
||||
global _data_cache
|
||||
|
||||
if _data_cache is not None:
|
||||
return _data_cache
|
||||
|
||||
all_data = []
|
||||
|
||||
# Look for year folders in data directory
|
||||
if DATA_DIR.exists():
|
||||
for year_folder in DATA_DIR.iterdir():
|
||||
if year_folder.is_dir() and re.match(r'\d{4}-\d{4}', year_folder.name):
|
||||
year = extract_year_from_folder(year_folder.name)
|
||||
if year is None:
|
||||
continue
|
||||
|
||||
# Look for KS2 data file
|
||||
ks2_file = year_folder / "england_ks2final.csv"
|
||||
if ks2_file.exists():
|
||||
try:
|
||||
print(f"Loading data from {ks2_file}")
|
||||
df = pd.read_csv(ks2_file, low_memory=False)
|
||||
|
||||
# Handle both string and integer columns
|
||||
if 'LEA' in df.columns and df['LEA'].dtype == 'object':
|
||||
df['LEA'] = pd.to_numeric(df['LEA'], errors='coerce')
|
||||
if 'URN' in df.columns and df['URN'].dtype == 'object':
|
||||
df['URN'] = pd.to_numeric(df['URN'], errors='coerce')
|
||||
|
||||
# Filter to schools only (RECTYPE == 1 means school level data)
|
||||
if 'RECTYPE' in df.columns:
|
||||
df = df[df['RECTYPE'] == 1]
|
||||
|
||||
# Add year and local authority name from LANAME column
|
||||
df['year'] = year
|
||||
if 'LANAME' in df.columns:
|
||||
df['local_authority'] = df['LANAME']
|
||||
elif 'LEA' in df.columns:
|
||||
df['local_authority'] = df['LEA'].astype(str)
|
||||
|
||||
# Standardize column names for our API
|
||||
df = df.rename(columns={
|
||||
'URN': 'urn',
|
||||
'SCHNAME': 'school_name',
|
||||
'ADDRESS1': 'address1',
|
||||
'ADDRESS2': 'address2',
|
||||
'TOWN': 'town',
|
||||
'PCODE': 'postcode',
|
||||
'NFTYPE': 'school_type_code',
|
||||
'RELDENOM': 'religious_denomination',
|
||||
'AGERANGE': 'age_range',
|
||||
'TOTPUPS': 'total_pupils',
|
||||
'TELIG': 'eligible_pupils',
|
||||
# Core KS2 metrics
|
||||
'PTRWM_EXP': 'rwm_expected_pct',
|
||||
'PTRWM_HIGH': 'rwm_high_pct',
|
||||
'READPROG': 'reading_progress',
|
||||
'WRITPROG': 'writing_progress',
|
||||
'MATPROG': 'maths_progress',
|
||||
'PTREAD_EXP': 'reading_expected_pct',
|
||||
'PTWRITTA_EXP': 'writing_expected_pct',
|
||||
'PTMAT_EXP': 'maths_expected_pct',
|
||||
'READ_AVERAGE': 'reading_avg_score',
|
||||
'MAT_AVERAGE': 'maths_avg_score',
|
||||
'PTREAD_HIGH': 'reading_high_pct',
|
||||
'PTWRITTA_HIGH': 'writing_high_pct',
|
||||
'PTMAT_HIGH': 'maths_high_pct',
|
||||
# GPS (Grammar, Punctuation & Spelling)
|
||||
'PTGPS_EXP': 'gps_expected_pct',
|
||||
'PTGPS_HIGH': 'gps_high_pct',
|
||||
'GPS_AVERAGE': 'gps_avg_score',
|
||||
# Science
|
||||
'PTSCITA_EXP': 'science_expected_pct',
|
||||
# School context
|
||||
'PTFSM6CLA1A': 'disadvantaged_pct',
|
||||
'PTEALGRP2': 'eal_pct',
|
||||
'PSENELK': 'sen_support_pct',
|
||||
'PSENELE': 'sen_ehcp_pct',
|
||||
'PTMOBN': 'stability_pct',
|
||||
# Gender breakdown
|
||||
'PTRWM_EXP_B': 'rwm_expected_boys_pct',
|
||||
'PTRWM_EXP_G': 'rwm_expected_girls_pct',
|
||||
'PTRWM_HIGH_B': 'rwm_high_boys_pct',
|
||||
'PTRWM_HIGH_G': 'rwm_high_girls_pct',
|
||||
# Disadvantaged performance
|
||||
'PTRWM_EXP_FSM6CLA1A': 'rwm_expected_disadvantaged_pct',
|
||||
'PTRWM_EXP_NotFSM6CLA1A': 'rwm_expected_non_disadvantaged_pct',
|
||||
'DIFFN_RWM_EXP': 'disadvantaged_gap',
|
||||
# 3-year averages
|
||||
'PTRWM_EXP_3YR': 'rwm_expected_3yr_pct',
|
||||
'READ_AVERAGE_3YR': 'reading_avg_3yr',
|
||||
'MAT_AVERAGE_3YR': 'maths_avg_3yr',
|
||||
})
|
||||
|
||||
# Create address field
|
||||
def make_address(row):
|
||||
parts = []
|
||||
if pd.notna(row.get('address1')) and row.get('address1'):
|
||||
parts.append(str(row['address1']))
|
||||
if pd.notna(row.get('town')) and row.get('town'):
|
||||
parts.append(str(row['town']))
|
||||
if pd.notna(row.get('postcode')) and row.get('postcode'):
|
||||
parts.append(str(row['postcode']))
|
||||
return ', '.join(parts) if parts else ''
|
||||
|
||||
df['address'] = df.apply(make_address, axis=1)
|
||||
|
||||
# Map school type codes to names
|
||||
school_type_map = {
|
||||
'AC': 'Academy', 'ACC': 'Academy Converter', 'ACS': 'Academy Sponsor Led',
|
||||
'CY': 'Community School', 'VA': 'Voluntary Aided', 'VC': 'Voluntary Controlled',
|
||||
'FD': 'Foundation', 'F': 'Foundation', 'FS': 'Free School',
|
||||
}
|
||||
df['school_type'] = df['school_type_code'].map(school_type_map).fillna('Other')
|
||||
|
||||
# Parse numeric columns
|
||||
numeric_cols = [
|
||||
# Core metrics
|
||||
'rwm_expected_pct', 'rwm_high_pct', 'reading_progress',
|
||||
'writing_progress', 'maths_progress', 'reading_expected_pct',
|
||||
'writing_expected_pct', 'maths_expected_pct', 'reading_avg_score',
|
||||
'maths_avg_score', 'reading_high_pct', 'writing_high_pct', 'maths_high_pct',
|
||||
# GPS & Science
|
||||
'gps_expected_pct', 'gps_high_pct', 'gps_avg_score', 'science_expected_pct',
|
||||
# School context
|
||||
'total_pupils', 'eligible_pupils', 'disadvantaged_pct', 'eal_pct',
|
||||
'sen_support_pct', 'sen_ehcp_pct', 'stability_pct',
|
||||
# Gender breakdown
|
||||
'rwm_expected_boys_pct', 'rwm_expected_girls_pct',
|
||||
'rwm_high_boys_pct', 'rwm_high_girls_pct',
|
||||
# Disadvantaged performance
|
||||
'rwm_expected_disadvantaged_pct', 'rwm_expected_non_disadvantaged_pct', 'disadvantaged_gap',
|
||||
# 3-year averages
|
||||
'rwm_expected_3yr_pct', 'reading_avg_3yr', 'maths_avg_3yr',
|
||||
]
|
||||
|
||||
for col in numeric_cols:
|
||||
if col in df.columns:
|
||||
df[col] = df[col].apply(parse_numeric)
|
||||
|
||||
all_data.append(df)
|
||||
print(f" Loaded {len(df)} schools for year {year}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading {ks2_file}: {e}")
|
||||
|
||||
if all_data:
|
||||
_data_cache = pd.concat(all_data, ignore_index=True)
|
||||
print(f"\nTotal records loaded: {len(_data_cache)}")
|
||||
print(f"Unique schools: {_data_cache['urn'].nunique()}")
|
||||
print(f"Years: {sorted(_data_cache['year'].unique())}")
|
||||
else:
|
||||
print("No data files found. Creating empty DataFrame.")
|
||||
_data_cache = pd.DataFrame()
|
||||
|
||||
return _data_cache
|
||||
|
||||
|
||||
@app.get("/")
|
||||
async def root():
|
||||
"""Serve the frontend."""
|
||||
return FileResponse(FRONTEND_DIR / "index.html")
|
||||
return FileResponse(settings.frontend_dir / "index.html")
|
||||
|
||||
|
||||
@app.get("/api/schools")
|
||||
async def get_schools(
|
||||
search: Optional[str] = Query(None, description="Search by school name"),
|
||||
local_authority: Optional[str] = Query(None, description="Filter by local authority (Wandsworth or Merton)"),
|
||||
local_authority: Optional[str] = Query(None, description="Filter by local authority"),
|
||||
school_type: Optional[str] = Query(None, description="Filter by school type"),
|
||||
page: int = Query(1, ge=1, description="Page number"),
|
||||
page_size: int = Query(None, ge=1, le=100, description="Results per page"),
|
||||
):
|
||||
"""Get list of unique primary schools in Wandsworth and Merton."""
|
||||
"""
|
||||
Get list of unique primary schools with pagination.
|
||||
|
||||
Returns paginated results with total count for efficient loading.
|
||||
"""
|
||||
df = load_school_data()
|
||||
|
||||
if df.empty:
|
||||
return {"schools": []}
|
||||
return {"schools": [], "total": 0, "page": page, "page_size": 0}
|
||||
|
||||
# Use configured default if not specified
|
||||
if page_size is None:
|
||||
page_size = settings.default_page_size
|
||||
|
||||
# Get unique schools (latest year data for each)
|
||||
latest_year = df.groupby('urn')['year'].max().reset_index()
|
||||
df_latest = df.merge(latest_year, on=['urn', 'year'])
|
||||
|
||||
school_cols = ["urn", "school_name", "local_authority", "school_type", "address", "town", "postcode"]
|
||||
available_cols = [c for c in school_cols if c in df_latest.columns]
|
||||
available_cols = [c for c in SCHOOL_COLUMNS if c in df_latest.columns]
|
||||
schools_df = df_latest[available_cols].drop_duplicates(subset=['urn'])
|
||||
|
||||
# Apply filters
|
||||
@@ -298,7 +97,19 @@ async def get_schools(
|
||||
if school_type:
|
||||
schools_df = schools_df[schools_df["school_type"].str.lower() == school_type.lower()]
|
||||
|
||||
return {"schools": clean_for_json(schools_df)}
|
||||
# Pagination
|
||||
total = len(schools_df)
|
||||
start_idx = (page - 1) * page_size
|
||||
end_idx = start_idx + page_size
|
||||
schools_df = schools_df.iloc[start_idx:end_idx]
|
||||
|
||||
return {
|
||||
"schools": clean_for_json(schools_df),
|
||||
"total": total,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
"total_pages": (total + page_size - 1) // page_size if page_size > 0 else 0,
|
||||
}
|
||||
|
||||
|
||||
@app.get("/api/schools/{urn}")
|
||||
@@ -390,56 +201,18 @@ async def get_filter_options():
|
||||
|
||||
@app.get("/api/metrics")
|
||||
async def get_available_metrics():
|
||||
"""Get list of available KS2 performance metrics for primary schools."""
|
||||
"""
|
||||
Get list of available KS2 performance metrics for primary schools.
|
||||
|
||||
This is the single source of truth for metric definitions.
|
||||
Frontend should consume this to avoid duplication.
|
||||
"""
|
||||
df = load_school_data()
|
||||
|
||||
# Define KS2 metric metadata organized by category
|
||||
metric_info = {
|
||||
# Expected Standard
|
||||
"rwm_expected_pct": {"name": "RWM Combined %", "description": "% meeting expected standard in reading, writing and maths", "type": "percentage", "category": "expected"},
|
||||
"reading_expected_pct": {"name": "Reading Expected %", "description": "% meeting expected standard in reading", "type": "percentage", "category": "expected"},
|
||||
"writing_expected_pct": {"name": "Writing Expected %", "description": "% meeting expected standard in writing", "type": "percentage", "category": "expected"},
|
||||
"maths_expected_pct": {"name": "Maths Expected %", "description": "% meeting expected standard in maths", "type": "percentage", "category": "expected"},
|
||||
"gps_expected_pct": {"name": "GPS Expected %", "description": "% meeting expected standard in grammar, punctuation & spelling", "type": "percentage", "category": "expected"},
|
||||
"science_expected_pct": {"name": "Science Expected %", "description": "% meeting expected standard in science", "type": "percentage", "category": "expected"},
|
||||
# Higher Standard
|
||||
"rwm_high_pct": {"name": "RWM Combined Higher %", "description": "% achieving higher standard in RWM combined", "type": "percentage", "category": "higher"},
|
||||
"reading_high_pct": {"name": "Reading Higher %", "description": "% achieving higher standard in reading", "type": "percentage", "category": "higher"},
|
||||
"writing_high_pct": {"name": "Writing Higher %", "description": "% achieving greater depth in writing", "type": "percentage", "category": "higher"},
|
||||
"maths_high_pct": {"name": "Maths Higher %", "description": "% achieving higher standard in maths", "type": "percentage", "category": "higher"},
|
||||
"gps_high_pct": {"name": "GPS Higher %", "description": "% achieving higher standard in GPS", "type": "percentage", "category": "higher"},
|
||||
# Progress Scores
|
||||
"reading_progress": {"name": "Reading Progress", "description": "Progress in reading from KS1 to KS2", "type": "score", "category": "progress"},
|
||||
"writing_progress": {"name": "Writing Progress", "description": "Progress in writing from KS1 to KS2", "type": "score", "category": "progress"},
|
||||
"maths_progress": {"name": "Maths Progress", "description": "Progress in maths from KS1 to KS2", "type": "score", "category": "progress"},
|
||||
# Average Scores
|
||||
"reading_avg_score": {"name": "Reading Avg Score", "description": "Average scaled score in reading", "type": "score", "category": "average"},
|
||||
"maths_avg_score": {"name": "Maths Avg Score", "description": "Average scaled score in maths", "type": "score", "category": "average"},
|
||||
"gps_avg_score": {"name": "GPS Avg Score", "description": "Average scaled score in GPS", "type": "score", "category": "average"},
|
||||
# Gender Performance
|
||||
"rwm_expected_boys_pct": {"name": "RWM Expected % (Boys)", "description": "% of boys meeting expected standard", "type": "percentage", "category": "gender"},
|
||||
"rwm_expected_girls_pct": {"name": "RWM Expected % (Girls)", "description": "% of girls meeting expected standard", "type": "percentage", "category": "gender"},
|
||||
"rwm_high_boys_pct": {"name": "RWM Higher % (Boys)", "description": "% of boys at higher standard", "type": "percentage", "category": "gender"},
|
||||
"rwm_high_girls_pct": {"name": "RWM Higher % (Girls)", "description": "% of girls at higher standard", "type": "percentage", "category": "gender"},
|
||||
# Disadvantaged Performance
|
||||
"rwm_expected_disadvantaged_pct": {"name": "RWM Expected % (Disadvantaged)", "description": "% of disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
|
||||
"rwm_expected_non_disadvantaged_pct": {"name": "RWM Expected % (Non-Disadvantaged)", "description": "% of non-disadvantaged pupils meeting expected", "type": "percentage", "category": "equity"},
|
||||
"disadvantaged_gap": {"name": "Disadvantaged Gap", "description": "Gap between disadvantaged and national non-disadvantaged", "type": "score", "category": "equity"},
|
||||
# School Context
|
||||
"disadvantaged_pct": {"name": "% Disadvantaged Pupils", "description": "% of pupils eligible for free school meals or looked after", "type": "percentage", "category": "context"},
|
||||
"eal_pct": {"name": "% EAL Pupils", "description": "% of pupils with English as additional language", "type": "percentage", "category": "context"},
|
||||
"sen_support_pct": {"name": "% SEN Support", "description": "% of pupils with SEN support", "type": "percentage", "category": "context"},
|
||||
"stability_pct": {"name": "% Pupil Stability", "description": "% of non-mobile pupils (stayed at school)", "type": "percentage", "category": "context"},
|
||||
# 3-Year Averages
|
||||
"rwm_expected_3yr_pct": {"name": "RWM Expected % (3-Year Avg)", "description": "3-year average % meeting expected", "type": "percentage", "category": "trends"},
|
||||
"reading_avg_3yr": {"name": "Reading Score (3-Year Avg)", "description": "3-year average reading score", "type": "score", "category": "trends"},
|
||||
"maths_avg_3yr": {"name": "Maths Score (3-Year Avg)", "description": "3-year average maths score", "type": "score", "category": "trends"},
|
||||
}
|
||||
|
||||
available = []
|
||||
for col, info in metric_info.items():
|
||||
if df.empty or col in df.columns:
|
||||
available.append({"key": col, **info})
|
||||
for key, info in METRIC_DEFINITIONS.items():
|
||||
if df.empty or key in df.columns:
|
||||
available.append({"key": key, **info})
|
||||
|
||||
return {"metrics": available}
|
||||
|
||||
@@ -448,13 +221,14 @@ async def get_available_metrics():
|
||||
async def get_rankings(
|
||||
metric: str = Query("rwm_expected_pct", description="KS2 metric to rank by"),
|
||||
year: Optional[int] = Query(None, description="Specific year (defaults to most recent)"),
|
||||
limit: int = Query(20, description="Number of schools to return"),
|
||||
limit: int = Query(20, ge=1, le=100, description="Number of schools to return"),
|
||||
local_authority: Optional[str] = Query(None, description="Filter by local authority"),
|
||||
):
|
||||
"""Get primary school rankings by a specific KS2 metric."""
|
||||
df = load_school_data()
|
||||
|
||||
if df.empty:
|
||||
return {"metric": metric, "year": None, "rankings": []}
|
||||
return {"metric": metric, "year": None, "rankings": [], "total": 0}
|
||||
|
||||
if metric not in df.columns:
|
||||
raise HTTPException(status_code=400, detail=f"Metric '{metric}' not available")
|
||||
@@ -467,39 +241,26 @@ async def get_rankings(
|
||||
max_year = df["year"].max()
|
||||
df = df[df["year"] == max_year]
|
||||
|
||||
# Filter by local authority if specified
|
||||
if local_authority:
|
||||
df = df[df["local_authority"].str.lower() == local_authority.lower()]
|
||||
|
||||
# Sort and rank (exclude rows with no data for this metric)
|
||||
df = df.dropna(subset=[metric])
|
||||
total = len(df)
|
||||
|
||||
# For progress scores, higher is better. For percentages, higher is also better.
|
||||
df = df.sort_values(metric, ascending=False).head(limit)
|
||||
|
||||
# Return only relevant fields for rankings
|
||||
ranking_cols = [
|
||||
"urn", "school_name", "local_authority", "school_type", "address", "year", "total_pupils",
|
||||
# Core expected
|
||||
"rwm_expected_pct", "reading_expected_pct", "writing_expected_pct", "maths_expected_pct",
|
||||
"gps_expected_pct", "science_expected_pct",
|
||||
# Core higher
|
||||
"rwm_high_pct", "reading_high_pct", "writing_high_pct", "maths_high_pct", "gps_high_pct",
|
||||
# Progress & averages
|
||||
"reading_progress", "writing_progress", "maths_progress",
|
||||
"reading_avg_score", "maths_avg_score", "gps_avg_score",
|
||||
# Gender
|
||||
"rwm_expected_boys_pct", "rwm_expected_girls_pct", "rwm_high_boys_pct", "rwm_high_girls_pct",
|
||||
# Equity
|
||||
"rwm_expected_disadvantaged_pct", "rwm_expected_non_disadvantaged_pct", "disadvantaged_gap",
|
||||
# Context
|
||||
"disadvantaged_pct", "eal_pct", "sen_support_pct", "stability_pct",
|
||||
# 3-year
|
||||
"rwm_expected_3yr_pct", "reading_avg_3yr", "maths_avg_3yr",
|
||||
]
|
||||
available_cols = [c for c in ranking_cols if c in df.columns]
|
||||
available_cols = [c for c in RANKING_COLUMNS if c in df.columns]
|
||||
df = df[available_cols]
|
||||
|
||||
return {
|
||||
"metric": metric,
|
||||
"year": int(df["year"].iloc[0]) if not df.empty else None,
|
||||
"rankings": clean_for_json(df)
|
||||
"rankings": clean_for_json(df),
|
||||
"total": total,
|
||||
}
|
||||
|
||||
|
||||
@@ -512,7 +273,7 @@ async def get_data_info():
|
||||
return {
|
||||
"status": "no_data",
|
||||
"message": "No data files found in data folder. Please download KS2 data from the government website.",
|
||||
"data_folder": str(DATA_DIR),
|
||||
"data_folder": str(settings.data_dir),
|
||||
}
|
||||
|
||||
years = [int(y) for y in sorted(df["year"].unique())]
|
||||
@@ -529,17 +290,22 @@ async def get_data_info():
|
||||
}
|
||||
|
||||
|
||||
# Mount static files
|
||||
@app.on_event("startup")
|
||||
async def startup():
|
||||
"""Setup static file serving and load data on startup."""
|
||||
if FRONTEND_DIR.exists():
|
||||
app.mount("/static", StaticFiles(directory=FRONTEND_DIR), name="static")
|
||||
|
||||
# Pre-load data
|
||||
@app.post("/api/admin/reload")
|
||||
async def reload_data():
|
||||
"""Admin endpoint to force data reload (useful after data updates)."""
|
||||
clear_cache()
|
||||
load_school_data()
|
||||
return {"status": "reloaded"}
|
||||
|
||||
|
||||
# Mount static files after all routes are defined
|
||||
@app.on_event("startup")
|
||||
async def mount_static():
|
||||
"""Mount static file serving."""
|
||||
if settings.frontend_dir.exists():
|
||||
app.mount("/static", StaticFiles(directory=settings.frontend_dir), name="static")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
uvicorn.run(app, host="0.0.0.0", port=8000)
|
||||
uvicorn.run(app, host=settings.host, port=settings.port)
|
||||
|
||||
38
backend/config.py
Normal file
38
backend/config.py
Normal file
@@ -0,0 +1,38 @@
|
||||
"""
|
||||
Application configuration using pydantic-settings.
|
||||
Loads from environment variables and .env file.
|
||||
"""
|
||||
|
||||
from pathlib import Path
|
||||
from typing import List
|
||||
from pydantic_settings import BaseSettings
|
||||
import os
|
||||
|
||||
|
||||
class Settings(BaseSettings):
|
||||
"""Application settings loaded from environment."""
|
||||
|
||||
# Paths
|
||||
data_dir: Path = Path(__file__).parent.parent / "data"
|
||||
frontend_dir: Path = Path(__file__).parent.parent / "frontend"
|
||||
|
||||
# Server
|
||||
host: str = "0.0.0.0"
|
||||
port: int = 80
|
||||
|
||||
# CORS
|
||||
allowed_origins: List[str] = ["https://schoolcompare.co.uk", "http://localhost:8000", "http://localhost:3000"]
|
||||
|
||||
# API
|
||||
default_page_size: int = 50
|
||||
max_page_size: int = 100
|
||||
|
||||
class Config:
|
||||
env_file = ".env"
|
||||
env_file_encoding = "utf-8"
|
||||
extra = "ignore"
|
||||
|
||||
|
||||
# Singleton instance
|
||||
settings = Settings()
|
||||
|
||||
196
backend/data_loader.py
Normal file
196
backend/data_loader.py
Normal file
@@ -0,0 +1,196 @@
|
||||
"""
|
||||
Data loading module with optimized pandas operations.
|
||||
Uses vectorized operations instead of .apply() for performance.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from pathlib import Path
|
||||
from functools import lru_cache
|
||||
import re
|
||||
from typing import Optional
|
||||
|
||||
from .config import settings
|
||||
from .schemas import (
|
||||
COLUMN_MAPPINGS,
|
||||
NUMERIC_COLUMNS,
|
||||
SCHOOL_TYPE_MAP,
|
||||
NULL_VALUES,
|
||||
LA_CODE_TO_NAME,
|
||||
)
|
||||
|
||||
|
||||
def extract_year_from_folder(folder_name: str) -> Optional[int]:
|
||||
"""Extract the end year from folder name like '2023-2024' -> 2024."""
|
||||
match = re.search(r'(\d{4})-(\d{4})', folder_name)
|
||||
if match:
|
||||
return int(match.group(2))
|
||||
return None
|
||||
|
||||
|
||||
def parse_numeric_vectorized(series: pd.Series) -> pd.Series:
|
||||
"""
|
||||
Vectorized numeric parsing - much faster than .apply().
|
||||
Handles SUPP, NE, NA, NP, %, etc.
|
||||
"""
|
||||
# Convert to string first
|
||||
str_series = series.astype(str)
|
||||
|
||||
# Replace null values with NaN
|
||||
for null_val in NULL_VALUES:
|
||||
str_series = str_series.replace(null_val, np.nan)
|
||||
|
||||
# Remove % signs
|
||||
str_series = str_series.str.rstrip('%')
|
||||
|
||||
# Convert to numeric
|
||||
return pd.to_numeric(str_series, errors='coerce')
|
||||
|
||||
|
||||
def create_address_vectorized(df: pd.DataFrame) -> pd.Series:
|
||||
"""
|
||||
Vectorized address creation - much faster than .apply().
|
||||
"""
|
||||
parts = []
|
||||
|
||||
if 'address1' in df.columns:
|
||||
parts.append(df['address1'].fillna('').astype(str))
|
||||
if 'town' in df.columns:
|
||||
parts.append(df['town'].fillna('').astype(str))
|
||||
if 'postcode' in df.columns:
|
||||
parts.append(df['postcode'].fillna('').astype(str))
|
||||
|
||||
if not parts:
|
||||
return pd.Series([''] * len(df), index=df.index)
|
||||
|
||||
# Combine parts with comma separator, filtering empty strings
|
||||
result = pd.Series([''] * len(df), index=df.index)
|
||||
for i, row_idx in enumerate(df.index):
|
||||
row_parts = [p.iloc[i] if hasattr(p, 'iloc') else p[i] for p in parts]
|
||||
row_parts = [p for p in row_parts if p and p.strip()]
|
||||
result.iloc[i] = ', '.join(row_parts)
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def create_address_fast(df: pd.DataFrame) -> pd.Series:
|
||||
"""
|
||||
Fast vectorized address creation using string concatenation.
|
||||
"""
|
||||
addr1 = df.get('address1', pd.Series([''] * len(df))).fillna('').astype(str)
|
||||
town = df.get('town', pd.Series([''] * len(df))).fillna('').astype(str)
|
||||
postcode = df.get('postcode', pd.Series([''] * len(df))).fillna('').astype(str)
|
||||
|
||||
# Build address with proper separators
|
||||
result = addr1.str.strip()
|
||||
|
||||
# Add town if not empty
|
||||
town_mask = town.str.strip() != ''
|
||||
result = result.where(~town_mask, result + ', ' + town.str.strip())
|
||||
|
||||
# Add postcode if not empty
|
||||
postcode_mask = postcode.str.strip() != ''
|
||||
result = result.where(~postcode_mask, result + ', ' + postcode.str.strip())
|
||||
|
||||
# Clean up leading commas
|
||||
result = result.str.lstrip(', ')
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def load_year_data(year_folder: Path, year: int) -> Optional[pd.DataFrame]:
|
||||
"""Load and process data for a single year."""
|
||||
ks2_file = year_folder / "england_ks2final.csv"
|
||||
if not ks2_file.exists():
|
||||
return None
|
||||
|
||||
try:
|
||||
print(f"Loading data from {ks2_file}")
|
||||
df = pd.read_csv(ks2_file, low_memory=False)
|
||||
|
||||
# Handle column types
|
||||
if 'LEA' in df.columns and df['LEA'].dtype == 'object':
|
||||
df['LEA'] = pd.to_numeric(df['LEA'], errors='coerce')
|
||||
if 'URN' in df.columns and df['URN'].dtype == 'object':
|
||||
df['URN'] = pd.to_numeric(df['URN'], errors='coerce')
|
||||
|
||||
# Filter to schools only (RECTYPE == 1 means school level data)
|
||||
if 'RECTYPE' in df.columns:
|
||||
df = df[df['RECTYPE'] == 1].copy()
|
||||
|
||||
# Add year and local authority name
|
||||
df['year'] = year
|
||||
|
||||
# Try different column names for LA name
|
||||
la_name_cols = ['LANAME', 'LA (name)', 'LA_NAME', 'LA NAME']
|
||||
la_col_found = None
|
||||
for col in la_name_cols:
|
||||
if col in df.columns:
|
||||
la_col_found = col
|
||||
break
|
||||
|
||||
if la_col_found:
|
||||
df['local_authority'] = df[la_col_found]
|
||||
elif 'LEA' in df.columns:
|
||||
# Map LEA codes to names using our mapping
|
||||
df['local_authority'] = df['LEA'].map(LA_CODE_TO_NAME).fillna(df['LEA'].astype(str))
|
||||
|
||||
# Rename columns using mapping
|
||||
rename_dict = {k: v for k, v in COLUMN_MAPPINGS.items() if k in df.columns}
|
||||
df = df.rename(columns=rename_dict)
|
||||
|
||||
# Create address field (vectorized)
|
||||
df['address'] = create_address_fast(df)
|
||||
|
||||
# Map school type codes to names (vectorized)
|
||||
if 'school_type_code' in df.columns:
|
||||
df['school_type'] = df['school_type_code'].map(SCHOOL_TYPE_MAP).fillna('Other')
|
||||
|
||||
# Parse numeric columns (vectorized - much faster than .apply())
|
||||
for col in NUMERIC_COLUMNS:
|
||||
if col in df.columns:
|
||||
df[col] = parse_numeric_vectorized(df[col])
|
||||
|
||||
print(f" Loaded {len(df)} schools for year {year}")
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error loading {ks2_file}: {e}")
|
||||
return None
|
||||
|
||||
|
||||
@lru_cache(maxsize=1)
|
||||
def load_school_data() -> pd.DataFrame:
|
||||
"""
|
||||
Load and combine all school data from CSV files in year folders.
|
||||
Uses lru_cache for singleton-like behavior.
|
||||
"""
|
||||
all_data = []
|
||||
|
||||
data_dir = settings.data_dir
|
||||
if data_dir.exists():
|
||||
for year_folder in data_dir.iterdir():
|
||||
if year_folder.is_dir() and re.match(r'\d{4}-\d{4}', year_folder.name):
|
||||
year = extract_year_from_folder(year_folder.name)
|
||||
if year is None:
|
||||
continue
|
||||
|
||||
df = load_year_data(year_folder, year)
|
||||
if df is not None:
|
||||
all_data.append(df)
|
||||
|
||||
if all_data:
|
||||
result = pd.concat(all_data, ignore_index=True)
|
||||
print(f"\nTotal records loaded: {len(result)}")
|
||||
print(f"Unique schools: {result['urn'].nunique()}")
|
||||
print(f"Years: {sorted(result['year'].unique())}")
|
||||
return result
|
||||
else:
|
||||
print("No data files found. Creating empty DataFrame.")
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def clear_cache():
|
||||
"""Clear the data cache to force reload."""
|
||||
load_school_data.cache_clear()
|
||||
|
||||
398
backend/schemas.py
Normal file
398
backend/schemas.py
Normal file
@@ -0,0 +1,398 @@
|
||||
"""
|
||||
Schema definitions: column mappings, metric definitions, school type mappings.
|
||||
Single source of truth for all data transformations.
|
||||
"""
|
||||
|
||||
# Column name mappings from DfE CSV to API field names
|
||||
COLUMN_MAPPINGS = {
|
||||
'URN': 'urn',
|
||||
'SCHNAME': 'school_name',
|
||||
'ADDRESS1': 'address1',
|
||||
'ADDRESS2': 'address2',
|
||||
'TOWN': 'town',
|
||||
'PCODE': 'postcode',
|
||||
'NFTYPE': 'school_type_code',
|
||||
'RELDENOM': 'religious_denomination',
|
||||
'AGERANGE': 'age_range',
|
||||
'TOTPUPS': 'total_pupils',
|
||||
'TELIG': 'eligible_pupils',
|
||||
# Core KS2 metrics
|
||||
'PTRWM_EXP': 'rwm_expected_pct',
|
||||
'PTRWM_HIGH': 'rwm_high_pct',
|
||||
'READPROG': 'reading_progress',
|
||||
'WRITPROG': 'writing_progress',
|
||||
'MATPROG': 'maths_progress',
|
||||
'PTREAD_EXP': 'reading_expected_pct',
|
||||
'PTWRITTA_EXP': 'writing_expected_pct',
|
||||
'PTMAT_EXP': 'maths_expected_pct',
|
||||
'READ_AVERAGE': 'reading_avg_score',
|
||||
'MAT_AVERAGE': 'maths_avg_score',
|
||||
'PTREAD_HIGH': 'reading_high_pct',
|
||||
'PTWRITTA_HIGH': 'writing_high_pct',
|
||||
'PTMAT_HIGH': 'maths_high_pct',
|
||||
# GPS (Grammar, Punctuation & Spelling)
|
||||
'PTGPS_EXP': 'gps_expected_pct',
|
||||
'PTGPS_HIGH': 'gps_high_pct',
|
||||
'GPS_AVERAGE': 'gps_avg_score',
|
||||
# Science
|
||||
'PTSCITA_EXP': 'science_expected_pct',
|
||||
# School context
|
||||
'PTFSM6CLA1A': 'disadvantaged_pct',
|
||||
'PTEALGRP2': 'eal_pct',
|
||||
'PSENELK': 'sen_support_pct',
|
||||
'PSENELE': 'sen_ehcp_pct',
|
||||
'PTMOBN': 'stability_pct',
|
||||
# Gender breakdown
|
||||
'PTRWM_EXP_B': 'rwm_expected_boys_pct',
|
||||
'PTRWM_EXP_G': 'rwm_expected_girls_pct',
|
||||
'PTRWM_HIGH_B': 'rwm_high_boys_pct',
|
||||
'PTRWM_HIGH_G': 'rwm_high_girls_pct',
|
||||
# Disadvantaged performance
|
||||
'PTRWM_EXP_FSM6CLA1A': 'rwm_expected_disadvantaged_pct',
|
||||
'PTRWM_EXP_NotFSM6CLA1A': 'rwm_expected_non_disadvantaged_pct',
|
||||
'DIFFN_RWM_EXP': 'disadvantaged_gap',
|
||||
# 3-year averages
|
||||
'PTRWM_EXP_3YR': 'rwm_expected_3yr_pct',
|
||||
'READ_AVERAGE_3YR': 'reading_avg_3yr',
|
||||
'MAT_AVERAGE_3YR': 'maths_avg_3yr',
|
||||
}
|
||||
|
||||
# Numeric columns that need parsing
|
||||
NUMERIC_COLUMNS = [
|
||||
# Core metrics
|
||||
'rwm_expected_pct', 'rwm_high_pct', 'reading_progress',
|
||||
'writing_progress', 'maths_progress', 'reading_expected_pct',
|
||||
'writing_expected_pct', 'maths_expected_pct', 'reading_avg_score',
|
||||
'maths_avg_score', 'reading_high_pct', 'writing_high_pct', 'maths_high_pct',
|
||||
# GPS & Science
|
||||
'gps_expected_pct', 'gps_high_pct', 'gps_avg_score', 'science_expected_pct',
|
||||
# School context
|
||||
'total_pupils', 'eligible_pupils', 'disadvantaged_pct', 'eal_pct',
|
||||
'sen_support_pct', 'sen_ehcp_pct', 'stability_pct',
|
||||
# Gender breakdown
|
||||
'rwm_expected_boys_pct', 'rwm_expected_girls_pct',
|
||||
'rwm_high_boys_pct', 'rwm_high_girls_pct',
|
||||
# Disadvantaged performance
|
||||
'rwm_expected_disadvantaged_pct', 'rwm_expected_non_disadvantaged_pct', 'disadvantaged_gap',
|
||||
# 3-year averages
|
||||
'rwm_expected_3yr_pct', 'reading_avg_3yr', 'maths_avg_3yr',
|
||||
]
|
||||
|
||||
# School type code to name mapping
|
||||
SCHOOL_TYPE_MAP = {
|
||||
'AC': 'Academy',
|
||||
'ACC': 'Academy Converter',
|
||||
'ACS': 'Academy Sponsor Led',
|
||||
'CY': 'Community School',
|
||||
'VA': 'Voluntary Aided',
|
||||
'VC': 'Voluntary Controlled',
|
||||
'FD': 'Foundation',
|
||||
'F': 'Foundation',
|
||||
'FS': 'Free School',
|
||||
}
|
||||
|
||||
# Special values to treat as null
|
||||
NULL_VALUES = ['SUPP', 'NE', 'NA', 'NP', 'NEW', 'LOW', '']
|
||||
|
||||
# KS2 Metric definitions - single source of truth
|
||||
# Used by both backend API and frontend
|
||||
METRIC_DEFINITIONS = {
|
||||
# Expected Standard
|
||||
"rwm_expected_pct": {
|
||||
"name": "RWM Combined %",
|
||||
"short_name": "RWM %",
|
||||
"description": "% meeting expected standard in reading, writing and maths",
|
||||
"type": "percentage",
|
||||
"category": "expected"
|
||||
},
|
||||
"reading_expected_pct": {
|
||||
"name": "Reading Expected %",
|
||||
"short_name": "Reading %",
|
||||
"description": "% meeting expected standard in reading",
|
||||
"type": "percentage",
|
||||
"category": "expected"
|
||||
},
|
||||
"writing_expected_pct": {
|
||||
"name": "Writing Expected %",
|
||||
"short_name": "Writing %",
|
||||
"description": "% meeting expected standard in writing",
|
||||
"type": "percentage",
|
||||
"category": "expected"
|
||||
},
|
||||
"maths_expected_pct": {
|
||||
"name": "Maths Expected %",
|
||||
"short_name": "Maths %",
|
||||
"description": "% meeting expected standard in maths",
|
||||
"type": "percentage",
|
||||
"category": "expected"
|
||||
},
|
||||
"gps_expected_pct": {
|
||||
"name": "GPS Expected %",
|
||||
"short_name": "GPS %",
|
||||
"description": "% meeting expected standard in grammar, punctuation & spelling",
|
||||
"type": "percentage",
|
||||
"category": "expected"
|
||||
},
|
||||
"science_expected_pct": {
|
||||
"name": "Science Expected %",
|
||||
"short_name": "Science %",
|
||||
"description": "% meeting expected standard in science",
|
||||
"type": "percentage",
|
||||
"category": "expected"
|
||||
},
|
||||
# Higher Standard
|
||||
"rwm_high_pct": {
|
||||
"name": "RWM Combined Higher %",
|
||||
"short_name": "RWM Higher %",
|
||||
"description": "% achieving higher standard in RWM combined",
|
||||
"type": "percentage",
|
||||
"category": "higher"
|
||||
},
|
||||
"reading_high_pct": {
|
||||
"name": "Reading Higher %",
|
||||
"short_name": "Reading Higher %",
|
||||
"description": "% achieving higher standard in reading",
|
||||
"type": "percentage",
|
||||
"category": "higher"
|
||||
},
|
||||
"writing_high_pct": {
|
||||
"name": "Writing Higher %",
|
||||
"short_name": "Writing Higher %",
|
||||
"description": "% achieving greater depth in writing",
|
||||
"type": "percentage",
|
||||
"category": "higher"
|
||||
},
|
||||
"maths_high_pct": {
|
||||
"name": "Maths Higher %",
|
||||
"short_name": "Maths Higher %",
|
||||
"description": "% achieving higher standard in maths",
|
||||
"type": "percentage",
|
||||
"category": "higher"
|
||||
},
|
||||
"gps_high_pct": {
|
||||
"name": "GPS Higher %",
|
||||
"short_name": "GPS Higher %",
|
||||
"description": "% achieving higher standard in GPS",
|
||||
"type": "percentage",
|
||||
"category": "higher"
|
||||
},
|
||||
# Progress Scores
|
||||
"reading_progress": {
|
||||
"name": "Reading Progress",
|
||||
"short_name": "Reading Progress",
|
||||
"description": "Progress in reading from KS1 to KS2",
|
||||
"type": "score",
|
||||
"category": "progress"
|
||||
},
|
||||
"writing_progress": {
|
||||
"name": "Writing Progress",
|
||||
"short_name": "Writing Progress",
|
||||
"description": "Progress in writing from KS1 to KS2",
|
||||
"type": "score",
|
||||
"category": "progress"
|
||||
},
|
||||
"maths_progress": {
|
||||
"name": "Maths Progress",
|
||||
"short_name": "Maths Progress",
|
||||
"description": "Progress in maths from KS1 to KS2",
|
||||
"type": "score",
|
||||
"category": "progress"
|
||||
},
|
||||
# Average Scores
|
||||
"reading_avg_score": {
|
||||
"name": "Reading Average Score",
|
||||
"short_name": "Reading Avg",
|
||||
"description": "Average scaled score in reading",
|
||||
"type": "score",
|
||||
"category": "average"
|
||||
},
|
||||
"maths_avg_score": {
|
||||
"name": "Maths Average Score",
|
||||
"short_name": "Maths Avg",
|
||||
"description": "Average scaled score in maths",
|
||||
"type": "score",
|
||||
"category": "average"
|
||||
},
|
||||
"gps_avg_score": {
|
||||
"name": "GPS Average Score",
|
||||
"short_name": "GPS Avg",
|
||||
"description": "Average scaled score in GPS",
|
||||
"type": "score",
|
||||
"category": "average"
|
||||
},
|
||||
# Gender Performance
|
||||
"rwm_expected_boys_pct": {
|
||||
"name": "RWM Expected % (Boys)",
|
||||
"short_name": "Boys RWM %",
|
||||
"description": "% of boys meeting expected standard",
|
||||
"type": "percentage",
|
||||
"category": "gender"
|
||||
},
|
||||
"rwm_expected_girls_pct": {
|
||||
"name": "RWM Expected % (Girls)",
|
||||
"short_name": "Girls RWM %",
|
||||
"description": "% of girls meeting expected standard",
|
||||
"type": "percentage",
|
||||
"category": "gender"
|
||||
},
|
||||
"rwm_high_boys_pct": {
|
||||
"name": "RWM Higher % (Boys)",
|
||||
"short_name": "Boys Higher %",
|
||||
"description": "% of boys at higher standard",
|
||||
"type": "percentage",
|
||||
"category": "gender"
|
||||
},
|
||||
"rwm_high_girls_pct": {
|
||||
"name": "RWM Higher % (Girls)",
|
||||
"short_name": "Girls Higher %",
|
||||
"description": "% of girls at higher standard",
|
||||
"type": "percentage",
|
||||
"category": "gender"
|
||||
},
|
||||
# Disadvantaged Performance
|
||||
"rwm_expected_disadvantaged_pct": {
|
||||
"name": "RWM Expected % (Disadvantaged)",
|
||||
"short_name": "Disadvantaged %",
|
||||
"description": "% of disadvantaged pupils meeting expected",
|
||||
"type": "percentage",
|
||||
"category": "equity"
|
||||
},
|
||||
"rwm_expected_non_disadvantaged_pct": {
|
||||
"name": "RWM Expected % (Non-Disadvantaged)",
|
||||
"short_name": "Non-Disadv %",
|
||||
"description": "% of non-disadvantaged pupils meeting expected",
|
||||
"type": "percentage",
|
||||
"category": "equity"
|
||||
},
|
||||
"disadvantaged_gap": {
|
||||
"name": "Disadvantaged Gap",
|
||||
"short_name": "Disadv Gap",
|
||||
"description": "Gap between disadvantaged and national non-disadvantaged",
|
||||
"type": "score",
|
||||
"category": "equity"
|
||||
},
|
||||
# School Context
|
||||
"disadvantaged_pct": {
|
||||
"name": "% Disadvantaged Pupils",
|
||||
"short_name": "% Disadvantaged",
|
||||
"description": "% of pupils eligible for free school meals or looked after",
|
||||
"type": "percentage",
|
||||
"category": "context"
|
||||
},
|
||||
"eal_pct": {
|
||||
"name": "% EAL Pupils",
|
||||
"short_name": "% EAL",
|
||||
"description": "% of pupils with English as additional language",
|
||||
"type": "percentage",
|
||||
"category": "context"
|
||||
},
|
||||
"sen_support_pct": {
|
||||
"name": "% SEN Support",
|
||||
"short_name": "% SEN",
|
||||
"description": "% of pupils with SEN support",
|
||||
"type": "percentage",
|
||||
"category": "context"
|
||||
},
|
||||
"stability_pct": {
|
||||
"name": "% Pupil Stability",
|
||||
"short_name": "% Stable",
|
||||
"description": "% of non-mobile pupils (stayed at school)",
|
||||
"type": "percentage",
|
||||
"category": "context"
|
||||
},
|
||||
# 3-Year Averages
|
||||
"rwm_expected_3yr_pct": {
|
||||
"name": "RWM Expected % (3-Year Avg)",
|
||||
"short_name": "RWM 3yr %",
|
||||
"description": "3-year average % meeting expected",
|
||||
"type": "percentage",
|
||||
"category": "trends"
|
||||
},
|
||||
"reading_avg_3yr": {
|
||||
"name": "Reading Score (3-Year Avg)",
|
||||
"short_name": "Reading 3yr",
|
||||
"description": "3-year average reading score",
|
||||
"type": "score",
|
||||
"category": "trends"
|
||||
},
|
||||
"maths_avg_3yr": {
|
||||
"name": "Maths Score (3-Year Avg)",
|
||||
"short_name": "Maths 3yr",
|
||||
"description": "3-year average maths score",
|
||||
"type": "score",
|
||||
"category": "trends"
|
||||
},
|
||||
}
|
||||
|
||||
# Ranking columns to include in rankings response
|
||||
RANKING_COLUMNS = [
|
||||
"urn", "school_name", "local_authority", "school_type", "address", "year", "total_pupils",
|
||||
# Core expected
|
||||
"rwm_expected_pct", "reading_expected_pct", "writing_expected_pct", "maths_expected_pct",
|
||||
"gps_expected_pct", "science_expected_pct",
|
||||
# Core higher
|
||||
"rwm_high_pct", "reading_high_pct", "writing_high_pct", "maths_high_pct", "gps_high_pct",
|
||||
# Progress & averages
|
||||
"reading_progress", "writing_progress", "maths_progress",
|
||||
"reading_avg_score", "maths_avg_score", "gps_avg_score",
|
||||
# Gender
|
||||
"rwm_expected_boys_pct", "rwm_expected_girls_pct", "rwm_high_boys_pct", "rwm_high_girls_pct",
|
||||
# Equity
|
||||
"rwm_expected_disadvantaged_pct", "rwm_expected_non_disadvantaged_pct", "disadvantaged_gap",
|
||||
# Context
|
||||
"disadvantaged_pct", "eal_pct", "sen_support_pct", "stability_pct",
|
||||
# 3-year
|
||||
"rwm_expected_3yr_pct", "reading_avg_3yr", "maths_avg_3yr",
|
||||
]
|
||||
|
||||
# School listing columns
|
||||
SCHOOL_COLUMNS = ["urn", "school_name", "local_authority", "school_type", "address", "town", "postcode"]
|
||||
|
||||
# Local Authority code to name mapping (for fallback when LANAME column missing)
|
||||
# Source: https://www.gov.uk/government/publications/local-authority-codes
|
||||
LA_CODE_TO_NAME = {
|
||||
201: "City of London", 202: "Camden", 203: "Greenwich", 204: "Hackney",
|
||||
205: "Hammersmith and Fulham", 206: "Islington", 207: "Kensington and Chelsea",
|
||||
208: "Lambeth", 209: "Lewisham", 210: "Southwark", 211: "Tower Hamlets",
|
||||
212: "Wandsworth", 213: "Westminster", 301: "Barking and Dagenham", 302: "Barnet",
|
||||
303: "Bexley", 304: "Brent", 305: "Bromley", 306: "Croydon", 307: "Ealing",
|
||||
308: "Enfield", 309: "Haringey", 310: "Harrow", 311: "Havering", 312: "Hillingdon",
|
||||
313: "Hounslow", 314: "Kingston upon Thames", 315: "Merton", 316: "Newham",
|
||||
317: "Redbridge", 318: "Richmond upon Thames", 319: "Sutton", 320: "Waltham Forest",
|
||||
330: "Birmingham", 331: "Coventry", 332: "Dudley", 333: "Sandwell", 334: "Solihull",
|
||||
335: "Walsall", 336: "Wolverhampton", 340: "Knowsley", 341: "Liverpool",
|
||||
342: "St. Helens", 343: "Sefton", 344: "Wirral", 350: "Bolton", 351: "Bury",
|
||||
352: "Manchester", 353: "Oldham", 354: "Rochdale", 355: "Salford", 356: "Stockport",
|
||||
357: "Tameside", 358: "Trafford", 359: "Wigan", 370: "Barnsley", 371: "Doncaster",
|
||||
372: "Rotherham", 373: "Sheffield", 380: "Bradford", 381: "Calderdale",
|
||||
382: "Kirklees", 383: "Leeds", 384: "Wakefield", 390: "Gateshead",
|
||||
391: "Newcastle upon Tyne", 392: "North Tyneside", 393: "South Tyneside",
|
||||
394: "Sunderland", 420: "Isles of Scilly", 800: "Bath and North East Somerset",
|
||||
801: "Bristol, City of", 802: "North Somerset", 803: "South Gloucestershire",
|
||||
805: "Hartlepool", 806: "Middlesbrough", 807: "Redcar and Cleveland",
|
||||
808: "Stockton-on-Tees", 810: "Kingston Upon Hull, City of", 811: "East Riding of Yorkshire",
|
||||
812: "North East Lincolnshire", 813: "North Lincolnshire", 815: "North Yorkshire",
|
||||
816: "York", 820: "Bedford", 821: "Central Bedfordshire", 822: "Luton",
|
||||
825: "Buckinghamshire", 826: "Milton Keynes", 830: "Derbyshire", 831: "Derby",
|
||||
835: "Dorset", 836: "Bournemouth, Christchurch and Poole", 837: "Poole",
|
||||
838: "Bournemouth", 839: "Durham", 840: "Darlington", 841: "East Sussex",
|
||||
845: "Brighton and Hove", 846: "Hampshire", 850: "Portsmouth", 851: "Southampton",
|
||||
852: "Isle of Wight", 855: "Leicestershire", 856: "Leicester", 857: "Rutland",
|
||||
860: "Staffordshire", 861: "Stoke-on-Trent", 865: "Wiltshire", 866: "Swindon",
|
||||
867: "Bracknell Forest", 868: "Windsor and Maidenhead", 869: "West Berkshire",
|
||||
870: "Reading", 871: "Slough", 872: "Wokingham", 873: "Cambridgeshire",
|
||||
874: "Peterborough", 876: "Halton", 877: "Warrington", 878: "Devon",
|
||||
879: "Plymouth", 880: "Torbay", 881: "Essex", 882: "Southend-on-Sea",
|
||||
883: "Thurrock", 884: "Herefordshire", 885: "Worcestershire", 886: "Kent",
|
||||
887: "Medway", 888: "Lancashire", 889: "Blackburn with Darwen", 890: "Blackpool",
|
||||
891: "Nottinghamshire", 892: "Nottingham", 893: "Shropshire", 894: "Telford and Wrekin",
|
||||
895: "Cheshire East", 896: "Cheshire West and Chester", 908: "Cornwall",
|
||||
909: "Cumbria", 916: "Gloucestershire", 919: "Hertfordshire", 921: "Norfolk",
|
||||
925: "Lincolnshire", 926: "Northamptonshire", 928: "Northumberland",
|
||||
929: "Oxfordshire", 931: "Somerset", 933: "Suffolk", 935: "Surrey",
|
||||
936: "Warwickshire", 937: "West Sussex", 938: "Westmorland and Furness",
|
||||
940: "Cumberland",
|
||||
# Additional codes
|
||||
420: "Isles of Scilly",
|
||||
}
|
||||
|
||||
37
backend/utils.py
Normal file
37
backend/utils.py
Normal file
@@ -0,0 +1,37 @@
|
||||
"""
|
||||
Utility functions for data conversion and JSON serialization.
|
||||
"""
|
||||
|
||||
import pandas as pd
|
||||
import numpy as np
|
||||
from typing import Any, List
|
||||
|
||||
|
||||
def convert_to_native(value: Any) -> Any:
|
||||
"""Convert numpy types to native Python types for JSON serialization."""
|
||||
if pd.isna(value):
|
||||
return None
|
||||
if isinstance(value, (np.integer,)):
|
||||
return int(value)
|
||||
if isinstance(value, (np.floating,)):
|
||||
if np.isnan(value) or np.isinf(value):
|
||||
return None
|
||||
return float(value)
|
||||
if isinstance(value, np.ndarray):
|
||||
return value.tolist()
|
||||
if value == "SUPP" or value == "NE" or value == "NA" or value == "NP":
|
||||
return None
|
||||
return value
|
||||
|
||||
|
||||
def clean_for_json(df: pd.DataFrame) -> List[dict]:
|
||||
"""Convert DataFrame to list of dicts, replacing NaN/inf with None for JSON serialization."""
|
||||
records = df.to_dict(orient="records")
|
||||
cleaned = []
|
||||
for record in records:
|
||||
clean_record = {}
|
||||
for key, value in record.items():
|
||||
clean_record[key] = convert_to_native(value)
|
||||
cleaned.append(clean_record)
|
||||
return cleaned
|
||||
|
||||
Reference in New Issue
Block a user