fixing data load
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 57s

This commit is contained in:
Tudor Sitaru
2026-01-06 22:06:59 +00:00
parent e601c499b6
commit 1a8ec670b9
3 changed files with 465 additions and 261 deletions

View File

@@ -5,20 +5,25 @@ Uses real data from UK Government Compare School Performance downloads.
"""
from contextlib import asynccontextmanager
import pandas as pd
from fastapi import FastAPI, HTTPException, Query
from fastapi.staticfiles import StaticFiles
from fastapi.responses import FileResponse
from fastapi.middleware.cors import CORSMiddleware
from typing import Optional
import pandas as pd
from fastapi import FastAPI, HTTPException, Query
from fastapi.middleware.cors import CORSMiddleware
from fastapi.responses import FileResponse
from fastapi.staticfiles import StaticFiles
from .config import settings
from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
from .data_loader import (
load_school_data, clear_cache, geocode_single_postcode,
geocode_postcodes_bulk, haversine_distance, get_data_info as get_db_info
clear_cache,
geocode_postcodes_bulk,
geocode_single_postcode,
haversine_distance,
load_school_data,
)
from .data_loader import get_data_info as get_db_info
from .database import init_db
from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
from .utils import clean_for_json
@@ -28,16 +33,16 @@ async def lifespan(app: FastAPI):
# Startup: initialize database and pre-load data
print("Starting up: Initializing database...")
init_db() # Ensure tables exist
print("Loading school data from database...")
df = load_school_data()
if df.empty:
print("Warning: No data in database. Run the migration script to import data.")
else:
print("Data loaded successfully.")
yield # Application runs here
# Shutdown: cleanup if needed
print("Shutting down...")
@@ -80,7 +85,9 @@ async def serve_rankings():
@app.get("/api/schools")
async def get_schools(
search: Optional[str] = Query(None, description="Search by school name"),
local_authority: Optional[str] = Query(None, description="Filter by local authority"),
local_authority: Optional[str] = Query(
None, description="Filter by local authority"
),
school_type: Optional[str] = Query(None, description="Filter by school type"),
postcode: Optional[str] = Query(None, description="Search near postcode"),
radius: float = Query(5.0, ge=0.1, le=50, description="Search radius in miles"),
@@ -89,28 +96,40 @@ async def get_schools(
):
"""
Get list of unique primary schools with pagination.
Returns paginated results with total count for efficient loading.
Supports location-based search using postcode.
"""
df = load_school_data()
if df.empty:
return {"schools": [], "total": 0, "page": page, "page_size": 0}
# Use configured default if not specified
if page_size is None:
page_size = settings.default_page_size
# Get unique schools (latest year data for each)
latest_year = df.groupby('urn')['year'].max().reset_index()
df_latest = df.merge(latest_year, on=['urn', 'year'])
# Include lat/long in columns for location search
location_cols = ['latitude', 'longitude']
available_cols = [c for c in SCHOOL_COLUMNS + location_cols if c in df_latest.columns]
schools_df = df_latest[available_cols].drop_duplicates(subset=['urn'])
latest_year = df.groupby("urn")["year"].max().reset_index()
df_latest = df.merge(latest_year, on=["urn", "year"])
# Include key result metrics for display on cards
location_cols = ["latitude", "longitude"]
result_cols = [
"year",
"rwm_expected_pct",
"reading_expected_pct",
"writing_expected_pct",
"maths_expected_pct",
"total_pupils",
]
available_cols = [
c
for c in SCHOOL_COLUMNS + location_cols + result_cols
if c in df_latest.columns
]
schools_df = df_latest[available_cols].drop_duplicates(subset=["urn"])
# Location-based search
search_coords = None
if postcode:
@@ -118,65 +137,81 @@ async def get_schools(
if coords:
search_coords = coords
schools_df = schools_df.copy()
# Geocode school postcodes on-demand if not already cached
if 'postcode' in schools_df.columns:
unique_postcodes = schools_df['postcode'].dropna().unique().tolist()
if "postcode" in schools_df.columns:
unique_postcodes = schools_df["postcode"].dropna().unique().tolist()
geocoded = geocode_postcodes_bulk(unique_postcodes)
# Add lat/long from geocoded data
schools_df['latitude'] = schools_df['postcode'].apply(
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[0] if pd.notna(pc) else None
schools_df["latitude"] = schools_df["postcode"].apply(
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[0]
if pd.notna(pc)
else None
)
schools_df['longitude'] = schools_df['postcode'].apply(
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[1] if pd.notna(pc) else None
schools_df["longitude"] = schools_df["postcode"].apply(
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[1]
if pd.notna(pc)
else None
)
# Filter by distance
def calc_distance(row):
if pd.isna(row.get('latitude')) or pd.isna(row.get('longitude')):
return float('inf')
if pd.isna(row.get("latitude")) or pd.isna(row.get("longitude")):
return float("inf")
return haversine_distance(
search_coords[0], search_coords[1],
row['latitude'], row['longitude']
search_coords[0],
search_coords[1],
row["latitude"],
row["longitude"],
)
schools_df['distance'] = schools_df.apply(calc_distance, axis=1)
schools_df = schools_df[schools_df['distance'] <= radius]
schools_df = schools_df.sort_values('distance')
schools_df["distance"] = schools_df.apply(calc_distance, axis=1)
schools_df = schools_df[schools_df["distance"] <= radius]
schools_df = schools_df.sort_values("distance")
# Apply filters
if search:
search_lower = search.lower()
mask = schools_df["school_name"].str.lower().str.contains(search_lower, na=False)
mask = (
schools_df["school_name"].str.lower().str.contains(search_lower, na=False)
)
if "address" in schools_df.columns:
mask = mask | schools_df["address"].str.lower().str.contains(search_lower, na=False)
mask = mask | schools_df["address"].str.lower().str.contains(
search_lower, na=False
)
schools_df = schools_df[mask]
if local_authority:
schools_df = schools_df[schools_df["local_authority"].str.lower() == local_authority.lower()]
schools_df = schools_df[
schools_df["local_authority"].str.lower() == local_authority.lower()
]
if school_type:
schools_df = schools_df[schools_df["school_type"].str.lower() == school_type.lower()]
schools_df = schools_df[
schools_df["school_type"].str.lower() == school_type.lower()
]
# Pagination
total = len(schools_df)
start_idx = (page - 1) * page_size
end_idx = start_idx + page_size
schools_df = schools_df.iloc[start_idx:end_idx]
# Remove internal columns before sending
output_cols = [c for c in schools_df.columns if c not in ['latitude', 'longitude']]
if 'distance' in schools_df.columns:
output_cols.append('distance')
output_cols = [c for c in schools_df.columns if c not in ["latitude", "longitude"]]
if "distance" in schools_df.columns:
output_cols.append("distance")
return {
"schools": clean_for_json(schools_df[output_cols]),
"total": total,
"page": page,
"page_size": page_size,
"total_pages": (total + page_size - 1) // page_size if page_size > 0 else 0,
"search_location": {"postcode": postcode, "radius": radius} if search_coords else None,
"search_location": {"postcode": postcode, "radius": radius}
if search_coords
else None,
}
@@ -184,21 +219,21 @@ async def get_schools(
async def get_school_details(urn: int):
"""Get detailed KS2 data for a specific primary school across all years."""
df = load_school_data()
if df.empty:
raise HTTPException(status_code=404, detail="No data available")
school_data = df[df["urn"] == urn]
if school_data.empty:
raise HTTPException(status_code=404, detail="School not found")
# Sort by year
school_data = school_data.sort_values("year")
# Get latest info for the school
latest = school_data.iloc[-1]
return {
"school_info": {
"urn": urn,
@@ -208,7 +243,7 @@ async def get_school_details(urn: int):
"address": latest.get("address", ""),
"phase": "Primary",
},
"yearly_data": clean_for_json(school_data)
"yearly_data": clean_for_json(school_data),
}
@@ -216,20 +251,20 @@ async def get_school_details(urn: int):
async def compare_schools(urns: str = Query(..., description="Comma-separated URNs")):
"""Compare multiple primary schools side by side."""
df = load_school_data()
if df.empty:
raise HTTPException(status_code=404, detail="No data available")
try:
urn_list = [int(u.strip()) for u in urns.split(",")]
except ValueError:
raise HTTPException(status_code=400, detail="Invalid URN format")
comparison_data = df[df["urn"].isin(urn_list)]
if comparison_data.empty:
raise HTTPException(status_code=404, detail="No schools found")
result = {}
for urn in urn_list:
school_data = comparison_data[comparison_data["urn"] == urn].sort_values("year")
@@ -242,9 +277,9 @@ async def compare_schools(urns: str = Query(..., description="Comma-separated UR
"local_authority": latest.get("local_authority", ""),
"address": latest.get("address", ""),
},
"yearly_data": clean_for_json(school_data)
"yearly_data": clean_for_json(school_data),
}
return {"comparison": result}
@@ -252,14 +287,14 @@ async def compare_schools(urns: str = Query(..., description="Comma-separated UR
async def get_filter_options():
"""Get available filter options (local authorities, school types, years)."""
df = load_school_data()
if df.empty:
return {
"local_authorities": [],
"school_types": [],
"years": [],
}
return {
"local_authorities": sorted(df["local_authority"].dropna().unique().tolist()),
"school_types": sorted(df["school_type"].dropna().unique().tolist()),
@@ -271,36 +306,40 @@ async def get_filter_options():
async def get_available_metrics():
"""
Get list of available KS2 performance metrics for primary schools.
This is the single source of truth for metric definitions.
Frontend should consume this to avoid duplication.
"""
df = load_school_data()
available = []
for key, info in METRIC_DEFINITIONS.items():
if df.empty or key in df.columns:
available.append({"key": key, **info})
return {"metrics": available}
@app.get("/api/rankings")
async def get_rankings(
metric: str = Query("rwm_expected_pct", description="KS2 metric to rank by"),
year: Optional[int] = Query(None, description="Specific year (defaults to most recent)"),
year: Optional[int] = Query(
None, description="Specific year (defaults to most recent)"
),
limit: int = Query(20, ge=1, le=100, description="Number of schools to return"),
local_authority: Optional[str] = Query(None, description="Filter by local authority"),
local_authority: Optional[str] = Query(
None, description="Filter by local authority"
),
):
"""Get primary school rankings by a specific KS2 metric."""
df = load_school_data()
if df.empty:
return {"metric": metric, "year": None, "rankings": [], "total": 0}
if metric not in df.columns:
raise HTTPException(status_code=400, detail=f"Metric '{metric}' not available")
# Filter by year
if year:
df = df[df["year"] == year]
@@ -308,22 +347,22 @@ async def get_rankings(
# Use most recent year
max_year = df["year"].max()
df = df[df["year"] == max_year]
# Filter by local authority if specified
if local_authority:
df = df[df["local_authority"].str.lower() == local_authority.lower()]
# Sort and rank (exclude rows with no data for this metric)
df = df.dropna(subset=[metric])
total = len(df)
# For progress scores, higher is better. For percentages, higher is also better.
df = df.sort_values(metric, ascending=False).head(limit)
# Return only relevant fields for rankings
available_cols = [c for c in RANKING_COLUMNS if c in df.columns]
df = df[available_cols]
return {
"metric": metric,
"year": int(df["year"].iloc[0]) if not df.empty else None,
@@ -337,28 +376,34 @@ async def get_data_info():
"""Get information about loaded data."""
# Get info directly from database
db_info = get_db_info()
if db_info["total_schools"] == 0:
return {
"status": "no_data",
"message": "No data in database. Run the migration script: python scripts/migrate_csv_to_db.py",
"data_source": "PostgreSQL",
}
# Also get DataFrame-based stats for backwards compatibility
df = load_school_data()
if df.empty:
return {
"status": "no_data",
"message": "No data available",
"data_source": "PostgreSQL",
}
years = [int(y) for y in sorted(df["year"].unique())]
schools_per_year = {str(int(k)): int(v) for k, v in df.groupby("year")["urn"].nunique().to_dict().items()}
la_counts = {str(k): int(v) for k, v in df["local_authority"].value_counts().to_dict().items()}
schools_per_year = {
str(int(k)): int(v)
for k, v in df.groupby("year")["urn"].nunique().to_dict().items()
}
la_counts = {
str(k): int(v)
for k, v in df["local_authority"].value_counts().to_dict().items()
}
return {
"status": "loaded",
"data_source": "PostgreSQL",
@@ -385,4 +430,5 @@ if settings.frontend_dir.exists():
if __name__ == "__main__":
import uvicorn
uvicorn.run(app, host=settings.host, port=settings.port)