fixing data load
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 57s
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 57s
This commit is contained in:
228
backend/app.py
228
backend/app.py
@@ -5,20 +5,25 @@ Uses real data from UK Government Compare School Performance downloads.
|
||||
"""
|
||||
|
||||
from contextlib import asynccontextmanager
|
||||
import pandas as pd
|
||||
from fastapi import FastAPI, HTTPException, Query
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
from fastapi.responses import FileResponse
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from typing import Optional
|
||||
|
||||
import pandas as pd
|
||||
from fastapi import FastAPI, HTTPException, Query
|
||||
from fastapi.middleware.cors import CORSMiddleware
|
||||
from fastapi.responses import FileResponse
|
||||
from fastapi.staticfiles import StaticFiles
|
||||
|
||||
from .config import settings
|
||||
from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
|
||||
from .data_loader import (
|
||||
load_school_data, clear_cache, geocode_single_postcode,
|
||||
geocode_postcodes_bulk, haversine_distance, get_data_info as get_db_info
|
||||
clear_cache,
|
||||
geocode_postcodes_bulk,
|
||||
geocode_single_postcode,
|
||||
haversine_distance,
|
||||
load_school_data,
|
||||
)
|
||||
from .data_loader import get_data_info as get_db_info
|
||||
from .database import init_db
|
||||
from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
|
||||
from .utils import clean_for_json
|
||||
|
||||
|
||||
@@ -28,16 +33,16 @@ async def lifespan(app: FastAPI):
|
||||
# Startup: initialize database and pre-load data
|
||||
print("Starting up: Initializing database...")
|
||||
init_db() # Ensure tables exist
|
||||
|
||||
|
||||
print("Loading school data from database...")
|
||||
df = load_school_data()
|
||||
if df.empty:
|
||||
print("Warning: No data in database. Run the migration script to import data.")
|
||||
else:
|
||||
print("Data loaded successfully.")
|
||||
|
||||
|
||||
yield # Application runs here
|
||||
|
||||
|
||||
# Shutdown: cleanup if needed
|
||||
print("Shutting down...")
|
||||
|
||||
@@ -80,7 +85,9 @@ async def serve_rankings():
|
||||
@app.get("/api/schools")
|
||||
async def get_schools(
|
||||
search: Optional[str] = Query(None, description="Search by school name"),
|
||||
local_authority: Optional[str] = Query(None, description="Filter by local authority"),
|
||||
local_authority: Optional[str] = Query(
|
||||
None, description="Filter by local authority"
|
||||
),
|
||||
school_type: Optional[str] = Query(None, description="Filter by school type"),
|
||||
postcode: Optional[str] = Query(None, description="Search near postcode"),
|
||||
radius: float = Query(5.0, ge=0.1, le=50, description="Search radius in miles"),
|
||||
@@ -89,28 +96,40 @@ async def get_schools(
|
||||
):
|
||||
"""
|
||||
Get list of unique primary schools with pagination.
|
||||
|
||||
|
||||
Returns paginated results with total count for efficient loading.
|
||||
Supports location-based search using postcode.
|
||||
"""
|
||||
df = load_school_data()
|
||||
|
||||
|
||||
if df.empty:
|
||||
return {"schools": [], "total": 0, "page": page, "page_size": 0}
|
||||
|
||||
|
||||
# Use configured default if not specified
|
||||
if page_size is None:
|
||||
page_size = settings.default_page_size
|
||||
|
||||
|
||||
# Get unique schools (latest year data for each)
|
||||
latest_year = df.groupby('urn')['year'].max().reset_index()
|
||||
df_latest = df.merge(latest_year, on=['urn', 'year'])
|
||||
|
||||
# Include lat/long in columns for location search
|
||||
location_cols = ['latitude', 'longitude']
|
||||
available_cols = [c for c in SCHOOL_COLUMNS + location_cols if c in df_latest.columns]
|
||||
schools_df = df_latest[available_cols].drop_duplicates(subset=['urn'])
|
||||
|
||||
latest_year = df.groupby("urn")["year"].max().reset_index()
|
||||
df_latest = df.merge(latest_year, on=["urn", "year"])
|
||||
|
||||
# Include key result metrics for display on cards
|
||||
location_cols = ["latitude", "longitude"]
|
||||
result_cols = [
|
||||
"year",
|
||||
"rwm_expected_pct",
|
||||
"reading_expected_pct",
|
||||
"writing_expected_pct",
|
||||
"maths_expected_pct",
|
||||
"total_pupils",
|
||||
]
|
||||
available_cols = [
|
||||
c
|
||||
for c in SCHOOL_COLUMNS + location_cols + result_cols
|
||||
if c in df_latest.columns
|
||||
]
|
||||
schools_df = df_latest[available_cols].drop_duplicates(subset=["urn"])
|
||||
|
||||
# Location-based search
|
||||
search_coords = None
|
||||
if postcode:
|
||||
@@ -118,65 +137,81 @@ async def get_schools(
|
||||
if coords:
|
||||
search_coords = coords
|
||||
schools_df = schools_df.copy()
|
||||
|
||||
|
||||
# Geocode school postcodes on-demand if not already cached
|
||||
if 'postcode' in schools_df.columns:
|
||||
unique_postcodes = schools_df['postcode'].dropna().unique().tolist()
|
||||
if "postcode" in schools_df.columns:
|
||||
unique_postcodes = schools_df["postcode"].dropna().unique().tolist()
|
||||
geocoded = geocode_postcodes_bulk(unique_postcodes)
|
||||
|
||||
|
||||
# Add lat/long from geocoded data
|
||||
schools_df['latitude'] = schools_df['postcode'].apply(
|
||||
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[0] if pd.notna(pc) else None
|
||||
schools_df["latitude"] = schools_df["postcode"].apply(
|
||||
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[0]
|
||||
if pd.notna(pc)
|
||||
else None
|
||||
)
|
||||
schools_df['longitude'] = schools_df['postcode'].apply(
|
||||
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[1] if pd.notna(pc) else None
|
||||
schools_df["longitude"] = schools_df["postcode"].apply(
|
||||
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[1]
|
||||
if pd.notna(pc)
|
||||
else None
|
||||
)
|
||||
|
||||
|
||||
# Filter by distance
|
||||
def calc_distance(row):
|
||||
if pd.isna(row.get('latitude')) or pd.isna(row.get('longitude')):
|
||||
return float('inf')
|
||||
if pd.isna(row.get("latitude")) or pd.isna(row.get("longitude")):
|
||||
return float("inf")
|
||||
return haversine_distance(
|
||||
search_coords[0], search_coords[1],
|
||||
row['latitude'], row['longitude']
|
||||
search_coords[0],
|
||||
search_coords[1],
|
||||
row["latitude"],
|
||||
row["longitude"],
|
||||
)
|
||||
|
||||
schools_df['distance'] = schools_df.apply(calc_distance, axis=1)
|
||||
schools_df = schools_df[schools_df['distance'] <= radius]
|
||||
schools_df = schools_df.sort_values('distance')
|
||||
|
||||
|
||||
schools_df["distance"] = schools_df.apply(calc_distance, axis=1)
|
||||
schools_df = schools_df[schools_df["distance"] <= radius]
|
||||
schools_df = schools_df.sort_values("distance")
|
||||
|
||||
# Apply filters
|
||||
if search:
|
||||
search_lower = search.lower()
|
||||
mask = schools_df["school_name"].str.lower().str.contains(search_lower, na=False)
|
||||
mask = (
|
||||
schools_df["school_name"].str.lower().str.contains(search_lower, na=False)
|
||||
)
|
||||
if "address" in schools_df.columns:
|
||||
mask = mask | schools_df["address"].str.lower().str.contains(search_lower, na=False)
|
||||
mask = mask | schools_df["address"].str.lower().str.contains(
|
||||
search_lower, na=False
|
||||
)
|
||||
schools_df = schools_df[mask]
|
||||
|
||||
|
||||
if local_authority:
|
||||
schools_df = schools_df[schools_df["local_authority"].str.lower() == local_authority.lower()]
|
||||
|
||||
schools_df = schools_df[
|
||||
schools_df["local_authority"].str.lower() == local_authority.lower()
|
||||
]
|
||||
|
||||
if school_type:
|
||||
schools_df = schools_df[schools_df["school_type"].str.lower() == school_type.lower()]
|
||||
|
||||
schools_df = schools_df[
|
||||
schools_df["school_type"].str.lower() == school_type.lower()
|
||||
]
|
||||
|
||||
# Pagination
|
||||
total = len(schools_df)
|
||||
start_idx = (page - 1) * page_size
|
||||
end_idx = start_idx + page_size
|
||||
schools_df = schools_df.iloc[start_idx:end_idx]
|
||||
|
||||
|
||||
# Remove internal columns before sending
|
||||
output_cols = [c for c in schools_df.columns if c not in ['latitude', 'longitude']]
|
||||
if 'distance' in schools_df.columns:
|
||||
output_cols.append('distance')
|
||||
|
||||
output_cols = [c for c in schools_df.columns if c not in ["latitude", "longitude"]]
|
||||
if "distance" in schools_df.columns:
|
||||
output_cols.append("distance")
|
||||
|
||||
return {
|
||||
"schools": clean_for_json(schools_df[output_cols]),
|
||||
"total": total,
|
||||
"page": page,
|
||||
"page_size": page_size,
|
||||
"total_pages": (total + page_size - 1) // page_size if page_size > 0 else 0,
|
||||
"search_location": {"postcode": postcode, "radius": radius} if search_coords else None,
|
||||
"search_location": {"postcode": postcode, "radius": radius}
|
||||
if search_coords
|
||||
else None,
|
||||
}
|
||||
|
||||
|
||||
@@ -184,21 +219,21 @@ async def get_schools(
|
||||
async def get_school_details(urn: int):
|
||||
"""Get detailed KS2 data for a specific primary school across all years."""
|
||||
df = load_school_data()
|
||||
|
||||
|
||||
if df.empty:
|
||||
raise HTTPException(status_code=404, detail="No data available")
|
||||
|
||||
|
||||
school_data = df[df["urn"] == urn]
|
||||
|
||||
|
||||
if school_data.empty:
|
||||
raise HTTPException(status_code=404, detail="School not found")
|
||||
|
||||
|
||||
# Sort by year
|
||||
school_data = school_data.sort_values("year")
|
||||
|
||||
|
||||
# Get latest info for the school
|
||||
latest = school_data.iloc[-1]
|
||||
|
||||
|
||||
return {
|
||||
"school_info": {
|
||||
"urn": urn,
|
||||
@@ -208,7 +243,7 @@ async def get_school_details(urn: int):
|
||||
"address": latest.get("address", ""),
|
||||
"phase": "Primary",
|
||||
},
|
||||
"yearly_data": clean_for_json(school_data)
|
||||
"yearly_data": clean_for_json(school_data),
|
||||
}
|
||||
|
||||
|
||||
@@ -216,20 +251,20 @@ async def get_school_details(urn: int):
|
||||
async def compare_schools(urns: str = Query(..., description="Comma-separated URNs")):
|
||||
"""Compare multiple primary schools side by side."""
|
||||
df = load_school_data()
|
||||
|
||||
|
||||
if df.empty:
|
||||
raise HTTPException(status_code=404, detail="No data available")
|
||||
|
||||
|
||||
try:
|
||||
urn_list = [int(u.strip()) for u in urns.split(",")]
|
||||
except ValueError:
|
||||
raise HTTPException(status_code=400, detail="Invalid URN format")
|
||||
|
||||
|
||||
comparison_data = df[df["urn"].isin(urn_list)]
|
||||
|
||||
|
||||
if comparison_data.empty:
|
||||
raise HTTPException(status_code=404, detail="No schools found")
|
||||
|
||||
|
||||
result = {}
|
||||
for urn in urn_list:
|
||||
school_data = comparison_data[comparison_data["urn"] == urn].sort_values("year")
|
||||
@@ -242,9 +277,9 @@ async def compare_schools(urns: str = Query(..., description="Comma-separated UR
|
||||
"local_authority": latest.get("local_authority", ""),
|
||||
"address": latest.get("address", ""),
|
||||
},
|
||||
"yearly_data": clean_for_json(school_data)
|
||||
"yearly_data": clean_for_json(school_data),
|
||||
}
|
||||
|
||||
|
||||
return {"comparison": result}
|
||||
|
||||
|
||||
@@ -252,14 +287,14 @@ async def compare_schools(urns: str = Query(..., description="Comma-separated UR
|
||||
async def get_filter_options():
|
||||
"""Get available filter options (local authorities, school types, years)."""
|
||||
df = load_school_data()
|
||||
|
||||
|
||||
if df.empty:
|
||||
return {
|
||||
"local_authorities": [],
|
||||
"school_types": [],
|
||||
"years": [],
|
||||
}
|
||||
|
||||
|
||||
return {
|
||||
"local_authorities": sorted(df["local_authority"].dropna().unique().tolist()),
|
||||
"school_types": sorted(df["school_type"].dropna().unique().tolist()),
|
||||
@@ -271,36 +306,40 @@ async def get_filter_options():
|
||||
async def get_available_metrics():
|
||||
"""
|
||||
Get list of available KS2 performance metrics for primary schools.
|
||||
|
||||
|
||||
This is the single source of truth for metric definitions.
|
||||
Frontend should consume this to avoid duplication.
|
||||
"""
|
||||
df = load_school_data()
|
||||
|
||||
|
||||
available = []
|
||||
for key, info in METRIC_DEFINITIONS.items():
|
||||
if df.empty or key in df.columns:
|
||||
available.append({"key": key, **info})
|
||||
|
||||
|
||||
return {"metrics": available}
|
||||
|
||||
|
||||
@app.get("/api/rankings")
|
||||
async def get_rankings(
|
||||
metric: str = Query("rwm_expected_pct", description="KS2 metric to rank by"),
|
||||
year: Optional[int] = Query(None, description="Specific year (defaults to most recent)"),
|
||||
year: Optional[int] = Query(
|
||||
None, description="Specific year (defaults to most recent)"
|
||||
),
|
||||
limit: int = Query(20, ge=1, le=100, description="Number of schools to return"),
|
||||
local_authority: Optional[str] = Query(None, description="Filter by local authority"),
|
||||
local_authority: Optional[str] = Query(
|
||||
None, description="Filter by local authority"
|
||||
),
|
||||
):
|
||||
"""Get primary school rankings by a specific KS2 metric."""
|
||||
df = load_school_data()
|
||||
|
||||
|
||||
if df.empty:
|
||||
return {"metric": metric, "year": None, "rankings": [], "total": 0}
|
||||
|
||||
|
||||
if metric not in df.columns:
|
||||
raise HTTPException(status_code=400, detail=f"Metric '{metric}' not available")
|
||||
|
||||
|
||||
# Filter by year
|
||||
if year:
|
||||
df = df[df["year"] == year]
|
||||
@@ -308,22 +347,22 @@ async def get_rankings(
|
||||
# Use most recent year
|
||||
max_year = df["year"].max()
|
||||
df = df[df["year"] == max_year]
|
||||
|
||||
|
||||
# Filter by local authority if specified
|
||||
if local_authority:
|
||||
df = df[df["local_authority"].str.lower() == local_authority.lower()]
|
||||
|
||||
|
||||
# Sort and rank (exclude rows with no data for this metric)
|
||||
df = df.dropna(subset=[metric])
|
||||
total = len(df)
|
||||
|
||||
|
||||
# For progress scores, higher is better. For percentages, higher is also better.
|
||||
df = df.sort_values(metric, ascending=False).head(limit)
|
||||
|
||||
|
||||
# Return only relevant fields for rankings
|
||||
available_cols = [c for c in RANKING_COLUMNS if c in df.columns]
|
||||
df = df[available_cols]
|
||||
|
||||
|
||||
return {
|
||||
"metric": metric,
|
||||
"year": int(df["year"].iloc[0]) if not df.empty else None,
|
||||
@@ -337,28 +376,34 @@ async def get_data_info():
|
||||
"""Get information about loaded data."""
|
||||
# Get info directly from database
|
||||
db_info = get_db_info()
|
||||
|
||||
|
||||
if db_info["total_schools"] == 0:
|
||||
return {
|
||||
"status": "no_data",
|
||||
"message": "No data in database. Run the migration script: python scripts/migrate_csv_to_db.py",
|
||||
"data_source": "PostgreSQL",
|
||||
}
|
||||
|
||||
|
||||
# Also get DataFrame-based stats for backwards compatibility
|
||||
df = load_school_data()
|
||||
|
||||
|
||||
if df.empty:
|
||||
return {
|
||||
"status": "no_data",
|
||||
"message": "No data available",
|
||||
"data_source": "PostgreSQL",
|
||||
}
|
||||
|
||||
|
||||
years = [int(y) for y in sorted(df["year"].unique())]
|
||||
schools_per_year = {str(int(k)): int(v) for k, v in df.groupby("year")["urn"].nunique().to_dict().items()}
|
||||
la_counts = {str(k): int(v) for k, v in df["local_authority"].value_counts().to_dict().items()}
|
||||
|
||||
schools_per_year = {
|
||||
str(int(k)): int(v)
|
||||
for k, v in df.groupby("year")["urn"].nunique().to_dict().items()
|
||||
}
|
||||
la_counts = {
|
||||
str(k): int(v)
|
||||
for k, v in df["local_authority"].value_counts().to_dict().items()
|
||||
}
|
||||
|
||||
return {
|
||||
"status": "loaded",
|
||||
"data_source": "PostgreSQL",
|
||||
@@ -385,4 +430,5 @@ if settings.frontend_dir.exists():
|
||||
|
||||
if __name__ == "__main__":
|
||||
import uvicorn
|
||||
|
||||
uvicorn.run(app, host=settings.host, port=settings.port)
|
||||
|
||||
Reference in New Issue
Block a user