perf: resolve all P1–P5 performance issues from code review

P1 (backend/data_loader.py): Add load_latest_school_data() which pre-computes the one-row-per-school latest-year snapshot (groupby, prev-year trend merge) once at startup instead of on every /api/schools request. get_schools route now starts from the cached snapshot rather than rebuilding it. S3 (backend/app.py): Wrap synchronous geocode_single_postcode() call in asyncio.to_thread() so postcode lookups no longer block the uvicorn event loop. Admin reload endpoint also uses to_thread for both cache primes. P2 (nextjs-app/components/HomeView.tsx): Add mapParamsRef guard so switching back to map view does not re-fetch 500 schools when search params haven't changed. Reset ref on new searches so fresh data is always fetched. P3 (nextjs-app/lib/chartSetup.ts): Extract Chart.js registration into a shared side-effect module. ComparisonChart and PerformanceChart now import it instead of each calling ChartJS.register() independently. P4 (backend/database.py): Remove unnecessary db.commit() from the read-only get_db_session() context manager — saves a DB round-trip on every request. P5 (backend/database.py): Add pool_recycle=1800 to SQLAlchemy engine to prevent stale TCP connections from accumulating in long-running processes. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-04-15 22:45:46 +01:00
parent f6b9d650f8
commit f05bbba613
7 changed files with 106 additions and 77 deletions
@@ -24,6 +24,7 @@ from .config import settings
 from .data_loader import (
    clear_cache,
    load_school_data,
+    load_latest_school_data,
    geocode_single_postcode,
    get_supplementary_data,
    search_schools_typesense,
@@ -223,6 +224,8 @@ async def lifespan(app: FastAPI):
        print("Warning: No data in marts. Run the annual EES pipeline to populate KS2 data.")
    else:
        print(f"Data loaded successfully: {len(df)} records.")
+        # Pre-compute the latest-year snapshot so the first search request is fast
+        await asyncio.to_thread(load_latest_school_data)
    try:
        _sitemap_xml = build_sitemap()
        n = _sitemap_xml.count("<url>")
@@ -321,44 +324,17 @@ async def get_schools(
    phase = sanitize_search_input(phase)
    postcode = validate_postcode(postcode)

-    df = load_school_data()
+    # Load the pre-computed latest-year snapshot (cached after first request / startup).
+    # This avoids rebuilding the expensive groupby + prev-year merge on every search.
+    df_latest = load_latest_school_data()

-    if df.empty:
+    if df_latest.empty:
        return {"schools": [], "total": 0, "page": page, "page_size": 0}

    # Use configured default if not specified
    if page_size is None:
        page_size = settings.default_page_size

-    # Schools with no performance data (special schools, PRUs, newly opened, etc.)
-    # have NULL year from the LEFT JOIN — keep them but skip the groupby/trend logic.
-    df_no_perf = df[df["year"].isna()].drop_duplicates(subset=["urn"])
-    df = df[df["year"].notna()]
-
-    # Get unique schools (latest year data for each)
-    latest_year = df.groupby("urn")["year"].max().reset_index()
-    df_latest = df.merge(latest_year, on=["urn", "year"])
-
-    # Calculate trend by comparing to previous year
-    # Get second-latest year for each school
-    df_sorted = df.sort_values(["urn", "year"], ascending=[True, False])
-    df_prev = df_sorted.groupby("urn").nth(1).reset_index()
-    if not df_prev.empty and "rwm_expected_pct" in df_prev.columns:
-        prev_rwm = df_prev[["urn", "rwm_expected_pct"]].rename(
-            columns={"rwm_expected_pct": "prev_rwm_expected_pct"}
-        )
-        if "attainment_8_score" in df_prev.columns:
-            prev_rwm = prev_rwm.merge(
-                df_prev[["urn", "attainment_8_score"]].rename(
-                    columns={"attainment_8_score": "prev_attainment_8_score"}
-                ),
-                on="urn", how="outer"
-            )
-        df_latest = df_latest.merge(prev_rwm, on="urn", how="left")
-
-    # Merge back schools with no performance data
-    df_latest = pd.concat([df_latest, df_no_perf], ignore_index=True)
-
    # Phase filter — uses PHASE_GROUPS so all-through/middle schools appear
    # in the correct phase(s) rather than being invisible to both filters.
    if phase:
@@ -404,7 +380,8 @@ async def get_schools(
    # Location-based search (uses pre-geocoded data from database)
    search_coords = None
    if postcode:
-        coords = geocode_single_postcode(postcode)
+        # Offload the synchronous HTTP call to a thread so the event loop stays free
+        coords = await asyncio.to_thread(geocode_single_postcode, postcode)
        if coords:
            search_coords = coords
            schools_df = schools_df.copy()
@@ -907,7 +884,8 @@ async def reload_data(
    Requires X-API-Key header with valid admin API key.
    """
    clear_cache()
-    load_school_data()
+    await asyncio.to_thread(load_school_data)
+    await asyncio.to_thread(load_latest_school_data)
    return {"status": "reloaded"}


@@ -242,6 +242,8 @@ def load_school_data_as_dataframe() -> pd.DataFrame:

 # Cache for DataFrame
 _df_cache: Optional[pd.DataFrame] = None
+# Pre-computed latest-year snapshot (one row per school, with prev-year trend columns)
+_df_latest_cache: Optional[pd.DataFrame] = None


 def load_school_data() -> pd.DataFrame:
@@ -260,10 +262,60 @@ def load_school_data() -> pd.DataFrame:
    return _df_cache


+def load_latest_school_data() -> pd.DataFrame:
+    """Return a cached one-row-per-school DataFrame at the latest available year.
+
+    The expensive groupby / merge / prev-year trend computation runs once at
+    startup (or after a cache clear) rather than on every search request.
+    Per-request filters (phase, gender, LA …) should be applied to the returned
+    DataFrame's copy; they must NOT modify the cached object.
+    """
+    global _df_latest_cache
+    if _df_latest_cache is not None:
+        return _df_latest_cache
+
+    df = load_school_data()
+    if df.empty:
+        return df
+
+    # Schools that have no performance rows (PRUs, new schools, etc.)
+    df_no_perf = df[df["year"].isna()].drop_duplicates(subset=["urn"])
+    df_with_perf = df[df["year"].notna()]
+
+    # Reduce to the latest year per school
+    latest_year = df_with_perf.groupby("urn")["year"].max().reset_index()
+    df_latest = df_with_perf.merge(latest_year, on=["urn", "year"])
+
+    # Attach previous-year metrics for trend arrows (second-latest year per school)
+    df_sorted = df_with_perf.sort_values(["urn", "year"], ascending=[True, False])
+    df_prev = df_sorted.groupby("urn").nth(1).reset_index()
+    if not df_prev.empty and "rwm_expected_pct" in df_prev.columns:
+        prev_rwm = df_prev[["urn", "rwm_expected_pct"]].rename(
+            columns={"rwm_expected_pct": "prev_rwm_expected_pct"}
+        )
+        if "attainment_8_score" in df_prev.columns:
+            prev_rwm = prev_rwm.merge(
+                df_prev[["urn", "attainment_8_score"]].rename(
+                    columns={"attainment_8_score": "prev_attainment_8_score"}
+                ),
+                on="urn",
+                how="outer",
+            )
+        df_latest = df_latest.merge(prev_rwm, on="urn", how="left")
+
+    # Merge back schools with no performance data
+    df_latest = pd.concat([df_latest, df_no_perf], ignore_index=True)
+
+    print(f"Latest-snapshot cache built: {len(df_latest)} schools")
+    _df_latest_cache = df_latest
+    return _df_latest_cache
+
+
 def clear_cache():
    """Clear all caches."""
-    global _df_cache
+    global _df_cache, _df_latest_cache
    _df_cache = None
+    _df_latest_cache = None


 # =============================================================================
@@ -15,6 +15,7 @@ engine = create_engine(
    pool_size=10,
    max_overflow=20,
    pool_pre_ping=True,
+    pool_recycle=1800,  # recycle connections every 30 min to avoid stale TCP
    echo=False,
 )

@@ -34,13 +35,9 @@ def get_db():

@contextmanager
 def get_db_session():
-    """Context manager for non-FastAPI contexts."""
+    """Context manager for non-FastAPI contexts (read-only)."""
    db = SessionLocal()
    try:
        yield db
-        db.commit()
-    except Exception:
-        db.rollback()
-        raise
    finally:
        db.close()