From 4b02ab3d8a311c941e6448351a47bd6903e5d560 Mon Sep 17 00:00:00 2001 From: Tudor Date: Fri, 27 Mar 2026 13:23:32 +0000 Subject: [PATCH] feat: wire Typesense search into backend, fix sync performance data bug MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit sync_typesense.py: - Fix query string replacement: was matching 'ST_X(l.geom) as lng' but QUERY_BASE uses 'l.longitude as lng' — KS2/KS4 lateral joins were silently dropped on every sync run backend: - Add typesense_url/typesense_api_key settings to config.py - Add search_schools_typesense() to data_loader.py — queries Typesense 'schools' alias, returns URNs in relevance order with typo tolerance; falls back to empty list if Typesense is unavailable - /api/schools: replace pandas str.contains with Typesense search; results are filtered from the DataFrame and returned in relevance order; graceful fallback to substring match if Typesense is down requirements.txt: add typesense==0.21.0, numpy==1.26.4 Co-Authored-By: Claude Sonnet 4.6 --- backend/app.py | 23 ++++++++++------- backend/config.py | 4 +++ backend/data_loader.py | 41 ++++++++++++++++++++++++++++++ pipeline/scripts/sync_typesense.py | 2 +- requirements.txt | 2 ++ 5 files changed, 62 insertions(+), 10 deletions(-) diff --git a/backend/app.py b/backend/app.py index 541de79..6211fd1 100644 --- a/backend/app.py +++ b/backend/app.py @@ -26,6 +26,7 @@ from .data_loader import ( load_school_data, geocode_single_postcode, get_supplementary_data, + search_schools_typesense, ) from .data_loader import get_data_info as get_db_info from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS @@ -314,15 +315,19 @@ async def get_schools( # Apply filters if search: - search_lower = search.lower() - mask = ( - schools_df["school_name"].str.lower().str.contains(search_lower, na=False) - ) - if "address" in schools_df.columns: - mask = mask | schools_df["address"].str.lower().str.contains( - search_lower, na=False - ) - schools_df = schools_df[mask] + ts_urns = search_schools_typesense(search) + if ts_urns: + urn_order = {urn: i for i, urn in enumerate(ts_urns)} + schools_df = schools_df[schools_df["urn"].isin(set(ts_urns))].copy() + schools_df["_ts_rank"] = schools_df["urn"].map(urn_order) + schools_df = schools_df.sort_values("_ts_rank").drop(columns=["_ts_rank"]) + else: + # Fallback: Typesense unavailable, use substring match + search_lower = search.lower() + mask = schools_df["school_name"].str.lower().str.contains(search_lower, na=False) + if "address" in schools_df.columns: + mask = mask | schools_df["address"].str.lower().str.contains(search_lower, na=False) + schools_df = schools_df[mask] if local_authority: schools_df = schools_df[ diff --git a/backend/config.py b/backend/config.py index ce0b8dc..bdaa9dc 100644 --- a/backend/config.py +++ b/backend/config.py @@ -38,6 +38,10 @@ class Settings(BaseSettings): rate_limit_burst: int = 10 # Allow burst of requests max_request_size: int = 1024 * 1024 # 1MB max request size + # Typesense + typesense_url: str = "http://localhost:8108" + typesense_api_key: str = "" + # Analytics ga_measurement_id: Optional[str] = "G-J0PCVT14NY" # Google Analytics 4 Measurement ID diff --git a/backend/data_loader.py b/backend/data_loader.py index 35b8617..f0e767d 100644 --- a/backend/data_loader.py +++ b/backend/data_loader.py @@ -20,6 +20,47 @@ from .models import ( from .schemas import SCHOOL_TYPE_MAP _postcode_cache: Dict[str, Tuple[float, float]] = {} +_typesense_client = None + + +def _get_typesense_client(): + global _typesense_client + if _typesense_client is not None: + return _typesense_client + url = settings.typesense_url + key = settings.typesense_api_key + if not url or not key: + return None + try: + import typesense + host = url.split("//")[-1] + host_part, _, port_str = host.partition(":") + port = int(port_str) if port_str else 8108 + _typesense_client = typesense.Client({ + "nodes": [{"host": host_part, "port": str(port), "protocol": "http"}], + "api_key": key, + "connection_timeout_seconds": 2, + }) + return _typesense_client + except Exception: + return None + + +def search_schools_typesense(query: str, limit: int = 250) -> List[int]: + """Search Typesense. Returns URNs in relevance order, or [] if unavailable.""" + client = _get_typesense_client() + if client is None: + return [] + try: + result = client.collections["schools"].documents.search({ + "q": query, + "query_by": "school_name,local_authority,postcode", + "per_page": min(limit, 250), + "typo_tokens_threshold": 1, + }) + return [int(h["document"]["urn"]) for h in result.get("hits", [])] + except Exception: + return [] def normalize_school_type(school_type: Optional[str]) -> Optional[str]: diff --git a/pipeline/scripts/sync_typesense.py b/pipeline/scripts/sync_typesense.py index 30194c5..e843ecb 100644 --- a/pipeline/scripts/sync_typesense.py +++ b/pipeline/scripts/sync_typesense.py @@ -158,7 +158,7 @@ def sync(typesense_url: str, api_key: str): query = QUERY_BASE if select_extra: # Insert extra select columns before FROM - query = query.replace("ST_X(l.geom) as lng", "ST_X(l.geom) as lng,\n " + ",\n ".join(select_extra)) + query = query.replace("l.longitude as lng", "l.longitude as lng,\n " + ",\n ".join(select_extra)) query += joins cur.execute(query) diff --git a/requirements.txt b/requirements.txt index fe87c8c..f5ab98a 100644 --- a/requirements.txt +++ b/requirements.txt @@ -10,4 +10,6 @@ psycopg2-binary==2.9.9 alembic==1.13.1 slowapi==0.1.9 secure==0.3.0 +typesense==0.21.0 +numpy==1.26.4