feat: wire Typesense search into backend, fix sync performance data bug
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 1m1s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m7s
Build and Push Docker Images / Build Integrator (push) Successful in 55s
Build and Push Docker Images / Build Kestra Init (push) Successful in 31s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m25s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

sync_typesense.py:
- Fix query string replacement: was matching 'ST_X(l.geom) as lng' but
  QUERY_BASE uses 'l.longitude as lng' — KS2/KS4 lateral joins were
  silently dropped on every sync run

backend:
- Add typesense_url/typesense_api_key settings to config.py
- Add search_schools_typesense() to data_loader.py — queries Typesense
  'schools' alias, returns URNs in relevance order with typo tolerance;
  falls back to empty list if Typesense is unavailable
- /api/schools: replace pandas str.contains with Typesense search;
  results are filtered from the DataFrame and returned in relevance order;
  graceful fallback to substring match if Typesense is down

requirements.txt: add typesense==0.21.0, numpy==1.26.4

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-27 13:23:32 +00:00
parent 5d8b319451
commit 4b02ab3d8a
5 changed files with 62 additions and 10 deletions

View File

@@ -26,6 +26,7 @@ from .data_loader import (
load_school_data,
geocode_single_postcode,
get_supplementary_data,
search_schools_typesense,
)
from .data_loader import get_data_info as get_db_info
from .schemas import METRIC_DEFINITIONS, RANKING_COLUMNS, SCHOOL_COLUMNS
@@ -314,14 +315,18 @@ async def get_schools(
# Apply filters
if search:
ts_urns = search_schools_typesense(search)
if ts_urns:
urn_order = {urn: i for i, urn in enumerate(ts_urns)}
schools_df = schools_df[schools_df["urn"].isin(set(ts_urns))].copy()
schools_df["_ts_rank"] = schools_df["urn"].map(urn_order)
schools_df = schools_df.sort_values("_ts_rank").drop(columns=["_ts_rank"])
else:
# Fallback: Typesense unavailable, use substring match
search_lower = search.lower()
mask = (
schools_df["school_name"].str.lower().str.contains(search_lower, na=False)
)
mask = schools_df["school_name"].str.lower().str.contains(search_lower, na=False)
if "address" in schools_df.columns:
mask = mask | schools_df["address"].str.lower().str.contains(
search_lower, na=False
)
mask = mask | schools_df["address"].str.lower().str.contains(search_lower, na=False)
schools_df = schools_df[mask]
if local_authority:

View File

@@ -38,6 +38,10 @@ class Settings(BaseSettings):
rate_limit_burst: int = 10 # Allow burst of requests
max_request_size: int = 1024 * 1024 # 1MB max request size
# Typesense
typesense_url: str = "http://localhost:8108"
typesense_api_key: str = ""
# Analytics
ga_measurement_id: Optional[str] = "G-J0PCVT14NY" # Google Analytics 4 Measurement ID

View File

@@ -20,6 +20,47 @@ from .models import (
from .schemas import SCHOOL_TYPE_MAP
_postcode_cache: Dict[str, Tuple[float, float]] = {}
_typesense_client = None
def _get_typesense_client():
global _typesense_client
if _typesense_client is not None:
return _typesense_client
url = settings.typesense_url
key = settings.typesense_api_key
if not url or not key:
return None
try:
import typesense
host = url.split("//")[-1]
host_part, _, port_str = host.partition(":")
port = int(port_str) if port_str else 8108
_typesense_client = typesense.Client({
"nodes": [{"host": host_part, "port": str(port), "protocol": "http"}],
"api_key": key,
"connection_timeout_seconds": 2,
})
return _typesense_client
except Exception:
return None
def search_schools_typesense(query: str, limit: int = 250) -> List[int]:
"""Search Typesense. Returns URNs in relevance order, or [] if unavailable."""
client = _get_typesense_client()
if client is None:
return []
try:
result = client.collections["schools"].documents.search({
"q": query,
"query_by": "school_name,local_authority,postcode",
"per_page": min(limit, 250),
"typo_tokens_threshold": 1,
})
return [int(h["document"]["urn"]) for h in result.get("hits", [])]
except Exception:
return []
def normalize_school_type(school_type: Optional[str]) -> Optional[str]:

View File

@@ -158,7 +158,7 @@ def sync(typesense_url: str, api_key: str):
query = QUERY_BASE
if select_extra:
# Insert extra select columns before FROM
query = query.replace("ST_X(l.geom) as lng", "ST_X(l.geom) as lng,\n " + ",\n ".join(select_extra))
query = query.replace("l.longitude as lng", "l.longitude as lng,\n " + ",\n ".join(select_extra))
query += joins
cur.execute(query)

View File

@@ -10,4 +10,6 @@ psycopg2-binary==2.9.9
alembic==1.13.1
slowapi==0.1.9
secure==0.3.0
typesense==0.21.0
numpy==1.26.4