From 40348cb1bd4bb1937835fa266a377376150fc7fa Mon Sep 17 00:00:00 2001 From: Tudor Date: Thu, 8 Jan 2026 15:30:33 +0000 Subject: [PATCH] moving geocoding to a background task --- README.md | 25 +++++ backend/app.py | 24 +---- backend/data_loader.py | 53 +---------- scripts/geocode_schools.py | 184 +++++++++++++++++++++++++++++++++++++ 4 files changed, 216 insertions(+), 70 deletions(-) create mode 100755 scripts/geocode_schools.py diff --git a/README.md b/README.md index de27348..4e0859e 100644 --- a/README.md +++ b/README.md @@ -179,6 +179,31 @@ Data is sourced from the UK Government's [Compare School Performance](https://ww **Important**: When using real data, please comply with the [terms of use](https://www.compare-school-performance.service.gov.uk/download-data) and data protection regulations. +## Scheduled Jobs + +### Geocoding Schools (Cron Job) + +School postcodes are geocoded by a scheduled job, not on-demand. This improves performance and reduces API calls. + +**Setup the cron job** (runs weekly on Sunday at 2am): + +```bash +# Edit crontab +crontab -e + +# Add this line (adjust paths as needed): +0 2 * * 0 cd /path/to/school_compare && /path/to/venv/bin/python scripts/geocode_schools.py >> /var/log/geocode_schools.log 2>&1 +``` + +**Manual run:** +```bash +# Geocode only schools missing coordinates +python scripts/geocode_schools.py + +# Force re-geocode all schools +python scripts/geocode_schools.py --force +``` + ## License MIT License - feel free to use this project for educational purposes. diff --git a/backend/app.py b/backend/app.py index 64ddb5b..0321e09 100644 --- a/backend/app.py +++ b/backend/app.py @@ -21,10 +21,9 @@ from starlette.middleware.base import BaseHTTPMiddleware from .config import settings from .data_loader import ( clear_cache, - geocode_postcodes_bulk, - geocode_single_postcode, haversine_distance, load_school_data, + geocode_single_postcode, ) from .data_loader import get_data_info as get_db_info from .database import init_db @@ -256,7 +255,7 @@ async def get_schools( ] schools_df = df_latest[available_cols].drop_duplicates(subset=["urn"]) - # Location-based search + # Location-based search (uses pre-geocoded data from database) search_coords = None if postcode: coords = geocode_single_postcode(postcode) @@ -264,24 +263,7 @@ async def get_schools( search_coords = coords schools_df = schools_df.copy() - # Geocode school postcodes on-demand if not already cached - if "postcode" in schools_df.columns: - unique_postcodes = schools_df["postcode"].dropna().unique().tolist() - geocoded = geocode_postcodes_bulk(unique_postcodes) - - # Add lat/long from geocoded data - schools_df["latitude"] = schools_df["postcode"].apply( - lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[0] - if pd.notna(pc) - else None - ) - schools_df["longitude"] = schools_df["postcode"].apply( - lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[1] - if pd.notna(pc) - else None - ) - - # Filter by distance + # Filter by distance using pre-geocoded lat/long from database def calc_distance(row): if pd.isna(row.get("latitude")) or pd.isna(row.get("longitude")): return float("inf") diff --git a/backend/data_loader.py b/backend/data_loader.py index 85aa4c0..86dcb46 100644 --- a/backend/data_loader.py +++ b/backend/data_loader.py @@ -1,6 +1,9 @@ """ Data loading module that queries from PostgreSQL database. Provides efficient queries with caching and lazy loading. + +Note: School geocoding is handled by a separate cron job (scripts/geocode_schools.py). +Only user search postcodes are geocoded on-demand via geocode_single_postcode(). """ import pandas as pd @@ -15,58 +18,10 @@ from .config import settings from .database import SessionLocal, get_db_session from .models import School, SchoolResult -# Cache for postcode geocoding +# Cache for user search postcode geocoding (not for school data) _postcode_cache: Dict[str, Tuple[float, float]] = {} -def geocode_postcodes_bulk(postcodes: list) -> Dict[str, Tuple[float, float]]: - """ - Geocode postcodes in bulk using postcodes.io API. - Returns dict of postcode -> (latitude, longitude). - """ - results = {} - - # Check cache first - uncached = [] - for pc in postcodes: - if pc and isinstance(pc, str): - pc_upper = pc.strip().upper() - if pc_upper in _postcode_cache: - results[pc_upper] = _postcode_cache[pc_upper] - elif len(pc_upper) >= 5: - uncached.append(pc_upper) - - if not uncached: - return results - - uncached = list(set(uncached)) - - # postcodes.io allows max 100 postcodes per request - batch_size = 100 - for i in range(0, len(uncached), batch_size): - batch = uncached[i:i + batch_size] - try: - response = requests.post( - 'https://api.postcodes.io/postcodes', - json={'postcodes': batch}, - timeout=30 - ) - if response.status_code == 200: - data = response.json() - for item in data.get('result', []): - if item and item.get('result'): - pc = item['query'].upper() - lat = item['result'].get('latitude') - lon = item['result'].get('longitude') - if lat and lon: - results[pc] = (lat, lon) - _postcode_cache[pc] = (lat, lon) - except Exception as e: - print(f" Warning: Geocoding batch failed: {e}") - - return results - - def geocode_single_postcode(postcode: str) -> Optional[Tuple[float, float]]: """Geocode a single postcode using postcodes.io API.""" if not postcode: diff --git a/scripts/geocode_schools.py b/scripts/geocode_schools.py new file mode 100755 index 0000000..9468bab --- /dev/null +++ b/scripts/geocode_schools.py @@ -0,0 +1,184 @@ +#!/usr/bin/env python3 +""" +Geocode all school postcodes and update the database. + +This script should be run as a weekly cron job to ensure all schools +have up-to-date latitude/longitude coordinates. + +Usage: + python scripts/geocode_schools.py [--force] + +Options: + --force Re-geocode all postcodes, even if already geocoded + +Crontab example (run every Sunday at 2am): + 0 2 * * 0 cd /path/to/school_compare && /path/to/venv/bin/python scripts/geocode_schools.py >> /var/log/geocode_schools.log 2>&1 +""" + +import argparse +import sys +from datetime import datetime +from pathlib import Path +from typing import Dict, Tuple + +import requests + +# Add parent directory to path for imports +sys.path.insert(0, str(Path(__file__).parent.parent)) + +from backend.database import SessionLocal +from backend.models import School + + +def geocode_postcodes_bulk(postcodes: list) -> Dict[str, Tuple[float, float]]: + """ + Geocode postcodes in bulk using postcodes.io API. + Returns dict of postcode -> (latitude, longitude). + """ + results = {} + valid_postcodes = [ + p.strip().upper() + for p in postcodes + if p and isinstance(p, str) and len(p.strip()) >= 5 + ] + valid_postcodes = list(set(valid_postcodes)) + + if not valid_postcodes: + return results + + batch_size = 100 + total_batches = (len(valid_postcodes) + batch_size - 1) // batch_size + + for i, batch_start in enumerate(range(0, len(valid_postcodes), batch_size)): + batch = valid_postcodes[batch_start : batch_start + batch_size] + print(f" Geocoding batch {i + 1}/{total_batches} ({len(batch)} postcodes)...") + + try: + response = requests.post( + "https://api.postcodes.io/postcodes", + json={"postcodes": batch}, + timeout=30, + ) + if response.status_code == 200: + data = response.json() + for item in data.get("result", []): + if item and item.get("result"): + pc = item["query"].upper() + lat = item["result"].get("latitude") + lon = item["result"].get("longitude") + if lat and lon: + results[pc] = (lat, lon) + else: + print(f" Warning: API returned status {response.status_code}") + except Exception as e: + print(f" Warning: Geocoding batch failed: {e}") + + return results + + +def geocode_schools(force: bool = False) -> None: + """ + Geocode all schools in the database. + + Args: + force: If True, re-geocode all postcodes even if already geocoded + """ + print(f"\n{'='*60}") + print(f"School Geocoding Job - {datetime.now().isoformat()}") + print(f"{'='*60}\n") + + db = SessionLocal() + + try: + # Get schools that need geocoding + if force: + schools = db.query(School).filter(School.postcode.isnot(None)).all() + print(f"Force mode: Processing all {len(schools)} schools with postcodes") + else: + schools = db.query(School).filter( + School.postcode.isnot(None), + (School.latitude.is_(None)) | (School.longitude.is_(None)) + ).all() + print(f"Found {len(schools)} schools without coordinates") + + if not schools: + print("No schools to geocode. Exiting.") + return + + # Extract unique postcodes + postcodes = list(set( + s.postcode.strip().upper() + for s in schools + if s.postcode + )) + print(f"Unique postcodes to geocode: {len(postcodes)}") + + # Geocode in bulk + print("\nGeocoding postcodes...") + geocoded = geocode_postcodes_bulk(postcodes) + print(f"Successfully geocoded: {len(geocoded)} postcodes") + + # Update database + print("\nUpdating database...") + updated_count = 0 + failed_count = 0 + + for school in schools: + if not school.postcode: + continue + + pc_upper = school.postcode.strip().upper() + coords = geocoded.get(pc_upper) + + if coords: + school.latitude = coords[0] + school.longitude = coords[1] + updated_count += 1 + else: + failed_count += 1 + + db.commit() + + print(f"\nResults:") + print(f" - Updated: {updated_count} schools") + print(f" - Failed (invalid/not found): {failed_count} postcodes") + + # Summary stats + total_with_coords = db.query(School).filter( + School.latitude.isnot(None), + School.longitude.isnot(None) + ).count() + total_schools = db.query(School).count() + + print(f"\nDatabase summary:") + print(f" - Total schools: {total_schools}") + print(f" - Schools with coordinates: {total_with_coords}") + print(f" - Coverage: {100*total_with_coords/total_schools:.1f}%") + + except Exception as e: + print(f"Error during geocoding: {e}") + db.rollback() + raise + finally: + db.close() + print(f"\n{'='*60}") + print(f"Geocoding job completed - {datetime.now().isoformat()}") + print(f"{'='*60}\n") + + +def main(): + parser = argparse.ArgumentParser( + description="Geocode school postcodes and update database" + ) + parser.add_argument( + "--force", + action="store_true", + help="Re-geocode all postcodes, even if already geocoded" + ) + args = parser.parse_args() + + geocode_schools(force=args.force) + + +if __name__ == "__main__": + main()