#!/usr/bin/env python3 """ Geocode all school postcodes and update the database. This script should be run as a weekly cron job to ensure all schools have up-to-date latitude/longitude coordinates. Usage: python scripts/geocode_schools.py [--force] Options: --force Re-geocode all postcodes, even if already geocoded Crontab example (run every Sunday at 2am): 0 2 * * 0 cd /path/to/school_compare && /path/to/venv/bin/python scripts/geocode_schools.py >> /var/log/geocode_schools.log 2>&1 """ import argparse import sys from datetime import datetime from pathlib import Path from typing import Dict, Tuple import requests # Add parent directory to path for imports sys.path.insert(0, str(Path(__file__).parent.parent)) from backend.database import SessionLocal from backend.models import School def geocode_postcodes_bulk(postcodes: list) -> Dict[str, Tuple[float, float]]: """ Geocode postcodes in bulk using postcodes.io API. Returns dict of postcode -> (latitude, longitude). """ results = {} valid_postcodes = [ p.strip().upper() for p in postcodes if p and isinstance(p, str) and len(p.strip()) >= 5 ] valid_postcodes = list(set(valid_postcodes)) if not valid_postcodes: return results batch_size = 100 total_batches = (len(valid_postcodes) + batch_size - 1) // batch_size for i, batch_start in enumerate(range(0, len(valid_postcodes), batch_size)): batch = valid_postcodes[batch_start : batch_start + batch_size] print(f" Geocoding batch {i + 1}/{total_batches} ({len(batch)} postcodes)...") try: response = requests.post( "https://api.postcodes.io/postcodes", json={"postcodes": batch}, timeout=30, ) if response.status_code == 200: data = response.json() for item in data.get("result", []): if item and item.get("result"): pc = item["query"].upper() lat = item["result"].get("latitude") lon = item["result"].get("longitude") if lat and lon: results[pc] = (lat, lon) else: print(f" Warning: API returned status {response.status_code}") except Exception as e: print(f" Warning: Geocoding batch failed: {e}") return results def geocode_schools(force: bool = False) -> None: """ Geocode all schools in the database. Args: force: If True, re-geocode all postcodes even if already geocoded """ print(f"\n{'='*60}") print(f"School Geocoding Job - {datetime.now().isoformat()}") print(f"{'='*60}\n") db = SessionLocal() try: # Get schools that need geocoding if force: schools = db.query(School).filter(School.postcode.isnot(None)).all() print(f"Force mode: Processing all {len(schools)} schools with postcodes") else: schools = db.query(School).filter( School.postcode.isnot(None), (School.latitude.is_(None)) | (School.longitude.is_(None)) ).all() print(f"Found {len(schools)} schools without coordinates") if not schools: print("No schools to geocode. Exiting.") return # Extract unique postcodes postcodes = list(set( s.postcode.strip().upper() for s in schools if s.postcode )) print(f"Unique postcodes to geocode: {len(postcodes)}") # Geocode in bulk print("\nGeocoding postcodes...") geocoded = geocode_postcodes_bulk(postcodes) print(f"Successfully geocoded: {len(geocoded)} postcodes") # Update database print("\nUpdating database...") updated_count = 0 failed_count = 0 for school in schools: if not school.postcode: continue pc_upper = school.postcode.strip().upper() coords = geocoded.get(pc_upper) if coords: school.latitude = coords[0] school.longitude = coords[1] updated_count += 1 else: failed_count += 1 db.commit() print(f"\nResults:") print(f" - Updated: {updated_count} schools") print(f" - Failed (invalid/not found): {failed_count} postcodes") # Summary stats total_with_coords = db.query(School).filter( School.latitude.isnot(None), School.longitude.isnot(None) ).count() total_schools = db.query(School).count() print(f"\nDatabase summary:") print(f" - Total schools: {total_schools}") print(f" - Schools with coordinates: {total_with_coords}") print(f" - Coverage: {100*total_with_coords/total_schools:.1f}%") except Exception as e: print(f"Error during geocoding: {e}") db.rollback() raise finally: db.close() print(f"\n{'='*60}") print(f"Geocoding job completed - {datetime.now().isoformat()}") print(f"{'='*60}\n") def main(): parser = argparse.ArgumentParser( description="Geocode school postcodes and update database" ) parser.add_argument( "--force", action="store_true", help="Re-geocode all postcodes, even if already geocoded" ) args = parser.parse_args() geocode_schools(force=args.force) if __name__ == "__main__": main()