moving geocoding to a background task
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 57s

This commit is contained in:
Tudor
2026-01-08 15:30:33 +00:00
parent 73971a43f0
commit 40348cb1bd
4 changed files with 216 additions and 70 deletions

View File

@@ -179,6 +179,31 @@ Data is sourced from the UK Government's [Compare School Performance](https://ww
**Important**: When using real data, please comply with the [terms of use](https://www.compare-school-performance.service.gov.uk/download-data) and data protection regulations. **Important**: When using real data, please comply with the [terms of use](https://www.compare-school-performance.service.gov.uk/download-data) and data protection regulations.
## Scheduled Jobs
### Geocoding Schools (Cron Job)
School postcodes are geocoded by a scheduled job, not on-demand. This improves performance and reduces API calls.
**Setup the cron job** (runs weekly on Sunday at 2am):
```bash
# Edit crontab
crontab -e
# Add this line (adjust paths as needed):
0 2 * * 0 cd /path/to/school_compare && /path/to/venv/bin/python scripts/geocode_schools.py >> /var/log/geocode_schools.log 2>&1
```
**Manual run:**
```bash
# Geocode only schools missing coordinates
python scripts/geocode_schools.py
# Force re-geocode all schools
python scripts/geocode_schools.py --force
```
## License ## License
MIT License - feel free to use this project for educational purposes. MIT License - feel free to use this project for educational purposes.

View File

@@ -21,10 +21,9 @@ from starlette.middleware.base import BaseHTTPMiddleware
from .config import settings from .config import settings
from .data_loader import ( from .data_loader import (
clear_cache, clear_cache,
geocode_postcodes_bulk,
geocode_single_postcode,
haversine_distance, haversine_distance,
load_school_data, load_school_data,
geocode_single_postcode,
) )
from .data_loader import get_data_info as get_db_info from .data_loader import get_data_info as get_db_info
from .database import init_db from .database import init_db
@@ -256,7 +255,7 @@ async def get_schools(
] ]
schools_df = df_latest[available_cols].drop_duplicates(subset=["urn"]) schools_df = df_latest[available_cols].drop_duplicates(subset=["urn"])
# Location-based search # Location-based search (uses pre-geocoded data from database)
search_coords = None search_coords = None
if postcode: if postcode:
coords = geocode_single_postcode(postcode) coords = geocode_single_postcode(postcode)
@@ -264,24 +263,7 @@ async def get_schools(
search_coords = coords search_coords = coords
schools_df = schools_df.copy() schools_df = schools_df.copy()
# Geocode school postcodes on-demand if not already cached # Filter by distance using pre-geocoded lat/long from database
if "postcode" in schools_df.columns:
unique_postcodes = schools_df["postcode"].dropna().unique().tolist()
geocoded = geocode_postcodes_bulk(unique_postcodes)
# Add lat/long from geocoded data
schools_df["latitude"] = schools_df["postcode"].apply(
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[0]
if pd.notna(pc)
else None
)
schools_df["longitude"] = schools_df["postcode"].apply(
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[1]
if pd.notna(pc)
else None
)
# Filter by distance
def calc_distance(row): def calc_distance(row):
if pd.isna(row.get("latitude")) or pd.isna(row.get("longitude")): if pd.isna(row.get("latitude")) or pd.isna(row.get("longitude")):
return float("inf") return float("inf")

View File

@@ -1,6 +1,9 @@
""" """
Data loading module that queries from PostgreSQL database. Data loading module that queries from PostgreSQL database.
Provides efficient queries with caching and lazy loading. Provides efficient queries with caching and lazy loading.
Note: School geocoding is handled by a separate cron job (scripts/geocode_schools.py).
Only user search postcodes are geocoded on-demand via geocode_single_postcode().
""" """
import pandas as pd import pandas as pd
@@ -15,58 +18,10 @@ from .config import settings
from .database import SessionLocal, get_db_session from .database import SessionLocal, get_db_session
from .models import School, SchoolResult from .models import School, SchoolResult
# Cache for postcode geocoding # Cache for user search postcode geocoding (not for school data)
_postcode_cache: Dict[str, Tuple[float, float]] = {} _postcode_cache: Dict[str, Tuple[float, float]] = {}
def geocode_postcodes_bulk(postcodes: list) -> Dict[str, Tuple[float, float]]:
"""
Geocode postcodes in bulk using postcodes.io API.
Returns dict of postcode -> (latitude, longitude).
"""
results = {}
# Check cache first
uncached = []
for pc in postcodes:
if pc and isinstance(pc, str):
pc_upper = pc.strip().upper()
if pc_upper in _postcode_cache:
results[pc_upper] = _postcode_cache[pc_upper]
elif len(pc_upper) >= 5:
uncached.append(pc_upper)
if not uncached:
return results
uncached = list(set(uncached))
# postcodes.io allows max 100 postcodes per request
batch_size = 100
for i in range(0, len(uncached), batch_size):
batch = uncached[i:i + batch_size]
try:
response = requests.post(
'https://api.postcodes.io/postcodes',
json={'postcodes': batch},
timeout=30
)
if response.status_code == 200:
data = response.json()
for item in data.get('result', []):
if item and item.get('result'):
pc = item['query'].upper()
lat = item['result'].get('latitude')
lon = item['result'].get('longitude')
if lat and lon:
results[pc] = (lat, lon)
_postcode_cache[pc] = (lat, lon)
except Exception as e:
print(f" Warning: Geocoding batch failed: {e}")
return results
def geocode_single_postcode(postcode: str) -> Optional[Tuple[float, float]]: def geocode_single_postcode(postcode: str) -> Optional[Tuple[float, float]]:
"""Geocode a single postcode using postcodes.io API.""" """Geocode a single postcode using postcodes.io API."""
if not postcode: if not postcode:

184
scripts/geocode_schools.py Executable file
View File

@@ -0,0 +1,184 @@
#!/usr/bin/env python3
"""
Geocode all school postcodes and update the database.
This script should be run as a weekly cron job to ensure all schools
have up-to-date latitude/longitude coordinates.
Usage:
python scripts/geocode_schools.py [--force]
Options:
--force Re-geocode all postcodes, even if already geocoded
Crontab example (run every Sunday at 2am):
0 2 * * 0 cd /path/to/school_compare && /path/to/venv/bin/python scripts/geocode_schools.py >> /var/log/geocode_schools.log 2>&1
"""
import argparse
import sys
from datetime import datetime
from pathlib import Path
from typing import Dict, Tuple
import requests
# Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent))
from backend.database import SessionLocal
from backend.models import School
def geocode_postcodes_bulk(postcodes: list) -> Dict[str, Tuple[float, float]]:
"""
Geocode postcodes in bulk using postcodes.io API.
Returns dict of postcode -> (latitude, longitude).
"""
results = {}
valid_postcodes = [
p.strip().upper()
for p in postcodes
if p and isinstance(p, str) and len(p.strip()) >= 5
]
valid_postcodes = list(set(valid_postcodes))
if not valid_postcodes:
return results
batch_size = 100
total_batches = (len(valid_postcodes) + batch_size - 1) // batch_size
for i, batch_start in enumerate(range(0, len(valid_postcodes), batch_size)):
batch = valid_postcodes[batch_start : batch_start + batch_size]
print(f" Geocoding batch {i + 1}/{total_batches} ({len(batch)} postcodes)...")
try:
response = requests.post(
"https://api.postcodes.io/postcodes",
json={"postcodes": batch},
timeout=30,
)
if response.status_code == 200:
data = response.json()
for item in data.get("result", []):
if item and item.get("result"):
pc = item["query"].upper()
lat = item["result"].get("latitude")
lon = item["result"].get("longitude")
if lat and lon:
results[pc] = (lat, lon)
else:
print(f" Warning: API returned status {response.status_code}")
except Exception as e:
print(f" Warning: Geocoding batch failed: {e}")
return results
def geocode_schools(force: bool = False) -> None:
"""
Geocode all schools in the database.
Args:
force: If True, re-geocode all postcodes even if already geocoded
"""
print(f"\n{'='*60}")
print(f"School Geocoding Job - {datetime.now().isoformat()}")
print(f"{'='*60}\n")
db = SessionLocal()
try:
# Get schools that need geocoding
if force:
schools = db.query(School).filter(School.postcode.isnot(None)).all()
print(f"Force mode: Processing all {len(schools)} schools with postcodes")
else:
schools = db.query(School).filter(
School.postcode.isnot(None),
(School.latitude.is_(None)) | (School.longitude.is_(None))
).all()
print(f"Found {len(schools)} schools without coordinates")
if not schools:
print("No schools to geocode. Exiting.")
return
# Extract unique postcodes
postcodes = list(set(
s.postcode.strip().upper()
for s in schools
if s.postcode
))
print(f"Unique postcodes to geocode: {len(postcodes)}")
# Geocode in bulk
print("\nGeocoding postcodes...")
geocoded = geocode_postcodes_bulk(postcodes)
print(f"Successfully geocoded: {len(geocoded)} postcodes")
# Update database
print("\nUpdating database...")
updated_count = 0
failed_count = 0
for school in schools:
if not school.postcode:
continue
pc_upper = school.postcode.strip().upper()
coords = geocoded.get(pc_upper)
if coords:
school.latitude = coords[0]
school.longitude = coords[1]
updated_count += 1
else:
failed_count += 1
db.commit()
print(f"\nResults:")
print(f" - Updated: {updated_count} schools")
print(f" - Failed (invalid/not found): {failed_count} postcodes")
# Summary stats
total_with_coords = db.query(School).filter(
School.latitude.isnot(None),
School.longitude.isnot(None)
).count()
total_schools = db.query(School).count()
print(f"\nDatabase summary:")
print(f" - Total schools: {total_schools}")
print(f" - Schools with coordinates: {total_with_coords}")
print(f" - Coverage: {100*total_with_coords/total_schools:.1f}%")
except Exception as e:
print(f"Error during geocoding: {e}")
db.rollback()
raise
finally:
db.close()
print(f"\n{'='*60}")
print(f"Geocoding job completed - {datetime.now().isoformat()}")
print(f"{'='*60}\n")
def main():
parser = argparse.ArgumentParser(
description="Geocode school postcodes and update database"
)
parser.add_argument(
"--force",
action="store_true",
help="Re-geocode all postcodes, even if already geocoded"
)
args = parser.parse_args()
geocode_schools(force=args.force)
if __name__ == "__main__":
main()