moving geocoding to a background task
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 57s
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 57s
This commit is contained in:
25
README.md
25
README.md
@@ -179,6 +179,31 @@ Data is sourced from the UK Government's [Compare School Performance](https://ww
|
|||||||
|
|
||||||
**Important**: When using real data, please comply with the [terms of use](https://www.compare-school-performance.service.gov.uk/download-data) and data protection regulations.
|
**Important**: When using real data, please comply with the [terms of use](https://www.compare-school-performance.service.gov.uk/download-data) and data protection regulations.
|
||||||
|
|
||||||
|
## Scheduled Jobs
|
||||||
|
|
||||||
|
### Geocoding Schools (Cron Job)
|
||||||
|
|
||||||
|
School postcodes are geocoded by a scheduled job, not on-demand. This improves performance and reduces API calls.
|
||||||
|
|
||||||
|
**Setup the cron job** (runs weekly on Sunday at 2am):
|
||||||
|
|
||||||
|
```bash
|
||||||
|
# Edit crontab
|
||||||
|
crontab -e
|
||||||
|
|
||||||
|
# Add this line (adjust paths as needed):
|
||||||
|
0 2 * * 0 cd /path/to/school_compare && /path/to/venv/bin/python scripts/geocode_schools.py >> /var/log/geocode_schools.log 2>&1
|
||||||
|
```
|
||||||
|
|
||||||
|
**Manual run:**
|
||||||
|
```bash
|
||||||
|
# Geocode only schools missing coordinates
|
||||||
|
python scripts/geocode_schools.py
|
||||||
|
|
||||||
|
# Force re-geocode all schools
|
||||||
|
python scripts/geocode_schools.py --force
|
||||||
|
```
|
||||||
|
|
||||||
## License
|
## License
|
||||||
|
|
||||||
MIT License - feel free to use this project for educational purposes.
|
MIT License - feel free to use this project for educational purposes.
|
||||||
|
|||||||
@@ -21,10 +21,9 @@ from starlette.middleware.base import BaseHTTPMiddleware
|
|||||||
from .config import settings
|
from .config import settings
|
||||||
from .data_loader import (
|
from .data_loader import (
|
||||||
clear_cache,
|
clear_cache,
|
||||||
geocode_postcodes_bulk,
|
|
||||||
geocode_single_postcode,
|
|
||||||
haversine_distance,
|
haversine_distance,
|
||||||
load_school_data,
|
load_school_data,
|
||||||
|
geocode_single_postcode,
|
||||||
)
|
)
|
||||||
from .data_loader import get_data_info as get_db_info
|
from .data_loader import get_data_info as get_db_info
|
||||||
from .database import init_db
|
from .database import init_db
|
||||||
@@ -256,7 +255,7 @@ async def get_schools(
|
|||||||
]
|
]
|
||||||
schools_df = df_latest[available_cols].drop_duplicates(subset=["urn"])
|
schools_df = df_latest[available_cols].drop_duplicates(subset=["urn"])
|
||||||
|
|
||||||
# Location-based search
|
# Location-based search (uses pre-geocoded data from database)
|
||||||
search_coords = None
|
search_coords = None
|
||||||
if postcode:
|
if postcode:
|
||||||
coords = geocode_single_postcode(postcode)
|
coords = geocode_single_postcode(postcode)
|
||||||
@@ -264,24 +263,7 @@ async def get_schools(
|
|||||||
search_coords = coords
|
search_coords = coords
|
||||||
schools_df = schools_df.copy()
|
schools_df = schools_df.copy()
|
||||||
|
|
||||||
# Geocode school postcodes on-demand if not already cached
|
# Filter by distance using pre-geocoded lat/long from database
|
||||||
if "postcode" in schools_df.columns:
|
|
||||||
unique_postcodes = schools_df["postcode"].dropna().unique().tolist()
|
|
||||||
geocoded = geocode_postcodes_bulk(unique_postcodes)
|
|
||||||
|
|
||||||
# Add lat/long from geocoded data
|
|
||||||
schools_df["latitude"] = schools_df["postcode"].apply(
|
|
||||||
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[0]
|
|
||||||
if pd.notna(pc)
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
schools_df["longitude"] = schools_df["postcode"].apply(
|
|
||||||
lambda pc: geocoded.get(str(pc).strip().upper(), (None, None))[1]
|
|
||||||
if pd.notna(pc)
|
|
||||||
else None
|
|
||||||
)
|
|
||||||
|
|
||||||
# Filter by distance
|
|
||||||
def calc_distance(row):
|
def calc_distance(row):
|
||||||
if pd.isna(row.get("latitude")) or pd.isna(row.get("longitude")):
|
if pd.isna(row.get("latitude")) or pd.isna(row.get("longitude")):
|
||||||
return float("inf")
|
return float("inf")
|
||||||
|
|||||||
@@ -1,6 +1,9 @@
|
|||||||
"""
|
"""
|
||||||
Data loading module that queries from PostgreSQL database.
|
Data loading module that queries from PostgreSQL database.
|
||||||
Provides efficient queries with caching and lazy loading.
|
Provides efficient queries with caching and lazy loading.
|
||||||
|
|
||||||
|
Note: School geocoding is handled by a separate cron job (scripts/geocode_schools.py).
|
||||||
|
Only user search postcodes are geocoded on-demand via geocode_single_postcode().
|
||||||
"""
|
"""
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
@@ -15,58 +18,10 @@ from .config import settings
|
|||||||
from .database import SessionLocal, get_db_session
|
from .database import SessionLocal, get_db_session
|
||||||
from .models import School, SchoolResult
|
from .models import School, SchoolResult
|
||||||
|
|
||||||
# Cache for postcode geocoding
|
# Cache for user search postcode geocoding (not for school data)
|
||||||
_postcode_cache: Dict[str, Tuple[float, float]] = {}
|
_postcode_cache: Dict[str, Tuple[float, float]] = {}
|
||||||
|
|
||||||
|
|
||||||
def geocode_postcodes_bulk(postcodes: list) -> Dict[str, Tuple[float, float]]:
|
|
||||||
"""
|
|
||||||
Geocode postcodes in bulk using postcodes.io API.
|
|
||||||
Returns dict of postcode -> (latitude, longitude).
|
|
||||||
"""
|
|
||||||
results = {}
|
|
||||||
|
|
||||||
# Check cache first
|
|
||||||
uncached = []
|
|
||||||
for pc in postcodes:
|
|
||||||
if pc and isinstance(pc, str):
|
|
||||||
pc_upper = pc.strip().upper()
|
|
||||||
if pc_upper in _postcode_cache:
|
|
||||||
results[pc_upper] = _postcode_cache[pc_upper]
|
|
||||||
elif len(pc_upper) >= 5:
|
|
||||||
uncached.append(pc_upper)
|
|
||||||
|
|
||||||
if not uncached:
|
|
||||||
return results
|
|
||||||
|
|
||||||
uncached = list(set(uncached))
|
|
||||||
|
|
||||||
# postcodes.io allows max 100 postcodes per request
|
|
||||||
batch_size = 100
|
|
||||||
for i in range(0, len(uncached), batch_size):
|
|
||||||
batch = uncached[i:i + batch_size]
|
|
||||||
try:
|
|
||||||
response = requests.post(
|
|
||||||
'https://api.postcodes.io/postcodes',
|
|
||||||
json={'postcodes': batch},
|
|
||||||
timeout=30
|
|
||||||
)
|
|
||||||
if response.status_code == 200:
|
|
||||||
data = response.json()
|
|
||||||
for item in data.get('result', []):
|
|
||||||
if item and item.get('result'):
|
|
||||||
pc = item['query'].upper()
|
|
||||||
lat = item['result'].get('latitude')
|
|
||||||
lon = item['result'].get('longitude')
|
|
||||||
if lat and lon:
|
|
||||||
results[pc] = (lat, lon)
|
|
||||||
_postcode_cache[pc] = (lat, lon)
|
|
||||||
except Exception as e:
|
|
||||||
print(f" Warning: Geocoding batch failed: {e}")
|
|
||||||
|
|
||||||
return results
|
|
||||||
|
|
||||||
|
|
||||||
def geocode_single_postcode(postcode: str) -> Optional[Tuple[float, float]]:
|
def geocode_single_postcode(postcode: str) -> Optional[Tuple[float, float]]:
|
||||||
"""Geocode a single postcode using postcodes.io API."""
|
"""Geocode a single postcode using postcodes.io API."""
|
||||||
if not postcode:
|
if not postcode:
|
||||||
|
|||||||
184
scripts/geocode_schools.py
Executable file
184
scripts/geocode_schools.py
Executable file
@@ -0,0 +1,184 @@
|
|||||||
|
#!/usr/bin/env python3
|
||||||
|
"""
|
||||||
|
Geocode all school postcodes and update the database.
|
||||||
|
|
||||||
|
This script should be run as a weekly cron job to ensure all schools
|
||||||
|
have up-to-date latitude/longitude coordinates.
|
||||||
|
|
||||||
|
Usage:
|
||||||
|
python scripts/geocode_schools.py [--force]
|
||||||
|
|
||||||
|
Options:
|
||||||
|
--force Re-geocode all postcodes, even if already geocoded
|
||||||
|
|
||||||
|
Crontab example (run every Sunday at 2am):
|
||||||
|
0 2 * * 0 cd /path/to/school_compare && /path/to/venv/bin/python scripts/geocode_schools.py >> /var/log/geocode_schools.log 2>&1
|
||||||
|
"""
|
||||||
|
|
||||||
|
import argparse
|
||||||
|
import sys
|
||||||
|
from datetime import datetime
|
||||||
|
from pathlib import Path
|
||||||
|
from typing import Dict, Tuple
|
||||||
|
|
||||||
|
import requests
|
||||||
|
|
||||||
|
# Add parent directory to path for imports
|
||||||
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||||
|
|
||||||
|
from backend.database import SessionLocal
|
||||||
|
from backend.models import School
|
||||||
|
|
||||||
|
|
||||||
|
def geocode_postcodes_bulk(postcodes: list) -> Dict[str, Tuple[float, float]]:
|
||||||
|
"""
|
||||||
|
Geocode postcodes in bulk using postcodes.io API.
|
||||||
|
Returns dict of postcode -> (latitude, longitude).
|
||||||
|
"""
|
||||||
|
results = {}
|
||||||
|
valid_postcodes = [
|
||||||
|
p.strip().upper()
|
||||||
|
for p in postcodes
|
||||||
|
if p and isinstance(p, str) and len(p.strip()) >= 5
|
||||||
|
]
|
||||||
|
valid_postcodes = list(set(valid_postcodes))
|
||||||
|
|
||||||
|
if not valid_postcodes:
|
||||||
|
return results
|
||||||
|
|
||||||
|
batch_size = 100
|
||||||
|
total_batches = (len(valid_postcodes) + batch_size - 1) // batch_size
|
||||||
|
|
||||||
|
for i, batch_start in enumerate(range(0, len(valid_postcodes), batch_size)):
|
||||||
|
batch = valid_postcodes[batch_start : batch_start + batch_size]
|
||||||
|
print(f" Geocoding batch {i + 1}/{total_batches} ({len(batch)} postcodes)...")
|
||||||
|
|
||||||
|
try:
|
||||||
|
response = requests.post(
|
||||||
|
"https://api.postcodes.io/postcodes",
|
||||||
|
json={"postcodes": batch},
|
||||||
|
timeout=30,
|
||||||
|
)
|
||||||
|
if response.status_code == 200:
|
||||||
|
data = response.json()
|
||||||
|
for item in data.get("result", []):
|
||||||
|
if item and item.get("result"):
|
||||||
|
pc = item["query"].upper()
|
||||||
|
lat = item["result"].get("latitude")
|
||||||
|
lon = item["result"].get("longitude")
|
||||||
|
if lat and lon:
|
||||||
|
results[pc] = (lat, lon)
|
||||||
|
else:
|
||||||
|
print(f" Warning: API returned status {response.status_code}")
|
||||||
|
except Exception as e:
|
||||||
|
print(f" Warning: Geocoding batch failed: {e}")
|
||||||
|
|
||||||
|
return results
|
||||||
|
|
||||||
|
|
||||||
|
def geocode_schools(force: bool = False) -> None:
|
||||||
|
"""
|
||||||
|
Geocode all schools in the database.
|
||||||
|
|
||||||
|
Args:
|
||||||
|
force: If True, re-geocode all postcodes even if already geocoded
|
||||||
|
"""
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"School Geocoding Job - {datetime.now().isoformat()}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
db = SessionLocal()
|
||||||
|
|
||||||
|
try:
|
||||||
|
# Get schools that need geocoding
|
||||||
|
if force:
|
||||||
|
schools = db.query(School).filter(School.postcode.isnot(None)).all()
|
||||||
|
print(f"Force mode: Processing all {len(schools)} schools with postcodes")
|
||||||
|
else:
|
||||||
|
schools = db.query(School).filter(
|
||||||
|
School.postcode.isnot(None),
|
||||||
|
(School.latitude.is_(None)) | (School.longitude.is_(None))
|
||||||
|
).all()
|
||||||
|
print(f"Found {len(schools)} schools without coordinates")
|
||||||
|
|
||||||
|
if not schools:
|
||||||
|
print("No schools to geocode. Exiting.")
|
||||||
|
return
|
||||||
|
|
||||||
|
# Extract unique postcodes
|
||||||
|
postcodes = list(set(
|
||||||
|
s.postcode.strip().upper()
|
||||||
|
for s in schools
|
||||||
|
if s.postcode
|
||||||
|
))
|
||||||
|
print(f"Unique postcodes to geocode: {len(postcodes)}")
|
||||||
|
|
||||||
|
# Geocode in bulk
|
||||||
|
print("\nGeocoding postcodes...")
|
||||||
|
geocoded = geocode_postcodes_bulk(postcodes)
|
||||||
|
print(f"Successfully geocoded: {len(geocoded)} postcodes")
|
||||||
|
|
||||||
|
# Update database
|
||||||
|
print("\nUpdating database...")
|
||||||
|
updated_count = 0
|
||||||
|
failed_count = 0
|
||||||
|
|
||||||
|
for school in schools:
|
||||||
|
if not school.postcode:
|
||||||
|
continue
|
||||||
|
|
||||||
|
pc_upper = school.postcode.strip().upper()
|
||||||
|
coords = geocoded.get(pc_upper)
|
||||||
|
|
||||||
|
if coords:
|
||||||
|
school.latitude = coords[0]
|
||||||
|
school.longitude = coords[1]
|
||||||
|
updated_count += 1
|
||||||
|
else:
|
||||||
|
failed_count += 1
|
||||||
|
|
||||||
|
db.commit()
|
||||||
|
|
||||||
|
print(f"\nResults:")
|
||||||
|
print(f" - Updated: {updated_count} schools")
|
||||||
|
print(f" - Failed (invalid/not found): {failed_count} postcodes")
|
||||||
|
|
||||||
|
# Summary stats
|
||||||
|
total_with_coords = db.query(School).filter(
|
||||||
|
School.latitude.isnot(None),
|
||||||
|
School.longitude.isnot(None)
|
||||||
|
).count()
|
||||||
|
total_schools = db.query(School).count()
|
||||||
|
|
||||||
|
print(f"\nDatabase summary:")
|
||||||
|
print(f" - Total schools: {total_schools}")
|
||||||
|
print(f" - Schools with coordinates: {total_with_coords}")
|
||||||
|
print(f" - Coverage: {100*total_with_coords/total_schools:.1f}%")
|
||||||
|
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Error during geocoding: {e}")
|
||||||
|
db.rollback()
|
||||||
|
raise
|
||||||
|
finally:
|
||||||
|
db.close()
|
||||||
|
print(f"\n{'='*60}")
|
||||||
|
print(f"Geocoding job completed - {datetime.now().isoformat()}")
|
||||||
|
print(f"{'='*60}\n")
|
||||||
|
|
||||||
|
|
||||||
|
def main():
|
||||||
|
parser = argparse.ArgumentParser(
|
||||||
|
description="Geocode school postcodes and update database"
|
||||||
|
)
|
||||||
|
parser.add_argument(
|
||||||
|
"--force",
|
||||||
|
action="store_true",
|
||||||
|
help="Re-geocode all postcodes, even if already geocoded"
|
||||||
|
)
|
||||||
|
args = parser.parse_args()
|
||||||
|
|
||||||
|
geocode_schools(force=args.force)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
main()
|
||||||
Reference in New Issue
Block a user