fix(migration): preserve geocoding across reimports
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 47s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m4s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

Before dropping tables, save all existing lat/lon coordinates keyed by URN.
After reimport, merge cached coordinates with any newly geocoded ones so
schools that already have coordinates skip the postcodes.io API call.
This makes repeated reimports fast and avoids re-geocoding ~15k schools.

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-24 22:09:51 +00:00
parent a478068d5a
commit 00dca39fbd

View File

@@ -185,9 +185,12 @@ def load_csv_data(data_dir: Path) -> pd.DataFrame:
return pd.DataFrame()
def migrate_data(df: pd.DataFrame, geocode: bool = False):
def migrate_data(df: pd.DataFrame, geocode: bool = False, geocode_cache: dict = None):
"""Migrate DataFrame data to database."""
if geocode_cache is None:
geocode_cache = {}
# Clean URN column - convert to integer, drop invalid values
df = df.copy()
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
@@ -200,13 +203,25 @@ def migrate_data(df: pd.DataFrame, geocode: bool = False):
)
print(f"\nMigrating {len(school_data)} unique schools...")
# Geocode if requested
geocoded = {}
# Geocode postcodes that aren't already in the cache
geocoded = dict(geocode_cache) # start with preserved coordinates
if geocode and "postcode" in df.columns:
print("\nGeocoding postcodes...")
postcodes = df["postcode"].dropna().unique().tolist()
geocoded = geocode_postcodes_bulk(postcodes)
print(f" Successfully geocoded {len(geocoded)} postcodes")
cached_postcodes = {
str(row.get("postcode", "")).strip().upper()
for _, row in school_data.iterrows()
if int(float(str(row.get("urn", 0) or 0))) in geocode_cache
}
postcodes_needed = [
p for p in df["postcode"].dropna().unique()
if str(p).strip().upper() not in cached_postcodes
]
if postcodes_needed:
print(f"\nGeocoding {len(postcodes_needed)} postcodes ({len(geocode_cache)} restored from cache)...")
fresh = geocode_postcodes_bulk(postcodes_needed)
geocoded.update(fresh)
print(f" Successfully geocoded {len(fresh)} new postcodes")
else:
print(f"\nAll {len(geocode_cache)} postcodes restored from cache, skipping geocoding.")
with get_db_session() as db:
# Create schools
@@ -396,6 +411,24 @@ def run_full_migration(geocode: bool = False) -> bool:
Returns True if successful, False if no data found.
Raises exception on error.
"""
# Preserve existing geocoding so a reimport doesn't throw away coordinates
# that took a long time to compute.
geocode_cache: dict[int, tuple[float, float]] = {}
inspector = __import__("sqlalchemy").inspect(engine)
if "schools" in inspector.get_table_names():
try:
with get_db_session() as db:
rows = db.execute(
__import__("sqlalchemy").text(
"SELECT urn, latitude, longitude FROM schools "
"WHERE latitude IS NOT NULL AND longitude IS NOT NULL"
)
).fetchall()
geocode_cache = {r.urn: (r.latitude, r.longitude) for r in rows}
print(f" Saved {len(geocode_cache)} existing geocoded coordinates.")
except Exception as e:
print(f" Warning: could not save geocode cache: {e}")
print("Dropping existing tables...")
Base.metadata.drop_all(bind=engine)
@@ -409,5 +442,5 @@ def run_full_migration(geocode: bool = False) -> bool:
print("Warning: No CSV data found to migrate!")
return False
migrate_data(df, geocode=geocode)
migrate_data(df, geocode=geocode, geocode_cache=geocode_cache)
return True