From 00dca39fbda54ee63209bb816460d1b68cf763ae Mon Sep 17 00:00:00 2001 From: Tudor Date: Tue, 24 Mar 2026 22:09:51 +0000 Subject: [PATCH] fix(migration): preserve geocoding across reimports Before dropping tables, save all existing lat/lon coordinates keyed by URN. After reimport, merge cached coordinates with any newly geocoded ones so schools that already have coordinates skip the postcodes.io API call. This makes repeated reimports fast and avoids re-geocoding ~15k schools. Co-Authored-By: Claude Sonnet 4.6 --- backend/migration.py | 49 ++++++++++++++++++++++++++++++++++++-------- 1 file changed, 41 insertions(+), 8 deletions(-) diff --git a/backend/migration.py b/backend/migration.py index f92ac22..071d38d 100644 --- a/backend/migration.py +++ b/backend/migration.py @@ -185,9 +185,12 @@ def load_csv_data(data_dir: Path) -> pd.DataFrame: return pd.DataFrame() -def migrate_data(df: pd.DataFrame, geocode: bool = False): +def migrate_data(df: pd.DataFrame, geocode: bool = False, geocode_cache: dict = None): """Migrate DataFrame data to database.""" + if geocode_cache is None: + geocode_cache = {} + # Clean URN column - convert to integer, drop invalid values df = df.copy() df["urn"] = pd.to_numeric(df["urn"], errors="coerce") @@ -200,13 +203,25 @@ def migrate_data(df: pd.DataFrame, geocode: bool = False): ) print(f"\nMigrating {len(school_data)} unique schools...") - # Geocode if requested - geocoded = {} + # Geocode postcodes that aren't already in the cache + geocoded = dict(geocode_cache) # start with preserved coordinates if geocode and "postcode" in df.columns: - print("\nGeocoding postcodes...") - postcodes = df["postcode"].dropna().unique().tolist() - geocoded = geocode_postcodes_bulk(postcodes) - print(f" Successfully geocoded {len(geocoded)} postcodes") + cached_postcodes = { + str(row.get("postcode", "")).strip().upper() + for _, row in school_data.iterrows() + if int(float(str(row.get("urn", 0) or 0))) in geocode_cache + } + postcodes_needed = [ + p for p in df["postcode"].dropna().unique() + if str(p).strip().upper() not in cached_postcodes + ] + if postcodes_needed: + print(f"\nGeocoding {len(postcodes_needed)} postcodes ({len(geocode_cache)} restored from cache)...") + fresh = geocode_postcodes_bulk(postcodes_needed) + geocoded.update(fresh) + print(f" Successfully geocoded {len(fresh)} new postcodes") + else: + print(f"\nAll {len(geocode_cache)} postcodes restored from cache, skipping geocoding.") with get_db_session() as db: # Create schools @@ -396,6 +411,24 @@ def run_full_migration(geocode: bool = False) -> bool: Returns True if successful, False if no data found. Raises exception on error. """ + # Preserve existing geocoding so a reimport doesn't throw away coordinates + # that took a long time to compute. + geocode_cache: dict[int, tuple[float, float]] = {} + inspector = __import__("sqlalchemy").inspect(engine) + if "schools" in inspector.get_table_names(): + try: + with get_db_session() as db: + rows = db.execute( + __import__("sqlalchemy").text( + "SELECT urn, latitude, longitude FROM schools " + "WHERE latitude IS NOT NULL AND longitude IS NOT NULL" + ) + ).fetchall() + geocode_cache = {r.urn: (r.latitude, r.longitude) for r in rows} + print(f" Saved {len(geocode_cache)} existing geocoded coordinates.") + except Exception as e: + print(f" Warning: could not save geocode cache: {e}") + print("Dropping existing tables...") Base.metadata.drop_all(bind=engine) @@ -409,5 +442,5 @@ def run_full_migration(geocode: bool = False) -> bool: print("Warning: No CSV data found to migrate!") return False - migrate_data(df, geocode=geocode) + migrate_data(df, geocode=geocode, geocode_cache=geocode_cache) return True