fix(migration): preserve geocoding across reimports

Before dropping tables, save all existing lat/lon coordinates keyed by URN. After reimport, merge cached coordinates with any newly geocoded ones so schools that already have coordinates skip the postcodes.io API call. This makes repeated reimports fast and avoids re-geocoding ~15k schools. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 22:09:51 +00:00
parent a478068d5a
commit 00dca39fbd
1 changed files with 41 additions and 8 deletions
@@ -185,9 +185,12 @@ def load_csv_data(data_dir: Path) -> pd.DataFrame:
    return pd.DataFrame()


-def migrate_data(df: pd.DataFrame, geocode: bool = False):
+def migrate_data(df: pd.DataFrame, geocode: bool = False, geocode_cache: dict = None):
    """Migrate DataFrame data to database."""

+    if geocode_cache is None:
+        geocode_cache = {}
+
    # Clean URN column - convert to integer, drop invalid values
    df = df.copy()
    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
@@ -200,13 +203,25 @@ def migrate_data(df: pd.DataFrame, geocode: bool = False):
    )
    print(f"\nMigrating {len(school_data)} unique schools...")

-    # Geocode if requested
-    geocoded = {}
+    # Geocode postcodes that aren't already in the cache
+    geocoded = dict(geocode_cache)  # start with preserved coordinates
    if geocode and "postcode" in df.columns:
-        print("\nGeocoding postcodes...")
-        postcodes = df["postcode"].dropna().unique().tolist()
-        geocoded = geocode_postcodes_bulk(postcodes)
-        print(f"  Successfully geocoded {len(geocoded)} postcodes")
+        cached_postcodes = {
+            str(row.get("postcode", "")).strip().upper()
+            for _, row in school_data.iterrows()
+            if int(float(str(row.get("urn", 0) or 0))) in geocode_cache
+        }
+        postcodes_needed = [
+            p for p in df["postcode"].dropna().unique()
+            if str(p).strip().upper() not in cached_postcodes
+        ]
+        if postcodes_needed:
+            print(f"\nGeocoding {len(postcodes_needed)} postcodes ({len(geocode_cache)} restored from cache)...")
+            fresh = geocode_postcodes_bulk(postcodes_needed)
+            geocoded.update(fresh)
+            print(f"  Successfully geocoded {len(fresh)} new postcodes")
+        else:
+            print(f"\nAll {len(geocode_cache)} postcodes restored from cache, skipping geocoding.")

    with get_db_session() as db:
        # Create schools
@@ -396,6 +411,24 @@ def run_full_migration(geocode: bool = False) -> bool:
    Returns True if successful, False if no data found.
    Raises exception on error.
    """
+    # Preserve existing geocoding so a reimport doesn't throw away coordinates
+    # that took a long time to compute.
+    geocode_cache: dict[int, tuple[float, float]] = {}
+    inspector = __import__("sqlalchemy").inspect(engine)
+    if "schools" in inspector.get_table_names():
+        try:
+            with get_db_session() as db:
+                rows = db.execute(
+                    __import__("sqlalchemy").text(
+                        "SELECT urn, latitude, longitude FROM schools "
+                        "WHERE latitude IS NOT NULL AND longitude IS NOT NULL"
+                    )
+                ).fetchall()
+                geocode_cache = {r.urn: (r.latitude, r.longitude) for r in rows}
+                print(f"  Saved {len(geocode_cache)} existing geocoded coordinates.")
+        except Exception as e:
+            print(f"  Warning: could not save geocode cache: {e}")
+
    print("Dropping existing tables...")
    Base.metadata.drop_all(bind=engine)

@@ -409,5 +442,5 @@ def run_full_migration(geocode: bool = False) -> bool:
        print("Warning: No CSV data found to migrate!")
        return False

-    migrate_data(df, geocode=geocode)
+    migrate_data(df, geocode=geocode, geocode_cache=geocode_cache)
    return True