diff --git a/backend/migration.py b/backend/migration.py index f92ac22..071d38d 100644 --- a/backend/migration.py +++ b/backend/migration.py @@ -185,9 +185,12 @@ def load_csv_data(data_dir: Path) -> pd.DataFrame: return pd.DataFrame() -def migrate_data(df: pd.DataFrame, geocode: bool = False): +def migrate_data(df: pd.DataFrame, geocode: bool = False, geocode_cache: dict = None): """Migrate DataFrame data to database.""" + if geocode_cache is None: + geocode_cache = {} + # Clean URN column - convert to integer, drop invalid values df = df.copy() df["urn"] = pd.to_numeric(df["urn"], errors="coerce") @@ -200,13 +203,25 @@ def migrate_data(df: pd.DataFrame, geocode: bool = False): ) print(f"\nMigrating {len(school_data)} unique schools...") - # Geocode if requested - geocoded = {} + # Geocode postcodes that aren't already in the cache + geocoded = dict(geocode_cache) # start with preserved coordinates if geocode and "postcode" in df.columns: - print("\nGeocoding postcodes...") - postcodes = df["postcode"].dropna().unique().tolist() - geocoded = geocode_postcodes_bulk(postcodes) - print(f" Successfully geocoded {len(geocoded)} postcodes") + cached_postcodes = { + str(row.get("postcode", "")).strip().upper() + for _, row in school_data.iterrows() + if int(float(str(row.get("urn", 0) or 0))) in geocode_cache + } + postcodes_needed = [ + p for p in df["postcode"].dropna().unique() + if str(p).strip().upper() not in cached_postcodes + ] + if postcodes_needed: + print(f"\nGeocoding {len(postcodes_needed)} postcodes ({len(geocode_cache)} restored from cache)...") + fresh = geocode_postcodes_bulk(postcodes_needed) + geocoded.update(fresh) + print(f" Successfully geocoded {len(fresh)} new postcodes") + else: + print(f"\nAll {len(geocode_cache)} postcodes restored from cache, skipping geocoding.") with get_db_session() as db: # Create schools @@ -396,6 +411,24 @@ def run_full_migration(geocode: bool = False) -> bool: Returns True if successful, False if no data found. Raises exception on error. """ + # Preserve existing geocoding so a reimport doesn't throw away coordinates + # that took a long time to compute. + geocode_cache: dict[int, tuple[float, float]] = {} + inspector = __import__("sqlalchemy").inspect(engine) + if "schools" in inspector.get_table_names(): + try: + with get_db_session() as db: + rows = db.execute( + __import__("sqlalchemy").text( + "SELECT urn, latitude, longitude FROM schools " + "WHERE latitude IS NOT NULL AND longitude IS NOT NULL" + ) + ).fetchall() + geocode_cache = {r.urn: (r.latitude, r.longitude) for r in rows} + print(f" Saved {len(geocode_cache)} existing geocoded coordinates.") + except Exception as e: + print(f" Warning: could not save geocode cache: {e}") + print("Dropping existing tables...") Base.metadata.drop_all(bind=engine) @@ -409,5 +442,5 @@ def run_full_migration(geocode: bool = False) -> bool: print("Warning: No CSV data found to migrate!") return False - migrate_data(df, geocode=geocode) + migrate_data(df, geocode=geocode, geocode_cache=geocode_cache) return True