fix(migration): preserve geocoding across reimports
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 47s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m4s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 47s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m4s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
Before dropping tables, save all existing lat/lon coordinates keyed by URN. After reimport, merge cached coordinates with any newly geocoded ones so schools that already have coordinates skip the postcodes.io API call. This makes repeated reimports fast and avoids re-geocoding ~15k schools. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -185,9 +185,12 @@ def load_csv_data(data_dir: Path) -> pd.DataFrame:
|
||||
return pd.DataFrame()
|
||||
|
||||
|
||||
def migrate_data(df: pd.DataFrame, geocode: bool = False):
|
||||
def migrate_data(df: pd.DataFrame, geocode: bool = False, geocode_cache: dict = None):
|
||||
"""Migrate DataFrame data to database."""
|
||||
|
||||
if geocode_cache is None:
|
||||
geocode_cache = {}
|
||||
|
||||
# Clean URN column - convert to integer, drop invalid values
|
||||
df = df.copy()
|
||||
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
|
||||
@@ -200,13 +203,25 @@ def migrate_data(df: pd.DataFrame, geocode: bool = False):
|
||||
)
|
||||
print(f"\nMigrating {len(school_data)} unique schools...")
|
||||
|
||||
# Geocode if requested
|
||||
geocoded = {}
|
||||
# Geocode postcodes that aren't already in the cache
|
||||
geocoded = dict(geocode_cache) # start with preserved coordinates
|
||||
if geocode and "postcode" in df.columns:
|
||||
print("\nGeocoding postcodes...")
|
||||
postcodes = df["postcode"].dropna().unique().tolist()
|
||||
geocoded = geocode_postcodes_bulk(postcodes)
|
||||
print(f" Successfully geocoded {len(geocoded)} postcodes")
|
||||
cached_postcodes = {
|
||||
str(row.get("postcode", "")).strip().upper()
|
||||
for _, row in school_data.iterrows()
|
||||
if int(float(str(row.get("urn", 0) or 0))) in geocode_cache
|
||||
}
|
||||
postcodes_needed = [
|
||||
p for p in df["postcode"].dropna().unique()
|
||||
if str(p).strip().upper() not in cached_postcodes
|
||||
]
|
||||
if postcodes_needed:
|
||||
print(f"\nGeocoding {len(postcodes_needed)} postcodes ({len(geocode_cache)} restored from cache)...")
|
||||
fresh = geocode_postcodes_bulk(postcodes_needed)
|
||||
geocoded.update(fresh)
|
||||
print(f" Successfully geocoded {len(fresh)} new postcodes")
|
||||
else:
|
||||
print(f"\nAll {len(geocode_cache)} postcodes restored from cache, skipping geocoding.")
|
||||
|
||||
with get_db_session() as db:
|
||||
# Create schools
|
||||
@@ -396,6 +411,24 @@ def run_full_migration(geocode: bool = False) -> bool:
|
||||
Returns True if successful, False if no data found.
|
||||
Raises exception on error.
|
||||
"""
|
||||
# Preserve existing geocoding so a reimport doesn't throw away coordinates
|
||||
# that took a long time to compute.
|
||||
geocode_cache: dict[int, tuple[float, float]] = {}
|
||||
inspector = __import__("sqlalchemy").inspect(engine)
|
||||
if "schools" in inspector.get_table_names():
|
||||
try:
|
||||
with get_db_session() as db:
|
||||
rows = db.execute(
|
||||
__import__("sqlalchemy").text(
|
||||
"SELECT urn, latitude, longitude FROM schools "
|
||||
"WHERE latitude IS NOT NULL AND longitude IS NOT NULL"
|
||||
)
|
||||
).fetchall()
|
||||
geocode_cache = {r.urn: (r.latitude, r.longitude) for r in rows}
|
||||
print(f" Saved {len(geocode_cache)} existing geocoded coordinates.")
|
||||
except Exception as e:
|
||||
print(f" Warning: could not save geocode cache: {e}")
|
||||
|
||||
print("Dropping existing tables...")
|
||||
Base.metadata.drop_all(bind=engine)
|
||||
|
||||
@@ -409,5 +442,5 @@ def run_full_migration(geocode: bool = False) -> bool:
|
||||
print("Warning: No CSV data found to migrate!")
|
||||
return False
|
||||
|
||||
migrate_data(df, geocode=geocode)
|
||||
migrate_data(df, geocode=geocode, geocode_cache=geocode_cache)
|
||||
return True
|
||||
|
||||
Reference in New Issue
Block a user