diff --git a/scripts/migrate_csv_to_db.py b/scripts/migrate_csv_to_db.py index bd1f3b1..921ef66 100644 --- a/scripts/migrate_csv_to_db.py +++ b/scripts/migrate_csv_to_db.py @@ -177,8 +177,14 @@ def load_csv_data(data_dir: Path) -> pd.DataFrame: def migrate_data(df: pd.DataFrame, geocode: bool = False): """Migrate DataFrame data to database.""" - # Group by URN to get unique schools - school_data = df.groupby('urn').first().reset_index() + # Clean URN column - convert to integer, drop invalid values + df = df.copy() + df['urn'] = pd.to_numeric(df['urn'], errors='coerce') + df = df.dropna(subset=['urn']) + df['urn'] = df['urn'].astype(int) + + # Group by URN to get unique schools (use latest year's data) + school_data = df.sort_values('year', ascending=False).groupby('urn').first().reset_index() print(f"\nMigrating {len(school_data)} unique schools...") # Geocode if requested @@ -208,6 +214,10 @@ def migrate_data(df: pd.DataFrame, geocode: bool = False): if not urn: continue + # Skip if we've already added this URN (handles duplicates in source data) + if urn in urn_to_school_id: + continue + # Get geocoding data postcode = row.get('postcode') lat, lon = None, None