Fix migration script to handle percentage signs in CSV data
All checks were successful
Build and Push Docker Image / build-and-push (push) Successful in 55s

- Updated parse_numeric() to strip percentage signs before parsing
- This fixes the issue where all percentage metrics were showing as NULL/empty
- School cards will now display actual performance data after re-running migration
This commit is contained in:
Tudor Sitaru
2026-01-06 22:22:33 +00:00
parent 1a8ec670b9
commit e2b2ddfb66

View File

@@ -10,23 +10,28 @@ Options:
--geocode Geocode postcodes (requires network access) --geocode Geocode postcodes (requires network access)
""" """
import sys
import os import os
import sys
from pathlib import Path from pathlib import Path
# Add parent directory to path for imports # Add parent directory to path for imports
sys.path.insert(0, str(Path(__file__).parent.parent)) sys.path.insert(0, str(Path(__file__).parent.parent))
import argparse import argparse
import pandas as pd
import numpy as np
import re import re
from typing import Optional, Dict from typing import Dict, Optional
import numpy as np
import pandas as pd
import requests import requests
from backend.config import settings from backend.config import settings
from backend.database import engine, Base, get_db_session from backend.models import (
from backend.models import School, SchoolResult, SCHOOL_FIELD_MAPPING, RESULT_FIELD_MAPPING School,
SchoolResult,
SCHOOL_FIELD_MAPPING,
RESULT_FIELD_MAPPING,
)
from backend.schemas import ( from backend.schemas import (
COLUMN_MAPPINGS, COLUMN_MAPPINGS,
NUMERIC_COLUMNS, NUMERIC_COLUMNS,
@@ -43,8 +48,10 @@ def parse_numeric(value) -> Optional[float]:
if isinstance(value, (int, float)): if isinstance(value, (int, float)):
return float(value) if not np.isnan(value) else None return float(value) if not np.isnan(value) else None
str_val = str(value).strip().upper() str_val = str(value).strip().upper()
if str_val in NULL_VALUES or str_val == '': if str_val in NULL_VALUES or str_val == "":
return None return None
# Remove percentage signs if present
str_val = str_val.replace("%", "")
try: try:
return float(str_val) return float(str_val)
except ValueError: except ValueError:
@@ -53,10 +60,10 @@ def parse_numeric(value) -> Optional[float]:
def extract_year_from_folder(folder_name: str) -> Optional[int]: def extract_year_from_folder(folder_name: str) -> Optional[int]:
"""Extract year from folder name like '2023-2024'.""" """Extract year from folder name like '2023-2024'."""
match = re.search(r'(\d{4})-(\d{4})', folder_name) match = re.search(r"(\d{4})-(\d{4})", folder_name)
if match: if match:
return int(match.group(2)) return int(match.group(2))
match = re.search(r'(\d{4})', folder_name) match = re.search(r"(\d{4})", folder_name)
if match: if match:
return int(match.group(1)) return int(match.group(1))
return None return None
@@ -68,141 +75,166 @@ def geocode_postcodes_bulk(postcodes: list) -> Dict[str, tuple]:
Returns dict of postcode -> (latitude, longitude). Returns dict of postcode -> (latitude, longitude).
""" """
results = {} results = {}
valid_postcodes = [p.strip().upper() for p in postcodes if p and isinstance(p, str) and len(p.strip()) >= 5] valid_postcodes = [
p.strip().upper()
for p in postcodes
if p and isinstance(p, str) and len(p.strip()) >= 5
]
valid_postcodes = list(set(valid_postcodes)) valid_postcodes = list(set(valid_postcodes))
if not valid_postcodes: if not valid_postcodes:
return results return results
batch_size = 100 batch_size = 100
total_batches = (len(valid_postcodes) + batch_size - 1) // batch_size total_batches = (len(valid_postcodes) + batch_size - 1) // batch_size
for i, batch_start in enumerate(range(0, len(valid_postcodes), batch_size)): for i, batch_start in enumerate(range(0, len(valid_postcodes), batch_size)):
batch = valid_postcodes[batch_start:batch_start + batch_size] batch = valid_postcodes[batch_start : batch_start + batch_size]
print(f" Geocoding batch {i+1}/{total_batches} ({len(batch)} postcodes)...") print(
f" Geocoding batch {i + 1}/{total_batches} ({len(batch)} postcodes)..."
)
try: try:
response = requests.post( response = requests.post(
'https://api.postcodes.io/postcodes', "https://api.postcodes.io/postcodes",
json={'postcodes': batch}, json={"postcodes": batch},
timeout=30 timeout=30,
) )
if response.status_code == 200: if response.status_code == 200:
data = response.json() data = response.json()
for item in data.get('result', []): for item in data.get("result", []):
if item and item.get('result'): if item and item.get("result"):
pc = item['query'].upper() pc = item["query"].upper()
lat = item['result'].get('latitude') lat = item["result"].get("latitude")
lon = item['result'].get('longitude') lon = item["result"].get("longitude")
if lat and lon: if lat and lon:
results[pc] = (lat, lon) results[pc] = (lat, lon)
except Exception as e: except Exception as e:
print(f" Warning: Geocoding batch failed: {e}") print(f" Warning: Geocoding batch failed: {e}")
return results return results
def load_csv_data(data_dir: Path) -> pd.DataFrame: def load_csv_data(data_dir: Path) -> pd.DataFrame:
"""Load all CSV data from data directory.""" """Load all CSV data from data directory."""
all_data = [] all_data = []
for folder in sorted(data_dir.iterdir()): for folder in sorted(data_dir.iterdir()):
if not folder.is_dir(): if not folder.is_dir():
continue continue
year = extract_year_from_folder(folder.name) year = extract_year_from_folder(folder.name)
if not year: if not year:
continue continue
# Specifically look for the KS2 results file # Specifically look for the KS2 results file
ks2_file = folder / "england_ks2final.csv" ks2_file = folder / "england_ks2final.csv"
if not ks2_file.exists(): if not ks2_file.exists():
continue continue
csv_file = ks2_file csv_file = ks2_file
print(f" Loading {csv_file.name} (year {year})...") print(f" Loading {csv_file.name} (year {year})...")
try: try:
df = pd.read_csv(csv_file, encoding='latin-1', low_memory=False) df = pd.read_csv(csv_file, encoding="latin-1", low_memory=False)
except Exception as e: except Exception as e:
print(f" Error loading {csv_file}: {e}") print(f" Error loading {csv_file}: {e}")
continue continue
# Rename columns # Rename columns
df.rename(columns=COLUMN_MAPPINGS, inplace=True) df.rename(columns=COLUMN_MAPPINGS, inplace=True)
df['year'] = year df["year"] = year
# Handle local authority name # Handle local authority name
la_name_cols = ['LANAME', 'LA (name)', 'LA_NAME', 'LA NAME'] la_name_cols = ["LANAME", "LA (name)", "LA_NAME", "LA NAME"]
la_name_col = next((c for c in la_name_cols if c in df.columns), None) la_name_col = next((c for c in la_name_cols if c in df.columns), None)
if la_name_col and la_name_col != 'local_authority': if la_name_col and la_name_col != "local_authority":
df['local_authority'] = df[la_name_col] df["local_authority"] = df[la_name_col]
elif 'LEA' in df.columns: elif "LEA" in df.columns:
df['local_authority_code'] = pd.to_numeric(df['LEA'], errors='coerce') df["local_authority_code"] = pd.to_numeric(df["LEA"], errors="coerce")
df['local_authority'] = df['local_authority_code'].map(LA_CODE_TO_NAME).fillna(df['LEA'].astype(str)) df["local_authority"] = (
df["local_authority_code"]
.map(LA_CODE_TO_NAME)
.fillna(df["LEA"].astype(str))
)
# Store LEA code # Store LEA code
if 'LEA' in df.columns: if "LEA" in df.columns:
df['local_authority_code'] = pd.to_numeric(df['LEA'], errors='coerce') df["local_authority_code"] = pd.to_numeric(df["LEA"], errors="coerce")
# Map school type # Map school type
if 'school_type_code' in df.columns: if "school_type_code" in df.columns:
df['school_type'] = df['school_type_code'].map(SCHOOL_TYPE_MAP).fillna(df['school_type_code']) df["school_type"] = (
df["school_type_code"]
.map(SCHOOL_TYPE_MAP)
.fillna(df["school_type_code"])
)
# Create combined address # Create combined address
addr_parts = ['address1', 'address2', 'town', 'postcode'] addr_parts = ["address1", "address2", "town", "postcode"]
for col in addr_parts: for col in addr_parts:
if col not in df.columns: if col not in df.columns:
df[col] = None df[col] = None
df['address'] = df.apply( df["address"] = df.apply(
lambda r: ', '.join(str(v) for v in [r.get('address1'), r.get('address2'), r.get('town'), r.get('postcode')] if pd.notna(v) and str(v).strip()), lambda r: ", ".join(
axis=1 str(v)
for v in [
r.get("address1"),
r.get("address2"),
r.get("town"),
r.get("postcode"),
]
if pd.notna(v) and str(v).strip()
),
axis=1,
) )
all_data.append(df) all_data.append(df)
print(f" Loaded {len(df)} records") print(f" Loaded {len(df)} records")
if all_data: if all_data:
result = pd.concat(all_data, ignore_index=True) result = pd.concat(all_data, ignore_index=True)
print(f"\nTotal records loaded: {len(result)}") print(f"\nTotal records loaded: {len(result)}")
print(f"Unique schools: {result['urn'].nunique()}") print(f"Unique schools: {result['urn'].nunique()}")
print(f"Years: {sorted(result['year'].unique())}") print(f"Years: {sorted(result['year'].unique())}")
return result return result
return pd.DataFrame() return pd.DataFrame()
def migrate_data(df: pd.DataFrame, geocode: bool = False): def migrate_data(df: pd.DataFrame, geocode: bool = False):
"""Migrate DataFrame data to database.""" """Migrate DataFrame data to database."""
# Clean URN column - convert to integer, drop invalid values # Clean URN column - convert to integer, drop invalid values
df = df.copy() df = df.copy()
df['urn'] = pd.to_numeric(df['urn'], errors='coerce') df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=['urn']) df = df.dropna(subset=["urn"])
df['urn'] = df['urn'].astype(int) df["urn"] = df["urn"].astype(int)
# Group by URN to get unique schools (use latest year's data) # Group by URN to get unique schools (use latest year's data)
school_data = df.sort_values('year', ascending=False).groupby('urn').first().reset_index() school_data = (
df.sort_values("year", ascending=False).groupby("urn").first().reset_index()
)
print(f"\nMigrating {len(school_data)} unique schools...") print(f"\nMigrating {len(school_data)} unique schools...")
# Geocode if requested # Geocode if requested
geocoded = {} geocoded = {}
if geocode and 'postcode' in df.columns: if geocode and "postcode" in df.columns:
print("\nGeocoding postcodes...") print("\nGeocoding postcodes...")
postcodes = df['postcode'].dropna().unique().tolist() postcodes = df["postcode"].dropna().unique().tolist()
geocoded = geocode_postcodes_bulk(postcodes) geocoded = geocode_postcodes_bulk(postcodes)
print(f" Successfully geocoded {len(geocoded)} postcodes") print(f" Successfully geocoded {len(geocoded)} postcodes")
with get_db_session() as db: with get_db_session() as db:
# Create schools # Create schools
urn_to_school_id = {} urn_to_school_id = {}
schools_created = 0 schools_created = 0
for _, row in school_data.iterrows(): for _, row in school_data.iterrows():
# Safely parse URN - handle None, NaN, whitespace, and invalid values # Safely parse URN - handle None, NaN, whitespace, and invalid values
urn_val = row.get('urn') urn_val = row.get("urn")
urn = None urn = None
if pd.notna(urn_val): if pd.notna(urn_val):
try: try:
@@ -213,22 +245,22 @@ def migrate_data(df: pd.DataFrame, geocode: bool = False):
pass pass
if not urn: if not urn:
continue continue
# Skip if we've already added this URN (handles duplicates in source data) # Skip if we've already added this URN (handles duplicates in source data)
if urn in urn_to_school_id: if urn in urn_to_school_id:
continue continue
# Get geocoding data # Get geocoding data
postcode = row.get('postcode') postcode = row.get("postcode")
lat, lon = None, None lat, lon = None, None
if postcode and pd.notna(postcode): if postcode and pd.notna(postcode):
coords = geocoded.get(str(postcode).strip().upper()) coords = geocoded.get(str(postcode).strip().upper())
if coords: if coords:
lat, lon = coords lat, lon = coords
# Safely parse local_authority_code # Safely parse local_authority_code
la_code = None la_code = None
la_code_val = row.get('local_authority_code') la_code_val = row.get("local_authority_code")
if pd.notna(la_code_val): if pd.notna(la_code_val):
try: try:
la_code_str = str(la_code_val).strip() la_code_str = str(la_code_val).strip()
@@ -236,20 +268,32 @@ def migrate_data(df: pd.DataFrame, geocode: bool = False):
la_code = int(float(la_code_str)) la_code = int(float(la_code_str))
except (ValueError, TypeError): except (ValueError, TypeError):
pass pass
school = School( school = School(
urn=urn, urn=urn,
school_name=row.get('school_name') if pd.notna(row.get('school_name')) else 'Unknown', school_name=row.get("school_name")
local_authority=row.get('local_authority') if pd.notna(row.get('local_authority')) else None, if pd.notna(row.get("school_name"))
else "Unknown",
local_authority=row.get("local_authority")
if pd.notna(row.get("local_authority"))
else None,
local_authority_code=la_code, local_authority_code=la_code,
school_type=row.get('school_type') if pd.notna(row.get('school_type')) else None, school_type=row.get("school_type")
school_type_code=row.get('school_type_code') if pd.notna(row.get('school_type_code')) else None, if pd.notna(row.get("school_type"))
religious_denomination=row.get('religious_denomination') if pd.notna(row.get('religious_denomination')) else None, else None,
age_range=row.get('age_range') if pd.notna(row.get('age_range')) else None, school_type_code=row.get("school_type_code")
address1=row.get('address1') if pd.notna(row.get('address1')) else None, if pd.notna(row.get("school_type_code"))
address2=row.get('address2') if pd.notna(row.get('address2')) else None, else None,
town=row.get('town') if pd.notna(row.get('town')) else None, religious_denomination=row.get("religious_denomination")
postcode=row.get('postcode') if pd.notna(row.get('postcode')) else None, if pd.notna(row.get("religious_denomination"))
else None,
age_range=row.get("age_range")
if pd.notna(row.get("age_range"))
else None,
address1=row.get("address1") if pd.notna(row.get("address1")) else None,
address2=row.get("address2") if pd.notna(row.get("address2")) else None,
town=row.get("town") if pd.notna(row.get("town")) else None,
postcode=row.get("postcode") if pd.notna(row.get("postcode")) else None,
latitude=lat, latitude=lat,
longitude=lon, longitude=lon,
) )
@@ -257,19 +301,19 @@ def migrate_data(df: pd.DataFrame, geocode: bool = False):
db.flush() # Get the ID db.flush() # Get the ID
urn_to_school_id[urn] = school.id urn_to_school_id[urn] = school.id
schools_created += 1 schools_created += 1
if schools_created % 1000 == 0: if schools_created % 1000 == 0:
print(f" Created {schools_created} schools...") print(f" Created {schools_created} schools...")
print(f" Created {schools_created} schools") print(f" Created {schools_created} schools")
# Create results # Create results
print(f"\nMigrating {len(df)} yearly results...") print(f"\nMigrating {len(df)} yearly results...")
results_created = 0 results_created = 0
for _, row in df.iterrows(): for _, row in df.iterrows():
# Safely parse URN # Safely parse URN
urn_val = row.get('urn') urn_val = row.get("urn")
urn = None urn = None
if pd.notna(urn_val): if pd.notna(urn_val):
try: try:
@@ -280,11 +324,11 @@ def migrate_data(df: pd.DataFrame, geocode: bool = False):
pass pass
if not urn or urn not in urn_to_school_id: if not urn or urn not in urn_to_school_id:
continue continue
school_id = urn_to_school_id[urn] school_id = urn_to_school_id[urn]
# Safely parse year # Safely parse year
year_val = row.get('year') year_val = row.get("year")
year = None year = None
if pd.notna(year_val): if pd.notna(year_val):
try: try:
@@ -293,98 +337,111 @@ def migrate_data(df: pd.DataFrame, geocode: bool = False):
pass pass
if not year: if not year:
continue continue
result = SchoolResult( result = SchoolResult(
school_id=school_id, school_id=school_id,
year=year, year=year,
total_pupils=parse_numeric(row.get('total_pupils')), total_pupils=parse_numeric(row.get("total_pupils")),
eligible_pupils=parse_numeric(row.get('eligible_pupils')), eligible_pupils=parse_numeric(row.get("eligible_pupils")),
# Expected Standard # Expected Standard
rwm_expected_pct=parse_numeric(row.get('rwm_expected_pct')), rwm_expected_pct=parse_numeric(row.get("rwm_expected_pct")),
reading_expected_pct=parse_numeric(row.get('reading_expected_pct')), reading_expected_pct=parse_numeric(row.get("reading_expected_pct")),
writing_expected_pct=parse_numeric(row.get('writing_expected_pct')), writing_expected_pct=parse_numeric(row.get("writing_expected_pct")),
maths_expected_pct=parse_numeric(row.get('maths_expected_pct')), maths_expected_pct=parse_numeric(row.get("maths_expected_pct")),
gps_expected_pct=parse_numeric(row.get('gps_expected_pct')), gps_expected_pct=parse_numeric(row.get("gps_expected_pct")),
science_expected_pct=parse_numeric(row.get('science_expected_pct')), science_expected_pct=parse_numeric(row.get("science_expected_pct")),
# Higher Standard # Higher Standard
rwm_high_pct=parse_numeric(row.get('rwm_high_pct')), rwm_high_pct=parse_numeric(row.get("rwm_high_pct")),
reading_high_pct=parse_numeric(row.get('reading_high_pct')), reading_high_pct=parse_numeric(row.get("reading_high_pct")),
writing_high_pct=parse_numeric(row.get('writing_high_pct')), writing_high_pct=parse_numeric(row.get("writing_high_pct")),
maths_high_pct=parse_numeric(row.get('maths_high_pct')), maths_high_pct=parse_numeric(row.get("maths_high_pct")),
gps_high_pct=parse_numeric(row.get('gps_high_pct')), gps_high_pct=parse_numeric(row.get("gps_high_pct")),
# Progress # Progress
reading_progress=parse_numeric(row.get('reading_progress')), reading_progress=parse_numeric(row.get("reading_progress")),
writing_progress=parse_numeric(row.get('writing_progress')), writing_progress=parse_numeric(row.get("writing_progress")),
maths_progress=parse_numeric(row.get('maths_progress')), maths_progress=parse_numeric(row.get("maths_progress")),
# Averages # Averages
reading_avg_score=parse_numeric(row.get('reading_avg_score')), reading_avg_score=parse_numeric(row.get("reading_avg_score")),
maths_avg_score=parse_numeric(row.get('maths_avg_score')), maths_avg_score=parse_numeric(row.get("maths_avg_score")),
gps_avg_score=parse_numeric(row.get('gps_avg_score')), gps_avg_score=parse_numeric(row.get("gps_avg_score")),
# Context # Context
disadvantaged_pct=parse_numeric(row.get('disadvantaged_pct')), disadvantaged_pct=parse_numeric(row.get("disadvantaged_pct")),
eal_pct=parse_numeric(row.get('eal_pct')), eal_pct=parse_numeric(row.get("eal_pct")),
sen_support_pct=parse_numeric(row.get('sen_support_pct')), sen_support_pct=parse_numeric(row.get("sen_support_pct")),
sen_ehcp_pct=parse_numeric(row.get('sen_ehcp_pct')), sen_ehcp_pct=parse_numeric(row.get("sen_ehcp_pct")),
stability_pct=parse_numeric(row.get('stability_pct')), stability_pct=parse_numeric(row.get("stability_pct")),
# Gender # Gender
rwm_expected_boys_pct=parse_numeric(row.get('rwm_expected_boys_pct')), rwm_expected_boys_pct=parse_numeric(row.get("rwm_expected_boys_pct")),
rwm_expected_girls_pct=parse_numeric(row.get('rwm_expected_girls_pct')), rwm_expected_girls_pct=parse_numeric(row.get("rwm_expected_girls_pct")),
rwm_high_boys_pct=parse_numeric(row.get('rwm_high_boys_pct')), rwm_high_boys_pct=parse_numeric(row.get("rwm_high_boys_pct")),
rwm_high_girls_pct=parse_numeric(row.get('rwm_high_girls_pct')), rwm_high_girls_pct=parse_numeric(row.get("rwm_high_girls_pct")),
# Disadvantaged # Disadvantaged
rwm_expected_disadvantaged_pct=parse_numeric(row.get('rwm_expected_disadvantaged_pct')), rwm_expected_disadvantaged_pct=parse_numeric(
rwm_expected_non_disadvantaged_pct=parse_numeric(row.get('rwm_expected_non_disadvantaged_pct')), row.get("rwm_expected_disadvantaged_pct")
disadvantaged_gap=parse_numeric(row.get('disadvantaged_gap')), ),
rwm_expected_non_disadvantaged_pct=parse_numeric(
row.get("rwm_expected_non_disadvantaged_pct")
),
disadvantaged_gap=parse_numeric(row.get("disadvantaged_gap")),
# 3-Year # 3-Year
rwm_expected_3yr_pct=parse_numeric(row.get('rwm_expected_3yr_pct')), rwm_expected_3yr_pct=parse_numeric(row.get("rwm_expected_3yr_pct")),
reading_avg_3yr=parse_numeric(row.get('reading_avg_3yr')), reading_avg_3yr=parse_numeric(row.get("reading_avg_3yr")),
maths_avg_3yr=parse_numeric(row.get('maths_avg_3yr')), maths_avg_3yr=parse_numeric(row.get("maths_avg_3yr")),
) )
db.add(result) db.add(result)
results_created += 1 results_created += 1
if results_created % 10000 == 0: if results_created % 10000 == 0:
print(f" Created {results_created} results...") print(f" Created {results_created} results...")
db.flush() db.flush()
print(f" Created {results_created} results") print(f" Created {results_created} results")
# Commit all changes # Commit all changes
db.commit() db.commit()
print("\nMigration complete!") print("\nMigration complete!")
def main(): def main():
parser = argparse.ArgumentParser(description='Migrate CSV data to PostgreSQL database') parser = argparse.ArgumentParser(
parser.add_argument('--drop', action='store_true', help='Drop existing tables before migration') description="Migrate CSV data to PostgreSQL database"
parser.add_argument('--geocode', action='store_true', help='Geocode postcodes') )
parser.add_argument(
"--drop", action="store_true", help="Drop existing tables before migration"
)
parser.add_argument("--geocode", action="store_true", help="Geocode postcodes")
args = parser.parse_args() args = parser.parse_args()
print("=" * 60) print("=" * 60)
print("School Data Migration: CSV -> PostgreSQL") print("School Data Migration: CSV -> PostgreSQL")
print("=" * 60) print("=" * 60)
print(f"\nDatabase: {settings.database_url.split('@')[-1]}") print(f"\nDatabase: {settings.database_url.split('@')[-1]}")
print(f"Data directory: {settings.data_dir}") print(f"Data directory: {settings.data_dir}")
if args.drop: if args.drop:
print("\n⚠️ Dropping existing tables...") print("\n⚠️ Dropping existing tables...")
Base.metadata.drop_all(bind=engine) Base.metadata.drop_all(bind=engine)
print("\nCreating tables...") print("\nCreating tables...")
Base.metadata.create_all(bind=engine) Base.metadata.create_all(bind=engine)
print("\nLoading CSV data...") print("\nLoading CSV data...")
df = load_csv_data(settings.data_dir) df = load_csv_data(settings.data_dir)
if df.empty: if df.empty:
print("No data found to migrate!") print("No data found to migrate!")
return 1 return 1
migrate_data(df, geocode=args.geocode) migrate_data(df, geocode=args.geocode)
return 0
if __name__ == "__main__":
migrate_data(df, geocode=args.geocode)
return 0 return 0
if __name__ == '__main__': if __name__ == '__main__':
sys.exit(main()) sys.exit(main())