""" GIAS (Get Information About Schools) bulk CSV downloader and loader. Source: https://get-information-schools.service.gov.uk/Downloads Update: Daily; we refresh weekly. Adds: website, headteacher_name, capacity, trust_name, trust_uid, gender, nursery_provision """ import argparse import sys from datetime import date from pathlib import Path import pandas as pd import requests sys.path.insert(0, str(Path(__file__).parent.parent)) from config import SUPPLEMENTARY_DIR from db import get_session DEST_DIR = SUPPLEMENTARY_DIR / "gias" # GIAS bulk download URL — date is injected at runtime GIAS_URL_TEMPLATE = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata{date}.csv" COLUMN_MAP = { "URN": "urn", "SchoolWebsite": "website", "SchoolCapacity": "capacity", "TrustName": "trust_name", "TrustUID": "trust_uid", "Gender (name)": "gender", "NurseryProvision (name)": "nursery_provision_raw", "HeadTitle": "head_title", "HeadFirstName": "head_first", "HeadLastName": "head_last", } def download(data_dir: Path | None = None) -> Path: dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR dest.mkdir(parents=True, exist_ok=True) today = date.today().strftime("%Y%m%d") url = GIAS_URL_TEMPLATE.format(date=today) filename = f"gias_{today}.csv" dest_file = dest / filename if dest_file.exists(): print(f" GIAS: {filename} already exists, skipping download.") return dest_file print(f" GIAS: downloading {url} ...") resp = requests.get(url, timeout=300, stream=True) # GIAS may not have today's file yet — fall back to yesterday if resp.status_code == 404: from datetime import timedelta yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d") url = GIAS_URL_TEMPLATE.format(date=yesterday) filename = f"gias_{yesterday}.csv" dest_file = dest / filename if dest_file.exists(): print(f" GIAS: {filename} already exists, skipping download.") return dest_file resp = requests.get(url, timeout=300, stream=True) resp.raise_for_status() with open(dest_file, "wb") as f: for chunk in resp.iter_content(chunk_size=65536): f.write(chunk) print(f" GIAS: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)") return dest_file def load(path: Path | None = None, data_dir: Path | None = None) -> dict: if path is None: dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR files = sorted(dest.glob("gias_*.csv")) if not files: raise FileNotFoundError(f"No GIAS CSV found in {dest}") path = files[-1] print(f" GIAS: loading {path} ...") df = pd.read_csv(path, encoding="latin-1", low_memory=False) df.rename(columns=COLUMN_MAP, inplace=True) if "urn" not in df.columns: raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") df["urn"] = pd.to_numeric(df["urn"], errors="coerce") df = df.dropna(subset=["urn"]) df["urn"] = df["urn"].astype(int) # Build headteacher_name from parts def build_name(row): parts = [ str(row.get("head_title", "") or "").strip(), str(row.get("head_first", "") or "").strip(), str(row.get("head_last", "") or "").strip(), ] return " ".join(p for p in parts if p) or None df["headteacher_name"] = df.apply(build_name, axis=1) df["nursery_provision"] = df.get("nursery_provision_raw", pd.Series()).apply( lambda v: True if str(v).strip().lower().startswith("has") else False if pd.notna(v) else None ) def clean_str(val): s = str(val).strip() if pd.notna(val) else None return s if s and s.lower() not in ("nan", "none", "") else None updated = 0 with get_session() as session: from sqlalchemy import text for _, row in df.iterrows(): urn = int(row["urn"]) session.execute( text(""" UPDATE schools SET website = :website, headteacher_name = :headteacher_name, capacity = :capacity, trust_name = :trust_name, trust_uid = :trust_uid, gender = :gender, nursery_provision = :nursery_provision WHERE urn = :urn """), { "urn": urn, "website": clean_str(row.get("website")), "headteacher_name": row.get("headteacher_name"), "capacity": int(row["capacity"]) if pd.notna(row.get("capacity")) and str(row.get("capacity")).strip().isdigit() else None, "trust_name": clean_str(row.get("trust_name")), "trust_uid": clean_str(row.get("trust_uid")), "gender": clean_str(row.get("gender")), "nursery_provision": row.get("nursery_provision"), }, ) updated += 1 if updated % 5000 == 0: session.flush() print(f" Updated {updated} schools...") print(f" GIAS: updated {updated} school records") return {"inserted": 0, "updated": updated, "skipped": 0} if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--action", choices=["download", "load", "all"], default="all") parser.add_argument("--data-dir", type=Path, default=None) args = parser.parse_args() if args.action in ("download", "all"): path = download(args.data_dir) if args.action in ("load", "all"): load(data_dir=args.data_dir)