""" School Admissions data downloader and loader. Source: EES publication "primary-and-secondary-school-applications-and-offers" Content API release ZIP → supporting-files/AppsandOffers_*_SchoolLevel*.csv Update: Annual (June/July post-offer round) """ import argparse import re import sys from pathlib import Path import pandas as pd sys.path.insert(0, str(Path(__file__).parent.parent)) from config import SUPPLEMENTARY_DIR from db import get_session from sources.ees import download_release_zip_csv DEST_DIR = SUPPLEMENTARY_DIR / "admissions" PUBLICATION_SLUG = "primary-and-secondary-school-applications-and-offers" NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", "Z", ""} # Maps actual CSV column names → internal field names COLUMN_MAP = { # School identifier "school_urn": "urn", # Year — e.g. 202526 → 2025 "time_period": "time_period_raw", # PAN (places offered) "total_number_places_offered": "pan", # Applications (total times put as any preference) "times_put_as_any_preferred_school": "total_applications", # 1st-preference applications "times_put_as_1st_preference": "times_1st_pref", # 1st-preference offers "number_1st_preference_offers": "offers_1st_pref", } def download(data_dir: Path | None = None) -> Path: dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR dest.mkdir(parents=True, exist_ok=True) dest_file = dest / "admissions_school_level_latest.csv" return download_release_zip_csv( PUBLICATION_SLUG, dest_file, zip_member_keyword="schoollevel", ) def _parse_int(val) -> int | None: if pd.isna(val): return None s = str(val).strip().upper().replace(",", "") if s in NULL_VALUES: return None try: return int(float(s)) except ValueError: return None def _parse_pct(val) -> float | None: if pd.isna(val): return None s = str(val).strip().upper().replace("%", "") if s in NULL_VALUES: return None try: return float(s) except ValueError: return None def load(path: Path | None = None, data_dir: Path | None = None) -> dict: if path is None: dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR files = sorted(dest.glob("*.csv")) if not files: raise FileNotFoundError(f"No admissions CSV found in {dest}") path = files[-1] print(f" Admissions: loading {path} ...") df = pd.read_csv(path, encoding="utf-8-sig", low_memory=False) # Rename columns we care about df.rename(columns=COLUMN_MAP, inplace=True) if "urn" not in df.columns: raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") # Filter to primary schools only if "school_phase" in df.columns: df = df[df["school_phase"].str.lower() == "primary"] df["urn"] = pd.to_numeric(df["urn"], errors="coerce") df = df.dropna(subset=["urn"]) df["urn"] = df["urn"].astype(int) # Derive year from time_period (e.g. 202526 → 2025) def _extract_year(val) -> int | None: s = str(val).strip() m = re.match(r"(\d{4})\d{2}", s) if m: return int(m.group(1)) m2 = re.search(r"20(\d{2})", s) if m2: return int("20" + m2.group(1)) return None if "time_period_raw" in df.columns: df["year"] = df["time_period_raw"].apply(_extract_year) else: year_m = re.search(r"20(\d{2})", path.stem) df["year"] = int("20" + year_m.group(1)) if year_m else None df = df.dropna(subset=["year"]) df["year"] = df["year"].astype(int) # Keep most recent year per school (file may contain multiple years) df = df.sort_values("year", ascending=False).groupby("urn").first().reset_index() inserted = 0 with get_session() as session: from sqlalchemy import text for _, row in df.iterrows(): urn = int(row["urn"]) year = int(row["year"]) pan = _parse_int(row.get("pan")) total_apps = _parse_int(row.get("total_applications")) times_1st = _parse_int(row.get("times_1st_pref")) offers_1st = _parse_int(row.get("offers_1st_pref")) # % of 1st-preference applicants who received an offer if times_1st and times_1st > 0 and offers_1st is not None: pct_1st = round(offers_1st / times_1st * 100, 1) else: pct_1st = None oversubscribed = ( True if (pan and times_1st and times_1st > pan) else False if (pan and times_1st and times_1st <= pan) else None ) session.execute( text(""" INSERT INTO school_admissions (urn, year, published_admission_number, total_applications, first_preference_offers_pct, oversubscribed) VALUES (:urn, :year, :pan, :total_apps, :pct_1st, :oversubscribed) ON CONFLICT (urn, year) DO UPDATE SET published_admission_number = EXCLUDED.published_admission_number, total_applications = EXCLUDED.total_applications, first_preference_offers_pct = EXCLUDED.first_preference_offers_pct, oversubscribed = EXCLUDED.oversubscribed """), { "urn": urn, "year": year, "pan": pan, "total_apps": total_apps, "pct_1st": pct_1st, "oversubscribed": oversubscribed, }, ) inserted += 1 if inserted % 5000 == 0: session.flush() print(f" Processed {inserted} records...") print(f" Admissions: upserted {inserted} records") return {"inserted": inserted, "updated": 0, "skipped": 0} if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--action", choices=["download", "load", "all"], default="all") parser.add_argument("--data-dir", type=Path, default=None) args = parser.parse_args() if args.action in ("download", "all"): download(args.data_dir) if args.action in ("load", "all"): load(data_dir=args.data_dir)