Files
school_compare/integrator/scripts/sources/admissions.py

159 lines
5.3 KiB
Python
Raw Normal View History

"""
School Admissions data downloader and loader.
Source: EES publication "secondary-and-primary-school-applications-and-offers"
Update: Annual (June/July post-offer round)
"""
import argparse
import re
import sys
from pathlib import Path
import pandas as pd
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
from sources.ees import get_latest_csv_url, download_csv
DEST_DIR = SUPPLEMENTARY_DIR / "admissions"
PUBLICATION_SLUG = "secondary-and-primary-school-applications-and-offers"
NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}
COLUMN_MAP = {
"URN": "urn",
"urn": "urn",
"YEAR": "year",
"Year": "year",
# PAN
"PAN": "pan",
"published_admission_number": "pan",
"admissions_number": "pan",
# Applications
"total_applications": "total_applications",
"TAPP": "total_applications",
"applications_received": "total_applications",
# 1st preference offers
"first_preference_offers_pct": "first_preference_offers_pct",
"pct_1st_preference": "first_preference_offers_pct",
"PT1PREF": "first_preference_offers_pct",
# Oversubscription
"oversubscribed": "oversubscribed",
}
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
url = get_latest_csv_url(PUBLICATION_SLUG, keyword="primary")
if not url:
url = get_latest_csv_url(PUBLICATION_SLUG)
if not url:
raise RuntimeError("Could not find CSV URL for admissions publication")
filename = url.split("/")[-1].split("?")[0] or "admissions_latest.csv"
return download_csv(url, dest / filename)
def _parse_int(val) -> int | None:
if pd.isna(val):
return None
s = str(val).strip().upper().replace(",", "")
if s in NULL_VALUES:
return None
try:
return int(float(s))
except ValueError:
return None
def _parse_pct(val) -> float | None:
if pd.isna(val):
return None
s = str(val).strip().upper().replace("%", "")
if s in NULL_VALUES:
return None
try:
return float(s)
except ValueError:
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR
files = sorted(dest.glob("*.csv"))
if not files:
raise FileNotFoundError(f"No admissions CSV found in {dest}")
path = files[-1]
print(f" Admissions: loading {path} ...")
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
df.rename(columns=COLUMN_MAP, inplace=True)
if "urn" not in df.columns:
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
year = None
m = re.search(r"20(\d{2})", path.stem)
if m:
year = int("20" + m.group(1))
inserted = 0
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
if not row_year:
continue
pan = _parse_int(row.get("pan"))
total_apps = _parse_int(row.get("total_applications"))
pct_1st = _parse_pct(row.get("first_preference_offers_pct"))
oversubscribed = bool(row.get("oversubscribed")) if pd.notna(row.get("oversubscribed")) else (
True if (pan and total_apps and total_apps > pan) else None
)
session.execute(
text("""
INSERT INTO school_admissions
(urn, year, published_admission_number, total_applications,
first_preference_offers_pct, oversubscribed)
VALUES (:urn, :year, :pan, :total_apps, :pct_1st, :oversubscribed)
ON CONFLICT (urn, year) DO UPDATE SET
published_admission_number = EXCLUDED.published_admission_number,
total_applications = EXCLUDED.total_applications,
first_preference_offers_pct = EXCLUDED.first_preference_offers_pct,
oversubscribed = EXCLUDED.oversubscribed
"""),
{
"urn": urn, "year": row_year, "pan": pan,
"total_apps": total_apps, "pct_1st": pct_1st,
"oversubscribed": oversubscribed,
},
)
inserted += 1
if inserted % 5000 == 0:
session.flush()
print(f" Admissions: upserted {inserted} records")
return {"inserted": inserted, "updated": 0, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)