fix(admissions): switch to EES content API + correct publication slug and columns

The EES statistics API only exposes ~13 publications; admissions data is not among them. Switch to the EES content API (content.explore-education-statistics. service.gov.uk) which covers all publications. - ees.py: add get_content_release_id() and download_release_zip_csv() that fetch the release ZIP and extract a named CSV member from it - admissions.py: use corrected slug (primary-and-secondary-school-applications- and-offers), correct column names from actual CSV (school_urn, total_number_places_offered, times_put_as_1st_preference, etc.), derive first_preference_offers_pct from offer/application ratio, filter to primary schools only, keep most recent year per URN Also includes SchoolDetailView UX redesign: parent-first section ordering, plain-English labels, national average benchmarks, progress score colour coding, expanded header, quick summary strip, and CSS consolidation. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-25 10:06:36 +00:00
parent 00dca39fbd
commit b68063c9b9
5 changed files with 951 additions and 652 deletions
--- a/integrator/scripts/sources/admissions.py
+++ b/integrator/scripts/sources/admissions.py
@@ -1,7 +1,8 @@
 """
 School Admissions data downloader and loader.

-Source: EES publication "secondary-and-primary-school-applications-and-offers"
+Source: EES publication "primary-and-secondary-school-applications-and-offers"
+        Content API release ZIP → supporting-files/AppsandOffers_*_SchoolLevel*.csv
 Update: Annual (June/July post-offer round)
 """
 import argparse
@@ -14,47 +15,39 @@ import pandas as pd
 sys.path.insert(0, str(Path(__file__).parent.parent))
 from config import SUPPLEMENTARY_DIR
 from db import get_session
-from sources.ees import get_latest_csv_url, download_csv
+from sources.ees import download_release_zip_csv

 DEST_DIR = SUPPLEMENTARY_DIR / "admissions"
-PUBLICATION_SLUG = "secondary-and-primary-school-applications-and-offers"
+PUBLICATION_SLUG = "primary-and-secondary-school-applications-and-offers"

-NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}
+NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", "Z", ""}

+# Maps actual CSV column names → internal field names
 COLUMN_MAP = {
-    "URN": "urn",
-    "urn": "urn",
-    "YEAR": "year",
-    "Year": "year",
-    # PAN
-    "PAN": "pan",
-    "published_admission_number": "pan",
-    "admissions_number": "pan",
-    # Applications
-    "total_applications": "total_applications",
-    "TAPP": "total_applications",
-    "applications_received": "total_applications",
-    # 1st preference offers
-    "first_preference_offers_pct": "first_preference_offers_pct",
-    "pct_1st_preference": "first_preference_offers_pct",
-    "PT1PREF": "first_preference_offers_pct",
-    # Oversubscription
-    "oversubscribed": "oversubscribed",
+    # School identifier
+    "school_urn": "urn",
+    # Year — e.g. 202526 → 2025
+    "time_period": "time_period_raw",
+    # PAN (places offered)
+    "total_number_places_offered": "pan",
+    # Applications (total times put as any preference)
+    "times_put_as_any_preferred_school": "total_applications",
+    # 1st-preference applications
+    "times_put_as_1st_preference": "times_1st_pref",
+    # 1st-preference offers
+    "number_1st_preference_offers": "offers_1st_pref",
 }


 def download(data_dir: Path | None = None) -> Path:
    dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR
    dest.mkdir(parents=True, exist_ok=True)
-
-    url = get_latest_csv_url(PUBLICATION_SLUG, keyword="primary")
-    if not url:
-        url = get_latest_csv_url(PUBLICATION_SLUG)
-    if not url:
-        raise RuntimeError("Could not find CSV URL for admissions publication")
-
-    filename = url.split("/")[-1].split("?")[0] or "admissions_latest.csv"
-    return download_csv(url, dest / filename)
+    dest_file = dest / "admissions_school_level_latest.csv"
+    return download_release_zip_csv(
+        PUBLICATION_SLUG,
+        dest_file,
+        zip_member_keyword="schoollevel",
+    )


 def _parse_int(val) -> int | None:
@@ -90,35 +83,67 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
        path = files[-1]

    print(f"  Admissions: loading {path} ...")
-    df = pd.read_csv(path, encoding="latin-1", low_memory=False)
+    df = pd.read_csv(path, encoding="utf-8-sig", low_memory=False)
+
+    # Rename columns we care about
    df.rename(columns=COLUMN_MAP, inplace=True)

    if "urn" not in df.columns:
        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")

+    # Filter to primary schools only
+    if "school_phase" in df.columns:
+        df = df[df["school_phase"].str.lower() == "primary"]
+
    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
    df = df.dropna(subset=["urn"])
    df["urn"] = df["urn"].astype(int)

-    year = None
-    m = re.search(r"20(\d{2})", path.stem)
-    if m:
-        year = int("20" + m.group(1))
+    # Derive year from time_period (e.g. 202526 → 2025)
+    def _extract_year(val) -> int | None:
+        s = str(val).strip()
+        m = re.match(r"(\d{4})\d{2}", s)
+        if m:
+            return int(m.group(1))
+        m2 = re.search(r"20(\d{2})", s)
+        if m2:
+            return int("20" + m2.group(1))
+        return None
+
+    if "time_period_raw" in df.columns:
+        df["year"] = df["time_period_raw"].apply(_extract_year)
+    else:
+        year_m = re.search(r"20(\d{2})", path.stem)
+        df["year"] = int("20" + year_m.group(1)) if year_m else None
+
+    df = df.dropna(subset=["year"])
+    df["year"] = df["year"].astype(int)
+
+    # Keep most recent year per school (file may contain multiple years)
+    df = df.sort_values("year", ascending=False).groupby("urn").first().reset_index()

    inserted = 0
    with get_session() as session:
        from sqlalchemy import text
        for _, row in df.iterrows():
            urn = int(row["urn"])
-            row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
-            if not row_year:
-                continue
+            year = int(row["year"])

            pan = _parse_int(row.get("pan"))
            total_apps = _parse_int(row.get("total_applications"))
-            pct_1st = _parse_pct(row.get("first_preference_offers_pct"))
-            oversubscribed = bool(row.get("oversubscribed")) if pd.notna(row.get("oversubscribed")) else (
-                True if (pan and total_apps and total_apps > pan) else None
+            times_1st = _parse_int(row.get("times_1st_pref"))
+            offers_1st = _parse_int(row.get("offers_1st_pref"))
+
+            # % of 1st-preference applicants who received an offer
+            if times_1st and times_1st > 0 and offers_1st is not None:
+                pct_1st = round(offers_1st / times_1st * 100, 1)
+            else:
+                pct_1st = None
+
+            oversubscribed = (
+                True if (pan and times_1st and times_1st > pan) else
+                False if (pan and times_1st and times_1st <= pan) else
+                None
            )

            session.execute(
@@ -134,7 +159,7 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
                        oversubscribed              = EXCLUDED.oversubscribed
                """),
                {
-                    "urn": urn, "year": row_year, "pan": pan,
+                    "urn": urn, "year": year, "pan": pan,
                    "total_apps": total_apps, "pct_1st": pct_1st,
                    "oversubscribed": oversubscribed,
                },
@@ -142,6 +167,7 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
            inserted += 1
            if inserted % 5000 == 0:
                session.flush()
+                print(f"    Processed {inserted} records...")

    print(f"  Admissions: upserted {inserted} records")
    return {"inserted": inserted, "updated": 0, "skipped": 0}