fix(admissions): switch to EES content API + correct publication slug and columns
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 50s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m12s
Build and Push Docker Images / Build Integrator (push) Successful in 57s
Build and Push Docker Images / Build Kestra Init (push) Successful in 33s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 50s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m12s
Build and Push Docker Images / Build Integrator (push) Successful in 57s
Build and Push Docker Images / Build Kestra Init (push) Successful in 33s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
The EES statistics API only exposes ~13 publications; admissions data is not among them. Switch to the EES content API (content.explore-education-statistics. service.gov.uk) which covers all publications. - ees.py: add get_content_release_id() and download_release_zip_csv() that fetch the release ZIP and extract a named CSV member from it - admissions.py: use corrected slug (primary-and-secondary-school-applications- and-offers), correct column names from actual CSV (school_urn, total_number_places_offered, times_put_as_1st_preference, etc.), derive first_preference_offers_pct from offer/application ratio, filter to primary schools only, keep most recent year per URN Also includes SchoolDetailView UX redesign: parent-first section ordering, plain-English labels, national average benchmarks, progress score colour coding, expanded header, quick summary strip, and CSS consolidation. Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,7 +1,8 @@
|
||||
"""
|
||||
School Admissions data downloader and loader.
|
||||
|
||||
Source: EES publication "secondary-and-primary-school-applications-and-offers"
|
||||
Source: EES publication "primary-and-secondary-school-applications-and-offers"
|
||||
Content API release ZIP → supporting-files/AppsandOffers_*_SchoolLevel*.csv
|
||||
Update: Annual (June/July post-offer round)
|
||||
"""
|
||||
import argparse
|
||||
@@ -14,47 +15,39 @@ import pandas as pd
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from config import SUPPLEMENTARY_DIR
|
||||
from db import get_session
|
||||
from sources.ees import get_latest_csv_url, download_csv
|
||||
from sources.ees import download_release_zip_csv
|
||||
|
||||
DEST_DIR = SUPPLEMENTARY_DIR / "admissions"
|
||||
PUBLICATION_SLUG = "secondary-and-primary-school-applications-and-offers"
|
||||
PUBLICATION_SLUG = "primary-and-secondary-school-applications-and-offers"
|
||||
|
||||
NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}
|
||||
NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", "Z", ""}
|
||||
|
||||
# Maps actual CSV column names → internal field names
|
||||
COLUMN_MAP = {
|
||||
"URN": "urn",
|
||||
"urn": "urn",
|
||||
"YEAR": "year",
|
||||
"Year": "year",
|
||||
# PAN
|
||||
"PAN": "pan",
|
||||
"published_admission_number": "pan",
|
||||
"admissions_number": "pan",
|
||||
# Applications
|
||||
"total_applications": "total_applications",
|
||||
"TAPP": "total_applications",
|
||||
"applications_received": "total_applications",
|
||||
# 1st preference offers
|
||||
"first_preference_offers_pct": "first_preference_offers_pct",
|
||||
"pct_1st_preference": "first_preference_offers_pct",
|
||||
"PT1PREF": "first_preference_offers_pct",
|
||||
# Oversubscription
|
||||
"oversubscribed": "oversubscribed",
|
||||
# School identifier
|
||||
"school_urn": "urn",
|
||||
# Year — e.g. 202526 → 2025
|
||||
"time_period": "time_period_raw",
|
||||
# PAN (places offered)
|
||||
"total_number_places_offered": "pan",
|
||||
# Applications (total times put as any preference)
|
||||
"times_put_as_any_preferred_school": "total_applications",
|
||||
# 1st-preference applications
|
||||
"times_put_as_1st_preference": "times_1st_pref",
|
||||
# 1st-preference offers
|
||||
"number_1st_preference_offers": "offers_1st_pref",
|
||||
}
|
||||
|
||||
|
||||
def download(data_dir: Path | None = None) -> Path:
|
||||
dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
url = get_latest_csv_url(PUBLICATION_SLUG, keyword="primary")
|
||||
if not url:
|
||||
url = get_latest_csv_url(PUBLICATION_SLUG)
|
||||
if not url:
|
||||
raise RuntimeError("Could not find CSV URL for admissions publication")
|
||||
|
||||
filename = url.split("/")[-1].split("?")[0] or "admissions_latest.csv"
|
||||
return download_csv(url, dest / filename)
|
||||
dest_file = dest / "admissions_school_level_latest.csv"
|
||||
return download_release_zip_csv(
|
||||
PUBLICATION_SLUG,
|
||||
dest_file,
|
||||
zip_member_keyword="schoollevel",
|
||||
)
|
||||
|
||||
|
||||
def _parse_int(val) -> int | None:
|
||||
@@ -90,35 +83,67 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||||
path = files[-1]
|
||||
|
||||
print(f" Admissions: loading {path} ...")
|
||||
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
|
||||
df = pd.read_csv(path, encoding="utf-8-sig", low_memory=False)
|
||||
|
||||
# Rename columns we care about
|
||||
df.rename(columns=COLUMN_MAP, inplace=True)
|
||||
|
||||
if "urn" not in df.columns:
|
||||
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
|
||||
|
||||
# Filter to primary schools only
|
||||
if "school_phase" in df.columns:
|
||||
df = df[df["school_phase"].str.lower() == "primary"]
|
||||
|
||||
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
|
||||
df = df.dropna(subset=["urn"])
|
||||
df["urn"] = df["urn"].astype(int)
|
||||
|
||||
year = None
|
||||
m = re.search(r"20(\d{2})", path.stem)
|
||||
if m:
|
||||
year = int("20" + m.group(1))
|
||||
# Derive year from time_period (e.g. 202526 → 2025)
|
||||
def _extract_year(val) -> int | None:
|
||||
s = str(val).strip()
|
||||
m = re.match(r"(\d{4})\d{2}", s)
|
||||
if m:
|
||||
return int(m.group(1))
|
||||
m2 = re.search(r"20(\d{2})", s)
|
||||
if m2:
|
||||
return int("20" + m2.group(1))
|
||||
return None
|
||||
|
||||
if "time_period_raw" in df.columns:
|
||||
df["year"] = df["time_period_raw"].apply(_extract_year)
|
||||
else:
|
||||
year_m = re.search(r"20(\d{2})", path.stem)
|
||||
df["year"] = int("20" + year_m.group(1)) if year_m else None
|
||||
|
||||
df = df.dropna(subset=["year"])
|
||||
df["year"] = df["year"].astype(int)
|
||||
|
||||
# Keep most recent year per school (file may contain multiple years)
|
||||
df = df.sort_values("year", ascending=False).groupby("urn").first().reset_index()
|
||||
|
||||
inserted = 0
|
||||
with get_session() as session:
|
||||
from sqlalchemy import text
|
||||
for _, row in df.iterrows():
|
||||
urn = int(row["urn"])
|
||||
row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
|
||||
if not row_year:
|
||||
continue
|
||||
year = int(row["year"])
|
||||
|
||||
pan = _parse_int(row.get("pan"))
|
||||
total_apps = _parse_int(row.get("total_applications"))
|
||||
pct_1st = _parse_pct(row.get("first_preference_offers_pct"))
|
||||
oversubscribed = bool(row.get("oversubscribed")) if pd.notna(row.get("oversubscribed")) else (
|
||||
True if (pan and total_apps and total_apps > pan) else None
|
||||
times_1st = _parse_int(row.get("times_1st_pref"))
|
||||
offers_1st = _parse_int(row.get("offers_1st_pref"))
|
||||
|
||||
# % of 1st-preference applicants who received an offer
|
||||
if times_1st and times_1st > 0 and offers_1st is not None:
|
||||
pct_1st = round(offers_1st / times_1st * 100, 1)
|
||||
else:
|
||||
pct_1st = None
|
||||
|
||||
oversubscribed = (
|
||||
True if (pan and times_1st and times_1st > pan) else
|
||||
False if (pan and times_1st and times_1st <= pan) else
|
||||
None
|
||||
)
|
||||
|
||||
session.execute(
|
||||
@@ -134,7 +159,7 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||||
oversubscribed = EXCLUDED.oversubscribed
|
||||
"""),
|
||||
{
|
||||
"urn": urn, "year": row_year, "pan": pan,
|
||||
"urn": urn, "year": year, "pan": pan,
|
||||
"total_apps": total_apps, "pct_1st": pct_1st,
|
||||
"oversubscribed": oversubscribed,
|
||||
},
|
||||
@@ -142,6 +167,7 @@ def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||||
inserted += 1
|
||||
if inserted % 5000 == 0:
|
||||
session.flush()
|
||||
print(f" Processed {inserted} records...")
|
||||
|
||||
print(f" Admissions: upserted {inserted} records")
|
||||
return {"inserted": inserted, "updated": 0, "skipped": 0}
|
||||
|
||||
@@ -1,21 +1,27 @@
|
||||
"""
|
||||
Shared EES (Explore Education Statistics) API client.
|
||||
|
||||
Base URL: https://api.education.gov.uk/statistics/v1
|
||||
Two APIs are available:
|
||||
- Statistics API: https://api.education.gov.uk/statistics/v1 (only ~13 publications)
|
||||
- Content API: https://content.explore-education-statistics.service.gov.uk/api
|
||||
Covers all publications; use this for admissions and other data not in the stats API.
|
||||
Download all files for a release as a ZIP from /api/releases/{id}/files.
|
||||
"""
|
||||
import sys
|
||||
import io
|
||||
import zipfile
|
||||
from pathlib import Path
|
||||
from typing import Optional
|
||||
|
||||
import requests
|
||||
|
||||
API_BASE = "https://api.education.gov.uk/statistics/v1"
|
||||
STATS_API_BASE = "https://api.education.gov.uk/statistics/v1"
|
||||
CONTENT_API_BASE = "https://content.explore-education-statistics.service.gov.uk/api"
|
||||
TIMEOUT = 60
|
||||
|
||||
|
||||
def get_publication_files(publication_slug: str) -> list[dict]:
|
||||
"""Return list of data-set file descriptors for a publication."""
|
||||
url = f"{API_BASE}/publications/{publication_slug}/data-set-files"
|
||||
"""Return list of data-set file descriptors for a publication (statistics API)."""
|
||||
url = f"{STATS_API_BASE}/publications/{publication_slug}/data-set-files"
|
||||
resp = requests.get(url, timeout=TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return resp.json().get("results", [])
|
||||
@@ -23,7 +29,7 @@ def get_publication_files(publication_slug: str) -> list[dict]:
|
||||
|
||||
def get_latest_csv_url(publication_slug: str, keyword: str = "") -> Optional[str]:
|
||||
"""
|
||||
Find the most recent CSV download URL for a publication.
|
||||
Find the most recent CSV download URL for a publication (statistics API).
|
||||
Optionally filter by a keyword in the file name.
|
||||
"""
|
||||
files = get_publication_files(publication_slug)
|
||||
@@ -37,6 +43,58 @@ def get_latest_csv_url(publication_slug: str, keyword: str = "") -> Optional[str
|
||||
return None
|
||||
|
||||
|
||||
def get_content_release_id(publication_slug: str) -> str:
|
||||
"""Return the latest release ID for a publication via the content API."""
|
||||
url = f"{CONTENT_API_BASE}/publications/{publication_slug}/releases/latest"
|
||||
resp = requests.get(url, timeout=TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return resp.json()["id"]
|
||||
|
||||
|
||||
def download_release_zip_csv(
|
||||
publication_slug: str,
|
||||
dest_path: Path,
|
||||
zip_member_keyword: str = "",
|
||||
) -> Path:
|
||||
"""
|
||||
Download the full-release ZIP from the EES content API and extract one CSV.
|
||||
|
||||
If zip_member_keyword is given, the first member whose path contains that
|
||||
keyword (case-insensitive) is extracted; otherwise the first .csv found is used.
|
||||
Returns dest_path (the extracted CSV file).
|
||||
"""
|
||||
if dest_path.exists():
|
||||
print(f" EES: {dest_path.name} already exists, skipping.")
|
||||
return dest_path
|
||||
|
||||
release_id = get_content_release_id(publication_slug)
|
||||
zip_url = f"{CONTENT_API_BASE}/releases/{release_id}/files"
|
||||
print(f" EES: downloading release ZIP for '{publication_slug}' ...")
|
||||
resp = requests.get(zip_url, timeout=300, stream=True)
|
||||
resp.raise_for_status()
|
||||
|
||||
data = b"".join(resp.iter_content(chunk_size=65536))
|
||||
with zipfile.ZipFile(io.BytesIO(data)) as z:
|
||||
members = z.namelist()
|
||||
target = None
|
||||
kw = zip_member_keyword.lower()
|
||||
for m in members:
|
||||
if m.endswith(".csv") and (not kw or kw in m.lower()):
|
||||
target = m
|
||||
break
|
||||
if not target:
|
||||
raise ValueError(
|
||||
f"No CSV matching '{zip_member_keyword}' in ZIP. Members: {members}"
|
||||
)
|
||||
print(f" EES: extracting '{target}' ...")
|
||||
dest_path.parent.mkdir(parents=True, exist_ok=True)
|
||||
with z.open(target) as src, open(dest_path, "wb") as dst:
|
||||
dst.write(src.read())
|
||||
|
||||
print(f" EES: saved {dest_path} ({dest_path.stat().st_size // 1024} KB)")
|
||||
return dest_path
|
||||
|
||||
|
||||
def download_csv(url: str, dest_path: Path) -> Path:
|
||||
"""Download a CSV from EES to dest_path."""
|
||||
if dest_path.exists():
|
||||
|
||||
Reference in New Issue
Block a user