133 lines
4.3 KiB
Python
133 lines
4.3 KiB
Python
|
|
"""
|
||
|
|
Phonics Screening Check downloader and loader.
|
||
|
|
|
||
|
|
Source: EES publication "phonics-screening-check-and-key-stage-1-assessments-england"
|
||
|
|
Update: Annual (September/October)
|
||
|
|
"""
|
||
|
|
import argparse
|
||
|
|
import sys
|
||
|
|
from pathlib import Path
|
||
|
|
|
||
|
|
import pandas as pd
|
||
|
|
|
||
|
|
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
|
|
from config import SUPPLEMENTARY_DIR
|
||
|
|
from db import get_session
|
||
|
|
from sources.ees import get_latest_csv_url, download_csv
|
||
|
|
|
||
|
|
DEST_DIR = SUPPLEMENTARY_DIR / "phonics"
|
||
|
|
PUBLICATION_SLUG = "phonics-screening-check-and-key-stage-1-assessments-england"
|
||
|
|
|
||
|
|
# Known column names in the phonics CSV (vary by year)
|
||
|
|
COLUMN_MAP = {
|
||
|
|
"URN": "urn",
|
||
|
|
"urn": "urn",
|
||
|
|
# Year 1 pass rate
|
||
|
|
"PPTA1": "year1_phonics_pct", # % meeting expected standard Y1
|
||
|
|
"PPTA1B": "year1_phonics_pct",
|
||
|
|
"PT_MET_PHON_Y1": "year1_phonics_pct",
|
||
|
|
"Y1_MET_EXPECTED_PCT": "year1_phonics_pct",
|
||
|
|
# Year 2 (re-takers)
|
||
|
|
"PPTA2": "year2_phonics_pct",
|
||
|
|
"PT_MET_PHON_Y2": "year2_phonics_pct",
|
||
|
|
"Y2_MET_EXPECTED_PCT": "year2_phonics_pct",
|
||
|
|
# Year label
|
||
|
|
"YEAR": "year",
|
||
|
|
"Year": "year",
|
||
|
|
}
|
||
|
|
|
||
|
|
NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", ""}
|
||
|
|
|
||
|
|
|
||
|
|
def download(data_dir: Path | None = None) -> Path:
|
||
|
|
dest = (data_dir / "supplementary" / "phonics") if data_dir else DEST_DIR
|
||
|
|
dest.mkdir(parents=True, exist_ok=True)
|
||
|
|
|
||
|
|
url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
|
||
|
|
if not url:
|
||
|
|
raise RuntimeError(f"Could not find CSV URL for phonics publication")
|
||
|
|
|
||
|
|
filename = url.split("/")[-1].split("?")[0] or "phonics_latest.csv"
|
||
|
|
return download_csv(url, dest / filename)
|
||
|
|
|
||
|
|
|
||
|
|
def _parse_pct(val) -> float | None:
|
||
|
|
if pd.isna(val):
|
||
|
|
return None
|
||
|
|
s = str(val).strip().upper().replace("%", "")
|
||
|
|
if s in NULL_VALUES:
|
||
|
|
return None
|
||
|
|
try:
|
||
|
|
return float(s)
|
||
|
|
except ValueError:
|
||
|
|
return None
|
||
|
|
|
||
|
|
|
||
|
|
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||
|
|
if path is None:
|
||
|
|
dest = (data_dir / "supplementary" / "phonics") if data_dir else DEST_DIR
|
||
|
|
files = sorted(dest.glob("*.csv"))
|
||
|
|
if not files:
|
||
|
|
raise FileNotFoundError(f"No phonics CSV found in {dest}")
|
||
|
|
path = files[-1]
|
||
|
|
|
||
|
|
print(f" Phonics: loading {path} ...")
|
||
|
|
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
|
||
|
|
df.rename(columns=COLUMN_MAP, inplace=True)
|
||
|
|
|
||
|
|
if "urn" not in df.columns:
|
||
|
|
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
|
||
|
|
|
||
|
|
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
|
||
|
|
df = df.dropna(subset=["urn"])
|
||
|
|
df["urn"] = df["urn"].astype(int)
|
||
|
|
|
||
|
|
# Infer year from filename if not in data
|
||
|
|
year = None
|
||
|
|
import re
|
||
|
|
m = re.search(r"20(\d{2})", path.stem)
|
||
|
|
if m:
|
||
|
|
year = int("20" + m.group(1))
|
||
|
|
|
||
|
|
inserted = 0
|
||
|
|
with get_session() as session:
|
||
|
|
from sqlalchemy import text
|
||
|
|
for _, row in df.iterrows():
|
||
|
|
urn = int(row["urn"])
|
||
|
|
row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
|
||
|
|
if not row_year:
|
||
|
|
continue
|
||
|
|
|
||
|
|
session.execute(
|
||
|
|
text("""
|
||
|
|
INSERT INTO phonics (urn, year, year1_phonics_pct, year2_phonics_pct)
|
||
|
|
VALUES (:urn, :year, :y1, :y2)
|
||
|
|
ON CONFLICT (urn, year) DO UPDATE SET
|
||
|
|
year1_phonics_pct = EXCLUDED.year1_phonics_pct,
|
||
|
|
year2_phonics_pct = EXCLUDED.year2_phonics_pct
|
||
|
|
"""),
|
||
|
|
{
|
||
|
|
"urn": urn,
|
||
|
|
"year": row_year,
|
||
|
|
"y1": _parse_pct(row.get("year1_phonics_pct")),
|
||
|
|
"y2": _parse_pct(row.get("year2_phonics_pct")),
|
||
|
|
},
|
||
|
|
)
|
||
|
|
inserted += 1
|
||
|
|
if inserted % 5000 == 0:
|
||
|
|
session.flush()
|
||
|
|
|
||
|
|
print(f" Phonics: upserted {inserted} records")
|
||
|
|
return {"inserted": inserted, "updated": 0, "skipped": 0}
|
||
|
|
|
||
|
|
|
||
|
|
if __name__ == "__main__":
|
||
|
|
parser = argparse.ArgumentParser()
|
||
|
|
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
|
||
|
|
parser.add_argument("--data-dir", type=Path, default=None)
|
||
|
|
args = parser.parse_args()
|
||
|
|
if args.action in ("download", "all"):
|
||
|
|
download(args.data_dir)
|
||
|
|
if args.action in ("load", "all"):
|
||
|
|
load(data_dir=args.data_dir)
|