feat(data): integrate 9 UK government data sources via Kestra
Adds a full data integration pipeline for enriching school profiles with
supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT.
Backend:
- Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections,
ofsted_parent_view, school_census, admissions, sen_detail, phonics,
school_deprivation, school_finance) plus GIAS columns on schools
- Expose all supplementary data via GET /api/schools/{urn}
- Enrich school list responses with ofsted_grade + ofsted_date
Integrator (new service):
- FastAPI HTTP microservice; Kestra calls POST /run/{source}
- 9 source modules: ofsted, gias, parent_view, census, admissions,
sen_detail, phonics, idaci, finance
- 9 Kestra flow YAMLs with scheduled triggers and 3× retry
Frontend:
- SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate)
- SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View
survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation
Context, Finances
- types.ts: 8 new interfaces + extended School/SchoolDetailsResponse
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
159
integrator/scripts/sources/gias.py
Normal file
159
integrator/scripts/sources/gias.py
Normal file
@@ -0,0 +1,159 @@
|
||||
"""
|
||||
GIAS (Get Information About Schools) bulk CSV downloader and loader.
|
||||
|
||||
Source: https://get-information-schools.service.gov.uk/Downloads
|
||||
Update: Daily; we refresh weekly.
|
||||
Adds: website, headteacher_name, capacity, trust_name, trust_uid, gender, nursery_provision
|
||||
"""
|
||||
import argparse
|
||||
import sys
|
||||
from datetime import date
|
||||
from pathlib import Path
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
|
||||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||||
from config import SUPPLEMENTARY_DIR
|
||||
from db import get_session
|
||||
|
||||
DEST_DIR = SUPPLEMENTARY_DIR / "gias"
|
||||
|
||||
# GIAS bulk download URL — date is injected at runtime
|
||||
GIAS_URL_TEMPLATE = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata{date}.csv"
|
||||
|
||||
COLUMN_MAP = {
|
||||
"URN": "urn",
|
||||
"SchoolWebsite": "website",
|
||||
"SchoolCapacity": "capacity",
|
||||
"TrustName": "trust_name",
|
||||
"TrustUID": "trust_uid",
|
||||
"Gender (name)": "gender",
|
||||
"NurseryProvision (name)": "nursery_provision_raw",
|
||||
"HeadTitle": "head_title",
|
||||
"HeadFirstName": "head_first",
|
||||
"HeadLastName": "head_last",
|
||||
}
|
||||
|
||||
|
||||
def download(data_dir: Path | None = None) -> Path:
|
||||
dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
|
||||
dest.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
today = date.today().strftime("%Y%m%d")
|
||||
url = GIAS_URL_TEMPLATE.format(date=today)
|
||||
filename = f"gias_{today}.csv"
|
||||
dest_file = dest / filename
|
||||
|
||||
if dest_file.exists():
|
||||
print(f" GIAS: {filename} already exists, skipping download.")
|
||||
return dest_file
|
||||
|
||||
print(f" GIAS: downloading {url} ...")
|
||||
resp = requests.get(url, timeout=300, stream=True)
|
||||
|
||||
# GIAS may not have today's file yet — fall back to yesterday
|
||||
if resp.status_code == 404:
|
||||
from datetime import timedelta
|
||||
yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d")
|
||||
url = GIAS_URL_TEMPLATE.format(date=yesterday)
|
||||
filename = f"gias_{yesterday}.csv"
|
||||
dest_file = dest / filename
|
||||
if dest_file.exists():
|
||||
print(f" GIAS: {filename} already exists, skipping download.")
|
||||
return dest_file
|
||||
resp = requests.get(url, timeout=300, stream=True)
|
||||
|
||||
resp.raise_for_status()
|
||||
with open(dest_file, "wb") as f:
|
||||
for chunk in resp.iter_content(chunk_size=65536):
|
||||
f.write(chunk)
|
||||
|
||||
print(f" GIAS: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)")
|
||||
return dest_file
|
||||
|
||||
|
||||
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||||
if path is None:
|
||||
dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
|
||||
files = sorted(dest.glob("gias_*.csv"))
|
||||
if not files:
|
||||
raise FileNotFoundError(f"No GIAS CSV found in {dest}")
|
||||
path = files[-1]
|
||||
|
||||
print(f" GIAS: loading {path} ...")
|
||||
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
|
||||
df.rename(columns=COLUMN_MAP, inplace=True)
|
||||
|
||||
if "urn" not in df.columns:
|
||||
raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
|
||||
|
||||
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
|
||||
df = df.dropna(subset=["urn"])
|
||||
df["urn"] = df["urn"].astype(int)
|
||||
|
||||
# Build headteacher_name from parts
|
||||
def build_name(row):
|
||||
parts = [
|
||||
str(row.get("head_title", "") or "").strip(),
|
||||
str(row.get("head_first", "") or "").strip(),
|
||||
str(row.get("head_last", "") or "").strip(),
|
||||
]
|
||||
return " ".join(p for p in parts if p) or None
|
||||
|
||||
df["headteacher_name"] = df.apply(build_name, axis=1)
|
||||
df["nursery_provision"] = df.get("nursery_provision_raw", pd.Series()).apply(
|
||||
lambda v: True if str(v).strip().lower().startswith("has") else False if pd.notna(v) else None
|
||||
)
|
||||
|
||||
def clean_str(val):
|
||||
s = str(val).strip() if pd.notna(val) else None
|
||||
return s if s and s.lower() not in ("nan", "none", "") else None
|
||||
|
||||
updated = 0
|
||||
with get_session() as session:
|
||||
from sqlalchemy import text
|
||||
for _, row in df.iterrows():
|
||||
urn = int(row["urn"])
|
||||
session.execute(
|
||||
text("""
|
||||
UPDATE schools SET
|
||||
website = :website,
|
||||
headteacher_name = :headteacher_name,
|
||||
capacity = :capacity,
|
||||
trust_name = :trust_name,
|
||||
trust_uid = :trust_uid,
|
||||
gender = :gender,
|
||||
nursery_provision = :nursery_provision
|
||||
WHERE urn = :urn
|
||||
"""),
|
||||
{
|
||||
"urn": urn,
|
||||
"website": clean_str(row.get("website")),
|
||||
"headteacher_name": row.get("headteacher_name"),
|
||||
"capacity": int(row["capacity"]) if pd.notna(row.get("capacity")) and str(row.get("capacity")).strip().isdigit() else None,
|
||||
"trust_name": clean_str(row.get("trust_name")),
|
||||
"trust_uid": clean_str(row.get("trust_uid")),
|
||||
"gender": clean_str(row.get("gender")),
|
||||
"nursery_provision": row.get("nursery_provision"),
|
||||
},
|
||||
)
|
||||
updated += 1
|
||||
if updated % 5000 == 0:
|
||||
session.flush()
|
||||
print(f" Updated {updated} schools...")
|
||||
|
||||
print(f" GIAS: updated {updated} school records")
|
||||
return {"inserted": 0, "updated": updated, "skipped": 0}
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
parser = argparse.ArgumentParser()
|
||||
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
|
||||
parser.add_argument("--data-dir", type=Path, default=None)
|
||||
args = parser.parse_args()
|
||||
|
||||
if args.action in ("download", "all"):
|
||||
path = download(args.data_dir)
|
||||
if args.action in ("load", "all"):
|
||||
load(data_dir=args.data_dir)
|
||||
Reference in New Issue
Block a user