feat(data): integrate 9 UK government data sources via Kestra

Adds a full data integration pipeline for enriching school profiles with supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT. Backend: - Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections, ofsted_parent_view, school_census, admissions, sen_detail, phonics, school_deprivation, school_finance) plus GIAS columns on schools - Expose all supplementary data via GET /api/schools/{urn} - Enrich school list responses with ofsted_grade + ofsted_date Integrator (new service): - FastAPI HTTP microservice; Kestra calls POST /run/{source} - 9 source modules: ofsted, gias, parent_view, census, admissions, sen_detail, phonics, idaci, finance - 9 Kestra flow YAMLs with scheduled triggers and 3× retry Frontend: - SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate) - SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation Context, Finances - types.ts: 8 new interfaces + extended School/SchoolDetailsResponse Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 11:44:04 +00:00
parent c49593d4d6
commit dd49ef28b2
36 changed files with 2849 additions and 8 deletions
--- a/integrator/scripts/sources/ees.py
+++ b/integrator/scripts/sources/ees.py
@@ -0,0 +1,53 @@
+"""
+Shared EES (Explore Education Statistics) API client.
+
+Base URL: https://api.education.gov.uk/statistics/v1
+"""
+import sys
+from pathlib import Path
+from typing import Optional
+
+import requests
+
+API_BASE = "https://api.education.gov.uk/statistics/v1"
+TIMEOUT = 60
+
+
+def get_publication_files(publication_slug: str) -> list[dict]:
+    """Return list of data-set file descriptors for a publication."""
+    url = f"{API_BASE}/publications/{publication_slug}/data-set-files"
+    resp = requests.get(url, timeout=TIMEOUT)
+    resp.raise_for_status()
+    return resp.json().get("results", [])
+
+
+def get_latest_csv_url(publication_slug: str, keyword: str = "") -> Optional[str]:
+    """
+    Find the most recent CSV download URL for a publication.
+    Optionally filter by a keyword in the file name.
+    """
+    files = get_publication_files(publication_slug)
+    for entry in files:
+        name = entry.get("name", "").lower()
+        if keyword and keyword.lower() not in name:
+            continue
+        csv_url = entry.get("csvDownloadUrl") or entry.get("file", {}).get("url")
+        if csv_url:
+            return csv_url
+    return None
+
+
+def download_csv(url: str, dest_path: Path) -> Path:
+    """Download a CSV from EES to dest_path."""
+    if dest_path.exists():
+        print(f"    EES: {dest_path.name} already exists, skipping.")
+        return dest_path
+    print(f"    EES: downloading {url} ...")
+    resp = requests.get(url, timeout=300, stream=True)
+    resp.raise_for_status()
+    dest_path.parent.mkdir(parents=True, exist_ok=True)
+    with open(dest_path, "wb") as f:
+        for chunk in resp.iter_content(chunk_size=65536):
+            f.write(chunk)
+    print(f"    EES: saved {dest_path} ({dest_path.stat().st_size // 1024} KB)")
+    return dest_path