Files
school_compare/integrator/scripts/sources/ks2.py
Tudor 0e5b71d4a0
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 47s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m12s
Build and Push Docker Images / Build Integrator (push) Successful in 58s
Build and Push Docker Images / Build Kestra Init (push) Successful in 31s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
fix(ks2): make reimport async with polling to avoid HTTP timeout
The geocoding pass over ~15k schools takes longer than any reasonable
HTTP timeout. New approach:
- POST /api/admin/reimport-ks2 starts migration in background thread,
  returns {"status":"started"} immediately
- GET /api/admin/reimport-ks2/status returns {running, done}
- ks2.py polls status every 30s (max 2h) before returning
- Kestra flow timeout bumped to PT2H

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 21:08:06 +00:00

67 lines
2.2 KiB
Python

"""
KS2 attainment data re-importer.
Triggers a full re-import of the KS2 CSV data by calling the backend's
admin endpoint. The backend owns the migration logic and CSV column mappings;
this module is a thin trigger so the re-import can be orchestrated via Kestra
like all other data sources.
The CSV files must already be present in the data volume under
/data/{year}/england_ks2final.csv
(populated at deploy time from the repo's data/ directory).
"""
import time
import requests
from config import BACKEND_URL, ADMIN_API_KEY
HEADERS = {"X-API-Key": ADMIN_API_KEY}
POLL_INTERVAL = 30 # seconds between status checks
MAX_WAIT = 7200 # 2 hours
def download():
"""No download step — CSVs are shipped with the repo."""
print("KS2 CSVs are bundled in the data volume; no download needed.")
return {"skipped": True}
def load():
"""Trigger full KS2 re-import and poll until complete."""
start_url = f"{BACKEND_URL}/api/admin/reimport-ks2?geocode=true"
status_url = f"{BACKEND_URL}/api/admin/reimport-ks2/status"
print(f"POST {start_url}")
resp = requests.post(start_url, headers=HEADERS, timeout=30)
resp.raise_for_status()
print(f"Started: {resp.json()}")
print(f"Polling {status_url} every {POLL_INTERVAL}s (max {MAX_WAIT // 60} min)...")
elapsed = 0
while elapsed < MAX_WAIT:
time.sleep(POLL_INTERVAL)
elapsed += POLL_INTERVAL
sr = requests.get(status_url, headers=HEADERS, timeout=15)
sr.raise_for_status()
state = sr.json()
print(f" [{elapsed // 60}m] {state}")
if state.get("done"):
print("Re-import complete.")
return state
if not state.get("running"):
raise RuntimeError(f"Re-import stopped unexpectedly: {state}")
raise TimeoutError(f"KS2 re-import did not complete within {MAX_WAIT // 60} minutes")
if __name__ == "__main__":
import argparse
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
args = parser.parse_args()
if args.action in ("download", "all"):
download()
if args.action in ("load", "all"):
load()