diff --git a/backend/app.py b/backend/app.py index 2f6536c..c2e3de4 100644 --- a/backend/app.py +++ b/backend/app.py @@ -635,6 +635,9 @@ async def reload_data( return {"status": "reloaded"} +_reimport_status: dict = {"running": False, "done": False, "error": None} + + @app.post("/api/admin/reimport-ks2") @limiter.limit("2/minute") async def reimport_ks2( @@ -643,19 +646,45 @@ async def reimport_ks2( _: bool = Depends(verify_admin_api_key) ): """ - Re-run the full KS2 CSV migration (drop + reimport schools and results). - Use when the database has been wiped and needs repopulating from the CSV files. + Start a full KS2 CSV migration in the background and return immediately. + Poll GET /api/admin/reimport-ks2/status to check progress. Pass ?geocode=false to skip postcode → lat/lng resolution. - Runs synchronously — expect this to take several minutes. Requires X-API-Key header with valid admin API key. """ - loop = asyncio.get_event_loop() - success = await loop.run_in_executor(None, lambda: run_full_migration(geocode=geocode)) - if not success: - raise HTTPException(status_code=500, detail="Migration failed: no CSV data found in data directory") - clear_cache() - load_school_data() - return {"status": "reimported"} + global _reimport_status + if _reimport_status["running"]: + return {"status": "already_running"} + + _reimport_status = {"running": True, "done": False, "error": None} + + def _run(): + global _reimport_status + try: + success = run_full_migration(geocode=geocode) + if not success: + _reimport_status = {"running": False, "done": False, "error": "No CSV data found"} + return + clear_cache() + load_school_data() + _reimport_status = {"running": False, "done": True, "error": None} + except Exception as exc: + _reimport_status = {"running": False, "done": False, "error": str(exc)} + + import threading + threading.Thread(target=_run, daemon=True).start() + return {"status": "started"} + + +@app.get("/api/admin/reimport-ks2/status") +async def reimport_ks2_status( + request: Request, + _: bool = Depends(verify_admin_api_key) +): + """Poll this endpoint to check reimport progress.""" + s = _reimport_status + if s["error"]: + raise HTTPException(status_code=500, detail=s["error"]) + return {"running": s["running"], "done": s["done"]} # ============================================================================= diff --git a/integrator/flows/ks2.yml b/integrator/flows/ks2.yml index e8c72c1..646c785 100644 --- a/integrator/flows/ks2.yml +++ b/integrator/flows/ks2.yml @@ -10,7 +10,7 @@ tasks: uri: http://integrator:8001/run/ks2?action=load method: POST allowFailed: false - timeout: PT20M # full migration takes ~10 min; give headroom + timeout: PT2H # polls backend every 30s; geocoding 15k schools takes up to 1h errors: - id: notify-failure diff --git a/integrator/scripts/sources/ks2.py b/integrator/scripts/sources/ks2.py index 4961585..21a9f7a 100644 --- a/integrator/scripts/sources/ks2.py +++ b/integrator/scripts/sources/ks2.py @@ -10,10 +10,14 @@ The CSV files must already be present in the data volume under /data/{year}/england_ks2final.csv (populated at deploy time from the repo's data/ directory). """ -import sys +import time import requests from config import BACKEND_URL, ADMIN_API_KEY +HEADERS = {"X-API-Key": ADMIN_API_KEY} +POLL_INTERVAL = 30 # seconds between status checks +MAX_WAIT = 7200 # 2 hours + def download(): """No download step — CSVs are shipped with the repo.""" @@ -22,18 +26,33 @@ def download(): def load(): - """Trigger full KS2 re-import via the backend admin endpoint (with geocoding).""" - url = f"{BACKEND_URL}/api/admin/reimport-ks2?geocode=true" - print(f"POST {url}") - resp = requests.post( - url, - headers={"X-API-Key": ADMIN_API_KEY}, - timeout=900, # migration can take ~10 minutes - ) + """Trigger full KS2 re-import and poll until complete.""" + start_url = f"{BACKEND_URL}/api/admin/reimport-ks2?geocode=true" + status_url = f"{BACKEND_URL}/api/admin/reimport-ks2/status" + + print(f"POST {start_url}") + resp = requests.post(start_url, headers=HEADERS, timeout=30) resp.raise_for_status() - result = resp.json() - print(f"Result: {result}") - return result + print(f"Started: {resp.json()}") + + print(f"Polling {status_url} every {POLL_INTERVAL}s (max {MAX_WAIT // 60} min)...") + elapsed = 0 + while elapsed < MAX_WAIT: + time.sleep(POLL_INTERVAL) + elapsed += POLL_INTERVAL + + sr = requests.get(status_url, headers=HEADERS, timeout=15) + sr.raise_for_status() + state = sr.json() + print(f" [{elapsed // 60}m] {state}") + + if state.get("done"): + print("Re-import complete.") + return state + if not state.get("running"): + raise RuntimeError(f"Re-import stopped unexpectedly: {state}") + + raise TimeoutError(f"KS2 re-import did not complete within {MAX_WAIT // 60} minutes") if __name__ == "__main__":