chore: remove Kestra and integrator legacy services

Migration to Airflow + Meltano pipeline is complete. Remove: - kestra, kestra-init, integrator services from docker-compose.portainer.yml - kestra_storage and supplementary_data volumes - KESTRA_USER/KESTRA_PASSWORD env var references - integrator/ directory (Kestra flows, scripts, Dockerfiles) Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-27 15:03:34 +00:00
parent 668e234eb2
commit 62284e7a94
30 changed files with 1 additions and 2453 deletions
@@ -8,8 +8,6 @@
 #   TYPESENSE_API_KEY      — Typesense admin API key
 #   TYPESENSE_SEARCH_KEY   — Typesense search-only key (exposed to frontend)
 #   AIRFLOW_ADMIN_USER     — Airflow admin username (password auto-generated, see api-server logs)
-#   KESTRA_USER            — Kestra UI username (optional)
-#   KESTRA_PASSWORD        — Kestra UI password (optional)

 services:

@@ -103,87 +101,6 @@ services:
      retries: 5
      start_period: 10s

-  # ── Kestra — workflow orchestrator (legacy, kept during migration) ────
-  kestra:
-    image: kestra/kestra:latest
-    container_name: schoolcompare_kestra
-    command: server standalone
-    ports:
-      - "8090:8080"
-    volumes:
-      - kestra_storage:/app/storage
-    environment:
-      KESTRA_CONFIGURATION: |
-        datasources:
-          postgres:
-            url: jdbc:postgresql://sc_database:5432/kestra
-            driverClassName: org.postgresql.Driver
-            username: ${DB_USERNAME}
-            password: ${DB_PASSWORD}
-        kestra:
-          repository:
-            type: postgres
-          queue:
-            type: postgres
-          storage:
-            type: local
-            local:
-              base-path: /app/storage
-    depends_on:
-      sc_database:
-        condition: service_healthy
-    networks:
-      - backend
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD-SHELL", "curl -sf http://localhost:8081/health | grep -q '\"status\":\"UP\"'"]
-      interval: 15s
-      timeout: 10s
-      retries: 10
-      start_period: 60s
-
-  # ── Kestra init (legacy, kept during migration) ──────────────────────
-  kestra-init:
-    image: privaterepo.sitaru.org/tudor/school_compare-kestra-init:latest
-    container_name: schoolcompare_kestra_init
-    environment:
-      KESTRA_URL: http://kestra:8080
-      KESTRA_USER: ${KESTRA_USER:-}
-      KESTRA_PASSWORD: ${KESTRA_PASSWORD:-}
-    depends_on:
-      kestra:
-        condition: service_healthy
-    networks:
-      - backend
-    restart: "no"
-
-  # ── Data integrator (legacy, kept during migration) ──────────────────
-  integrator:
-    image: privaterepo.sitaru.org/tudor/school_compare-integrator:latest
-    container_name: schoolcompare_integrator
-    ports:
-      - "8001:8001"
-    environment:
-      DATABASE_URL: postgresql://${DB_USERNAME}:${DB_PASSWORD}@sc_database:5432/${DB_DATABASE_NAME}
-      DATA_DIR: /data
-      BACKEND_URL: http://backend:80
-      ADMIN_API_KEY: ${ADMIN_API_KEY:-changeme}
-      PYTHONUNBUFFERED: 1
-    volumes:
-      - supplementary_data:/data
-    depends_on:
-      sc_database:
-        condition: service_healthy
-    networks:
-      - backend
-    restart: unless-stopped
-    healthcheck:
-      test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
-      interval: 30s
-      timeout: 10s
-      retries: 3
-      start_period: 15s
-
  # ── Airflow API Server + UI ───────────────────────────────────────────
  airflow-api-server:
    image: privaterepo.sitaru.org/tudor/school_compare-pipeline:latest
@@ -282,7 +199,5 @@ networks:

 volumes:
  postgres_data:
-  kestra_storage:
-  supplementary_data:
  typesense_data:
  airflow_logs:
@@ -1,15 +0,0 @@
-FROM python:3.12-slim
-
-WORKDIR /app
-
-# Install dependencies
-COPY requirements.txt .
-RUN apt-get update && apt-get install -y --no-install-recommends curl && rm -rf /var/lib/apt/lists/*
-
-RUN pip install --no-cache-dir -r requirements.txt
-
-# Copy application code
-COPY scripts/ ./scripts/
-COPY server.py .
-
-CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"]
@@ -1,6 +0,0 @@
-FROM alpine:3.19
-RUN apk add --no-cache curl
-COPY flows/ /flows/
-COPY docker/kestra-init.sh /kestra-init.sh
-RUN chmod +x /kestra-init.sh
-CMD ["/kestra-init.sh"]
@@ -1,59 +0,0 @@
-#!/bin/sh
-set -e
-
-KESTRA_URL="${KESTRA_URL:-http://kestra:8080}"
-MAX_WAIT=120
-
-# Basic auth — set KESTRA_USER / KESTRA_PASSWORD if authentication is enabled
-AUTH=""
-if [ -n "$KESTRA_USER" ] && [ -n "$KESTRA_PASSWORD" ]; then
-  AUTH="-u ${KESTRA_USER}:${KESTRA_PASSWORD}"
-fi
-
-echo "Waiting for Kestra API at ${KESTRA_URL}..."
-elapsed=0
-until curl -sf $AUTH "${KESTRA_URL}/api/v1/flows/search" > /dev/null 2>&1; do
-  if [ "$elapsed" -ge "$MAX_WAIT" ]; then
-    echo "ERROR: Kestra API not reachable after ${MAX_WAIT}s"
-    exit 1
-  fi
-  sleep 5
-  elapsed=$((elapsed + 5))
-done
-echo "Kestra API is ready."
-
-echo "Importing flows..."
-
-for f in /flows/*.yml; do
-  name="$(basename "$f")"
-  echo "  -> $name"
-
-  http_code=$(curl -s $AUTH -o /tmp/kestra_resp -w "%{http_code}" \
-    -X POST "${KESTRA_URL}/api/v1/flows" \
-    -H "Content-Type: application/x-yaml" \
-    --data-binary "@${f}")
-
-  if [ "$http_code" = "200" ] || [ "$http_code" = "201" ]; then
-    echo "     created"
-  elif [ "$http_code" = "409" ]; then
-    ns=$(grep '^namespace:' "$f" | awk '{print $2}')
-    id=$(grep '^id:'        "$f" | awk '{print $2}')
-    http_code2=$(curl -s $AUTH -o /tmp/kestra_resp -w "%{http_code}" \
-      -X PUT "${KESTRA_URL}/api/v1/flows/${ns}/${id}" \
-      -H "Content-Type: application/x-yaml" \
-      --data-binary "@${f}")
-    if [ "$http_code2" = "200" ] || [ "$http_code2" = "201" ]; then
-      echo "     updated"
-    else
-      echo "     ERROR updating $name: HTTP $http_code2"
-      cat /tmp/kestra_resp; echo
-      exit 1
-    fi
-  else
-    echo "     ERROR importing $name: HTTP $http_code"
-    cat /tmp/kestra_resp; echo
-    exit 1
-  fi
-done
-
-echo "All flows imported."
@@ -1,26 +0,0 @@
-id: admissions-annual-update
-namespace: schoolcompare.data
-description: Download and load school admissions data via EES API
-
-triggers:
-  - id: annual-schedule
-    type: io.kestra.plugin.core.trigger.Schedule
-    cron: "0 4 1 7 *"   # 1 July annually at 04:00
-
-tasks:
-  - id: download
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/admissions?action=download
-    method: POST
-    timeout: PT20M
-
-  - id: load
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/admissions?action=load
-    method: POST
-    timeout: PT30M
-
-retry:
-  type: constant
-  maxAttempts: 3
-  interval: PT15M
@@ -1,26 +0,0 @@
-id: census-annual-update
-namespace: schoolcompare.data
-description: Download and load School Census (SPC) data via EES API
-
-triggers:
-  - id: annual-schedule
-    type: io.kestra.plugin.core.trigger.Schedule
-    cron: "0 4 1 9 *"   # 1 September annually at 04:00
-
-tasks:
-  - id: download
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/census?action=download
-    method: POST
-    timeout: PT20M
-
-  - id: load
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/census?action=load
-    method: POST
-    timeout: PT30M
-
-retry:
-  type: constant
-  maxAttempts: 3
-  interval: PT15M
@@ -1,26 +0,0 @@
-id: finance-annual-update
-namespace: schoolcompare.data
-description: Fetch FBIT financial benchmarking data from DfE API for all schools
-
-triggers:
-  - id: annual-schedule
-    type: io.kestra.plugin.core.trigger.Schedule
-    cron: "0 4 1 12 *"   # 1 December annually at 04:00
-
-tasks:
-  - id: download
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/finance?action=download
-    method: POST
-    timeout: PT120M   # Fetches per-school from API — ~20k schools
-
-  - id: load
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/finance?action=load
-    method: POST
-    timeout: PT30M
-
-retry:
-  type: constant
-  maxAttempts: 2
-  interval: PT30M
@@ -1,31 +0,0 @@
-id: gias-weekly-update
-namespace: schoolcompare.data
-description: Download and load GIAS (Get Information About Schools) bulk CSV
-
-triggers:
-  - id: weekly-schedule
-    type: io.kestra.plugin.core.trigger.Schedule
-    cron: "0 3 * * 0"   # Every Sunday at 03:00
-
-tasks:
-  - id: download
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/gias?action=download
-    method: POST
-    timeout: PT30M
-
-  - id: load
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/gias?action=load
-    method: POST
-    timeout: PT30M
-
-errors:
-  - id: notify-failure
-    type: io.kestra.plugin.core.log.Log
-    message: "GIAS update FAILED: {{ error.message }}"
-
-retry:
-  type: constant
-  maxAttempts: 3
-  interval: PT10M
@@ -1,26 +0,0 @@
-id: idaci-annual-check
-namespace: schoolcompare.data
-description: Download IoD2019 IDACI file and compute deprivation scores for all schools
-
-triggers:
-  - id: annual-schedule
-    type: io.kestra.plugin.core.trigger.Schedule
-    cron: "0 5 1 1 *"   # 1 January annually at 05:00
-
-tasks:
-  - id: download
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/idaci?action=download
-    method: POST
-    timeout: PT10M
-
-  - id: load
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/idaci?action=load
-    method: POST
-    timeout: PT60M
-
-retry:
-  type: constant
-  maxAttempts: 2
-  interval: PT30M
@@ -1,23 +0,0 @@
-id: ks2-reimport
-namespace: schoolcompare.data
-description: Re-import KS2 attainment data from bundled CSV files (use after DB wipe)
-
-# No scheduled trigger — run manually from the Kestra UI when needed.
-
-tasks:
-  - id: reimport
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/ks2?action=load
-    method: POST
-    allowFailed: false
-    timeout: PT30S   # fire-and-forget; backend runs migration in background
-
-errors:
-  - id: notify-failure
-    type: io.kestra.plugin.core.log.Log
-    message: "KS2 re-import FAILED: {{ error.message }}"
-
-retry:
-  type: constant
-  maxAttempts: 2
-  interval: PT5M
@@ -1,33 +0,0 @@
-id: ofsted-monthly-update
-namespace: schoolcompare.data
-description: Download and load Ofsted Monthly Management Information CSV
-
-triggers:
-  - id: monthly-schedule
-    type: io.kestra.plugin.core.trigger.Schedule
-    cron: "0 2 1 * *"   # 1st of each month at 02:00
-
-tasks:
-  - id: download
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/ofsted?action=download
-    method: POST
-    allowFailed: false
-    timeout: PT10M
-
-  - id: load
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/ofsted?action=load
-    method: POST
-    allowFailed: false
-    timeout: PT30M
-
-errors:
-  - id: notify-failure
-    type: io.kestra.plugin.core.log.Log
-    message: "Ofsted update FAILED: {{ error.message }}"
-
-retry:
-  type: constant
-  maxAttempts: 3
-  interval: PT10M
@@ -1,31 +0,0 @@
-id: parent-view-monthly-check
-namespace: schoolcompare.data
-description: Download and load Ofsted Parent View open data (released ~3x/year)
-
-triggers:
-  - id: monthly-schedule
-    type: io.kestra.plugin.core.trigger.Schedule
-    cron: "0 3 1 * *"   # 1st of each month at 03:00
-
-tasks:
-  - id: download
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/parent_view?action=download
-    method: POST
-    timeout: PT10M
-
-  - id: load
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/parent_view?action=load
-    method: POST
-    timeout: PT20M
-
-errors:
-  - id: notify-failure
-    type: io.kestra.plugin.core.log.Log
-    message: "Parent View update FAILED: {{ error.message }}"
-
-retry:
-  type: constant
-  maxAttempts: 3
-  interval: PT10M
@@ -1,26 +0,0 @@
-id: phonics-annual-update
-namespace: schoolcompare.data
-description: Download and load Phonics Screening Check data via EES API
-
-triggers:
-  - id: annual-schedule
-    type: io.kestra.plugin.core.trigger.Schedule
-    cron: "0 5 1 9 *"   # 1 September annually at 05:00
-
-tasks:
-  - id: download
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/phonics?action=download
-    method: POST
-    timeout: PT20M
-
-  - id: load
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/phonics?action=load
-    method: POST
-    timeout: PT30M
-
-retry:
-  type: constant
-  maxAttempts: 3
-  interval: PT15M
@@ -1,26 +0,0 @@
-id: sen-detail-annual-update
-namespace: schoolcompare.data
-description: Download and load SEN primary need breakdown via EES API
-
-triggers:
-  - id: annual-schedule
-    type: io.kestra.plugin.core.trigger.Schedule
-    cron: "0 4 15 9 *"   # 15 September annually at 04:00
-
-tasks:
-  - id: download
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/sen_detail?action=download
-    method: POST
-    timeout: PT20M
-
-  - id: load
-    type: io.kestra.plugin.core.http.Request
-    uri: http://integrator:8001/run/sen_detail?action=load
-    method: POST
-    timeout: PT30M
-
-retry:
-  type: constant
-  maxAttempts: 3
-  interval: PT15M
@@ -1,7 +0,0 @@
-fastapi==0.115.0
-uvicorn[standard]==0.30.6
-requests==2.32.3
-pandas==2.2.3
-openpyxl==3.1.5
-psycopg2-binary==2.9.9
-sqlalchemy==2.0.35
@@ -1,14 +0,0 @@
-"""Configuration for the data integrator."""
-import os
-from pathlib import Path
-
-DATABASE_URL = os.environ.get(
-    "DATABASE_URL",
-    "postgresql://schoolcompare:schoolcompare@db:5432/schoolcompare",
-)
-
-DATA_DIR = Path(os.environ.get("DATA_DIR", "/data"))
-SUPPLEMENTARY_DIR = DATA_DIR / "supplementary"
-
-BACKEND_URL = os.environ.get("BACKEND_URL", "http://backend:80")
-ADMIN_API_KEY = os.environ.get("ADMIN_API_KEY", "changeme")
@@ -1,23 +0,0 @@
-"""Database connection for the integrator."""
-from contextlib import contextmanager
-
-from sqlalchemy import create_engine
-from sqlalchemy.orm import sessionmaker
-
-from config import DATABASE_URL
-
-engine = create_engine(DATABASE_URL, pool_pre_ping=True)
-SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine)
-
-
-@contextmanager
-def get_session():
-    session = SessionLocal()
-    try:
-        yield session
-        session.commit()
-    except Exception:
-        session.rollback()
-        raise
-    finally:
-        session.close()
@@ -1,184 +0,0 @@
-"""
-School Admissions data downloader and loader.
-
-Source: EES publication "primary-and-secondary-school-applications-and-offers"
-        Content API release ZIP → supporting-files/AppsandOffers_*_SchoolLevel*.csv
-Update: Annual (June/July post-offer round)
-"""
-import argparse
-import re
-import sys
-from pathlib import Path
-
-import pandas as pd
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from config import SUPPLEMENTARY_DIR
-from db import get_session
-from sources.ees import download_release_zip_csv
-
-DEST_DIR = SUPPLEMENTARY_DIR / "admissions"
-PUBLICATION_SLUG = "primary-and-secondary-school-applications-and-offers"
-
-NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", "Z", ""}
-
-# Maps actual CSV column names → internal field names
-COLUMN_MAP = {
-    # School identifier
-    "school_urn": "urn",
-    # Year — e.g. 202526 → 2025
-    "time_period": "time_period_raw",
-    # PAN (places offered)
-    "total_number_places_offered": "pan",
-    # Applications (total times put as any preference)
-    "times_put_as_any_preferred_school": "total_applications",
-    # 1st-preference applications
-    "times_put_as_1st_preference": "times_1st_pref",
-    # 1st-preference offers
-    "number_1st_preference_offers": "offers_1st_pref",
-}
-
-
-def download(data_dir: Path | None = None) -> Path:
-    dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR
-    dest.mkdir(parents=True, exist_ok=True)
-    dest_file = dest / "admissions_school_level_latest.csv"
-    return download_release_zip_csv(
-        PUBLICATION_SLUG,
-        dest_file,
-        zip_member_keyword="schoollevel",
-    )
-
-
-def _parse_int(val) -> int | None:
-    if pd.isna(val):
-        return None
-    s = str(val).strip().upper().replace(",", "")
-    if s in NULL_VALUES:
-        return None
-    try:
-        return int(float(s))
-    except ValueError:
-        return None
-
-
-def _parse_pct(val) -> float | None:
-    if pd.isna(val):
-        return None
-    s = str(val).strip().upper().replace("%", "")
-    if s in NULL_VALUES:
-        return None
-    try:
-        return float(s)
-    except ValueError:
-        return None
-
-
-def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
-    if path is None:
-        dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR
-        files = sorted(dest.glob("*.csv"))
-        if not files:
-            raise FileNotFoundError(f"No admissions CSV found in {dest}")
-        path = files[-1]
-
-    print(f"  Admissions: loading {path} ...")
-    df = pd.read_csv(path, encoding="utf-8-sig", low_memory=False)
-
-    # Rename columns we care about
-    df.rename(columns=COLUMN_MAP, inplace=True)
-
-    if "urn" not in df.columns:
-        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
-
-    # Filter to primary schools only
-    if "school_phase" in df.columns:
-        df = df[df["school_phase"].str.lower() == "primary"]
-
-    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
-    df = df.dropna(subset=["urn"])
-    df["urn"] = df["urn"].astype(int)
-
-    # Derive year from time_period (e.g. 202526 → 2025)
-    def _extract_year(val) -> int | None:
-        s = str(val).strip()
-        m = re.match(r"(\d{4})\d{2}", s)
-        if m:
-            return int(m.group(1))
-        m2 = re.search(r"20(\d{2})", s)
-        if m2:
-            return int("20" + m2.group(1))
-        return None
-
-    if "time_period_raw" in df.columns:
-        df["year"] = df["time_period_raw"].apply(_extract_year)
-    else:
-        year_m = re.search(r"20(\d{2})", path.stem)
-        df["year"] = int("20" + year_m.group(1)) if year_m else None
-
-    df = df.dropna(subset=["year"])
-    df["year"] = df["year"].astype(int)
-
-    # Keep most recent year per school (file may contain multiple years)
-    df = df.sort_values("year", ascending=False).groupby("urn").first().reset_index()
-
-    inserted = 0
-    with get_session() as session:
-        from sqlalchemy import text
-        for _, row in df.iterrows():
-            urn = int(row["urn"])
-            year = int(row["year"])
-
-            pan = _parse_int(row.get("pan"))
-            total_apps = _parse_int(row.get("total_applications"))
-            times_1st = _parse_int(row.get("times_1st_pref"))
-            offers_1st = _parse_int(row.get("offers_1st_pref"))
-
-            # % of 1st-preference applicants who received an offer
-            if times_1st and times_1st > 0 and offers_1st is not None:
-                pct_1st = round(offers_1st / times_1st * 100, 1)
-            else:
-                pct_1st = None
-
-            oversubscribed = (
-                True if (pan and times_1st and times_1st > pan) else
-                False if (pan and times_1st and times_1st <= pan) else
-                None
-            )
-
-            session.execute(
-                text("""
-                    INSERT INTO school_admissions
-                        (urn, year, published_admission_number, total_applications,
-                         first_preference_offers_pct, oversubscribed)
-                    VALUES (:urn, :year, :pan, :total_apps, :pct_1st, :oversubscribed)
-                    ON CONFLICT (urn, year) DO UPDATE SET
-                        published_admission_number  = EXCLUDED.published_admission_number,
-                        total_applications          = EXCLUDED.total_applications,
-                        first_preference_offers_pct = EXCLUDED.first_preference_offers_pct,
-                        oversubscribed              = EXCLUDED.oversubscribed
-                """),
-                {
-                    "urn": urn, "year": year, "pan": pan,
-                    "total_apps": total_apps, "pct_1st": pct_1st,
-                    "oversubscribed": oversubscribed,
-                },
-            )
-            inserted += 1
-            if inserted % 5000 == 0:
-                session.flush()
-                print(f"    Processed {inserted} records...")
-
-    print(f"  Admissions: upserted {inserted} records")
-    return {"inserted": inserted, "updated": 0, "skipped": 0}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
-    parser.add_argument("--data-dir", type=Path, default=None)
-    args = parser.parse_args()
-    if args.action in ("download", "all"):
-        download(args.data_dir)
-    if args.action in ("load", "all"):
-        load(data_dir=args.data_dir)
@@ -1,148 +0,0 @@
-"""
-School Census (SPC) downloader and loader.
-
-Source: EES publication "schools-pupils-and-their-characteristics"
-Update: Annual (June)
-Adds: class_size_avg, ethnicity breakdown by school
-"""
-import argparse
-import re
-import sys
-from pathlib import Path
-
-import pandas as pd
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from config import SUPPLEMENTARY_DIR
-from db import get_session
-from sources.ees import get_latest_csv_url, download_csv
-
-DEST_DIR = SUPPLEMENTARY_DIR / "census"
-PUBLICATION_SLUG = "schools-pupils-and-their-characteristics"
-
-NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}
-
-COLUMN_MAP = {
-    "URN": "urn",
-    "urn": "urn",
-    "YEAR": "year",
-    "Year": "year",
-    # Class size
-    "average_class_size": "class_size_avg",
-    "AVCLAS": "class_size_avg",
-    "avg_class_size": "class_size_avg",
-    # Ethnicity — DfE uses ethnicity major group percentages
-    "perc_white": "ethnicity_white_pct",
-    "perc_asian": "ethnicity_asian_pct",
-    "perc_black": "ethnicity_black_pct",
-    "perc_mixed": "ethnicity_mixed_pct",
-    "perc_other_ethnic": "ethnicity_other_pct",
-    "PTWHITE": "ethnicity_white_pct",
-    "PTASIAN": "ethnicity_asian_pct",
-    "PTBLACK": "ethnicity_black_pct",
-    "PTMIXED": "ethnicity_mixed_pct",
-    "PTOTHER": "ethnicity_other_pct",
-}
-
-
-def download(data_dir: Path | None = None) -> Path:
-    dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR
-    dest.mkdir(parents=True, exist_ok=True)
-
-    url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
-    if not url:
-        raise RuntimeError(f"Could not find CSV URL for census publication")
-
-    filename = url.split("/")[-1].split("?")[0] or "census_latest.csv"
-    return download_csv(url, dest / filename)
-
-
-def _parse_pct(val) -> float | None:
-    if pd.isna(val):
-        return None
-    s = str(val).strip().upper().replace("%", "")
-    if s in NULL_VALUES:
-        return None
-    try:
-        return float(s)
-    except ValueError:
-        return None
-
-
-def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
-    if path is None:
-        dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR
-        files = sorted(dest.glob("*.csv"))
-        if not files:
-            raise FileNotFoundError(f"No census CSV found in {dest}")
-        path = files[-1]
-
-    print(f"  Census: loading {path} ...")
-    df = pd.read_csv(path, encoding="latin-1", low_memory=False)
-    df.rename(columns=COLUMN_MAP, inplace=True)
-
-    if "urn" not in df.columns:
-        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
-
-    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
-    df = df.dropna(subset=["urn"])
-    df["urn"] = df["urn"].astype(int)
-
-    year = None
-    m = re.search(r"20(\d{2})", path.stem)
-    if m:
-        year = int("20" + m.group(1))
-
-    inserted = 0
-    with get_session() as session:
-        from sqlalchemy import text
-        for _, row in df.iterrows():
-            urn = int(row["urn"])
-            row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
-            if not row_year:
-                continue
-
-            session.execute(
-                text("""
-                    INSERT INTO school_census
-                        (urn, year, class_size_avg,
-                         ethnicity_white_pct, ethnicity_asian_pct, ethnicity_black_pct,
-                         ethnicity_mixed_pct, ethnicity_other_pct)
-                    VALUES (:urn, :year, :class_size_avg,
-                            :white, :asian, :black, :mixed, :other)
-                    ON CONFLICT (urn, year) DO UPDATE SET
-                        class_size_avg       = EXCLUDED.class_size_avg,
-                        ethnicity_white_pct  = EXCLUDED.ethnicity_white_pct,
-                        ethnicity_asian_pct  = EXCLUDED.ethnicity_asian_pct,
-                        ethnicity_black_pct  = EXCLUDED.ethnicity_black_pct,
-                        ethnicity_mixed_pct  = EXCLUDED.ethnicity_mixed_pct,
-                        ethnicity_other_pct  = EXCLUDED.ethnicity_other_pct
-                """),
-                {
-                    "urn": urn,
-                    "year": row_year,
-                    "class_size_avg": _parse_pct(row.get("class_size_avg")),
-                    "white": _parse_pct(row.get("ethnicity_white_pct")),
-                    "asian": _parse_pct(row.get("ethnicity_asian_pct")),
-                    "black": _parse_pct(row.get("ethnicity_black_pct")),
-                    "mixed": _parse_pct(row.get("ethnicity_mixed_pct")),
-                    "other": _parse_pct(row.get("ethnicity_other_pct")),
-                },
-            )
-            inserted += 1
-            if inserted % 5000 == 0:
-                session.flush()
-
-    print(f"  Census: upserted {inserted} records")
-    return {"inserted": inserted, "updated": 0, "skipped": 0}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
-    parser.add_argument("--data-dir", type=Path, default=None)
-    args = parser.parse_args()
-    if args.action in ("download", "all"):
-        download(args.data_dir)
-    if args.action in ("load", "all"):
-        load(data_dir=args.data_dir)
@@ -1,111 +0,0 @@
-"""
-Shared EES (Explore Education Statistics) API client.
-
-Two APIs are available:
-  - Statistics API: https://api.education.gov.uk/statistics/v1  (only ~13 publications)
-  - Content API:    https://content.explore-education-statistics.service.gov.uk/api
-    Covers all publications; use this for admissions and other data not in the stats API.
-    Download all files for a release as a ZIP from /api/releases/{id}/files.
-"""
-import io
-import zipfile
-from pathlib import Path
-from typing import Optional
-
-import requests
-
-STATS_API_BASE = "https://api.education.gov.uk/statistics/v1"
-CONTENT_API_BASE = "https://content.explore-education-statistics.service.gov.uk/api"
-TIMEOUT = 60
-
-
-def get_publication_files(publication_slug: str) -> list[dict]:
-    """Return list of data-set file descriptors for a publication (statistics API)."""
-    url = f"{STATS_API_BASE}/publications/{publication_slug}/data-set-files"
-    resp = requests.get(url, timeout=TIMEOUT)
-    resp.raise_for_status()
-    return resp.json().get("results", [])
-
-
-def get_latest_csv_url(publication_slug: str, keyword: str = "") -> Optional[str]:
-    """
-    Find the most recent CSV download URL for a publication (statistics API).
-    Optionally filter by a keyword in the file name.
-    """
-    files = get_publication_files(publication_slug)
-    for entry in files:
-        name = entry.get("name", "").lower()
-        if keyword and keyword.lower() not in name:
-            continue
-        csv_url = entry.get("csvDownloadUrl") or entry.get("file", {}).get("url")
-        if csv_url:
-            return csv_url
-    return None
-
-
-def get_content_release_id(publication_slug: str) -> str:
-    """Return the latest release ID for a publication via the content API."""
-    url = f"{CONTENT_API_BASE}/publications/{publication_slug}/releases/latest"
-    resp = requests.get(url, timeout=TIMEOUT)
-    resp.raise_for_status()
-    return resp.json()["id"]
-
-
-def download_release_zip_csv(
-    publication_slug: str,
-    dest_path: Path,
-    zip_member_keyword: str = "",
-) -> Path:
-    """
-    Download the full-release ZIP from the EES content API and extract one CSV.
-
-    If zip_member_keyword is given, the first member whose path contains that
-    keyword (case-insensitive) is extracted; otherwise the first .csv found is used.
-    Returns dest_path (the extracted CSV file).
-    """
-    if dest_path.exists():
-        print(f"    EES: {dest_path.name} already exists, skipping.")
-        return dest_path
-
-    release_id = get_content_release_id(publication_slug)
-    zip_url = f"{CONTENT_API_BASE}/releases/{release_id}/files"
-    print(f"    EES: downloading release ZIP for '{publication_slug}' ...")
-    resp = requests.get(zip_url, timeout=300, stream=True)
-    resp.raise_for_status()
-
-    data = b"".join(resp.iter_content(chunk_size=65536))
-    with zipfile.ZipFile(io.BytesIO(data)) as z:
-        members = z.namelist()
-        target = None
-        kw = zip_member_keyword.lower()
-        for m in members:
-            if m.endswith(".csv") and (not kw or kw in m.lower()):
-                target = m
-                break
-        if not target:
-            raise ValueError(
-                f"No CSV matching '{zip_member_keyword}' in ZIP. Members: {members}"
-            )
-        print(f"    EES: extracting '{target}' ...")
-        dest_path.parent.mkdir(parents=True, exist_ok=True)
-        with z.open(target) as src, open(dest_path, "wb") as dst:
-            dst.write(src.read())
-
-    print(f"    EES: saved {dest_path} ({dest_path.stat().st_size // 1024} KB)")
-    return dest_path
-
-
-def download_csv(url: str, dest_path: Path) -> Path:
-    """Download a CSV from EES to dest_path."""
-    if dest_path.exists():
-        print(f"    EES: {dest_path.name} already exists, skipping.")
-        return dest_path
-    print(f"    EES: downloading {url} ...")
-    resp = requests.get(url, timeout=300, stream=True)
-    resp.raise_for_status()
-    dest_path.parent.mkdir(parents=True, exist_ok=True)
-    with open(dest_path, "wb") as f:
-        for chunk in resp.iter_content(chunk_size=65536):
-            f.write(chunk)
-    print(f"    EES: saved {dest_path} ({dest_path.stat().st_size // 1024} KB)")
-    return dest_path
@@ -1,143 +0,0 @@
-"""
-FBIT (Financial Benchmarking and Insights Tool) financial data loader.
-
-Source: https://schools-financial-benchmarking.service.gov.uk/api/
-Update: Annual (December — data for the prior financial year)
-"""
-import argparse
-import sys
-import time
-from pathlib import Path
-
-import pandas as pd
-import requests
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from config import SUPPLEMENTARY_DIR
-from db import get_session
-
-DEST_DIR = SUPPLEMENTARY_DIR / "finance"
-API_BASE = "https://schools-financial-benchmarking.service.gov.uk/api"
-RATE_LIMIT_DELAY = 0.1   # seconds between requests
-
-
-def download(data_dir: Path | None = None) -> Path:
-    """
-    Fetch per-URN financial data from FBIT API and save as CSV.
-    Batches all school URNs from the database.
-    """
-    dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
-    dest.mkdir(parents=True, exist_ok=True)
-
-    # Determine year from API (use current year minus 1 for completed financials)
-    from datetime import date
-    year = date.today().year - 1
-    dest_file = dest / f"fbit_{year}.csv"
-
-    if dest_file.exists():
-        print(f"  Finance: {dest_file.name} already exists, skipping download.")
-        return dest_file
-
-    # Get all URNs from the database
-    with get_session() as session:
-        from sqlalchemy import text
-        rows = session.execute(text("SELECT urn FROM schools")).fetchall()
-    urns = [r[0] for r in rows]
-    print(f"  Finance: fetching FBIT data for {len(urns)} schools (year {year}) ...")
-
-    records = []
-    errors = 0
-    for i, urn in enumerate(urns):
-        if i % 500 == 0:
-            print(f"    {i}/{len(urns)} ...")
-        try:
-            resp = requests.get(
-                f"{API_BASE}/schoolFinancialDataObject/{urn}",
-                timeout=10,
-            )
-            if resp.status_code == 200:
-                data = resp.json()
-                if data:
-                    records.append({
-                        "urn": urn,
-                        "year": year,
-                        "per_pupil_spend": data.get("totalExpenditure") and
-                                           data.get("numberOfPupils") and
-                                           round(data["totalExpenditure"] / data["numberOfPupils"], 2),
-                        "staff_cost_pct": data.get("staffCostPercent"),
-                        "teacher_cost_pct": data.get("teachingStaffCostPercent"),
-                        "support_staff_cost_pct": data.get("educationSupportStaffCostPercent"),
-                        "premises_cost_pct": data.get("premisesStaffCostPercent"),
-                    })
-            elif resp.status_code not in (404, 400):
-                errors += 1
-        except Exception:
-            errors += 1
-
-        time.sleep(RATE_LIMIT_DELAY)
-
-    df = pd.DataFrame(records)
-    df.to_csv(dest_file, index=False)
-    print(f"  Finance: saved {len(records)} records to {dest_file} ({errors} errors)")
-    return dest_file
-
-
-def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
-    if path is None:
-        dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR
-        files = sorted(dest.glob("fbit_*.csv"))
-        if not files:
-            raise FileNotFoundError(f"No finance CSV found in {dest}")
-        path = files[-1]
-
-    print(f"  Finance: loading {path} ...")
-    df = pd.read_csv(path)
-
-    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
-    df = df.dropna(subset=["urn"])
-    df["urn"] = df["urn"].astype(int)
-
-    inserted = 0
-    with get_session() as session:
-        from sqlalchemy import text
-        for _, row in df.iterrows():
-            session.execute(
-                text("""
-                    INSERT INTO school_finance
-                        (urn, year, per_pupil_spend, staff_cost_pct, teacher_cost_pct,
-                         support_staff_cost_pct, premises_cost_pct)
-                    VALUES (:urn, :year, :per_pupil, :staff, :teacher, :support, :premises)
-                    ON CONFLICT (urn, year) DO UPDATE SET
-                        per_pupil_spend        = EXCLUDED.per_pupil_spend,
-                        staff_cost_pct         = EXCLUDED.staff_cost_pct,
-                        teacher_cost_pct       = EXCLUDED.teacher_cost_pct,
-                        support_staff_cost_pct = EXCLUDED.support_staff_cost_pct,
-                        premises_cost_pct      = EXCLUDED.premises_cost_pct
-                """),
-                {
-                    "urn": int(row["urn"]),
-                    "year": int(row["year"]),
-                    "per_pupil": float(row["per_pupil_spend"]) if pd.notna(row.get("per_pupil_spend")) else None,
-                    "staff": float(row["staff_cost_pct"]) if pd.notna(row.get("staff_cost_pct")) else None,
-                    "teacher": float(row["teacher_cost_pct"]) if pd.notna(row.get("teacher_cost_pct")) else None,
-                    "support": float(row["support_staff_cost_pct"]) if pd.notna(row.get("support_staff_cost_pct")) else None,
-                    "premises": float(row["premises_cost_pct"]) if pd.notna(row.get("premises_cost_pct")) else None,
-                },
-            )
-            inserted += 1
-            if inserted % 2000 == 0:
-                session.flush()
-
-    print(f"  Finance: upserted {inserted} records")
-    return {"inserted": inserted, "updated": 0, "skipped": 0}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
-    parser.add_argument("--data-dir", type=Path, default=None)
-    args = parser.parse_args()
-    if args.action in ("download", "all"):
-        download(args.data_dir)
-    if args.action in ("load", "all"):
-        load(data_dir=args.data_dir)
@@ -1,159 +0,0 @@
-"""
-GIAS (Get Information About Schools) bulk CSV downloader and loader.
-
-Source: https://get-information-schools.service.gov.uk/Downloads
-Update: Daily; we refresh weekly.
-Adds: website, headteacher_name, capacity, trust_name, trust_uid, gender, nursery_provision
-"""
-import argparse
-import sys
-from datetime import date
-from pathlib import Path
-
-import pandas as pd
-import requests
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from config import SUPPLEMENTARY_DIR
-from db import get_session
-
-DEST_DIR = SUPPLEMENTARY_DIR / "gias"
-
-# GIAS bulk download URL — date is injected at runtime
-GIAS_URL_TEMPLATE = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata{date}.csv"
-
-COLUMN_MAP = {
-    "URN": "urn",
-    "SchoolWebsite": "website",
-    "SchoolCapacity": "capacity",
-    "TrustName": "trust_name",
-    "TrustUID": "trust_uid",
-    "Gender (name)": "gender",
-    "NurseryProvision (name)": "nursery_provision_raw",
-    "HeadTitle": "head_title",
-    "HeadFirstName": "head_first",
-    "HeadLastName": "head_last",
-}
-
-
-def download(data_dir: Path | None = None) -> Path:
-    dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
-    dest.mkdir(parents=True, exist_ok=True)
-
-    today = date.today().strftime("%Y%m%d")
-    url = GIAS_URL_TEMPLATE.format(date=today)
-    filename = f"gias_{today}.csv"
-    dest_file = dest / filename
-
-    if dest_file.exists():
-        print(f"  GIAS: {filename} already exists, skipping download.")
-        return dest_file
-
-    print(f"  GIAS: downloading {url} ...")
-    resp = requests.get(url, timeout=300, stream=True)
-
-    # GIAS may not have today's file yet — fall back to yesterday
-    if resp.status_code == 404:
-        from datetime import timedelta
-        yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d")
-        url = GIAS_URL_TEMPLATE.format(date=yesterday)
-        filename = f"gias_{yesterday}.csv"
-        dest_file = dest / filename
-        if dest_file.exists():
-            print(f"  GIAS: {filename} already exists, skipping download.")
-            return dest_file
-        resp = requests.get(url, timeout=300, stream=True)
-
-    resp.raise_for_status()
-    with open(dest_file, "wb") as f:
-        for chunk in resp.iter_content(chunk_size=65536):
-            f.write(chunk)
-
-    print(f"  GIAS: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)")
-    return dest_file
-
-
-def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
-    if path is None:
-        dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
-        files = sorted(dest.glob("gias_*.csv"))
-        if not files:
-            raise FileNotFoundError(f"No GIAS CSV found in {dest}")
-        path = files[-1]
-
-    print(f"  GIAS: loading {path} ...")
-    df = pd.read_csv(path, encoding="latin-1", low_memory=False)
-    df.rename(columns=COLUMN_MAP, inplace=True)
-
-    if "urn" not in df.columns:
-        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
-
-    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
-    df = df.dropna(subset=["urn"])
-    df["urn"] = df["urn"].astype(int)
-
-    # Build headteacher_name from parts
-    def build_name(row):
-        parts = [
-            str(row.get("head_title", "") or "").strip(),
-            str(row.get("head_first", "") or "").strip(),
-            str(row.get("head_last", "") or "").strip(),
-        ]
-        return " ".join(p for p in parts if p) or None
-
-    df["headteacher_name"] = df.apply(build_name, axis=1)
-    df["nursery_provision"] = df.get("nursery_provision_raw", pd.Series()).apply(
-        lambda v: True if str(v).strip().lower().startswith("has") else False if pd.notna(v) else None
-    )
-
-    def clean_str(val):
-        s = str(val).strip() if pd.notna(val) else None
-        return s if s and s.lower() not in ("nan", "none", "") else None
-
-    updated = 0
-    with get_session() as session:
-        from sqlalchemy import text
-        for _, row in df.iterrows():
-            urn = int(row["urn"])
-            session.execute(
-                text("""
-                    UPDATE schools SET
-                        website            = :website,
-                        headteacher_name   = :headteacher_name,
-                        capacity           = :capacity,
-                        trust_name         = :trust_name,
-                        trust_uid          = :trust_uid,
-                        gender             = :gender,
-                        nursery_provision  = :nursery_provision
-                    WHERE urn = :urn
-                """),
-                {
-                    "urn": urn,
-                    "website": clean_str(row.get("website")),
-                    "headteacher_name": row.get("headteacher_name"),
-                    "capacity": int(row["capacity"]) if pd.notna(row.get("capacity")) and str(row.get("capacity")).strip().isdigit() else None,
-                    "trust_name": clean_str(row.get("trust_name")),
-                    "trust_uid": clean_str(row.get("trust_uid")),
-                    "gender": clean_str(row.get("gender")),
-                    "nursery_provision": row.get("nursery_provision"),
-                },
-            )
-            updated += 1
-            if updated % 5000 == 0:
-                session.flush()
-                print(f"    Updated {updated} schools...")
-
-    print(f"  GIAS: updated {updated} school records")
-    return {"inserted": 0, "updated": updated, "skipped": 0}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
-    parser.add_argument("--data-dir", type=Path, default=None)
-    args = parser.parse_args()
-
-    if args.action in ("download", "all"):
-        path = download(args.data_dir)
-    if args.action in ("load", "all"):
-        load(data_dir=args.data_dir)
@@ -1,176 +0,0 @@
-"""
-IDACI (Income Deprivation Affecting Children Index) loader.
-
-Source: English Indices of Deprivation 2019
-https://www.gov.uk/government/statistics/english-indices-of-deprivation-2019
-
-This is a one-time download (5-yearly release). We join school postcodes to LSOAs
-via postcodes.io, then look up IDACI scores from the IoD2019 file.
-
-Update: ~5-yearly (next release expected 2025/26)
-"""
-import argparse
-import sys
-from pathlib import Path
-
-import pandas as pd
-import requests
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from config import SUPPLEMENTARY_DIR
-from db import get_session
-
-DEST_DIR = SUPPLEMENTARY_DIR / "idaci"
-
-# IoD 2019 supplementary data — "Income Deprivation Affecting Children Index (IDACI)"
-IOD_2019_URL = (
-    "https://assets.publishing.service.gov.uk/government/uploads/system/uploads/"
-    "attachment_data/file/833970/File_1_-_IMD2019_Index_of_Multiple_Deprivation.xlsx"
-)
-
-POSTCODES_IO_BATCH = "https://api.postcodes.io/postcodes"
-BATCH_SIZE = 100
-
-
-def download(data_dir: Path | None = None) -> Path:
-    dest = (data_dir / "supplementary" / "idaci") if data_dir else DEST_DIR
-    dest.mkdir(parents=True, exist_ok=True)
-
-    filename = "iod2019_idaci.xlsx"
-    dest_file = dest / filename
-    if dest_file.exists():
-        print(f"  IDACI: {filename} already exists, skipping download.")
-        return dest_file
-
-    print(f"  IDACI: downloading IoD2019 file ...")
-    resp = requests.get(IOD_2019_URL, timeout=300, stream=True)
-    resp.raise_for_status()
-    with open(dest_file, "wb") as f:
-        for chunk in resp.iter_content(chunk_size=65536):
-            f.write(chunk)
-
-    print(f"  IDACI: saved {dest_file}")
-    return dest_file
-
-
-def _postcode_to_lsoa(postcodes: list[str]) -> dict[str, str]:
-    """Batch-resolve postcodes to LSOA codes via postcodes.io."""
-    result = {}
-    valid = [p.strip().upper() for p in postcodes if p and len(str(p).strip()) >= 5]
-    valid = list(set(valid))
-
-    for i in range(0, len(valid), BATCH_SIZE):
-        batch = valid[i:i + BATCH_SIZE]
-        try:
-            resp = requests.post(POSTCODES_IO_BATCH, json={"postcodes": batch}, timeout=30)
-            if resp.status_code == 200:
-                for item in resp.json().get("result", []):
-                    if item and item.get("result"):
-                        lsoa = item["result"].get("lsoa")
-                        if lsoa:
-                            result[item["query"].upper()] = lsoa
-        except Exception as e:
-            print(f"  Warning: postcodes.io batch failed: {e}")
-
-    return result
-
-
-def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
-    dest = (data_dir / "supplementary" / "idaci") if data_dir else DEST_DIR
-    if path is None:
-        files = sorted(dest.glob("*.xlsx"))
-        if not files:
-            raise FileNotFoundError(f"No IDACI file found in {dest}")
-        path = files[-1]
-
-    print(f"  IDACI: loading IoD2019 from {path} ...")
-
-    # IoD2019 File 1 — sheet "IoD2019 IDACI" or similar
-    try:
-        iod_df = pd.read_excel(path, sheet_name=None)
-        # Find sheet with IDACI data
-        idaci_sheet = None
-        for name, df in iod_df.items():
-            if "IDACI" in name.upper() or "IDACI" in str(df.columns.tolist()).upper():
-                idaci_sheet = name
-                break
-        if idaci_sheet is None:
-            idaci_sheet = list(iod_df.keys())[0]
-        df_iod = iod_df[idaci_sheet]
-    except Exception as e:
-        raise RuntimeError(f"Could not read IoD2019 file: {e}")
-
-    # Normalise column names — IoD2019 uses specific headers
-    col_lsoa = next((c for c in df_iod.columns if "LSOA" in str(c).upper() and "code" in str(c).lower()), None)
-    col_score = next((c for c in df_iod.columns if "IDACI" in str(c).upper() and "score" in str(c).lower()), None)
-    col_rank = next((c for c in df_iod.columns if "IDACI" in str(c).upper() and "rank" in str(c).lower()), None)
-
-    if not col_lsoa or not col_score:
-        print(f"  IDACI columns available: {list(df_iod.columns)[:20]}")
-        raise ValueError("Could not find LSOA code or IDACI score columns")
-
-    df_iod = df_iod[[col_lsoa, col_score]].copy()
-    df_iod.columns = ["lsoa_code", "idaci_score"]
-    df_iod = df_iod.dropna()
-
-    # Compute decile from rank (or from score distribution)
-    total = len(df_iod)
-    df_iod = df_iod.sort_values("idaci_score", ascending=False)
-    df_iod["idaci_decile"] = (pd.qcut(df_iod["idaci_score"], 10, labels=False) + 1).astype(int)
-    # Decile 1 = most deprived (highest IDACI score)
-    df_iod["idaci_decile"] = 11 - df_iod["idaci_decile"]
-
-    lsoa_lookup = df_iod.set_index("lsoa_code")[["idaci_score", "idaci_decile"]].to_dict("index")
-    print(f"  IDACI: loaded {len(lsoa_lookup)} LSOA records")
-
-    # Fetch all school postcodes from the database
-    with get_session() as session:
-        from sqlalchemy import text
-        rows = session.execute(text("SELECT urn, postcode FROM schools WHERE postcode IS NOT NULL")).fetchall()
-
-    postcodes = [r[1] for r in rows]
-    print(f"  IDACI: resolving {len(postcodes)} postcodes via postcodes.io ...")
-    pc_to_lsoa = _postcode_to_lsoa(postcodes)
-    print(f"  IDACI: resolved {len(pc_to_lsoa)} postcodes to LSOAs")
-
-    inserted = skipped = 0
-    with get_session() as session:
-        from sqlalchemy import text
-        for urn, postcode in rows:
-            lsoa = pc_to_lsoa.get(str(postcode).strip().upper())
-            if not lsoa:
-                skipped += 1
-                continue
-            iod = lsoa_lookup.get(lsoa)
-            if not iod:
-                skipped += 1
-                continue
-
-            session.execute(
-                text("""
-                    INSERT INTO school_deprivation (urn, lsoa_code, idaci_score, idaci_decile)
-                    VALUES (:urn, :lsoa, :score, :decile)
-                    ON CONFLICT (urn) DO UPDATE SET
-                        lsoa_code    = EXCLUDED.lsoa_code,
-                        idaci_score  = EXCLUDED.idaci_score,
-                        idaci_decile = EXCLUDED.idaci_decile
-                """),
-                {"urn": urn, "lsoa": lsoa, "score": float(iod["idaci_score"]), "decile": int(iod["idaci_decile"])},
-            )
-            inserted += 1
-            if inserted % 2000 == 0:
-                session.flush()
-
-    print(f"  IDACI: upserted {inserted}, skipped {skipped}")
-    return {"inserted": inserted, "updated": 0, "skipped": skipped}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
-    parser.add_argument("--data-dir", type=Path, default=None)
-    args = parser.parse_args()
-    if args.action in ("download", "all"):
-        download(args.data_dir)
-    if args.action in ("load", "all"):
-        load(data_dir=args.data_dir)
@@ -1,49 +0,0 @@
-"""
-KS2 attainment data re-importer.
-
-Triggers a full re-import of the KS2 CSV data by calling the backend's
-admin endpoint. The backend owns the migration logic and CSV column mappings;
-this module is a thin trigger so the re-import can be orchestrated via Kestra
-like all other data sources.
-
-The CSV files must already be present in the data volume under
-  /data/{year}/england_ks2final.csv
-(populated at deploy time from the repo's data/ directory).
-"""
-import requests
-from config import BACKEND_URL, ADMIN_API_KEY
-
-HEADERS = {"X-API-Key": ADMIN_API_KEY}
-
-
-def download():
-    """No download step — CSVs are shipped with the repo."""
-    print("KS2 CSVs are bundled in the data volume; no download needed.")
-    return {"skipped": True}
-
-
-def load():
-    """Trigger KS2 re-import on the backend and return immediately.
-
-    The migration (including geocoding) runs as a background thread on the
-    backend and can take up to an hour. Poll GET /api/admin/reimport-ks2/status
-    to check progress, or simply wait for schools to appear in the UI.
-    """
-    url = f"{BACKEND_URL}/api/admin/reimport-ks2?geocode=true"
-    print(f"POST {url}")
-    resp = requests.post(url, headers=HEADERS, timeout=30)
-    resp.raise_for_status()
-    result = resp.json()
-    print(f"Result: {result}")
-    return result
-
-
-if __name__ == "__main__":
-    import argparse
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
-    args = parser.parse_args()
-    if args.action in ("download", "all"):
-        download()
-    if args.action in ("load", "all"):
-        load()
@@ -1,418 +0,0 @@
-"""
-Ofsted Monthly Management Information CSV downloader and loader.
-
-Source: https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes
-Update: Monthly (released ~2 weeks into each month)
-"""
-import argparse
-import re
-import sys
-from datetime import date, datetime
-from pathlib import Path
-
-import pandas as pd
-import requests
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from config import SUPPLEMENTARY_DIR
-from db import get_session
-
-# Current Ofsted MI download URL — update this when Ofsted releases a new file.
-# The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page.
-GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes"
-
-# Column name → internal field, listed in priority order per field.
-# First matching column wins; later entries are fallbacks for older file formats.
-COLUMN_PRIORITY = {
-    "urn": ["URN", "Urn", "urn"],
-    "inspection_date": [
-        "Inspection start date of latest OEIF graded inspection",
-        "Inspection start date",
-        "Inspection date",
-        "InspectionDate",
-    ],
-    "publication_date": [
-        "Publication date of latest OEIF graded inspection",
-        "Publication date",
-        "PublicationDate",
-    ],
-    "inspection_type": [
-        "Inspection type of latest OEIF graded inspection",
-        "Inspection type",
-        "InspectionType",
-    ],
-    "overall_effectiveness": [
-        "Latest OEIF overall effectiveness",
-        "Overall effectiveness",
-        "OverallEffectiveness",
-    ],
-    "quality_of_education": [
-        "Latest OEIF quality of education",
-        "Quality of education",
-        "QualityOfEducation",
-    ],
-    "behaviour_attitudes": [
-        "Latest OEIF behaviour and attitudes",
-        "Behaviour and attitudes",
-        "BehaviourAndAttitudes",
-    ],
-    "personal_development": [
-        "Latest OEIF personal development",
-        "Personal development",
-        "PersonalDevelopment",
-    ],
-    "leadership_management": [
-        "Latest OEIF effectiveness of leadership and management",
-        "Leadership and management",
-        "LeadershipAndManagement",
-    ],
-    "early_years_provision": [
-        "Latest OEIF early years provision (where applicable)",
-        "Early years provision",
-        "EarlyYearsProvision",
-    ],
-}
-
-GRADE_MAP = {
-    "Outstanding": 1, "1": 1, 1: 1,
-    "Good": 2, "2": 2, 2: 2,
-    "Requires improvement": 3, "3": 3, 3: 3,
-    "Requires Improvement": 3,
-    "Inadequate": 4, "4": 4, 4: 4,
-}
-
-# Report Card grade text → integer (1=Exceptional … 5=Urgent improvement)
-RC_GRADE_MAP = {
-    "exceptional": 1,
-    "strong standard": 2,
-    "strong": 2,
-    "expected standard": 3,
-    "expected": 3,
-    "needs attention": 4,
-    "urgent improvement": 5,
-}
-
-# Column name priority for Report Card fields (best-guess names; Ofsted may vary)
-RC_COLUMN_PRIORITY = {
-    "rc_safeguarding": [
-        "Safeguarding",
-        "safeguarding",
-        "Safeguarding standards",
-    ],
-    "rc_inclusion": [
-        "Inclusion",
-        "inclusion",
-    ],
-    "rc_curriculum_teaching": [
-        "Curriculum and teaching",
-        "curriculum_and_teaching",
-        "Curriculum & teaching",
-    ],
-    "rc_achievement": [
-        "Achievement",
-        "achievement",
-    ],
-    "rc_attendance_behaviour": [
-        "Attendance and behaviour",
-        "attendance_and_behaviour",
-        "Attendance & behaviour",
-    ],
-    "rc_personal_development": [
-        "Personal development and well-being",
-        "Personal development and wellbeing",
-        "personal_development_and_wellbeing",
-        "Personal development & well-being",
-    ],
-    "rc_leadership_governance": [
-        "Leadership and governance",
-        "leadership_and_governance",
-        "Leadership & governance",
-    ],
-    "rc_early_years": [
-        "Early years",
-        "early_years",
-        "Early years provision",
-    ],
-    "rc_sixth_form": [
-        "Sixth form",
-        "sixth_form",
-        "Sixth form in schools",
-    ],
-}
-
-DEST_DIR = SUPPLEMENTARY_DIR / "ofsted"
-
-
-def _discover_csv_url() -> str | None:
-    """Scrape the GOV.UK page for the most recent CSV/ZIP link."""
-    try:
-        resp = requests.get(GOV_UK_PAGE, timeout=30)
-        resp.raise_for_status()
-        # Look for links to assets.publishing.service.gov.uk CSV or ZIP files
-        pattern = r'href="(https://assets\.publishing\.service\.gov\.uk[^"]+\.(?:csv|zip))"'
-        urls = re.findall(pattern, resp.text, re.IGNORECASE)
-        if urls:
-            return urls[0]
-    except Exception as e:
-        print(f"  Warning: could not scrape GOV.UK page: {e}")
-    return None
-
-
-def download(data_dir: Path | None = None) -> Path:
-    dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR
-    dest.mkdir(parents=True, exist_ok=True)
-
-    url = _discover_csv_url()
-    if not url:
-        raise RuntimeError(
-            "Could not discover Ofsted MI download URL. "
-            "Visit https://www.gov.uk/government/statistical-data-sets/"
-            "monthly-management-information-ofsteds-school-inspections-outcomes "
-            "to get the latest URL and update MANUAL_URL in ofsted.py"
-        )
-
-    filename = url.split("/")[-1]
-    dest_file = dest / filename
-
-    if dest_file.exists():
-        print(f"  Ofsted: {filename} already exists, skipping download.")
-        return dest_file
-
-    print(f"  Ofsted: downloading {url} ...")
-    resp = requests.get(url, timeout=120, stream=True)
-    resp.raise_for_status()
-    with open(dest_file, "wb") as f:
-        for chunk in resp.iter_content(chunk_size=65536):
-            f.write(chunk)
-
-    print(f"  Ofsted: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)")
-    return dest_file
-
-
-def _parse_grade(val) -> int | None:
-    if pd.isna(val):
-        return None
-    key = str(val).strip()
-    return GRADE_MAP.get(key)
-
-
-def _parse_rc_grade(val) -> int | None:
-    """Parse a Report Card grade text to integer 1–5."""
-    if pd.isna(val):
-        return None
-    key = str(val).strip().lower()
-    return RC_GRADE_MAP.get(key)
-
-
-def _parse_safeguarding(val) -> bool | None:
-    """Parse safeguarding 'Met'/'Not met' to boolean."""
-    if pd.isna(val):
-        return None
-    s = str(val).strip().lower()
-    if s == "met":
-        return True
-    if s in ("not met", "not_met"):
-        return False
-    return None
-
-
-def _parse_date(val) -> date | None:
-    if pd.isna(val):
-        return None
-    for fmt in ("%d/%m/%Y", "%Y-%m-%d", "%d-%m-%Y", "%d %B %Y"):
-        try:
-            return datetime.strptime(str(val).strip(), fmt).date()
-        except ValueError:
-            pass
-    return None
-
-
-def _framework_for_row(row) -> str | None:
-    """Determine inspection framework for a single school row.
-
-    Check RC columns first — if any have a value, it's a Report Card inspection.
-    Fall back to OEIF columns. If neither has data, the school has no graded
-    inspection on record (return None).
-    """
-    rc_check_cols = [
-        "rc_inclusion", "rc_curriculum_teaching", "rc_achievement",
-        "rc_attendance_behaviour", "rc_personal_development",
-        "rc_leadership_governance", "rc_safeguarding",
-    ]
-    for col in rc_check_cols:
-        val = row.get(col)
-        if val is not None and not (isinstance(val, float) and pd.isna(val)):
-            return "ReportCard"
-
-    oeif_check_cols = ["overall_effectiveness", "quality_of_education"]
-    for col in oeif_check_cols:
-        val = row.get(col)
-        if val is not None and not (isinstance(val, float) and pd.isna(val)):
-            return "OEIF"
-
-    return None
-
-
-def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
-    if path is None:
-        dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR
-        files = sorted(dest.glob("*.csv")) + sorted(dest.glob("*.zip"))
-        if not files:
-            raise FileNotFoundError(f"No Ofsted MI file found in {dest}")
-        path = files[-1]
-
-    print(f"  Ofsted: loading {path} ...")
-
-    def _find_header_row(filepath, encoding="latin-1"):
-        """Scan up to 10 rows to find the one containing a URN column."""
-        for i in range(10):
-            peek = pd.read_csv(filepath, encoding=encoding, header=i, nrows=0)
-            if any(str(c).strip() in ("URN", "Urn", "urn") for c in peek.columns):
-                return i
-        return 0
-
-    if str(path).endswith(".zip"):
-        import zipfile, io
-        with zipfile.ZipFile(path) as z:
-            csv_names = [n for n in z.namelist() if n.endswith(".csv")]
-            if not csv_names:
-                raise ValueError("No CSV found inside Ofsted ZIP")
-            # Extract to a temp file so we can scan for the header row
-            import tempfile, os
-            with tempfile.NamedTemporaryFile(suffix=".csv", delete=False) as tmp:
-                tmp.write(z.read(csv_names[0]))
-                tmp_path = tmp.name
-            try:
-                hdr = _find_header_row(tmp_path)
-                df = pd.read_csv(tmp_path, encoding="latin-1", low_memory=False, header=hdr)
-            finally:
-                os.unlink(tmp_path)
-    else:
-        hdr = _find_header_row(path)
-        df = pd.read_csv(path, encoding="latin-1", low_memory=False, header=hdr)
-
-    # Normalise OEIF column names: for each target field pick the first source column present
-    available = set(df.columns)
-    for target, sources in COLUMN_PRIORITY.items():
-        for src in sources:
-            if src in available:
-                df.rename(columns={src: target}, inplace=True)
-                break
-
-    # Normalise Report Card column names (if present)
-    available = set(df.columns)
-    for target, sources in RC_COLUMN_PRIORITY.items():
-        for src in sources:
-            if src in available:
-                df.rename(columns={src: target}, inplace=True)
-                break
-
-    if "urn" not in df.columns:
-        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
-
-    # Only keep rows with a valid URN
-    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
-    df = df.dropna(subset=["urn"])
-    df["urn"] = df["urn"].astype(int)
-
-    inserted = updated = skipped = 0
-
-    with get_session() as session:
-        # Keep only the most recent inspection per URN
-        if "inspection_date" in df.columns:
-            df["_date_parsed"] = df["inspection_date"].apply(_parse_date)
-            df = df.sort_values("_date_parsed", ascending=False).groupby("urn").first().reset_index()
-
-        from sqlalchemy import text
-
-        for _, row in df.iterrows():
-            urn = int(row["urn"])
-
-            record = {
-                "urn": urn,
-                "framework": _framework_for_row(row),
-                "inspection_date": _parse_date(row.get("inspection_date")),
-                "publication_date": _parse_date(row.get("publication_date")),
-                "inspection_type": str(row.get("inspection_type", "")).strip() or None,
-                # OEIF fields
-                "overall_effectiveness": _parse_grade(row.get("overall_effectiveness")),
-                "quality_of_education": _parse_grade(row.get("quality_of_education")),
-                "behaviour_attitudes": _parse_grade(row.get("behaviour_attitudes")),
-                "personal_development": _parse_grade(row.get("personal_development")),
-                "leadership_management": _parse_grade(row.get("leadership_management")),
-                "early_years_provision": _parse_grade(row.get("early_years_provision")),
-                "previous_overall": None,
-                # Report Card fields
-                "rc_safeguarding_met": _parse_safeguarding(row.get("rc_safeguarding")),
-                "rc_inclusion": _parse_rc_grade(row.get("rc_inclusion")),
-                "rc_curriculum_teaching": _parse_rc_grade(row.get("rc_curriculum_teaching")),
-                "rc_achievement": _parse_rc_grade(row.get("rc_achievement")),
-                "rc_attendance_behaviour": _parse_rc_grade(row.get("rc_attendance_behaviour")),
-                "rc_personal_development": _parse_rc_grade(row.get("rc_personal_development")),
-                "rc_leadership_governance": _parse_rc_grade(row.get("rc_leadership_governance")),
-                "rc_early_years": _parse_rc_grade(row.get("rc_early_years")),
-                "rc_sixth_form": _parse_rc_grade(row.get("rc_sixth_form")),
-            }
-
-            session.execute(
-                text("""
-                    INSERT INTO ofsted_inspections
-                        (urn, framework, inspection_date, publication_date, inspection_type,
-                         overall_effectiveness, quality_of_education, behaviour_attitudes,
-                         personal_development, leadership_management, early_years_provision,
-                         previous_overall,
-                         rc_safeguarding_met, rc_inclusion, rc_curriculum_teaching,
-                         rc_achievement, rc_attendance_behaviour, rc_personal_development,
-                         rc_leadership_governance, rc_early_years, rc_sixth_form)
-                    VALUES
-                        (:urn, :framework, :inspection_date, :publication_date, :inspection_type,
-                         :overall_effectiveness, :quality_of_education, :behaviour_attitudes,
-                         :personal_development, :leadership_management, :early_years_provision,
-                         :previous_overall,
-                         :rc_safeguarding_met, :rc_inclusion, :rc_curriculum_teaching,
-                         :rc_achievement, :rc_attendance_behaviour, :rc_personal_development,
-                         :rc_leadership_governance, :rc_early_years, :rc_sixth_form)
-                    ON CONFLICT (urn) DO UPDATE SET
-                        previous_overall      = ofsted_inspections.overall_effectiveness,
-                        framework             = EXCLUDED.framework,
-                        inspection_date       = EXCLUDED.inspection_date,
-                        publication_date      = EXCLUDED.publication_date,
-                        inspection_type       = EXCLUDED.inspection_type,
-                        overall_effectiveness = EXCLUDED.overall_effectiveness,
-                        quality_of_education  = EXCLUDED.quality_of_education,
-                        behaviour_attitudes   = EXCLUDED.behaviour_attitudes,
-                        personal_development  = EXCLUDED.personal_development,
-                        leadership_management = EXCLUDED.leadership_management,
-                        early_years_provision = EXCLUDED.early_years_provision,
-                        rc_safeguarding_met      = EXCLUDED.rc_safeguarding_met,
-                        rc_inclusion             = EXCLUDED.rc_inclusion,
-                        rc_curriculum_teaching   = EXCLUDED.rc_curriculum_teaching,
-                        rc_achievement           = EXCLUDED.rc_achievement,
-                        rc_attendance_behaviour  = EXCLUDED.rc_attendance_behaviour,
-                        rc_personal_development  = EXCLUDED.rc_personal_development,
-                        rc_leadership_governance = EXCLUDED.rc_leadership_governance,
-                        rc_early_years           = EXCLUDED.rc_early_years,
-                        rc_sixth_form            = EXCLUDED.rc_sixth_form
-                """),
-                record,
-            )
-            inserted += 1
-
-            if inserted % 5000 == 0:
-                session.flush()
-                print(f"    Processed {inserted} records...")
-
-    print(f"  Ofsted: upserted {inserted} records")
-    return {"inserted": inserted, "updated": updated, "skipped": skipped}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
-    parser.add_argument("--data-dir", type=Path, default=None)
-    args = parser.parse_args()
-
-    if args.action in ("download", "all"):
-        path = download(args.data_dir)
-    if args.action in ("load", "all"):
-        load(data_dir=args.data_dir)
@@ -1,229 +0,0 @@
-"""
-Ofsted Parent View open data downloader and loader.
-
-Source: https://parentview.ofsted.gov.uk/open-data
-Update: ~3 times/year (Spring, Autumn, Summer)
-"""
-import argparse
-import re
-import sys
-from datetime import date, datetime
-from pathlib import Path
-
-import pandas as pd
-import requests
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from config import SUPPLEMENTARY_DIR
-from db import get_session
-
-DEST_DIR = SUPPLEMENTARY_DIR / "parent_view"
-OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data"
-
-# Question column mapping — Parent View open data uses descriptive column headers
-# Map any variant to our internal field names
-QUESTION_MAP = {
-    # Q1 — happiness
-    "My child is happy at this school": "q_happy_pct",
-    "Happy": "q_happy_pct",
-    # Q2 — safety
-    "My child feels safe at this school": "q_safe_pct",
-    "Safe": "q_safe_pct",
-    # Q3 — bullying
-    "The school makes sure its pupils are well behaved": "q_behaviour_pct",
-    "Well Behaved": "q_behaviour_pct",
-    # Q4 — bullying dealt with (sometimes separate)
-    "My child has been bullied and the school dealt with the bullying quickly and effectively": "q_bullying_pct",
-    "Bullying": "q_bullying_pct",
-    # Q5 — curriculum info
-    "The school makes me aware of what my child will learn during the year": "q_communication_pct",
-    "Aware of learning": "q_communication_pct",
-    # Q6 — concerns dealt with
-    "When I have raised concerns with the school, they have been dealt with properly": "q_communication_pct",
-    # Q7 — child does well
-    "My child does well at this school": "q_progress_pct",
-    "Does well": "q_progress_pct",
-    # Q8 — teaching
-    "The teaching is good at this school": "q_teaching_pct",
-    "Good teaching": "q_teaching_pct",
-    # Q9 — progress info
-    "I receive valuable information from the school about my child's progress": "q_information_pct",
-    "Progress information": "q_information_pct",
-    # Q10 — curriculum breadth
-    "My child is taught a broad range of subjects": "q_curriculum_pct",
-    "Broad subjects": "q_curriculum_pct",
-    # Q11 — prepares for future
-    "The school prepares my child well for the future": "q_future_pct",
-    "Prepared for future": "q_future_pct",
-    # Q12 — leadership
-    "The school is led and managed effectively": "q_leadership_pct",
-    "Led well": "q_leadership_pct",
-    # Q13 — wellbeing
-    "The school supports my child's wider personal development": "q_wellbeing_pct",
-    "Personal development": "q_wellbeing_pct",
-    # Q14 — recommendation
-    "I would recommend this school to another parent": "q_recommend_pct",
-    "Recommend": "q_recommend_pct",
-}
-
-
-def download(data_dir: Path | None = None) -> Path:
-    dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
-    dest.mkdir(parents=True, exist_ok=True)
-
-    # Scrape the open data page for the download link
-    try:
-        resp = requests.get(OPEN_DATA_PAGE, timeout=30)
-        resp.raise_for_status()
-        pattern = r'href="([^"]+\.(?:xlsx|csv|zip))"'
-        urls = re.findall(pattern, resp.text, re.IGNORECASE)
-        if not urls:
-            raise RuntimeError("No download link found on Parent View open data page")
-        url = urls[0] if urls[0].startswith("http") else "https://parentview.ofsted.gov.uk" + urls[0]
-    except Exception as e:
-        raise RuntimeError(f"Could not discover Parent View download URL: {e}")
-
-    filename = url.split("/")[-1].split("?")[0]
-    dest_file = dest / filename
-
-    if dest_file.exists():
-        print(f"  ParentView: {filename} already exists, skipping download.")
-        return dest_file
-
-    print(f"  ParentView: downloading {url} ...")
-    resp = requests.get(url, timeout=120, stream=True)
-    resp.raise_for_status()
-    with open(dest_file, "wb") as f:
-        for chunk in resp.iter_content(chunk_size=65536):
-            f.write(chunk)
-
-    print(f"  ParentView: saved {dest_file}")
-    return dest_file
-
-
-def _positive_pct(row: pd.Series, q_col_base: str) -> float | None:
-    """Sum 'Strongly agree' + 'Agree' percentages for a question."""
-    # Parent View open data has columns like "Q1 - Strongly agree %", "Q1 - Agree %"
-    strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %")
-    agree = row.get(f"{q_col_base} - Agree %")
-    try:
-        total = 0.0
-        if pd.notna(strongly):
-            total += float(strongly)
-        if pd.notna(agree):
-            total += float(agree)
-        return round(total, 1) if total > 0 else None
-    except (TypeError, ValueError):
-        return None
-
-
-def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
-    if path is None:
-        dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
-        files = sorted(dest.glob("*.xlsx")) + sorted(dest.glob("*.csv"))
-        if not files:
-            raise FileNotFoundError(f"No Parent View file found in {dest}")
-        path = files[-1]
-
-    print(f"  ParentView: loading {path} ...")
-
-    if str(path).endswith(".xlsx"):
-        df = pd.read_excel(path)
-    else:
-        df = pd.read_csv(path, encoding="latin-1", low_memory=False)
-
-    # Normalise URN column
-    urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None)
-    if not urn_col:
-        raise ValueError(f"URN column not found. Columns: {list(df.columns)[:20]}")
-    df.rename(columns={urn_col: "urn"}, inplace=True)
-    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
-    df = df.dropna(subset=["urn"])
-    df["urn"] = df["urn"].astype(int)
-
-    # Try to find total responses column
-    resp_col = next((c for c in df.columns if "total" in c.lower() and "respon" in c.lower()), None)
-
-    inserted = 0
-    today = date.today()
-
-    with get_session() as session:
-        from sqlalchemy import text
-        for _, row in df.iterrows():
-            urn = int(row["urn"])
-            total = int(row[resp_col]) if resp_col and pd.notna(row.get(resp_col)) else None
-
-            # Try to extract % positive per question from wide-format columns
-            # Parent View has numbered questions Q1–Q12 (or Q1–Q14 depending on year)
-            record = {
-                "urn": urn,
-                "survey_date": today,
-                "total_responses": total,
-                "q_happy_pct": _positive_pct(row, "Q1"),
-                "q_safe_pct": _positive_pct(row, "Q2"),
-                "q_behaviour_pct": _positive_pct(row, "Q3"),
-                "q_bullying_pct": _positive_pct(row, "Q4"),
-                "q_communication_pct": _positive_pct(row, "Q5"),
-                "q_progress_pct": _positive_pct(row, "Q7"),
-                "q_teaching_pct": _positive_pct(row, "Q8"),
-                "q_information_pct": _positive_pct(row, "Q9"),
-                "q_curriculum_pct": _positive_pct(row, "Q10"),
-                "q_future_pct": _positive_pct(row, "Q11"),
-                "q_leadership_pct": _positive_pct(row, "Q12"),
-                "q_wellbeing_pct": _positive_pct(row, "Q13"),
-                "q_recommend_pct": _positive_pct(row, "Q14"),
-                "q_sen_pct": None,
-            }
-
-            session.execute(
-                text("""
-                    INSERT INTO ofsted_parent_view
-                        (urn, survey_date, total_responses,
-                         q_happy_pct, q_safe_pct, q_behaviour_pct, q_bullying_pct,
-                         q_communication_pct, q_progress_pct, q_teaching_pct,
-                         q_information_pct, q_curriculum_pct, q_future_pct,
-                         q_leadership_pct, q_wellbeing_pct, q_recommend_pct, q_sen_pct)
-                    VALUES
-                        (:urn, :survey_date, :total_responses,
-                         :q_happy_pct, :q_safe_pct, :q_behaviour_pct, :q_bullying_pct,
-                         :q_communication_pct, :q_progress_pct, :q_teaching_pct,
-                         :q_information_pct, :q_curriculum_pct, :q_future_pct,
-                         :q_leadership_pct, :q_wellbeing_pct, :q_recommend_pct, :q_sen_pct)
-                    ON CONFLICT (urn) DO UPDATE SET
-                        survey_date = EXCLUDED.survey_date,
-                        total_responses = EXCLUDED.total_responses,
-                        q_happy_pct = EXCLUDED.q_happy_pct,
-                        q_safe_pct = EXCLUDED.q_safe_pct,
-                        q_behaviour_pct = EXCLUDED.q_behaviour_pct,
-                        q_bullying_pct = EXCLUDED.q_bullying_pct,
-                        q_communication_pct = EXCLUDED.q_communication_pct,
-                        q_progress_pct = EXCLUDED.q_progress_pct,
-                        q_teaching_pct = EXCLUDED.q_teaching_pct,
-                        q_information_pct = EXCLUDED.q_information_pct,
-                        q_curriculum_pct = EXCLUDED.q_curriculum_pct,
-                        q_future_pct = EXCLUDED.q_future_pct,
-                        q_leadership_pct = EXCLUDED.q_leadership_pct,
-                        q_wellbeing_pct = EXCLUDED.q_wellbeing_pct,
-                        q_recommend_pct = EXCLUDED.q_recommend_pct,
-                        q_sen_pct = EXCLUDED.q_sen_pct
-                """),
-                record,
-            )
-            inserted += 1
-            if inserted % 2000 == 0:
-                session.flush()
-
-    print(f"  ParentView: upserted {inserted} records")
-    return {"inserted": inserted, "updated": 0, "skipped": 0}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
-    parser.add_argument("--data-dir", type=Path, default=None)
-    args = parser.parse_args()
-
-    if args.action in ("download", "all"):
-        download(args.data_dir)
-    if args.action in ("load", "all"):
-        load(data_dir=args.data_dir)
@@ -1,132 +0,0 @@
-"""
-Phonics Screening Check downloader and loader.
-
-Source: EES publication "phonics-screening-check-and-key-stage-1-assessments-england"
-Update: Annual (September/October)
-"""
-import argparse
-import sys
-from pathlib import Path
-
-import pandas as pd
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from config import SUPPLEMENTARY_DIR
-from db import get_session
-from sources.ees import get_latest_csv_url, download_csv
-
-DEST_DIR = SUPPLEMENTARY_DIR / "phonics"
-PUBLICATION_SLUG = "phonics-screening-check-and-key-stage-1-assessments-england"
-
-# Known column names in the phonics CSV (vary by year)
-COLUMN_MAP = {
-    "URN": "urn",
-    "urn": "urn",
-    # Year 1 pass rate
-    "PPTA1": "year1_phonics_pct",          # % meeting expected standard Y1
-    "PPTA1B": "year1_phonics_pct",
-    "PT_MET_PHON_Y1": "year1_phonics_pct",
-    "Y1_MET_EXPECTED_PCT": "year1_phonics_pct",
-    # Year 2 (re-takers)
-    "PPTA2": "year2_phonics_pct",
-    "PT_MET_PHON_Y2": "year2_phonics_pct",
-    "Y2_MET_EXPECTED_PCT": "year2_phonics_pct",
-    # Year label
-    "YEAR": "year",
-    "Year": "year",
-}
-
-NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", ""}
-
-
-def download(data_dir: Path | None = None) -> Path:
-    dest = (data_dir / "supplementary" / "phonics") if data_dir else DEST_DIR
-    dest.mkdir(parents=True, exist_ok=True)
-
-    url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
-    if not url:
-        raise RuntimeError(f"Could not find CSV URL for phonics publication")
-
-    filename = url.split("/")[-1].split("?")[0] or "phonics_latest.csv"
-    return download_csv(url, dest / filename)
-
-
-def _parse_pct(val) -> float | None:
-    if pd.isna(val):
-        return None
-    s = str(val).strip().upper().replace("%", "")
-    if s in NULL_VALUES:
-        return None
-    try:
-        return float(s)
-    except ValueError:
-        return None
-
-
-def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
-    if path is None:
-        dest = (data_dir / "supplementary" / "phonics") if data_dir else DEST_DIR
-        files = sorted(dest.glob("*.csv"))
-        if not files:
-            raise FileNotFoundError(f"No phonics CSV found in {dest}")
-        path = files[-1]
-
-    print(f"  Phonics: loading {path} ...")
-    df = pd.read_csv(path, encoding="latin-1", low_memory=False)
-    df.rename(columns=COLUMN_MAP, inplace=True)
-
-    if "urn" not in df.columns:
-        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
-
-    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
-    df = df.dropna(subset=["urn"])
-    df["urn"] = df["urn"].astype(int)
-
-    # Infer year from filename if not in data
-    year = None
-    import re
-    m = re.search(r"20(\d{2})", path.stem)
-    if m:
-        year = int("20" + m.group(1))
-
-    inserted = 0
-    with get_session() as session:
-        from sqlalchemy import text
-        for _, row in df.iterrows():
-            urn = int(row["urn"])
-            row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
-            if not row_year:
-                continue
-
-            session.execute(
-                text("""
-                    INSERT INTO phonics (urn, year, year1_phonics_pct, year2_phonics_pct)
-                    VALUES (:urn, :year, :y1, :y2)
-                    ON CONFLICT (urn, year) DO UPDATE SET
-                        year1_phonics_pct = EXCLUDED.year1_phonics_pct,
-                        year2_phonics_pct = EXCLUDED.year2_phonics_pct
-                """),
-                {
-                    "urn": urn,
-                    "year": row_year,
-                    "y1": _parse_pct(row.get("year1_phonics_pct")),
-                    "y2": _parse_pct(row.get("year2_phonics_pct")),
-                },
-            )
-            inserted += 1
-            if inserted % 5000 == 0:
-                session.flush()
-
-    print(f"  Phonics: upserted {inserted} records")
-    return {"inserted": inserted, "updated": 0, "skipped": 0}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
-    parser.add_argument("--data-dir", type=Path, default=None)
-    args = parser.parse_args()
-    if args.action in ("download", "all"):
-        download(args.data_dir)
-    if args.action in ("load", "all"):
-        load(data_dir=args.data_dir)
@@ -1,150 +0,0 @@
-"""
-SEN (Special Educational Needs) primary need type breakdown.
-
-Source: EES publication "special-educational-needs-in-england"
-Update: Annual (September)
-"""
-import argparse
-import re
-import sys
-from pathlib import Path
-
-import pandas as pd
-
-sys.path.insert(0, str(Path(__file__).parent.parent))
-from config import SUPPLEMENTARY_DIR
-from db import get_session
-from sources.ees import get_latest_csv_url, download_csv
-
-DEST_DIR = SUPPLEMENTARY_DIR / "sen_detail"
-PUBLICATION_SLUG = "special-educational-needs-in-england"
-
-NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""}
-
-COLUMN_MAP = {
-    "URN": "urn",
-    "urn": "urn",
-    "YEAR": "year",
-    "Year": "year",
-    # Primary need types — DfE abbreviated codes
-    "PT_SPEECH": "primary_need_speech_pct",       # SLCN
-    "PT_ASD": "primary_need_autism_pct",           # ASD
-    "PT_MLD": "primary_need_mld_pct",             # Moderate learning difficulty
-    "PT_SPLD": "primary_need_spld_pct",           # Specific learning difficulty
-    "PT_SEMH": "primary_need_semh_pct",           # Social, emotional, mental health
-    "PT_PHYSICAL": "primary_need_physical_pct",   # Physical/sensory
-    "PT_OTHER": "primary_need_other_pct",
-    # Alternative naming
-    "SLCN_PCT": "primary_need_speech_pct",
-    "ASD_PCT": "primary_need_autism_pct",
-    "MLD_PCT": "primary_need_mld_pct",
-    "SPLD_PCT": "primary_need_spld_pct",
-    "SEMH_PCT": "primary_need_semh_pct",
-    "PHYSICAL_PCT": "primary_need_physical_pct",
-    "OTHER_PCT": "primary_need_other_pct",
-}
-
-
-def download(data_dir: Path | None = None) -> Path:
-    dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR
-    dest.mkdir(parents=True, exist_ok=True)
-
-    url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school")
-    if not url:
-        url = get_latest_csv_url(PUBLICATION_SLUG)
-    if not url:
-        raise RuntimeError("Could not find CSV URL for SEN publication")
-
-    filename = url.split("/")[-1].split("?")[0] or "sen_latest.csv"
-    return download_csv(url, dest / filename)
-
-
-def _parse_pct(val) -> float | None:
-    if pd.isna(val):
-        return None
-    s = str(val).strip().upper().replace("%", "")
-    if s in NULL_VALUES:
-        return None
-    try:
-        return float(s)
-    except ValueError:
-        return None
-
-
-def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
-    if path is None:
-        dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR
-        files = sorted(dest.glob("*.csv"))
-        if not files:
-            raise FileNotFoundError(f"No SEN CSV found in {dest}")
-        path = files[-1]
-
-    print(f"  SEN Detail: loading {path} ...")
-    df = pd.read_csv(path, encoding="latin-1", low_memory=False)
-    df.rename(columns=COLUMN_MAP, inplace=True)
-
-    if "urn" not in df.columns:
-        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")
-
-    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
-    df = df.dropna(subset=["urn"])
-    df["urn"] = df["urn"].astype(int)
-
-    year = None
-    m = re.search(r"20(\d{2})", path.stem)
-    if m:
-        year = int("20" + m.group(1))
-
-    inserted = 0
-    with get_session() as session:
-        from sqlalchemy import text
-        for _, row in df.iterrows():
-            urn = int(row["urn"])
-            row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year
-            if not row_year:
-                continue
-
-            session.execute(
-                text("""
-                    INSERT INTO sen_detail
-                        (urn, year, primary_need_speech_pct, primary_need_autism_pct,
-                         primary_need_mld_pct, primary_need_spld_pct, primary_need_semh_pct,
-                         primary_need_physical_pct, primary_need_other_pct)
-                    VALUES (:urn, :year, :speech, :autism, :mld, :spld, :semh, :physical, :other)
-                    ON CONFLICT (urn, year) DO UPDATE SET
-                        primary_need_speech_pct   = EXCLUDED.primary_need_speech_pct,
-                        primary_need_autism_pct   = EXCLUDED.primary_need_autism_pct,
-                        primary_need_mld_pct      = EXCLUDED.primary_need_mld_pct,
-                        primary_need_spld_pct     = EXCLUDED.primary_need_spld_pct,
-                        primary_need_semh_pct     = EXCLUDED.primary_need_semh_pct,
-                        primary_need_physical_pct = EXCLUDED.primary_need_physical_pct,
-                        primary_need_other_pct    = EXCLUDED.primary_need_other_pct
-                """),
-                {
-                    "urn": urn, "year": row_year,
-                    "speech": _parse_pct(row.get("primary_need_speech_pct")),
-                    "autism": _parse_pct(row.get("primary_need_autism_pct")),
-                    "mld": _parse_pct(row.get("primary_need_mld_pct")),
-                    "spld": _parse_pct(row.get("primary_need_spld_pct")),
-                    "semh": _parse_pct(row.get("primary_need_semh_pct")),
-                    "physical": _parse_pct(row.get("primary_need_physical_pct")),
-                    "other": _parse_pct(row.get("primary_need_other_pct")),
-                },
-            )
-            inserted += 1
-            if inserted % 5000 == 0:
-                session.flush()
-
-    print(f"  SEN Detail: upserted {inserted} records")
-    return {"inserted": inserted, "updated": 0, "skipped": 0}
-
-
-if __name__ == "__main__":
-    parser = argparse.ArgumentParser()
-    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
-    parser.add_argument("--data-dir", type=Path, default=None)
-    args = parser.parse_args()
-    if args.action in ("download", "all"):
-        download(args.data_dir)
-    if args.action in ("load", "all"):
-        load(data_dir=args.data_dir)
@@ -1,70 +0,0 @@
-"""
-Data integrator HTTP server.
-Kestra calls this server via HTTP tasks to trigger download/load operations.
-"""
-import importlib
-import sys
-import traceback
-from pathlib import Path
-
-from fastapi import FastAPI, HTTPException
-from fastapi.responses import JSONResponse
-
-sys.path.insert(0, "/app/scripts")
-
-app = FastAPI(title="SchoolCompare Data Integrator", version="1.0.0")
-
-SOURCES = {
-    "ofsted", "gias", "parent_view",
-    "census", "admissions", "sen_detail",
-    "phonics", "idaci", "finance", "ks2",
-}
-
-
-@app.get("/health")
-def health():
-    return {"status": "ok"}
-
-
-@app.post("/run/{source}")
-def run_source(source: str, action: str = "all"):
-    """
-    Trigger a data source download and/or load.
-    action: "download" | "load" | "all"
-    """
-    if source not in SOURCES:
-        raise HTTPException(status_code=404, detail=f"Unknown source '{source}'. Available: {sorted(SOURCES)}")
-    if action not in ("download", "load", "all"):
-        raise HTTPException(status_code=400, detail="action must be 'download', 'load', or 'all'")
-
-    try:
-        mod = importlib.import_module(f"sources.{source}")
-        result = {}
-
-        if action in ("download", "all"):
-            mod.download()
-
-        if action in ("load", "all"):
-            result = mod.load()
-
-        return {"source": source, "action": action, "result": result}
-
-    except Exception as e:
-        tb = traceback.format_exc()
-        raise HTTPException(status_code=500, detail={"error": str(e), "traceback": tb})
-
-
-@app.post("/run-all")
-def run_all(action: str = "all"):
-    """Trigger all sources in sequence."""
-    results = {}
-    for source in sorted(SOURCES):
-        try:
-            mod = importlib.import_module(f"sources.{source}")
-            if action in ("download", "all"):
-                mod.download()
-            if action in ("load", "all"):
-                results[source] = mod.load()
-        except Exception as e:
-            results[source] = {"error": str(e)}
-    return results