"""
GIAS (Get Information About Schools) bulk CSV downloader and loader.

Source: https://get-information-schools.service.gov.uk/Downloads
Update: Daily; we refresh weekly.
Adds: website, headteacher_name, capacity, trust_name, trust_uid, gender, nursery_provision
"""
import argparse
import sys
from datetime import date
from pathlib import Path

import pandas as pd
import requests

sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session

DEST_DIR = SUPPLEMENTARY_DIR / "gias"

# GIAS bulk download URL — date is injected at runtime
GIAS_URL_TEMPLATE = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata{date}.csv"

COLUMN_MAP = {
    "URN": "urn",
    "SchoolWebsite": "website",
    "SchoolCapacity": "capacity",
    "TrustName": "trust_name",
    "TrustUID": "trust_uid",
    "Gender (name)": "gender",
    "NurseryProvision (name)": "nursery_provision_raw",
    "HeadTitle": "head_title",
    "HeadFirstName": "head_first",
    "HeadLastName": "head_last",
}


def download(data_dir: Path | None = None) -> Path:
    dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
    dest.mkdir(parents=True, exist_ok=True)

    today = date.today().strftime("%Y%m%d")
    url = GIAS_URL_TEMPLATE.format(date=today)
    filename = f"gias_{today}.csv"
    dest_file = dest / filename

    if dest_file.exists():
        print(f"  GIAS: {filename} already exists, skipping download.")
        return dest_file

    print(f"  GIAS: downloading {url} ...")
    resp = requests.get(url, timeout=300, stream=True)

    # GIAS may not have today's file yet — fall back to yesterday
    if resp.status_code == 404:
        from datetime import timedelta
        yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d")
        url = GIAS_URL_TEMPLATE.format(date=yesterday)
        filename = f"gias_{yesterday}.csv"
        dest_file = dest / filename
        if dest_file.exists():
            print(f"  GIAS: {filename} already exists, skipping download.")
            return dest_file
        resp = requests.get(url, timeout=300, stream=True)

    resp.raise_for_status()
    with open(dest_file, "wb") as f:
        for chunk in resp.iter_content(chunk_size=65536):
            f.write(chunk)

    print(f"  GIAS: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)")
    return dest_file


def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
    if path is None:
        dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR
        files = sorted(dest.glob("gias_*.csv"))
        if not files:
            raise FileNotFoundError(f"No GIAS CSV found in {dest}")
        path = files[-1]

    print(f"  GIAS: loading {path} ...")
    df = pd.read_csv(path, encoding="latin-1", low_memory=False)
    df.rename(columns=COLUMN_MAP, inplace=True)

    if "urn" not in df.columns:
        raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}")

    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
    df = df.dropna(subset=["urn"])
    df["urn"] = df["urn"].astype(int)

    # Build headteacher_name from parts
    def build_name(row):
        parts = [
            str(row.get("head_title", "") or "").strip(),
            str(row.get("head_first", "") or "").strip(),
            str(row.get("head_last", "") or "").strip(),
        ]
        return " ".join(p for p in parts if p) or None

    df["headteacher_name"] = df.apply(build_name, axis=1)
    df["nursery_provision"] = df.get("nursery_provision_raw", pd.Series()).apply(
        lambda v: True if str(v).strip().lower().startswith("has") else False if pd.notna(v) else None
    )

    def clean_str(val):
        s = str(val).strip() if pd.notna(val) else None
        return s if s and s.lower() not in ("nan", "none", "") else None

    updated = 0
    with get_session() as session:
        from sqlalchemy import text
        for _, row in df.iterrows():
            urn = int(row["urn"])
            session.execute(
                text("""
                    UPDATE schools SET
                        website            = :website,
                        headteacher_name   = :headteacher_name,
                        capacity           = :capacity,
                        trust_name         = :trust_name,
                        trust_uid          = :trust_uid,
                        gender             = :gender,
                        nursery_provision  = :nursery_provision
                    WHERE urn = :urn
                """),
                {
                    "urn": urn,
                    "website": clean_str(row.get("website")),
                    "headteacher_name": row.get("headteacher_name"),
                    "capacity": int(row["capacity"]) if pd.notna(row.get("capacity")) and str(row.get("capacity")).strip().isdigit() else None,
                    "trust_name": clean_str(row.get("trust_name")),
                    "trust_uid": clean_str(row.get("trust_uid")),
                    "gender": clean_str(row.get("gender")),
                    "nursery_provision": row.get("nursery_provision"),
                },
            )
            updated += 1
            if updated % 5000 == 0:
                session.flush()
                print(f"    Updated {updated} schools...")

    print(f"  GIAS: updated {updated} school records")
    return {"inserted": 0, "updated": updated, "skipped": 0}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
    parser.add_argument("--data-dir", type=Path, default=None)
    args = parser.parse_args()

    if args.action in ("download", "all"):
        path = download(args.data_dir)
    if args.action in ("load", "all"):
        load(data_dir=args.data_dir)