school_compare/integrator/scripts/sources/parent_view.py

"""
Ofsted Parent View open data downloader and loader.

Source: https://parentview.ofsted.gov.uk/open-data
Update: ~3 times/year (Spring, Autumn, Summer)
"""
import argparse
import re
import sys
from datetime import date, datetime
from pathlib import Path

import pandas as pd
import requests

sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session

DEST_DIR = SUPPLEMENTARY_DIR / "parent_view"
OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data"

# Question column mapping — Parent View open data uses descriptive column headers
# Map any variant to our internal field names
QUESTION_MAP = {
    # Q1 — happiness
    "My child is happy at this school": "q_happy_pct",
    "Happy": "q_happy_pct",
    # Q2 — safety
    "My child feels safe at this school": "q_safe_pct",
    "Safe": "q_safe_pct",
    # Q3 — bullying
    "The school makes sure its pupils are well behaved": "q_behaviour_pct",
    "Well Behaved": "q_behaviour_pct",
    # Q4 — bullying dealt with (sometimes separate)
    "My child has been bullied and the school dealt with the bullying quickly and effectively": "q_bullying_pct",
    "Bullying": "q_bullying_pct",
    # Q5 — curriculum info
    "The school makes me aware of what my child will learn during the year": "q_communication_pct",
    "Aware of learning": "q_communication_pct",
    # Q6 — concerns dealt with
    "When I have raised concerns with the school, they have been dealt with properly": "q_communication_pct",
    # Q7 — child does well
    "My child does well at this school": "q_progress_pct",
    "Does well": "q_progress_pct",
    # Q8 — teaching
    "The teaching is good at this school": "q_teaching_pct",
    "Good teaching": "q_teaching_pct",
    # Q9 — progress info
    "I receive valuable information from the school about my child's progress": "q_information_pct",
    "Progress information": "q_information_pct",
    # Q10 — curriculum breadth
    "My child is taught a broad range of subjects": "q_curriculum_pct",
    "Broad subjects": "q_curriculum_pct",
    # Q11 — prepares for future
    "The school prepares my child well for the future": "q_future_pct",
    "Prepared for future": "q_future_pct",
    # Q12 — leadership
    "The school is led and managed effectively": "q_leadership_pct",
    "Led well": "q_leadership_pct",
    # Q13 — wellbeing
    "The school supports my child's wider personal development": "q_wellbeing_pct",
    "Personal development": "q_wellbeing_pct",
    # Q14 — recommendation
    "I would recommend this school to another parent": "q_recommend_pct",
    "Recommend": "q_recommend_pct",
}


def download(data_dir: Path | None = None) -> Path:
    dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
    dest.mkdir(parents=True, exist_ok=True)

    # Scrape the open data page for the download link
    try:
        resp = requests.get(OPEN_DATA_PAGE, timeout=30)
        resp.raise_for_status()
        pattern = r'href="([^"]+\.(?:xlsx|csv|zip))"'
        urls = re.findall(pattern, resp.text, re.IGNORECASE)
        if not urls:
            raise RuntimeError("No download link found on Parent View open data page")
        url = urls[0] if urls[0].startswith("http") else "https://parentview.ofsted.gov.uk" + urls[0]
    except Exception as e:
        raise RuntimeError(f"Could not discover Parent View download URL: {e}")

    filename = url.split("/")[-1].split("?")[0]
    dest_file = dest / filename

    if dest_file.exists():
        print(f"  ParentView: {filename} already exists, skipping download.")
        return dest_file

    print(f"  ParentView: downloading {url} ...")
    resp = requests.get(url, timeout=120, stream=True)
    resp.raise_for_status()
    with open(dest_file, "wb") as f:
        for chunk in resp.iter_content(chunk_size=65536):
            f.write(chunk)

    print(f"  ParentView: saved {dest_file}")
    return dest_file


def _positive_pct(row: pd.Series, q_col_base: str) -> float | None:
    """Sum 'Strongly agree' + 'Agree' percentages for a question."""
    # Parent View open data has columns like "Q1 - Strongly agree %", "Q1 - Agree %"
    strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %")
    agree = row.get(f"{q_col_base} - Agree %")
    try:
        total = 0.0
        if pd.notna(strongly):
            total += float(strongly)
        if pd.notna(agree):
            total += float(agree)
        return round(total, 1) if total > 0 else None
    except (TypeError, ValueError):
        return None


def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
    if path is None:
        dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
        files = sorted(dest.glob("*.xlsx")) + sorted(dest.glob("*.csv"))
        if not files:
            raise FileNotFoundError(f"No Parent View file found in {dest}")
        path = files[-1]

    print(f"  ParentView: loading {path} ...")

    if str(path).endswith(".xlsx"):
        df = pd.read_excel(path)
    else:
        df = pd.read_csv(path, encoding="latin-1", low_memory=False)

    # Normalise URN column
    urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None)
    if not urn_col:
        raise ValueError(f"URN column not found. Columns: {list(df.columns)[:20]}")
    df.rename(columns={urn_col: "urn"}, inplace=True)
    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
    df = df.dropna(subset=["urn"])
    df["urn"] = df["urn"].astype(int)

    # Try to find total responses column
    resp_col = next((c for c in df.columns if "total" in c.lower() and "respon" in c.lower()), None)

    inserted = 0
    today = date.today()

    with get_session() as session:
        from sqlalchemy import text
        for _, row in df.iterrows():
            urn = int(row["urn"])
            total = int(row[resp_col]) if resp_col and pd.notna(row.get(resp_col)) else None

            # Try to extract % positive per question from wide-format columns
            # Parent View has numbered questions Q1–Q12 (or Q1–Q14 depending on year)
            record = {
                "urn": urn,
                "survey_date": today,
                "total_responses": total,
                "q_happy_pct": _positive_pct(row, "Q1"),
                "q_safe_pct": _positive_pct(row, "Q2"),
                "q_behaviour_pct": _positive_pct(row, "Q3"),
                "q_bullying_pct": _positive_pct(row, "Q4"),
                "q_communication_pct": _positive_pct(row, "Q5"),
                "q_progress_pct": _positive_pct(row, "Q7"),
                "q_teaching_pct": _positive_pct(row, "Q8"),
                "q_information_pct": _positive_pct(row, "Q9"),
                "q_curriculum_pct": _positive_pct(row, "Q10"),
                "q_future_pct": _positive_pct(row, "Q11"),
                "q_leadership_pct": _positive_pct(row, "Q12"),
                "q_wellbeing_pct": _positive_pct(row, "Q13"),
                "q_recommend_pct": _positive_pct(row, "Q14"),
                "q_sen_pct": None,
            }

            session.execute(
                text("""
                    INSERT INTO ofsted_parent_view
                        (urn, survey_date, total_responses,
                         q_happy_pct, q_safe_pct, q_behaviour_pct, q_bullying_pct,
                         q_communication_pct, q_progress_pct, q_teaching_pct,
                         q_information_pct, q_curriculum_pct, q_future_pct,
                         q_leadership_pct, q_wellbeing_pct, q_recommend_pct, q_sen_pct)
                    VALUES
                        (:urn, :survey_date, :total_responses,
                         :q_happy_pct, :q_safe_pct, :q_behaviour_pct, :q_bullying_pct,
                         :q_communication_pct, :q_progress_pct, :q_teaching_pct,
                         :q_information_pct, :q_curriculum_pct, :q_future_pct,
                         :q_leadership_pct, :q_wellbeing_pct, :q_recommend_pct, :q_sen_pct)
                    ON CONFLICT (urn) DO UPDATE SET
                        survey_date = EXCLUDED.survey_date,
                        total_responses = EXCLUDED.total_responses,
                        q_happy_pct = EXCLUDED.q_happy_pct,
                        q_safe_pct = EXCLUDED.q_safe_pct,
                        q_behaviour_pct = EXCLUDED.q_behaviour_pct,
                        q_bullying_pct = EXCLUDED.q_bullying_pct,
                        q_communication_pct = EXCLUDED.q_communication_pct,
                        q_progress_pct = EXCLUDED.q_progress_pct,
                        q_teaching_pct = EXCLUDED.q_teaching_pct,
                        q_information_pct = EXCLUDED.q_information_pct,
                        q_curriculum_pct = EXCLUDED.q_curriculum_pct,
                        q_future_pct = EXCLUDED.q_future_pct,
                        q_leadership_pct = EXCLUDED.q_leadership_pct,
                        q_wellbeing_pct = EXCLUDED.q_wellbeing_pct,
                        q_recommend_pct = EXCLUDED.q_recommend_pct,
                        q_sen_pct = EXCLUDED.q_sen_pct
                """),
                record,
            )
            inserted += 1
            if inserted % 2000 == 0:
                session.flush()

    print(f"  ParentView: upserted {inserted} records")
    return {"inserted": inserted, "updated": 0, "skipped": 0}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
    parser.add_argument("--data-dir", type=Path, default=None)
    args = parser.parse_args()

    if args.action in ("download", "all"):
        download(args.data_dir)
    if args.action in ("load", "all"):
        load(data_dir=args.data_dir)