integrator/scripts/sources/parent_view.py

"""
Ofsted Parent View open data downloader and loader.

Source: https://parentview.ofsted.gov.uk/open-data
Update: ~3 times/year (Spring, Autumn, Summer)
"""
import argparse
import re
import sys
from datetime import date, datetime
from pathlib import Path

import pandas as pd
import requests

sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session

DEST_DIR = SUPPLEMENTARY_DIR / "parent_view"
OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data"

# Question column mapping — Parent View open data uses descriptive column headers
# Map any variant to our internal field names
QUESTION_MAP = {
    # Q1 — happiness
    "My child is happy at this school": "q_happy_pct",
    "Happy": "q_happy_pct",
    # Q2 — safety
    "My child feels safe at this school": "q_safe_pct",
    "Safe": "q_safe_pct",
    # Q3 — bullying
    "The school makes sure its pupils are well behaved": "q_behaviour_pct",
    "Well Behaved": "q_behaviour_pct",
    # Q4 — bullying dealt with (sometimes separate)
    "My child has been bullied and the school dealt with the bullying quickly and effectively": "q_bullying_pct",
    "Bullying": "q_bullying_pct",
    # Q5 — curriculum info
    "The school makes me aware of what my child will learn during the year": "q_communication_pct",
    "Aware of learning": "q_communication_pct",
    # Q6 — concerns dealt with
    "When I have raised concerns with the school, they have been dealt with properly": "q_communication_pct",
    # Q7 — child does well
    "My child does well at this school": "q_progress_pct",
    "Does well": "q_progress_pct",
    # Q8 — teaching
    "The teaching is good at this school": "q_teaching_pct",
    "Good teaching": "q_teaching_pct",
    # Q9 — progress info
    "I receive valuable information from the school about my child's progress": "q_information_pct",
    "Progress information": "q_information_pct",
    # Q10 — curriculum breadth
    "My child is taught a broad range of subjects": "q_curriculum_pct",
    "Broad subjects": "q_curriculum_pct",
    # Q11 — prepares for future
    "The school prepares my child well for the future": "q_future_pct",
    "Prepared for future": "q_future_pct",
    # Q12 — leadership
    "The school is led and managed effectively": "q_leadership_pct",
    "Led well": "q_leadership_pct",
    # Q13 — wellbeing
    "The school supports my child's wider personal development": "q_wellbeing_pct",
    "Personal development": "q_wellbeing_pct",
    # Q14 — recommendation
    "I would recommend this school to another parent": "q_recommend_pct",
    "Recommend": "q_recommend_pct",
}


def download(data_dir: Path | None = None) -> Path:
    dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
    dest.mkdir(parents=True, exist_ok=True)

    # Scrape the open data page for the download link
    try:
        resp = requests.get(OPEN_DATA_PAGE, timeout=30)
        resp.raise_for_status()
        pattern = r'href="([^"]+\.(?:xlsx|csv|zip))"'
        urls = re.findall(pattern, resp.text, re.IGNORECASE)
        if not urls:
            raise RuntimeError("No download link found on Parent View open data page")
        url = urls[0] if urls[0].startswith("http") else "https://parentview.ofsted.gov.uk" + urls[0]
    except Exception as e:
        raise RuntimeError(f"Could not discover Parent View download URL: {e}")

    filename = url.split("/")[-1].split("?")[0]
    dest_file = dest / filename

    if dest_file.exists():
        print(f"  ParentView: {filename} already exists, skipping download.")
        return dest_file

    print(f"  ParentView: downloading {url} ...")
    resp = requests.get(url, timeout=120, stream=True)
    resp.raise_for_status()
    with open(dest_file, "wb") as f:
        for chunk in resp.iter_content(chunk_size=65536):
            f.write(chunk)

    print(f"  ParentView: saved {dest_file}")
    return dest_file


def _positive_pct(row: pd.Series, q_col_base: str) -> float | None:
    """Sum 'Strongly agree' + 'Agree' percentages for a question."""
    # Parent View open data has columns like "Q1 - Strongly agree %", "Q1 - Agree %"
    strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %")
    agree = row.get(f"{q_col_base} - Agree %")
    try:
        total = 0.0
        if pd.notna(strongly):
            total += float(strongly)
        if pd.notna(agree):
            total += float(agree)
        return round(total, 1) if total > 0 else None
    except (TypeError, ValueError):
        return None


def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
    if path is None:
        dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
        files = sorted(dest.glob("*.xlsx")) + sorted(dest.glob("*.csv"))
        if not files:
            raise FileNotFoundError(f"No Parent View file found in {dest}")
        path = files[-1]

    print(f"  ParentView: loading {path} ...")

    if str(path).endswith(".xlsx"):
        df = pd.read_excel(path)
    else:
        df = pd.read_csv(path, encoding="latin-1", low_memory=False)

    # Normalise URN column
    urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None)
    if not urn_col:
        raise ValueError(f"URN column not found. Columns: {list(df.columns)[:20]}")
    df.rename(columns={urn_col: "urn"}, inplace=True)
    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
    df = df.dropna(subset=["urn"])
    df["urn"] = df["urn"].astype(int)

    # Try to find total responses column
    resp_col = next((c for c in df.columns if "total" in c.lower() and "respon" in c.lower()), None)

    inserted = 0
    today = date.today()

    with get_session() as session:
        from sqlalchemy import text
        for _, row in df.iterrows():
            urn = int(row["urn"])
            total = int(row[resp_col]) if resp_col and pd.notna(row.get(resp_col)) else None

            # Try to extract % positive per question from wide-format columns
            # Parent View has numbered questions Q1–Q12 (or Q1–Q14 depending on year)
            record = {
                "urn": urn,
                "survey_date": today,
                "total_responses": total,
                "q_happy_pct": _positive_pct(row, "Q1"),
                "q_safe_pct": _positive_pct(row, "Q2"),
                "q_behaviour_pct": _positive_pct(row, "Q3"),
                "q_bullying_pct": _positive_pct(row, "Q4"),
                "q_communication_pct": _positive_pct(row, "Q5"),
                "q_progress_pct": _positive_pct(row, "Q7"),
                "q_teaching_pct": _positive_pct(row, "Q8"),
                "q_information_pct": _positive_pct(row, "Q9"),
                "q_curriculum_pct": _positive_pct(row, "Q10"),
                "q_future_pct": _positive_pct(row, "Q11"),
                "q_leadership_pct": _positive_pct(row, "Q12"),
                "q_wellbeing_pct": _positive_pct(row, "Q13"),
                "q_recommend_pct": _positive_pct(row, "Q14"),
                "q_sen_pct": None,
            }

            session.execute(
                text("""
                    INSERT INTO ofsted_parent_view
                        (urn, survey_date, total_responses,
                         q_happy_pct, q_safe_pct, q_behaviour_pct, q_bullying_pct,
                         q_communication_pct, q_progress_pct, q_teaching_pct,
                         q_information_pct, q_curriculum_pct, q_future_pct,
                         q_leadership_pct, q_wellbeing_pct, q_recommend_pct, q_sen_pct)
                    VALUES
                        (:urn, :survey_date, :total_responses,
                         :q_happy_pct, :q_safe_pct, :q_behaviour_pct, :q_bullying_pct,
                         :q_communication_pct, :q_progress_pct, :q_teaching_pct,
                         :q_information_pct, :q_curriculum_pct, :q_future_pct,
                         :q_leadership_pct, :q_wellbeing_pct, :q_recommend_pct, :q_sen_pct)
                    ON CONFLICT (urn) DO UPDATE SET
                        survey_date = EXCLUDED.survey_date,
                        total_responses = EXCLUDED.total_responses,
                        q_happy_pct = EXCLUDED.q_happy_pct,
                        q_safe_pct = EXCLUDED.q_safe_pct,
                        q_behaviour_pct = EXCLUDED.q_behaviour_pct,
                        q_bullying_pct = EXCLUDED.q_bullying_pct,
                        q_communication_pct = EXCLUDED.q_communication_pct,
                        q_progress_pct = EXCLUDED.q_progress_pct,
                        q_teaching_pct = EXCLUDED.q_teaching_pct,
                        q_information_pct = EXCLUDED.q_information_pct,
                        q_curriculum_pct = EXCLUDED.q_curriculum_pct,
                        q_future_pct = EXCLUDED.q_future_pct,
                        q_leadership_pct = EXCLUDED.q_leadership_pct,
                        q_wellbeing_pct = EXCLUDED.q_wellbeing_pct,
                        q_recommend_pct = EXCLUDED.q_recommend_pct,
                        q_sen_pct = EXCLUDED.q_sen_pct
                """),
                record,
            )
            inserted += 1
            if inserted % 2000 == 0:
                session.flush()

    print(f"  ParentView: upserted {inserted} records")
    return {"inserted": inserted, "updated": 0, "skipped": 0}


if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
    parser.add_argument("--data-dir", type=Path, default=None)
    args = parser.parse_args()

    if args.action in ("download", "all"):
        download(args.data_dir)
    if args.action in ("load", "all"):
        load(data_dir=args.data_dir)
-												feat(data): integrate 9 UK government data sources via Kestra

Adds a full data integration pipeline for enriching school profiles with
supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT.

Backend:
- Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections,
  ofsted_parent_view, school_census, admissions, sen_detail, phonics,
  school_deprivation, school_finance) plus GIAS columns on schools
- Expose all supplementary data via GET /api/schools/{urn}
- Enrich school list responses with ofsted_grade + ofsted_date

Integrator (new service):
- FastAPI HTTP microservice; Kestra calls POST /run/{source}
- 9 source modules: ofsted, gias, parent_view, census, admissions,
  sen_detail, phonics, idaci, finance
- 9 Kestra flow YAMLs with scheduled triggers and 3× retry

Frontend:
- SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate)
- SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View
  survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation
  Context, Finances
- types.ts: 8 new interfaces + extended School/SchoolDetailsResponse

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>

											
										
										
											2026-03-24 11:44:04 +00:00
+								"""
 								Ofsted Parent View open data downloader and loader.
 								Source: https://parentview.ofsted.gov.uk/open-data
 								Update: ~3 times/year (Spring, Autumn, Summer)
 								"""
 								import argparse
 								import re
 								import sys
 								from datetime import date, datetime
 								from pathlib import Path
 								import pandas as pd
 								import requests
 								sys.path.insert(0, str(Path(__file__).parent.parent))
 								from config import SUPPLEMENTARY_DIR
 								from db import get_session
 								DEST_DIR = SUPPLEMENTARY_DIR / "parent_view"
 								OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data"
 								# Question column mapping — Parent View open data uses descriptive column headers
 								# Map any variant to our internal field names
 								QUESTION_MAP = {
 								    # Q1 — happiness
 								    "My child is happy at this school": "q_happy_pct",
 								    "Happy": "q_happy_pct",
 								    # Q2 — safety
 								    "My child feels safe at this school": "q_safe_pct",
 								    "Safe": "q_safe_pct",
 								    # Q3 — bullying
 								    "The school makes sure its pupils are well behaved": "q_behaviour_pct",
 								    "Well Behaved": "q_behaviour_pct",
 								    # Q4 — bullying dealt with (sometimes separate)
 								    "My child has been bullied and the school dealt with the bullying quickly and effectively": "q_bullying_pct",
 								    "Bullying": "q_bullying_pct",
 								    # Q5 — curriculum info
 								    "The school makes me aware of what my child will learn during the year": "q_communication_pct",
 								    "Aware of learning": "q_communication_pct",
 								    # Q6 — concerns dealt with
 								    "When I have raised concerns with the school, they have been dealt with properly": "q_communication_pct",
 								    # Q7 — child does well
 								    "My child does well at this school": "q_progress_pct",
 								    "Does well": "q_progress_pct",
 								    # Q8 — teaching
 								    "The teaching is good at this school": "q_teaching_pct",
 								    "Good teaching": "q_teaching_pct",
 								    # Q9 — progress info
 								    "I receive valuable information from the school about my child's progress": "q_information_pct",
 								    "Progress information": "q_information_pct",
 								    # Q10 — curriculum breadth
 								    "My child is taught a broad range of subjects": "q_curriculum_pct",
 								    "Broad subjects": "q_curriculum_pct",
 								    # Q11 — prepares for future
 								    "The school prepares my child well for the future": "q_future_pct",
 								    "Prepared for future": "q_future_pct",
 								    # Q12 — leadership
 								    "The school is led and managed effectively": "q_leadership_pct",
 								    "Led well": "q_leadership_pct",
 								    # Q13 — wellbeing
 								    "The school supports my child's wider personal development": "q_wellbeing_pct",
 								    "Personal development": "q_wellbeing_pct",
 								    # Q14 — recommendation
 								    "I would recommend this school to another parent": "q_recommend_pct",
 								    "Recommend": "q_recommend_pct",
 								}
 								def download(data_dir: Path | None = None) -> Path:
 								    dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
 								    dest.mkdir(parents=True, exist_ok=True)
 								    # Scrape the open data page for the download link
 								    try:
 								        resp = requests.get(OPEN_DATA_PAGE, timeout=30)
 								        resp.raise_for_status()
 								        pattern = r'href="([^"]+\.(?:xlsx|csv|zip))"'
 								        urls = re.findall(pattern, resp.text, re.IGNORECASE)
 								        if not urls:
 								            raise RuntimeError("No download link found on Parent View open data page")
 								        url = urls[0] if urls[0].startswith("http") else "https://parentview.ofsted.gov.uk" + urls[0]
 								    except Exception as e:
 								        raise RuntimeError(f"Could not discover Parent View download URL: {e}")
 								    filename = url.split("/")[-1].split("?")[0]
 								    dest_file = dest / filename
 								    if dest_file.exists():
 								        print(f"  ParentView: {filename} already exists, skipping download.")
 								        return dest_file
 								    print(f"  ParentView: downloading {url} ...")
 								    resp = requests.get(url, timeout=120, stream=True)
 								    resp.raise_for_status()
 								    with open(dest_file, "wb") as f:
 								        for chunk in resp.iter_content(chunk_size=65536):
 								            f.write(chunk)
 								    print(f"  ParentView: saved {dest_file}")
 								    return dest_file
 								def _positive_pct(row: pd.Series, q_col_base: str) -> float | None:
 								    """Sum 'Strongly agree' + 'Agree' percentages for a question."""
 								    # Parent View open data has columns like "Q1 - Strongly agree %", "Q1 - Agree %"
 								    strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %")
 								    agree = row.get(f"{q_col_base} - Agree %")
 								    try:
 								        total = 0.0
 								        if pd.notna(strongly):
 								            total += float(strongly)
 								        if pd.notna(agree):
 								            total += float(agree)
 								        return round(total, 1) if total > 0 else None
 								    except (TypeError, ValueError):
 								        return None
 								def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
 								    if path is None:
 								        dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
 								        files = sorted(dest.glob("*.xlsx")) + sorted(dest.glob("*.csv"))
 								        if not files:
 								            raise FileNotFoundError(f"No Parent View file found in {dest}")
 								        path = files[-1]
 								    print(f"  ParentView: loading {path} ...")
 								    if str(path).endswith(".xlsx"):
 								        df = pd.read_excel(path)
 								    else:
 								        df = pd.read_csv(path, encoding="latin-1", low_memory=False)
 								    # Normalise URN column
 								    urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None)
 								    if not urn_col:
 								        raise ValueError(f"URN column not found. Columns: {list(df.columns)[:20]}")
 								    df.rename(columns={urn_col: "urn"}, inplace=True)
 								    df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
 								    df = df.dropna(subset=["urn"])
 								    df["urn"] = df["urn"].astype(int)
 								    # Try to find total responses column
 								    resp_col = next((c for c in df.columns if "total" in c.lower() and "respon" in c.lower()), None)
 								    inserted = 0
 								    today = date.today()
 								    with get_session() as session:
 								        from sqlalchemy import text
 								        for _, row in df.iterrows():
 								            urn = int(row["urn"])
 								            total = int(row[resp_col]) if resp_col and pd.notna(row.get(resp_col)) else None
 								            # Try to extract % positive per question from wide-format columns
 								            # Parent View has numbered questions Q1–Q12 (or Q1–Q14 depending on year)
 								            record = {
 								                "urn": urn,
 								                "survey_date": today,
 								                "total_responses": total,
 								                "q_happy_pct": _positive_pct(row, "Q1"),
 								                "q_safe_pct": _positive_pct(row, "Q2"),
 								                "q_behaviour_pct": _positive_pct(row, "Q3"),
 								                "q_bullying_pct": _positive_pct(row, "Q4"),
 								                "q_communication_pct": _positive_pct(row, "Q5"),
 								                "q_progress_pct": _positive_pct(row, "Q7"),
 								                "q_teaching_pct": _positive_pct(row, "Q8"),
 								                "q_information_pct": _positive_pct(row, "Q9"),
 								                "q_curriculum_pct": _positive_pct(row, "Q10"),
 								                "q_future_pct": _positive_pct(row, "Q11"),
 								                "q_leadership_pct": _positive_pct(row, "Q12"),
 								                "q_wellbeing_pct": _positive_pct(row, "Q13"),
 								                "q_recommend_pct": _positive_pct(row, "Q14"),
 								                "q_sen_pct": None,
 								            }
 								            session.execute(
 								                text("""
 								                    INSERT INTO ofsted_parent_view
 								                        (urn, survey_date, total_responses,
 								                         q_happy_pct, q_safe_pct, q_behaviour_pct, q_bullying_pct,
 								                         q_communication_pct, q_progress_pct, q_teaching_pct,
 								                         q_information_pct, q_curriculum_pct, q_future_pct,
 								                         q_leadership_pct, q_wellbeing_pct, q_recommend_pct, q_sen_pct)
 								                    VALUES
 								                        (:urn, :survey_date, :total_responses,
 								                         :q_happy_pct, :q_safe_pct, :q_behaviour_pct, :q_bullying_pct,
 								                         :q_communication_pct, :q_progress_pct, :q_teaching_pct,
 								                         :q_information_pct, :q_curriculum_pct, :q_future_pct,
 								                         :q_leadership_pct, :q_wellbeing_pct, :q_recommend_pct, :q_sen_pct)
 								                    ON CONFLICT (urn) DO UPDATE SET
 								                        survey_date = EXCLUDED.survey_date,
 								                        total_responses = EXCLUDED.total_responses,
 								                        q_happy_pct = EXCLUDED.q_happy_pct,
 								                        q_safe_pct = EXCLUDED.q_safe_pct,
 								                        q_behaviour_pct = EXCLUDED.q_behaviour_pct,
 								                        q_bullying_pct = EXCLUDED.q_bullying_pct,
 								                        q_communication_pct = EXCLUDED.q_communication_pct,
 								                        q_progress_pct = EXCLUDED.q_progress_pct,
 								                        q_teaching_pct = EXCLUDED.q_teaching_pct,
 								                        q_information_pct = EXCLUDED.q_information_pct,
 								                        q_curriculum_pct = EXCLUDED.q_curriculum_pct,
 								                        q_future_pct = EXCLUDED.q_future_pct,
 								                        q_leadership_pct = EXCLUDED.q_leadership_pct,
 								                        q_wellbeing_pct = EXCLUDED.q_wellbeing_pct,
 								                        q_recommend_pct = EXCLUDED.q_recommend_pct,
 								                        q_sen_pct = EXCLUDED.q_sen_pct
 								                """),
 								                record,
 								            )
 								            inserted += 1
 								            if inserted % 2000 == 0:
 								                session.flush()
 								    print(f"  ParentView: upserted {inserted} records")
 								    return {"inserted": inserted, "updated": 0, "skipped": 0}
 								if __name__ == "__main__":
 								    parser = argparse.ArgumentParser()
 								    parser.add_argument("--action", choices=["download", "load", "all"], default="all")
 								    parser.add_argument("--data-dir", type=Path, default=None)
 								    args = parser.parse_args()
 								    if args.action in ("download", "all"):
 								        download(args.data_dir)
 								    if args.action in ("load", "all"):
 								        load(data_dir=args.data_dir)