""" Ofsted Parent View open data downloader and loader. Source: https://parentview.ofsted.gov.uk/open-data Update: ~3 times/year (Spring, Autumn, Summer) """ import argparse import re import sys from datetime import date, datetime from pathlib import Path import pandas as pd import requests sys.path.insert(0, str(Path(__file__).parent.parent)) from config import SUPPLEMENTARY_DIR from db import get_session DEST_DIR = SUPPLEMENTARY_DIR / "parent_view" OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data" # Question column mapping — Parent View open data uses descriptive column headers # Map any variant to our internal field names QUESTION_MAP = { # Q1 — happiness "My child is happy at this school": "q_happy_pct", "Happy": "q_happy_pct", # Q2 — safety "My child feels safe at this school": "q_safe_pct", "Safe": "q_safe_pct", # Q3 — bullying "The school makes sure its pupils are well behaved": "q_behaviour_pct", "Well Behaved": "q_behaviour_pct", # Q4 — bullying dealt with (sometimes separate) "My child has been bullied and the school dealt with the bullying quickly and effectively": "q_bullying_pct", "Bullying": "q_bullying_pct", # Q5 — curriculum info "The school makes me aware of what my child will learn during the year": "q_communication_pct", "Aware of learning": "q_communication_pct", # Q6 — concerns dealt with "When I have raised concerns with the school, they have been dealt with properly": "q_communication_pct", # Q7 — child does well "My child does well at this school": "q_progress_pct", "Does well": "q_progress_pct", # Q8 — teaching "The teaching is good at this school": "q_teaching_pct", "Good teaching": "q_teaching_pct", # Q9 — progress info "I receive valuable information from the school about my child's progress": "q_information_pct", "Progress information": "q_information_pct", # Q10 — curriculum breadth "My child is taught a broad range of subjects": "q_curriculum_pct", "Broad subjects": "q_curriculum_pct", # Q11 — prepares for future "The school prepares my child well for the future": "q_future_pct", "Prepared for future": "q_future_pct", # Q12 — leadership "The school is led and managed effectively": "q_leadership_pct", "Led well": "q_leadership_pct", # Q13 — wellbeing "The school supports my child's wider personal development": "q_wellbeing_pct", "Personal development": "q_wellbeing_pct", # Q14 — recommendation "I would recommend this school to another parent": "q_recommend_pct", "Recommend": "q_recommend_pct", } def download(data_dir: Path | None = None) -> Path: dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR dest.mkdir(parents=True, exist_ok=True) # Scrape the open data page for the download link try: resp = requests.get(OPEN_DATA_PAGE, timeout=30) resp.raise_for_status() pattern = r'href="([^"]+\.(?:xlsx|csv|zip))"' urls = re.findall(pattern, resp.text, re.IGNORECASE) if not urls: raise RuntimeError("No download link found on Parent View open data page") url = urls[0] if urls[0].startswith("http") else "https://parentview.ofsted.gov.uk" + urls[0] except Exception as e: raise RuntimeError(f"Could not discover Parent View download URL: {e}") filename = url.split("/")[-1].split("?")[0] dest_file = dest / filename if dest_file.exists(): print(f" ParentView: {filename} already exists, skipping download.") return dest_file print(f" ParentView: downloading {url} ...") resp = requests.get(url, timeout=120, stream=True) resp.raise_for_status() with open(dest_file, "wb") as f: for chunk in resp.iter_content(chunk_size=65536): f.write(chunk) print(f" ParentView: saved {dest_file}") return dest_file def _positive_pct(row: pd.Series, q_col_base: str) -> float | None: """Sum 'Strongly agree' + 'Agree' percentages for a question.""" # Parent View open data has columns like "Q1 - Strongly agree %", "Q1 - Agree %" strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %") agree = row.get(f"{q_col_base} - Agree %") try: total = 0.0 if pd.notna(strongly): total += float(strongly) if pd.notna(agree): total += float(agree) return round(total, 1) if total > 0 else None except (TypeError, ValueError): return None def load(path: Path | None = None, data_dir: Path | None = None) -> dict: if path is None: dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR files = sorted(dest.glob("*.xlsx")) + sorted(dest.glob("*.csv")) if not files: raise FileNotFoundError(f"No Parent View file found in {dest}") path = files[-1] print(f" ParentView: loading {path} ...") if str(path).endswith(".xlsx"): df = pd.read_excel(path) else: df = pd.read_csv(path, encoding="latin-1", low_memory=False) # Normalise URN column urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None) if not urn_col: raise ValueError(f"URN column not found. Columns: {list(df.columns)[:20]}") df.rename(columns={urn_col: "urn"}, inplace=True) df["urn"] = pd.to_numeric(df["urn"], errors="coerce") df = df.dropna(subset=["urn"]) df["urn"] = df["urn"].astype(int) # Try to find total responses column resp_col = next((c for c in df.columns if "total" in c.lower() and "respon" in c.lower()), None) inserted = 0 today = date.today() with get_session() as session: from sqlalchemy import text for _, row in df.iterrows(): urn = int(row["urn"]) total = int(row[resp_col]) if resp_col and pd.notna(row.get(resp_col)) else None # Try to extract % positive per question from wide-format columns # Parent View has numbered questions Q1–Q12 (or Q1–Q14 depending on year) record = { "urn": urn, "survey_date": today, "total_responses": total, "q_happy_pct": _positive_pct(row, "Q1"), "q_safe_pct": _positive_pct(row, "Q2"), "q_behaviour_pct": _positive_pct(row, "Q3"), "q_bullying_pct": _positive_pct(row, "Q4"), "q_communication_pct": _positive_pct(row, "Q5"), "q_progress_pct": _positive_pct(row, "Q7"), "q_teaching_pct": _positive_pct(row, "Q8"), "q_information_pct": _positive_pct(row, "Q9"), "q_curriculum_pct": _positive_pct(row, "Q10"), "q_future_pct": _positive_pct(row, "Q11"), "q_leadership_pct": _positive_pct(row, "Q12"), "q_wellbeing_pct": _positive_pct(row, "Q13"), "q_recommend_pct": _positive_pct(row, "Q14"), "q_sen_pct": None, } session.execute( text(""" INSERT INTO ofsted_parent_view (urn, survey_date, total_responses, q_happy_pct, q_safe_pct, q_behaviour_pct, q_bullying_pct, q_communication_pct, q_progress_pct, q_teaching_pct, q_information_pct, q_curriculum_pct, q_future_pct, q_leadership_pct, q_wellbeing_pct, q_recommend_pct, q_sen_pct) VALUES (:urn, :survey_date, :total_responses, :q_happy_pct, :q_safe_pct, :q_behaviour_pct, :q_bullying_pct, :q_communication_pct, :q_progress_pct, :q_teaching_pct, :q_information_pct, :q_curriculum_pct, :q_future_pct, :q_leadership_pct, :q_wellbeing_pct, :q_recommend_pct, :q_sen_pct) ON CONFLICT (urn) DO UPDATE SET survey_date = EXCLUDED.survey_date, total_responses = EXCLUDED.total_responses, q_happy_pct = EXCLUDED.q_happy_pct, q_safe_pct = EXCLUDED.q_safe_pct, q_behaviour_pct = EXCLUDED.q_behaviour_pct, q_bullying_pct = EXCLUDED.q_bullying_pct, q_communication_pct = EXCLUDED.q_communication_pct, q_progress_pct = EXCLUDED.q_progress_pct, q_teaching_pct = EXCLUDED.q_teaching_pct, q_information_pct = EXCLUDED.q_information_pct, q_curriculum_pct = EXCLUDED.q_curriculum_pct, q_future_pct = EXCLUDED.q_future_pct, q_leadership_pct = EXCLUDED.q_leadership_pct, q_wellbeing_pct = EXCLUDED.q_wellbeing_pct, q_recommend_pct = EXCLUDED.q_recommend_pct, q_sen_pct = EXCLUDED.q_sen_pct """), record, ) inserted += 1 if inserted % 2000 == 0: session.flush() print(f" ParentView: upserted {inserted} records") return {"inserted": inserted, "updated": 0, "skipped": 0} if __name__ == "__main__": parser = argparse.ArgumentParser() parser.add_argument("--action", choices=["download", "load", "all"], default="all") parser.add_argument("--data-dir", type=Path, default=None) args = parser.parse_args() if args.action in ("download", "all"): download(args.data_dir) if args.action in ("load", "all"): load(data_dir=args.data_dir)