Files
school_compare/integrator/scripts/sources/parent_view.py
Tudor dd49ef28b2
Some checks failed
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 47s
Build and Push Docker Images / Trigger Portainer Update (push) Has been cancelled
Build and Push Docker Images / Build Frontend (Next.js) (push) Has been cancelled
feat(data): integrate 9 UK government data sources via Kestra
Adds a full data integration pipeline for enriching school profiles with
supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT.

Backend:
- Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections,
  ofsted_parent_view, school_census, admissions, sen_detail, phonics,
  school_deprivation, school_finance) plus GIAS columns on schools
- Expose all supplementary data via GET /api/schools/{urn}
- Enrich school list responses with ofsted_grade + ofsted_date

Integrator (new service):
- FastAPI HTTP microservice; Kestra calls POST /run/{source}
- 9 source modules: ofsted, gias, parent_view, census, admissions,
  sen_detail, phonics, idaci, finance
- 9 Kestra flow YAMLs with scheduled triggers and 3× retry

Frontend:
- SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate)
- SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View
  survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation
  Context, Finances
- types.ts: 8 new interfaces + extended School/SchoolDetailsResponse

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-24 11:44:04 +00:00

230 lines
9.6 KiB
Python
Raw Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
"""
Ofsted Parent View open data downloader and loader.
Source: https://parentview.ofsted.gov.uk/open-data
Update: ~3 times/year (Spring, Autumn, Summer)
"""
import argparse
import re
import sys
from datetime import date, datetime
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
DEST_DIR = SUPPLEMENTARY_DIR / "parent_view"
OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data"
# Question column mapping — Parent View open data uses descriptive column headers
# Map any variant to our internal field names
QUESTION_MAP = {
# Q1 — happiness
"My child is happy at this school": "q_happy_pct",
"Happy": "q_happy_pct",
# Q2 — safety
"My child feels safe at this school": "q_safe_pct",
"Safe": "q_safe_pct",
# Q3 — bullying
"The school makes sure its pupils are well behaved": "q_behaviour_pct",
"Well Behaved": "q_behaviour_pct",
# Q4 — bullying dealt with (sometimes separate)
"My child has been bullied and the school dealt with the bullying quickly and effectively": "q_bullying_pct",
"Bullying": "q_bullying_pct",
# Q5 — curriculum info
"The school makes me aware of what my child will learn during the year": "q_communication_pct",
"Aware of learning": "q_communication_pct",
# Q6 — concerns dealt with
"When I have raised concerns with the school, they have been dealt with properly": "q_communication_pct",
# Q7 — child does well
"My child does well at this school": "q_progress_pct",
"Does well": "q_progress_pct",
# Q8 — teaching
"The teaching is good at this school": "q_teaching_pct",
"Good teaching": "q_teaching_pct",
# Q9 — progress info
"I receive valuable information from the school about my child's progress": "q_information_pct",
"Progress information": "q_information_pct",
# Q10 — curriculum breadth
"My child is taught a broad range of subjects": "q_curriculum_pct",
"Broad subjects": "q_curriculum_pct",
# Q11 — prepares for future
"The school prepares my child well for the future": "q_future_pct",
"Prepared for future": "q_future_pct",
# Q12 — leadership
"The school is led and managed effectively": "q_leadership_pct",
"Led well": "q_leadership_pct",
# Q13 — wellbeing
"The school supports my child's wider personal development": "q_wellbeing_pct",
"Personal development": "q_wellbeing_pct",
# Q14 — recommendation
"I would recommend this school to another parent": "q_recommend_pct",
"Recommend": "q_recommend_pct",
}
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
# Scrape the open data page for the download link
try:
resp = requests.get(OPEN_DATA_PAGE, timeout=30)
resp.raise_for_status()
pattern = r'href="([^"]+\.(?:xlsx|csv|zip))"'
urls = re.findall(pattern, resp.text, re.IGNORECASE)
if not urls:
raise RuntimeError("No download link found on Parent View open data page")
url = urls[0] if urls[0].startswith("http") else "https://parentview.ofsted.gov.uk" + urls[0]
except Exception as e:
raise RuntimeError(f"Could not discover Parent View download URL: {e}")
filename = url.split("/")[-1].split("?")[0]
dest_file = dest / filename
if dest_file.exists():
print(f" ParentView: {filename} already exists, skipping download.")
return dest_file
print(f" ParentView: downloading {url} ...")
resp = requests.get(url, timeout=120, stream=True)
resp.raise_for_status()
with open(dest_file, "wb") as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
print(f" ParentView: saved {dest_file}")
return dest_file
def _positive_pct(row: pd.Series, q_col_base: str) -> float | None:
"""Sum 'Strongly agree' + 'Agree' percentages for a question."""
# Parent View open data has columns like "Q1 - Strongly agree %", "Q1 - Agree %"
strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %")
agree = row.get(f"{q_col_base} - Agree %")
try:
total = 0.0
if pd.notna(strongly):
total += float(strongly)
if pd.notna(agree):
total += float(agree)
return round(total, 1) if total > 0 else None
except (TypeError, ValueError):
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
files = sorted(dest.glob("*.xlsx")) + sorted(dest.glob("*.csv"))
if not files:
raise FileNotFoundError(f"No Parent View file found in {dest}")
path = files[-1]
print(f" ParentView: loading {path} ...")
if str(path).endswith(".xlsx"):
df = pd.read_excel(path)
else:
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
# Normalise URN column
urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None)
if not urn_col:
raise ValueError(f"URN column not found. Columns: {list(df.columns)[:20]}")
df.rename(columns={urn_col: "urn"}, inplace=True)
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
# Try to find total responses column
resp_col = next((c for c in df.columns if "total" in c.lower() and "respon" in c.lower()), None)
inserted = 0
today = date.today()
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
total = int(row[resp_col]) if resp_col and pd.notna(row.get(resp_col)) else None
# Try to extract % positive per question from wide-format columns
# Parent View has numbered questions Q1Q12 (or Q1Q14 depending on year)
record = {
"urn": urn,
"survey_date": today,
"total_responses": total,
"q_happy_pct": _positive_pct(row, "Q1"),
"q_safe_pct": _positive_pct(row, "Q2"),
"q_behaviour_pct": _positive_pct(row, "Q3"),
"q_bullying_pct": _positive_pct(row, "Q4"),
"q_communication_pct": _positive_pct(row, "Q5"),
"q_progress_pct": _positive_pct(row, "Q7"),
"q_teaching_pct": _positive_pct(row, "Q8"),
"q_information_pct": _positive_pct(row, "Q9"),
"q_curriculum_pct": _positive_pct(row, "Q10"),
"q_future_pct": _positive_pct(row, "Q11"),
"q_leadership_pct": _positive_pct(row, "Q12"),
"q_wellbeing_pct": _positive_pct(row, "Q13"),
"q_recommend_pct": _positive_pct(row, "Q14"),
"q_sen_pct": None,
}
session.execute(
text("""
INSERT INTO ofsted_parent_view
(urn, survey_date, total_responses,
q_happy_pct, q_safe_pct, q_behaviour_pct, q_bullying_pct,
q_communication_pct, q_progress_pct, q_teaching_pct,
q_information_pct, q_curriculum_pct, q_future_pct,
q_leadership_pct, q_wellbeing_pct, q_recommend_pct, q_sen_pct)
VALUES
(:urn, :survey_date, :total_responses,
:q_happy_pct, :q_safe_pct, :q_behaviour_pct, :q_bullying_pct,
:q_communication_pct, :q_progress_pct, :q_teaching_pct,
:q_information_pct, :q_curriculum_pct, :q_future_pct,
:q_leadership_pct, :q_wellbeing_pct, :q_recommend_pct, :q_sen_pct)
ON CONFLICT (urn) DO UPDATE SET
survey_date = EXCLUDED.survey_date,
total_responses = EXCLUDED.total_responses,
q_happy_pct = EXCLUDED.q_happy_pct,
q_safe_pct = EXCLUDED.q_safe_pct,
q_behaviour_pct = EXCLUDED.q_behaviour_pct,
q_bullying_pct = EXCLUDED.q_bullying_pct,
q_communication_pct = EXCLUDED.q_communication_pct,
q_progress_pct = EXCLUDED.q_progress_pct,
q_teaching_pct = EXCLUDED.q_teaching_pct,
q_information_pct = EXCLUDED.q_information_pct,
q_curriculum_pct = EXCLUDED.q_curriculum_pct,
q_future_pct = EXCLUDED.q_future_pct,
q_leadership_pct = EXCLUDED.q_leadership_pct,
q_wellbeing_pct = EXCLUDED.q_wellbeing_pct,
q_recommend_pct = EXCLUDED.q_recommend_pct,
q_sen_pct = EXCLUDED.q_sen_pct
"""),
record,
)
inserted += 1
if inserted % 2000 == 0:
session.flush()
print(f" ParentView: upserted {inserted} records")
return {"inserted": inserted, "updated": 0, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)