Files
school_compare/integrator/scripts/sources/parent_view.py

230 lines
9.6 KiB
Python
Raw Normal View History

"""
Ofsted Parent View open data downloader and loader.
Source: https://parentview.ofsted.gov.uk/open-data
Update: ~3 times/year (Spring, Autumn, Summer)
"""
import argparse
import re
import sys
from datetime import date, datetime
from pathlib import Path
import pandas as pd
import requests
sys.path.insert(0, str(Path(__file__).parent.parent))
from config import SUPPLEMENTARY_DIR
from db import get_session
DEST_DIR = SUPPLEMENTARY_DIR / "parent_view"
OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data"
# Question column mapping — Parent View open data uses descriptive column headers
# Map any variant to our internal field names
QUESTION_MAP = {
# Q1 — happiness
"My child is happy at this school": "q_happy_pct",
"Happy": "q_happy_pct",
# Q2 — safety
"My child feels safe at this school": "q_safe_pct",
"Safe": "q_safe_pct",
# Q3 — bullying
"The school makes sure its pupils are well behaved": "q_behaviour_pct",
"Well Behaved": "q_behaviour_pct",
# Q4 — bullying dealt with (sometimes separate)
"My child has been bullied and the school dealt with the bullying quickly and effectively": "q_bullying_pct",
"Bullying": "q_bullying_pct",
# Q5 — curriculum info
"The school makes me aware of what my child will learn during the year": "q_communication_pct",
"Aware of learning": "q_communication_pct",
# Q6 — concerns dealt with
"When I have raised concerns with the school, they have been dealt with properly": "q_communication_pct",
# Q7 — child does well
"My child does well at this school": "q_progress_pct",
"Does well": "q_progress_pct",
# Q8 — teaching
"The teaching is good at this school": "q_teaching_pct",
"Good teaching": "q_teaching_pct",
# Q9 — progress info
"I receive valuable information from the school about my child's progress": "q_information_pct",
"Progress information": "q_information_pct",
# Q10 — curriculum breadth
"My child is taught a broad range of subjects": "q_curriculum_pct",
"Broad subjects": "q_curriculum_pct",
# Q11 — prepares for future
"The school prepares my child well for the future": "q_future_pct",
"Prepared for future": "q_future_pct",
# Q12 — leadership
"The school is led and managed effectively": "q_leadership_pct",
"Led well": "q_leadership_pct",
# Q13 — wellbeing
"The school supports my child's wider personal development": "q_wellbeing_pct",
"Personal development": "q_wellbeing_pct",
# Q14 — recommendation
"I would recommend this school to another parent": "q_recommend_pct",
"Recommend": "q_recommend_pct",
}
def download(data_dir: Path | None = None) -> Path:
dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
dest.mkdir(parents=True, exist_ok=True)
# Scrape the open data page for the download link
try:
resp = requests.get(OPEN_DATA_PAGE, timeout=30)
resp.raise_for_status()
pattern = r'href="([^"]+\.(?:xlsx|csv|zip))"'
urls = re.findall(pattern, resp.text, re.IGNORECASE)
if not urls:
raise RuntimeError("No download link found on Parent View open data page")
url = urls[0] if urls[0].startswith("http") else "https://parentview.ofsted.gov.uk" + urls[0]
except Exception as e:
raise RuntimeError(f"Could not discover Parent View download URL: {e}")
filename = url.split("/")[-1].split("?")[0]
dest_file = dest / filename
if dest_file.exists():
print(f" ParentView: {filename} already exists, skipping download.")
return dest_file
print(f" ParentView: downloading {url} ...")
resp = requests.get(url, timeout=120, stream=True)
resp.raise_for_status()
with open(dest_file, "wb") as f:
for chunk in resp.iter_content(chunk_size=65536):
f.write(chunk)
print(f" ParentView: saved {dest_file}")
return dest_file
def _positive_pct(row: pd.Series, q_col_base: str) -> float | None:
"""Sum 'Strongly agree' + 'Agree' percentages for a question."""
# Parent View open data has columns like "Q1 - Strongly agree %", "Q1 - Agree %"
strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %")
agree = row.get(f"{q_col_base} - Agree %")
try:
total = 0.0
if pd.notna(strongly):
total += float(strongly)
if pd.notna(agree):
total += float(agree)
return round(total, 1) if total > 0 else None
except (TypeError, ValueError):
return None
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
if path is None:
dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
files = sorted(dest.glob("*.xlsx")) + sorted(dest.glob("*.csv"))
if not files:
raise FileNotFoundError(f"No Parent View file found in {dest}")
path = files[-1]
print(f" ParentView: loading {path} ...")
if str(path).endswith(".xlsx"):
df = pd.read_excel(path)
else:
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
# Normalise URN column
urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None)
if not urn_col:
raise ValueError(f"URN column not found. Columns: {list(df.columns)[:20]}")
df.rename(columns={urn_col: "urn"}, inplace=True)
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
df = df.dropna(subset=["urn"])
df["urn"] = df["urn"].astype(int)
# Try to find total responses column
resp_col = next((c for c in df.columns if "total" in c.lower() and "respon" in c.lower()), None)
inserted = 0
today = date.today()
with get_session() as session:
from sqlalchemy import text
for _, row in df.iterrows():
urn = int(row["urn"])
total = int(row[resp_col]) if resp_col and pd.notna(row.get(resp_col)) else None
# Try to extract % positive per question from wide-format columns
# Parent View has numbered questions Q1Q12 (or Q1Q14 depending on year)
record = {
"urn": urn,
"survey_date": today,
"total_responses": total,
"q_happy_pct": _positive_pct(row, "Q1"),
"q_safe_pct": _positive_pct(row, "Q2"),
"q_behaviour_pct": _positive_pct(row, "Q3"),
"q_bullying_pct": _positive_pct(row, "Q4"),
"q_communication_pct": _positive_pct(row, "Q5"),
"q_progress_pct": _positive_pct(row, "Q7"),
"q_teaching_pct": _positive_pct(row, "Q8"),
"q_information_pct": _positive_pct(row, "Q9"),
"q_curriculum_pct": _positive_pct(row, "Q10"),
"q_future_pct": _positive_pct(row, "Q11"),
"q_leadership_pct": _positive_pct(row, "Q12"),
"q_wellbeing_pct": _positive_pct(row, "Q13"),
"q_recommend_pct": _positive_pct(row, "Q14"),
"q_sen_pct": None,
}
session.execute(
text("""
INSERT INTO ofsted_parent_view
(urn, survey_date, total_responses,
q_happy_pct, q_safe_pct, q_behaviour_pct, q_bullying_pct,
q_communication_pct, q_progress_pct, q_teaching_pct,
q_information_pct, q_curriculum_pct, q_future_pct,
q_leadership_pct, q_wellbeing_pct, q_recommend_pct, q_sen_pct)
VALUES
(:urn, :survey_date, :total_responses,
:q_happy_pct, :q_safe_pct, :q_behaviour_pct, :q_bullying_pct,
:q_communication_pct, :q_progress_pct, :q_teaching_pct,
:q_information_pct, :q_curriculum_pct, :q_future_pct,
:q_leadership_pct, :q_wellbeing_pct, :q_recommend_pct, :q_sen_pct)
ON CONFLICT (urn) DO UPDATE SET
survey_date = EXCLUDED.survey_date,
total_responses = EXCLUDED.total_responses,
q_happy_pct = EXCLUDED.q_happy_pct,
q_safe_pct = EXCLUDED.q_safe_pct,
q_behaviour_pct = EXCLUDED.q_behaviour_pct,
q_bullying_pct = EXCLUDED.q_bullying_pct,
q_communication_pct = EXCLUDED.q_communication_pct,
q_progress_pct = EXCLUDED.q_progress_pct,
q_teaching_pct = EXCLUDED.q_teaching_pct,
q_information_pct = EXCLUDED.q_information_pct,
q_curriculum_pct = EXCLUDED.q_curriculum_pct,
q_future_pct = EXCLUDED.q_future_pct,
q_leadership_pct = EXCLUDED.q_leadership_pct,
q_wellbeing_pct = EXCLUDED.q_wellbeing_pct,
q_recommend_pct = EXCLUDED.q_recommend_pct,
q_sen_pct = EXCLUDED.q_sen_pct
"""),
record,
)
inserted += 1
if inserted % 2000 == 0:
session.flush()
print(f" ParentView: upserted {inserted} records")
return {"inserted": inserted, "updated": 0, "skipped": 0}
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
parser.add_argument("--data-dir", type=Path, default=None)
args = parser.parse_args()
if args.action in ("download", "all"):
download(args.data_dir)
if args.action in ("load", "all"):
load(data_dir=args.data_dir)