Adds a full data integration pipeline for enriching school profiles with
supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT.
Backend:
- Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections,
ofsted_parent_view, school_census, admissions, sen_detail, phonics,
school_deprivation, school_finance) plus GIAS columns on schools
- Expose all supplementary data via GET /api/schools/{urn}
- Enrich school list responses with ofsted_grade + ofsted_date
Integrator (new service):
- FastAPI HTTP microservice; Kestra calls POST /run/{source}
- 9 source modules: ofsted, gias, parent_view, census, admissions,
sen_detail, phonics, idaci, finance
- 9 Kestra flow YAMLs with scheduled triggers and 3× retry
Frontend:
- SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate)
- SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View
survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation
Context, Finances
- types.ts: 8 new interfaces + extended School/SchoolDetailsResponse
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
230 lines
9.6 KiB
Python
230 lines
9.6 KiB
Python
"""
|
||
Ofsted Parent View open data downloader and loader.
|
||
|
||
Source: https://parentview.ofsted.gov.uk/open-data
|
||
Update: ~3 times/year (Spring, Autumn, Summer)
|
||
"""
|
||
import argparse
|
||
import re
|
||
import sys
|
||
from datetime import date, datetime
|
||
from pathlib import Path
|
||
|
||
import pandas as pd
|
||
import requests
|
||
|
||
sys.path.insert(0, str(Path(__file__).parent.parent))
|
||
from config import SUPPLEMENTARY_DIR
|
||
from db import get_session
|
||
|
||
DEST_DIR = SUPPLEMENTARY_DIR / "parent_view"
|
||
OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data"
|
||
|
||
# Question column mapping — Parent View open data uses descriptive column headers
|
||
# Map any variant to our internal field names
|
||
QUESTION_MAP = {
|
||
# Q1 — happiness
|
||
"My child is happy at this school": "q_happy_pct",
|
||
"Happy": "q_happy_pct",
|
||
# Q2 — safety
|
||
"My child feels safe at this school": "q_safe_pct",
|
||
"Safe": "q_safe_pct",
|
||
# Q3 — bullying
|
||
"The school makes sure its pupils are well behaved": "q_behaviour_pct",
|
||
"Well Behaved": "q_behaviour_pct",
|
||
# Q4 — bullying dealt with (sometimes separate)
|
||
"My child has been bullied and the school dealt with the bullying quickly and effectively": "q_bullying_pct",
|
||
"Bullying": "q_bullying_pct",
|
||
# Q5 — curriculum info
|
||
"The school makes me aware of what my child will learn during the year": "q_communication_pct",
|
||
"Aware of learning": "q_communication_pct",
|
||
# Q6 — concerns dealt with
|
||
"When I have raised concerns with the school, they have been dealt with properly": "q_communication_pct",
|
||
# Q7 — child does well
|
||
"My child does well at this school": "q_progress_pct",
|
||
"Does well": "q_progress_pct",
|
||
# Q8 — teaching
|
||
"The teaching is good at this school": "q_teaching_pct",
|
||
"Good teaching": "q_teaching_pct",
|
||
# Q9 — progress info
|
||
"I receive valuable information from the school about my child's progress": "q_information_pct",
|
||
"Progress information": "q_information_pct",
|
||
# Q10 — curriculum breadth
|
||
"My child is taught a broad range of subjects": "q_curriculum_pct",
|
||
"Broad subjects": "q_curriculum_pct",
|
||
# Q11 — prepares for future
|
||
"The school prepares my child well for the future": "q_future_pct",
|
||
"Prepared for future": "q_future_pct",
|
||
# Q12 — leadership
|
||
"The school is led and managed effectively": "q_leadership_pct",
|
||
"Led well": "q_leadership_pct",
|
||
# Q13 — wellbeing
|
||
"The school supports my child's wider personal development": "q_wellbeing_pct",
|
||
"Personal development": "q_wellbeing_pct",
|
||
# Q14 — recommendation
|
||
"I would recommend this school to another parent": "q_recommend_pct",
|
||
"Recommend": "q_recommend_pct",
|
||
}
|
||
|
||
|
||
def download(data_dir: Path | None = None) -> Path:
|
||
dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
|
||
dest.mkdir(parents=True, exist_ok=True)
|
||
|
||
# Scrape the open data page for the download link
|
||
try:
|
||
resp = requests.get(OPEN_DATA_PAGE, timeout=30)
|
||
resp.raise_for_status()
|
||
pattern = r'href="([^"]+\.(?:xlsx|csv|zip))"'
|
||
urls = re.findall(pattern, resp.text, re.IGNORECASE)
|
||
if not urls:
|
||
raise RuntimeError("No download link found on Parent View open data page")
|
||
url = urls[0] if urls[0].startswith("http") else "https://parentview.ofsted.gov.uk" + urls[0]
|
||
except Exception as e:
|
||
raise RuntimeError(f"Could not discover Parent View download URL: {e}")
|
||
|
||
filename = url.split("/")[-1].split("?")[0]
|
||
dest_file = dest / filename
|
||
|
||
if dest_file.exists():
|
||
print(f" ParentView: {filename} already exists, skipping download.")
|
||
return dest_file
|
||
|
||
print(f" ParentView: downloading {url} ...")
|
||
resp = requests.get(url, timeout=120, stream=True)
|
||
resp.raise_for_status()
|
||
with open(dest_file, "wb") as f:
|
||
for chunk in resp.iter_content(chunk_size=65536):
|
||
f.write(chunk)
|
||
|
||
print(f" ParentView: saved {dest_file}")
|
||
return dest_file
|
||
|
||
|
||
def _positive_pct(row: pd.Series, q_col_base: str) -> float | None:
|
||
"""Sum 'Strongly agree' + 'Agree' percentages for a question."""
|
||
# Parent View open data has columns like "Q1 - Strongly agree %", "Q1 - Agree %"
|
||
strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %")
|
||
agree = row.get(f"{q_col_base} - Agree %")
|
||
try:
|
||
total = 0.0
|
||
if pd.notna(strongly):
|
||
total += float(strongly)
|
||
if pd.notna(agree):
|
||
total += float(agree)
|
||
return round(total, 1) if total > 0 else None
|
||
except (TypeError, ValueError):
|
||
return None
|
||
|
||
|
||
def load(path: Path | None = None, data_dir: Path | None = None) -> dict:
|
||
if path is None:
|
||
dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR
|
||
files = sorted(dest.glob("*.xlsx")) + sorted(dest.glob("*.csv"))
|
||
if not files:
|
||
raise FileNotFoundError(f"No Parent View file found in {dest}")
|
||
path = files[-1]
|
||
|
||
print(f" ParentView: loading {path} ...")
|
||
|
||
if str(path).endswith(".xlsx"):
|
||
df = pd.read_excel(path)
|
||
else:
|
||
df = pd.read_csv(path, encoding="latin-1", low_memory=False)
|
||
|
||
# Normalise URN column
|
||
urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None)
|
||
if not urn_col:
|
||
raise ValueError(f"URN column not found. Columns: {list(df.columns)[:20]}")
|
||
df.rename(columns={urn_col: "urn"}, inplace=True)
|
||
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
|
||
df = df.dropna(subset=["urn"])
|
||
df["urn"] = df["urn"].astype(int)
|
||
|
||
# Try to find total responses column
|
||
resp_col = next((c for c in df.columns if "total" in c.lower() and "respon" in c.lower()), None)
|
||
|
||
inserted = 0
|
||
today = date.today()
|
||
|
||
with get_session() as session:
|
||
from sqlalchemy import text
|
||
for _, row in df.iterrows():
|
||
urn = int(row["urn"])
|
||
total = int(row[resp_col]) if resp_col and pd.notna(row.get(resp_col)) else None
|
||
|
||
# Try to extract % positive per question from wide-format columns
|
||
# Parent View has numbered questions Q1–Q12 (or Q1–Q14 depending on year)
|
||
record = {
|
||
"urn": urn,
|
||
"survey_date": today,
|
||
"total_responses": total,
|
||
"q_happy_pct": _positive_pct(row, "Q1"),
|
||
"q_safe_pct": _positive_pct(row, "Q2"),
|
||
"q_behaviour_pct": _positive_pct(row, "Q3"),
|
||
"q_bullying_pct": _positive_pct(row, "Q4"),
|
||
"q_communication_pct": _positive_pct(row, "Q5"),
|
||
"q_progress_pct": _positive_pct(row, "Q7"),
|
||
"q_teaching_pct": _positive_pct(row, "Q8"),
|
||
"q_information_pct": _positive_pct(row, "Q9"),
|
||
"q_curriculum_pct": _positive_pct(row, "Q10"),
|
||
"q_future_pct": _positive_pct(row, "Q11"),
|
||
"q_leadership_pct": _positive_pct(row, "Q12"),
|
||
"q_wellbeing_pct": _positive_pct(row, "Q13"),
|
||
"q_recommend_pct": _positive_pct(row, "Q14"),
|
||
"q_sen_pct": None,
|
||
}
|
||
|
||
session.execute(
|
||
text("""
|
||
INSERT INTO ofsted_parent_view
|
||
(urn, survey_date, total_responses,
|
||
q_happy_pct, q_safe_pct, q_behaviour_pct, q_bullying_pct,
|
||
q_communication_pct, q_progress_pct, q_teaching_pct,
|
||
q_information_pct, q_curriculum_pct, q_future_pct,
|
||
q_leadership_pct, q_wellbeing_pct, q_recommend_pct, q_sen_pct)
|
||
VALUES
|
||
(:urn, :survey_date, :total_responses,
|
||
:q_happy_pct, :q_safe_pct, :q_behaviour_pct, :q_bullying_pct,
|
||
:q_communication_pct, :q_progress_pct, :q_teaching_pct,
|
||
:q_information_pct, :q_curriculum_pct, :q_future_pct,
|
||
:q_leadership_pct, :q_wellbeing_pct, :q_recommend_pct, :q_sen_pct)
|
||
ON CONFLICT (urn) DO UPDATE SET
|
||
survey_date = EXCLUDED.survey_date,
|
||
total_responses = EXCLUDED.total_responses,
|
||
q_happy_pct = EXCLUDED.q_happy_pct,
|
||
q_safe_pct = EXCLUDED.q_safe_pct,
|
||
q_behaviour_pct = EXCLUDED.q_behaviour_pct,
|
||
q_bullying_pct = EXCLUDED.q_bullying_pct,
|
||
q_communication_pct = EXCLUDED.q_communication_pct,
|
||
q_progress_pct = EXCLUDED.q_progress_pct,
|
||
q_teaching_pct = EXCLUDED.q_teaching_pct,
|
||
q_information_pct = EXCLUDED.q_information_pct,
|
||
q_curriculum_pct = EXCLUDED.q_curriculum_pct,
|
||
q_future_pct = EXCLUDED.q_future_pct,
|
||
q_leadership_pct = EXCLUDED.q_leadership_pct,
|
||
q_wellbeing_pct = EXCLUDED.q_wellbeing_pct,
|
||
q_recommend_pct = EXCLUDED.q_recommend_pct,
|
||
q_sen_pct = EXCLUDED.q_sen_pct
|
||
"""),
|
||
record,
|
||
)
|
||
inserted += 1
|
||
if inserted % 2000 == 0:
|
||
session.flush()
|
||
|
||
print(f" ParentView: upserted {inserted} records")
|
||
return {"inserted": inserted, "updated": 0, "skipped": 0}
|
||
|
||
|
||
if __name__ == "__main__":
|
||
parser = argparse.ArgumentParser()
|
||
parser.add_argument("--action", choices=["download", "load", "all"], default="all")
|
||
parser.add_argument("--data-dir", type=Path, default=None)
|
||
args = parser.parse_args()
|
||
|
||
if args.action in ("download", "all"):
|
||
download(args.data_dir)
|
||
if args.action in ("load", "all"):
|
||
load(data_dir=args.data_dir)
|