From dd49ef28b21a9c9e5e3789fe9c8272a7019f12a4 Mon Sep 17 00:00:00 2001 From: Tudor Date: Tue, 24 Mar 2026 11:44:04 +0000 Subject: [PATCH] feat(data): integrate 9 UK government data sources via Kestra MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds a full data integration pipeline for enriching school profiles with supplementary data from Ofsted, GIAS, EES, IDACI, and FBIT. Backend: - Bump SCHEMA_VERSION to 3; add 8 new DB tables (ofsted_inspections, ofsted_parent_view, school_census, admissions, sen_detail, phonics, school_deprivation, school_finance) plus GIAS columns on schools - Expose all supplementary data via GET /api/schools/{urn} - Enrich school list responses with ofsted_grade + ofsted_date Integrator (new service): - FastAPI HTTP microservice; Kestra calls POST /run/{source} - 9 source modules: ofsted, gias, parent_view, census, admissions, sen_detail, phonics, idaci, finance - 9 Kestra flow YAMLs with scheduled triggers and 3× retry Frontend: - SchoolRow: colour-coded Ofsted badge (Outstanding/Good/RI/Inadequate) - SchoolDetailView: 7 new sections — Ofsted sub-judgements, Parent View survey bars, Admissions, Pupils & Inclusion / SEN, Phonics, Deprivation Context, Finances - types.ts: 8 new interfaces + extended School/SchoolDetailsResponse Co-Authored-By: Claude Sonnet 4.6 --- backend/app.py | 26 ++ backend/data_loader.py | 161 +++++++++++- backend/models.py | 176 +++++++++++++- backend/version.py | 3 +- docker-compose.yml | 61 +++++ integrator/Dockerfile | 13 + integrator/flows/admissions.yml | 25 ++ integrator/flows/census.yml | 25 ++ integrator/flows/finance.yml | 25 ++ integrator/flows/gias.yml | 30 +++ integrator/flows/idaci.yml | 25 ++ integrator/flows/ofsted.yml | 32 +++ integrator/flows/parent_view.yml | 30 +++ integrator/flows/phonics.yml | 25 ++ integrator/flows/sen_detail.yml | 25 ++ integrator/requirements.txt | 7 + integrator/scripts/config.py | 11 + integrator/scripts/db.py | 23 ++ integrator/scripts/sources/__init__.py | 0 integrator/scripts/sources/admissions.py | 158 ++++++++++++ integrator/scripts/sources/census.py | 148 +++++++++++ integrator/scripts/sources/ees.py | 53 ++++ integrator/scripts/sources/finance.py | 143 +++++++++++ integrator/scripts/sources/gias.py | 159 ++++++++++++ integrator/scripts/sources/idaci.py | 176 ++++++++++++++ integrator/scripts/sources/ofsted.py | 226 +++++++++++++++++ integrator/scripts/sources/parent_view.py | 229 ++++++++++++++++++ integrator/scripts/sources/phonics.py | 132 ++++++++++ integrator/scripts/sources/sen_detail.py | 150 ++++++++++++ integrator/server.py | 70 ++++++ nextjs-app/app/school/[urn]/page.tsx | 10 +- .../components/SchoolDetailView.module.css | 117 +++++++++ nextjs-app/components/SchoolDetailView.tsx | 227 ++++++++++++++++- nextjs-app/components/SchoolRow.module.css | 17 ++ nextjs-app/components/SchoolRow.tsx | 14 +- nextjs-app/lib/types.ts | 105 ++++++++ 36 files changed, 2849 insertions(+), 8 deletions(-) create mode 100644 integrator/Dockerfile create mode 100644 integrator/flows/admissions.yml create mode 100644 integrator/flows/census.yml create mode 100644 integrator/flows/finance.yml create mode 100644 integrator/flows/gias.yml create mode 100644 integrator/flows/idaci.yml create mode 100644 integrator/flows/ofsted.yml create mode 100644 integrator/flows/parent_view.yml create mode 100644 integrator/flows/phonics.yml create mode 100644 integrator/flows/sen_detail.yml create mode 100644 integrator/requirements.txt create mode 100644 integrator/scripts/config.py create mode 100644 integrator/scripts/db.py create mode 100644 integrator/scripts/sources/__init__.py create mode 100644 integrator/scripts/sources/admissions.py create mode 100644 integrator/scripts/sources/census.py create mode 100644 integrator/scripts/sources/ees.py create mode 100644 integrator/scripts/sources/finance.py create mode 100644 integrator/scripts/sources/gias.py create mode 100644 integrator/scripts/sources/idaci.py create mode 100644 integrator/scripts/sources/ofsted.py create mode 100644 integrator/scripts/sources/parent_view.py create mode 100644 integrator/scripts/sources/phonics.py create mode 100644 integrator/scripts/sources/sen_detail.py create mode 100644 integrator/server.py diff --git a/backend/app.py b/backend/app.py index 3ab895d..22a6bea 100644 --- a/backend/app.py +++ b/backend/app.py @@ -24,6 +24,7 @@ from .data_loader import ( clear_cache, load_school_data, geocode_single_postcode, + get_supplementary_data, ) from .data_loader import get_data_info as get_db_info from .database import check_and_migrate_if_needed @@ -384,6 +385,16 @@ async def get_school_details(request: Request, urn: int): # Get latest info for the school latest = school_data.iloc[-1] + # Fetch supplementary data (Ofsted, Parent View, admissions, etc.) + from .database import SessionLocal + supplementary = {} + try: + db = SessionLocal() + supplementary = get_supplementary_data(db, urn) + db.close() + except Exception: + pass + return { "school_info": { "urn": urn, @@ -396,8 +407,23 @@ async def get_school_details(request: Request, urn: int): "latitude": latest.get("latitude"), "longitude": latest.get("longitude"), "phase": "Primary", + # GIAS fields + "website": latest.get("website"), + "headteacher_name": latest.get("headteacher_name"), + "capacity": latest.get("capacity"), + "trust_name": latest.get("trust_name"), + "gender": latest.get("gender"), }, "yearly_data": clean_for_json(school_data), + # Supplementary data (null if not yet populated by Kestra) + "ofsted": supplementary.get("ofsted"), + "parent_view": supplementary.get("parent_view"), + "census": supplementary.get("census"), + "admissions": supplementary.get("admissions"), + "sen_detail": supplementary.get("sen_detail"), + "phonics": supplementary.get("phonics"), + "deprivation": supplementary.get("deprivation"), + "finance": supplementary.get("finance"), } diff --git a/backend/data_loader.py b/backend/data_loader.py index 1364c19..61a0fca 100644 --- a/backend/data_loader.py +++ b/backend/data_loader.py @@ -16,7 +16,11 @@ from sqlalchemy.orm import joinedload, Session from .config import settings from .database import SessionLocal, get_db_session -from .models import School, SchoolResult +from .models import ( + School, SchoolResult, + OfstedInspection, OfstedParentView, SchoolCensus, + SchoolAdmissions, SenDetail, Phonics, SchoolDeprivation, SchoolFinance, +) from .schemas import SCHOOL_TYPE_MAP # Cache for user search postcode geocoding (not for school data) @@ -381,6 +385,12 @@ def school_to_dict(school: School, include_results: bool = False) -> dict: "postcode": school.postcode, "latitude": school.latitude, "longitude": school.longitude, + # GIAS fields + "website": school.website, + "headteacher_name": school.headteacher_name, + "capacity": school.capacity, + "trust_name": school.trust_name, + "gender": school.gender, } if include_results and school.results: @@ -455,8 +465,25 @@ def load_school_data_as_dataframe(db: Session = None) -> pd.DataFrame: # Query all schools with their results schools = db.query(School).options(joinedload(School.results)).all() + # Load Ofsted data into a lookup dict (urn → grade, date) + ofsted_lookup: Dict[int, dict] = {} + try: + ofsted_rows = db.query( + OfstedInspection.urn, + OfstedInspection.overall_effectiveness, + OfstedInspection.inspection_date, + ).all() + for o in ofsted_rows: + ofsted_lookup[o.urn] = { + "ofsted_grade": o.overall_effectiveness, + "ofsted_date": o.inspection_date.isoformat() if o.inspection_date else None, + } + except Exception: + pass # Table may not exist yet on first run + rows = [] for school in schools: + ofsted = ofsted_lookup.get(school.urn, {}) for result in school.results: row = { "urn": school.urn, @@ -468,6 +495,15 @@ def load_school_data_as_dataframe(db: Session = None) -> pd.DataFrame: "postcode": school.postcode, "latitude": school.latitude, "longitude": school.longitude, + # GIAS fields + "website": school.website, + "headteacher_name": school.headteacher_name, + "capacity": school.capacity, + "trust_name": school.trust_name, + "gender": school.gender, + # Ofsted (for list view) + "ofsted_grade": ofsted.get("ofsted_grade"), + "ofsted_date": ofsted.get("ofsted_date"), **result_to_dict(result) } rows.append(row) @@ -511,3 +547,126 @@ def clear_cache(): """Clear all caches.""" global _df_cache _df_cache = None + + +def get_supplementary_data(db: Session, urn: int) -> dict: + """ + Fetch all supplementary data for a single school URN. + Returns a dict with keys: ofsted, parent_view, census, admissions, sen_detail, + phonics, deprivation, finance. Values are dicts or None. + """ + result = {} + + def safe_query(model, pk_field, latest_year_field=None): + try: + if latest_year_field: + row = ( + db.query(model) + .filter(getattr(model, pk_field) == urn) + .order_by(getattr(model, latest_year_field).desc()) + .first() + ) + else: + row = db.query(model).filter(getattr(model, pk_field) == urn).first() + return row + except Exception: + return None + + # Ofsted inspection + o = safe_query(OfstedInspection, "urn") + result["ofsted"] = { + "overall_effectiveness": o.overall_effectiveness, + "quality_of_education": o.quality_of_education, + "behaviour_attitudes": o.behaviour_attitudes, + "personal_development": o.personal_development, + "leadership_management": o.leadership_management, + "early_years_provision": o.early_years_provision, + "previous_overall": o.previous_overall, + "inspection_date": o.inspection_date.isoformat() if o.inspection_date else None, + "inspection_type": o.inspection_type, + } if o else None + + # Parent View + pv = safe_query(OfstedParentView, "urn") + result["parent_view"] = { + "survey_date": pv.survey_date.isoformat() if pv.survey_date else None, + "total_responses": pv.total_responses, + "q_happy_pct": pv.q_happy_pct, + "q_safe_pct": pv.q_safe_pct, + "q_behaviour_pct": pv.q_behaviour_pct, + "q_bullying_pct": pv.q_bullying_pct, + "q_communication_pct": pv.q_communication_pct, + "q_progress_pct": pv.q_progress_pct, + "q_teaching_pct": pv.q_teaching_pct, + "q_information_pct": pv.q_information_pct, + "q_curriculum_pct": pv.q_curriculum_pct, + "q_future_pct": pv.q_future_pct, + "q_leadership_pct": pv.q_leadership_pct, + "q_wellbeing_pct": pv.q_wellbeing_pct, + "q_recommend_pct": pv.q_recommend_pct, + "q_sen_pct": pv.q_sen_pct, + } if pv else None + + # School Census (latest year) + c = safe_query(SchoolCensus, "urn", "year") + result["census"] = { + "year": c.year, + "class_size_avg": c.class_size_avg, + "ethnicity_white_pct": c.ethnicity_white_pct, + "ethnicity_asian_pct": c.ethnicity_asian_pct, + "ethnicity_black_pct": c.ethnicity_black_pct, + "ethnicity_mixed_pct": c.ethnicity_mixed_pct, + "ethnicity_other_pct": c.ethnicity_other_pct, + } if c else None + + # Admissions (latest year) + a = safe_query(SchoolAdmissions, "urn", "year") + result["admissions"] = { + "year": a.year, + "published_admission_number": a.published_admission_number, + "total_applications": a.total_applications, + "first_preference_offers_pct": a.first_preference_offers_pct, + "oversubscribed": a.oversubscribed, + } if a else None + + # SEN Detail (latest year) + s = safe_query(SenDetail, "urn", "year") + result["sen_detail"] = { + "year": s.year, + "primary_need_speech_pct": s.primary_need_speech_pct, + "primary_need_autism_pct": s.primary_need_autism_pct, + "primary_need_mld_pct": s.primary_need_mld_pct, + "primary_need_spld_pct": s.primary_need_spld_pct, + "primary_need_semh_pct": s.primary_need_semh_pct, + "primary_need_physical_pct": s.primary_need_physical_pct, + "primary_need_other_pct": s.primary_need_other_pct, + } if s else None + + # Phonics (latest year) + ph = safe_query(Phonics, "urn", "year") + result["phonics"] = { + "year": ph.year, + "year1_phonics_pct": ph.year1_phonics_pct, + "year2_phonics_pct": ph.year2_phonics_pct, + } if ph else None + + # Deprivation + d = safe_query(SchoolDeprivation, "urn") + result["deprivation"] = { + "lsoa_code": d.lsoa_code, + "idaci_score": d.idaci_score, + "idaci_decile": d.idaci_decile, + } if d else None + + # Finance (latest year) + f = safe_query(SchoolFinance, "urn", "year") + result["finance"] = { + "year": f.year, + "per_pupil_spend": f.per_pupil_spend, + "staff_cost_pct": f.staff_cost_pct, + "teacher_cost_pct": f.teacher_cost_pct, + "support_staff_cost_pct": f.support_staff_cost_pct, + "premises_cost_pct": f.premises_cost_pct, + } if f else None + + return result diff --git a/backend/models.py b/backend/models.py index b280342..4bfd094 100644 --- a/backend/models.py +++ b/backend/models.py @@ -7,7 +7,7 @@ from datetime import datetime from sqlalchemy import ( Column, Integer, String, Float, ForeignKey, Index, UniqueConstraint, - Text, Boolean, DateTime + Text, Boolean, DateTime, Date ) from sqlalchemy.orm import relationship from .database import Base @@ -38,7 +38,16 @@ class School(Base): # Geocoding (cached) latitude = Column(Float) longitude = Column(Float) - + + # GIAS enrichment fields + website = Column(String(255)) + headteacher_name = Column(String(200)) + capacity = Column(Integer) + trust_name = Column(String(255)) + trust_uid = Column(String(20)) + gender = Column(String(20)) # Mixed / Girls / Boys + nursery_provision = Column(Boolean) + # Relationships results = relationship("SchoolResult", back_populates="school", cascade="all, delete-orphan") @@ -150,6 +159,169 @@ class SchemaVersion(Base): return f"" +# --------------------------------------------------------------------------- +# Supplementary data tables (populated by the Kestra data integrator) +# --------------------------------------------------------------------------- + +class OfstedInspection(Base): + """Latest Ofsted inspection judgement per school.""" + __tablename__ = "ofsted_inspections" + + urn = Column(Integer, primary_key=True) + inspection_date = Column(Date) + publication_date = Column(Date) + inspection_type = Column(String(100)) # Section 5 / Section 8 etc. + # 1=Outstanding 2=Good 3=Requires improvement 4=Inadequate + overall_effectiveness = Column(Integer) + quality_of_education = Column(Integer) + behaviour_attitudes = Column(Integer) + personal_development = Column(Integer) + leadership_management = Column(Integer) + early_years_provision = Column(Integer) # nullable — not all schools + previous_overall = Column(Integer) # for trend display + + def __repr__(self): + return f"" + + +class OfstedParentView(Base): + """Ofsted Parent View survey — latest per school. 14 questions, % saying Yes.""" + __tablename__ = "ofsted_parent_view" + + urn = Column(Integer, primary_key=True) + survey_date = Column(Date) + total_responses = Column(Integer) + q_happy_pct = Column(Float) # My child is happy at this school + q_safe_pct = Column(Float) # My child feels safe at this school + q_bullying_pct = Column(Float) # School deals with bullying well + q_communication_pct = Column(Float) # School keeps me informed + q_progress_pct = Column(Float) # My child does well / good progress + q_teaching_pct = Column(Float) # Teaching is good + q_information_pct = Column(Float) # I receive valuable info about progress + q_curriculum_pct = Column(Float) # Broad range of subjects taught + q_future_pct = Column(Float) # Prepares child well for the future + q_leadership_pct = Column(Float) # Led and managed effectively + q_wellbeing_pct = Column(Float) # Supports wider personal development + q_behaviour_pct = Column(Float) # Pupils are well behaved + q_recommend_pct = Column(Float) # I would recommend this school + q_sen_pct = Column(Float) # Good information about child's SEN (where applicable) + + def __repr__(self): + return f"" + + +class SchoolCensus(Base): + """Annual school census snapshot — class sizes and ethnicity breakdown.""" + __tablename__ = "school_census" + + urn = Column(Integer, primary_key=True) + year = Column(Integer, primary_key=True) + class_size_avg = Column(Float) + ethnicity_white_pct = Column(Float) + ethnicity_asian_pct = Column(Float) + ethnicity_black_pct = Column(Float) + ethnicity_mixed_pct = Column(Float) + ethnicity_other_pct = Column(Float) + + __table_args__ = ( + Index('ix_school_census_urn_year', 'urn', 'year'), + ) + + def __repr__(self): + return f"" + + +class SchoolAdmissions(Base): + """Annual admissions statistics per school.""" + __tablename__ = "school_admissions" + + urn = Column(Integer, primary_key=True) + year = Column(Integer, primary_key=True) + published_admission_number = Column(Integer) # PAN + total_applications = Column(Integer) + first_preference_offers_pct = Column(Float) # % receiving 1st choice + oversubscribed = Column(Boolean) + + __table_args__ = ( + Index('ix_school_admissions_urn_year', 'urn', 'year'), + ) + + def __repr__(self): + return f"" + + +class SenDetail(Base): + """SEN primary need type breakdown — more granular than school_results context fields.""" + __tablename__ = "sen_detail" + + urn = Column(Integer, primary_key=True) + year = Column(Integer, primary_key=True) + primary_need_speech_pct = Column(Float) # SLCN + primary_need_autism_pct = Column(Float) # ASD + primary_need_mld_pct = Column(Float) # Moderate learning difficulty + primary_need_spld_pct = Column(Float) # Specific learning difficulty (dyslexia etc.) + primary_need_semh_pct = Column(Float) # Social, emotional, mental health + primary_need_physical_pct = Column(Float) # Physical/sensory + primary_need_other_pct = Column(Float) + + __table_args__ = ( + Index('ix_sen_detail_urn_year', 'urn', 'year'), + ) + + def __repr__(self): + return f"" + + +class Phonics(Base): + """Phonics Screening Check pass rates.""" + __tablename__ = "phonics" + + urn = Column(Integer, primary_key=True) + year = Column(Integer, primary_key=True) + year1_phonics_pct = Column(Float) # % reaching expected standard in Year 1 + year2_phonics_pct = Column(Float) # % reaching standard in Year 2 (re-takers) + + __table_args__ = ( + Index('ix_phonics_urn_year', 'urn', 'year'), + ) + + def __repr__(self): + return f"" + + +class SchoolDeprivation(Base): + """IDACI deprivation index — derived via postcode → LSOA lookup.""" + __tablename__ = "school_deprivation" + + urn = Column(Integer, primary_key=True) + lsoa_code = Column(String(20)) + idaci_score = Column(Float) # 0–1, higher = more deprived + idaci_decile = Column(Integer) # 1 = most deprived, 10 = least deprived + + def __repr__(self): + return f"" + + +class SchoolFinance(Base): + """FBIT financial benchmarking data.""" + __tablename__ = "school_finance" + + urn = Column(Integer, primary_key=True) + year = Column(Integer, primary_key=True) + per_pupil_spend = Column(Float) # £ total expenditure per pupil + staff_cost_pct = Column(Float) # % of budget on all staff + teacher_cost_pct = Column(Float) # % on teachers specifically + support_staff_cost_pct = Column(Float) + premises_cost_pct = Column(Float) + + __table_args__ = ( + Index('ix_school_finance_urn_year', 'urn', 'year'), + ) + + def __repr__(self): + return f"" + + # Mapping from CSV columns to model fields SCHOOL_FIELD_MAPPING = { 'urn': 'urn', diff --git a/backend/version.py b/backend/version.py index 49d8f5b..c5a21f8 100644 --- a/backend/version.py +++ b/backend/version.py @@ -13,10 +13,11 @@ WHEN TO BUMP: """ # Current schema version - increment when models change -SCHEMA_VERSION = 2 +SCHEMA_VERSION = 3 # Changelog for documentation SCHEMA_CHANGELOG = { 1: "Initial schema with School and SchoolResult tables", 2: "Added pupil absence fields (reading, maths, gps, writing, science)", + 3: "Added supplementary data tables: ofsted, parent_view, census, admissions, sen_detail, phonics, deprivation, finance; GIAS columns on schools", } diff --git a/docker-compose.yml b/docker-compose.yml index 9434c70..de40bd5 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -77,9 +77,70 @@ services: retries: 3 start_period: 40s + # Kestra — workflow orchestrator (UI at http://localhost:8080) + kestra: + image: kestra/kestra:latest + container_name: schoolcompare_kestra + ports: + - "8080:8080" + volumes: + - kestra_storage:/app/storage + - ./integrator/flows:/flows + environment: + KESTRA_CONFIGURATION: | + datasources: + postgres: + url: jdbc:postgresql://db:5432/kestra + driverClassName: org.postgresql.Driver + username: schoolcompare + password: schoolcompare + kestra: + repository: + type: postgres + queue: + type: postgres + storage: + type: local + local: + base-path: /app/storage + depends_on: + db: + condition: service_healthy + networks: + - schoolcompare-network + restart: unless-stopped + + # Data integrator — Python microservice called by Kestra + integrator: + build: + context: ./integrator + dockerfile: Dockerfile + container_name: schoolcompare_integrator + ports: + - "8001:8001" + environment: + DATABASE_URL: postgresql://schoolcompare:schoolcompare@db:5432/schoolcompare + DATA_DIR: /data + PYTHONUNBUFFERED: 1 + volumes: + - ./data:/data + depends_on: + db: + condition: service_healthy + networks: + - schoolcompare-network + restart: unless-stopped + healthcheck: + test: ["CMD", "curl", "-f", "http://localhost:8001/health"] + interval: 30s + timeout: 10s + retries: 3 + start_period: 15s + networks: schoolcompare-network: driver: bridge volumes: postgres_data: + kestra_storage: diff --git a/integrator/Dockerfile b/integrator/Dockerfile new file mode 100644 index 0000000..c21e55d --- /dev/null +++ b/integrator/Dockerfile @@ -0,0 +1,13 @@ +FROM python:3.12-slim + +WORKDIR /app + +# Install dependencies +COPY requirements.txt . +RUN pip install --no-cache-dir -r requirements.txt + +# Copy application code +COPY scripts/ ./scripts/ +COPY server.py . + +CMD ["uvicorn", "server:app", "--host", "0.0.0.0", "--port", "8001"] diff --git a/integrator/flows/admissions.yml b/integrator/flows/admissions.yml new file mode 100644 index 0000000..51cab2d --- /dev/null +++ b/integrator/flows/admissions.yml @@ -0,0 +1,25 @@ +id: admissions-annual-update +namespace: schoolcompare.data +description: Download and load school admissions data via EES API + +triggers: + - id: annual-schedule + type: io.kestra.plugin.core.trigger.Schedule + cron: "0 4 1 7 *" # 1 July annually at 04:00 + +tasks: + - id: download + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/admissions?action=download + method: POST + timeout: PT20M + + - id: load + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/admissions?action=load + method: POST + timeout: PT30M + +retry: + maxAttempts: 3 + delay: PT15M diff --git a/integrator/flows/census.yml b/integrator/flows/census.yml new file mode 100644 index 0000000..df557e5 --- /dev/null +++ b/integrator/flows/census.yml @@ -0,0 +1,25 @@ +id: census-annual-update +namespace: schoolcompare.data +description: Download and load School Census (SPC) data via EES API + +triggers: + - id: annual-schedule + type: io.kestra.plugin.core.trigger.Schedule + cron: "0 4 1 9 *" # 1 September annually at 04:00 + +tasks: + - id: download + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/census?action=download + method: POST + timeout: PT20M + + - id: load + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/census?action=load + method: POST + timeout: PT30M + +retry: + maxAttempts: 3 + delay: PT15M diff --git a/integrator/flows/finance.yml b/integrator/flows/finance.yml new file mode 100644 index 0000000..e1e84f8 --- /dev/null +++ b/integrator/flows/finance.yml @@ -0,0 +1,25 @@ +id: finance-annual-update +namespace: schoolcompare.data +description: Fetch FBIT financial benchmarking data from DfE API for all schools + +triggers: + - id: annual-schedule + type: io.kestra.plugin.core.trigger.Schedule + cron: "0 4 1 12 *" # 1 December annually at 04:00 + +tasks: + - id: download + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/finance?action=download + method: POST + timeout: PT120M # Fetches per-school from API — ~20k schools + + - id: load + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/finance?action=load + method: POST + timeout: PT30M + +retry: + maxAttempts: 2 + delay: PT30M diff --git a/integrator/flows/gias.yml b/integrator/flows/gias.yml new file mode 100644 index 0000000..78d1d06 --- /dev/null +++ b/integrator/flows/gias.yml @@ -0,0 +1,30 @@ +id: gias-weekly-update +namespace: schoolcompare.data +description: Download and load GIAS (Get Information About Schools) bulk CSV + +triggers: + - id: weekly-schedule + type: io.kestra.plugin.core.trigger.Schedule + cron: "0 3 * * 0" # Every Sunday at 03:00 + +tasks: + - id: download + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/gias?action=download + method: POST + timeout: PT30M + + - id: load + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/gias?action=load + method: POST + timeout: PT30M + +errors: + - id: notify-failure + type: io.kestra.plugin.core.log.Log + message: "GIAS update FAILED: {{ error.message }}" + +retry: + maxAttempts: 3 + delay: PT10M diff --git a/integrator/flows/idaci.yml b/integrator/flows/idaci.yml new file mode 100644 index 0000000..5b6bc35 --- /dev/null +++ b/integrator/flows/idaci.yml @@ -0,0 +1,25 @@ +id: idaci-annual-check +namespace: schoolcompare.data +description: Download IoD2019 IDACI file and compute deprivation scores for all schools + +triggers: + - id: annual-schedule + type: io.kestra.plugin.core.trigger.Schedule + cron: "0 5 1 1 *" # 1 January annually at 05:00 + +tasks: + - id: download + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/idaci?action=download + method: POST + timeout: PT10M + + - id: load + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/idaci?action=load + method: POST + timeout: PT60M + +retry: + maxAttempts: 2 + delay: PT30M diff --git a/integrator/flows/ofsted.yml b/integrator/flows/ofsted.yml new file mode 100644 index 0000000..a309246 --- /dev/null +++ b/integrator/flows/ofsted.yml @@ -0,0 +1,32 @@ +id: ofsted-monthly-update +namespace: schoolcompare.data +description: Download and load Ofsted Monthly Management Information CSV + +triggers: + - id: monthly-schedule + type: io.kestra.plugin.core.trigger.Schedule + cron: "0 2 1 * *" # 1st of each month at 02:00 + +tasks: + - id: download + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/ofsted?action=download + method: POST + allowFailed: false + timeout: PT10M + + - id: load + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/ofsted?action=load + method: POST + allowFailed: false + timeout: PT30M + +errors: + - id: notify-failure + type: io.kestra.plugin.core.log.Log + message: "Ofsted update FAILED: {{ error.message }}" + +retry: + maxAttempts: 3 + delay: PT10M diff --git a/integrator/flows/parent_view.yml b/integrator/flows/parent_view.yml new file mode 100644 index 0000000..2914cef --- /dev/null +++ b/integrator/flows/parent_view.yml @@ -0,0 +1,30 @@ +id: parent-view-monthly-check +namespace: schoolcompare.data +description: Download and load Ofsted Parent View open data (released ~3x/year) + +triggers: + - id: monthly-schedule + type: io.kestra.plugin.core.trigger.Schedule + cron: "0 3 1 * *" # 1st of each month at 03:00 + +tasks: + - id: download + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/parent_view?action=download + method: POST + timeout: PT10M + + - id: load + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/parent_view?action=load + method: POST + timeout: PT20M + +errors: + - id: notify-failure + type: io.kestra.plugin.core.log.Log + message: "Parent View update FAILED: {{ error.message }}" + +retry: + maxAttempts: 3 + delay: PT10M diff --git a/integrator/flows/phonics.yml b/integrator/flows/phonics.yml new file mode 100644 index 0000000..6241d0d --- /dev/null +++ b/integrator/flows/phonics.yml @@ -0,0 +1,25 @@ +id: phonics-annual-update +namespace: schoolcompare.data +description: Download and load Phonics Screening Check data via EES API + +triggers: + - id: annual-schedule + type: io.kestra.plugin.core.trigger.Schedule + cron: "0 5 1 9 *" # 1 September annually at 05:00 + +tasks: + - id: download + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/phonics?action=download + method: POST + timeout: PT20M + + - id: load + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/phonics?action=load + method: POST + timeout: PT30M + +retry: + maxAttempts: 3 + delay: PT15M diff --git a/integrator/flows/sen_detail.yml b/integrator/flows/sen_detail.yml new file mode 100644 index 0000000..4ee883c --- /dev/null +++ b/integrator/flows/sen_detail.yml @@ -0,0 +1,25 @@ +id: sen-detail-annual-update +namespace: schoolcompare.data +description: Download and load SEN primary need breakdown via EES API + +triggers: + - id: annual-schedule + type: io.kestra.plugin.core.trigger.Schedule + cron: "0 4 15 9 *" # 15 September annually at 04:00 + +tasks: + - id: download + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/sen_detail?action=download + method: POST + timeout: PT20M + + - id: load + type: io.kestra.plugin.core.http.Request + uri: http://integrator:8001/run/sen_detail?action=load + method: POST + timeout: PT30M + +retry: + maxAttempts: 3 + delay: PT15M diff --git a/integrator/requirements.txt b/integrator/requirements.txt new file mode 100644 index 0000000..676ff15 --- /dev/null +++ b/integrator/requirements.txt @@ -0,0 +1,7 @@ +fastapi==0.115.0 +uvicorn[standard]==0.30.6 +requests==2.32.3 +pandas==2.2.3 +openpyxl==3.1.5 +psycopg2-binary==2.9.9 +sqlalchemy==2.0.35 diff --git a/integrator/scripts/config.py b/integrator/scripts/config.py new file mode 100644 index 0000000..873eada --- /dev/null +++ b/integrator/scripts/config.py @@ -0,0 +1,11 @@ +"""Configuration for the data integrator.""" +import os +from pathlib import Path + +DATABASE_URL = os.environ.get( + "DATABASE_URL", + "postgresql://schoolcompare:schoolcompare@db:5432/schoolcompare", +) + +DATA_DIR = Path(os.environ.get("DATA_DIR", "/data")) +SUPPLEMENTARY_DIR = DATA_DIR / "supplementary" diff --git a/integrator/scripts/db.py b/integrator/scripts/db.py new file mode 100644 index 0000000..2e89b32 --- /dev/null +++ b/integrator/scripts/db.py @@ -0,0 +1,23 @@ +"""Database connection for the integrator.""" +from contextlib import contextmanager + +from sqlalchemy import create_engine +from sqlalchemy.orm import sessionmaker + +from config import DATABASE_URL + +engine = create_engine(DATABASE_URL, pool_pre_ping=True) +SessionLocal = sessionmaker(autocommit=False, autoflush=False, bind=engine) + + +@contextmanager +def get_session(): + session = SessionLocal() + try: + yield session + session.commit() + except Exception: + session.rollback() + raise + finally: + session.close() diff --git a/integrator/scripts/sources/__init__.py b/integrator/scripts/sources/__init__.py new file mode 100644 index 0000000..e69de29 diff --git a/integrator/scripts/sources/admissions.py b/integrator/scripts/sources/admissions.py new file mode 100644 index 0000000..ff38548 --- /dev/null +++ b/integrator/scripts/sources/admissions.py @@ -0,0 +1,158 @@ +""" +School Admissions data downloader and loader. + +Source: EES publication "secondary-and-primary-school-applications-and-offers" +Update: Annual (June/July post-offer round) +""" +import argparse +import re +import sys +from pathlib import Path + +import pandas as pd + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from config import SUPPLEMENTARY_DIR +from db import get_session +from sources.ees import get_latest_csv_url, download_csv + +DEST_DIR = SUPPLEMENTARY_DIR / "admissions" +PUBLICATION_SLUG = "secondary-and-primary-school-applications-and-offers" + +NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""} + +COLUMN_MAP = { + "URN": "urn", + "urn": "urn", + "YEAR": "year", + "Year": "year", + # PAN + "PAN": "pan", + "published_admission_number": "pan", + "admissions_number": "pan", + # Applications + "total_applications": "total_applications", + "TAPP": "total_applications", + "applications_received": "total_applications", + # 1st preference offers + "first_preference_offers_pct": "first_preference_offers_pct", + "pct_1st_preference": "first_preference_offers_pct", + "PT1PREF": "first_preference_offers_pct", + # Oversubscription + "oversubscribed": "oversubscribed", +} + + +def download(data_dir: Path | None = None) -> Path: + dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR + dest.mkdir(parents=True, exist_ok=True) + + url = get_latest_csv_url(PUBLICATION_SLUG, keyword="primary") + if not url: + url = get_latest_csv_url(PUBLICATION_SLUG) + if not url: + raise RuntimeError("Could not find CSV URL for admissions publication") + + filename = url.split("/")[-1].split("?")[0] or "admissions_latest.csv" + return download_csv(url, dest / filename) + + +def _parse_int(val) -> int | None: + if pd.isna(val): + return None + s = str(val).strip().upper().replace(",", "") + if s in NULL_VALUES: + return None + try: + return int(float(s)) + except ValueError: + return None + + +def _parse_pct(val) -> float | None: + if pd.isna(val): + return None + s = str(val).strip().upper().replace("%", "") + if s in NULL_VALUES: + return None + try: + return float(s) + except ValueError: + return None + + +def load(path: Path | None = None, data_dir: Path | None = None) -> dict: + if path is None: + dest = (data_dir / "supplementary" / "admissions") if data_dir else DEST_DIR + files = sorted(dest.glob("*.csv")) + if not files: + raise FileNotFoundError(f"No admissions CSV found in {dest}") + path = files[-1] + + print(f" Admissions: loading {path} ...") + df = pd.read_csv(path, encoding="latin-1", low_memory=False) + df.rename(columns=COLUMN_MAP, inplace=True) + + if "urn" not in df.columns: + raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") + + df["urn"] = pd.to_numeric(df["urn"], errors="coerce") + df = df.dropna(subset=["urn"]) + df["urn"] = df["urn"].astype(int) + + year = None + m = re.search(r"20(\d{2})", path.stem) + if m: + year = int("20" + m.group(1)) + + inserted = 0 + with get_session() as session: + from sqlalchemy import text + for _, row in df.iterrows(): + urn = int(row["urn"]) + row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year + if not row_year: + continue + + pan = _parse_int(row.get("pan")) + total_apps = _parse_int(row.get("total_applications")) + pct_1st = _parse_pct(row.get("first_preference_offers_pct")) + oversubscribed = bool(row.get("oversubscribed")) if pd.notna(row.get("oversubscribed")) else ( + True if (pan and total_apps and total_apps > pan) else None + ) + + session.execute( + text(""" + INSERT INTO school_admissions + (urn, year, published_admission_number, total_applications, + first_preference_offers_pct, oversubscribed) + VALUES (:urn, :year, :pan, :total_apps, :pct_1st, :oversubscribed) + ON CONFLICT (urn, year) DO UPDATE SET + published_admission_number = EXCLUDED.published_admission_number, + total_applications = EXCLUDED.total_applications, + first_preference_offers_pct = EXCLUDED.first_preference_offers_pct, + oversubscribed = EXCLUDED.oversubscribed + """), + { + "urn": urn, "year": row_year, "pan": pan, + "total_apps": total_apps, "pct_1st": pct_1st, + "oversubscribed": oversubscribed, + }, + ) + inserted += 1 + if inserted % 5000 == 0: + session.flush() + + print(f" Admissions: upserted {inserted} records") + return {"inserted": inserted, "updated": 0, "skipped": 0} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--action", choices=["download", "load", "all"], default="all") + parser.add_argument("--data-dir", type=Path, default=None) + args = parser.parse_args() + if args.action in ("download", "all"): + download(args.data_dir) + if args.action in ("load", "all"): + load(data_dir=args.data_dir) diff --git a/integrator/scripts/sources/census.py b/integrator/scripts/sources/census.py new file mode 100644 index 0000000..a30c9e2 --- /dev/null +++ b/integrator/scripts/sources/census.py @@ -0,0 +1,148 @@ +""" +School Census (SPC) downloader and loader. + +Source: EES publication "schools-pupils-and-their-characteristics" +Update: Annual (June) +Adds: class_size_avg, ethnicity breakdown by school +""" +import argparse +import re +import sys +from pathlib import Path + +import pandas as pd + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from config import SUPPLEMENTARY_DIR +from db import get_session +from sources.ees import get_latest_csv_url, download_csv + +DEST_DIR = SUPPLEMENTARY_DIR / "census" +PUBLICATION_SLUG = "schools-pupils-and-their-characteristics" + +NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""} + +COLUMN_MAP = { + "URN": "urn", + "urn": "urn", + "YEAR": "year", + "Year": "year", + # Class size + "average_class_size": "class_size_avg", + "AVCLAS": "class_size_avg", + "avg_class_size": "class_size_avg", + # Ethnicity — DfE uses ethnicity major group percentages + "perc_white": "ethnicity_white_pct", + "perc_asian": "ethnicity_asian_pct", + "perc_black": "ethnicity_black_pct", + "perc_mixed": "ethnicity_mixed_pct", + "perc_other_ethnic": "ethnicity_other_pct", + "PTWHITE": "ethnicity_white_pct", + "PTASIAN": "ethnicity_asian_pct", + "PTBLACK": "ethnicity_black_pct", + "PTMIXED": "ethnicity_mixed_pct", + "PTOTHER": "ethnicity_other_pct", +} + + +def download(data_dir: Path | None = None) -> Path: + dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR + dest.mkdir(parents=True, exist_ok=True) + + url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school") + if not url: + raise RuntimeError(f"Could not find CSV URL for census publication") + + filename = url.split("/")[-1].split("?")[0] or "census_latest.csv" + return download_csv(url, dest / filename) + + +def _parse_pct(val) -> float | None: + if pd.isna(val): + return None + s = str(val).strip().upper().replace("%", "") + if s in NULL_VALUES: + return None + try: + return float(s) + except ValueError: + return None + + +def load(path: Path | None = None, data_dir: Path | None = None) -> dict: + if path is None: + dest = (data_dir / "supplementary" / "census") if data_dir else DEST_DIR + files = sorted(dest.glob("*.csv")) + if not files: + raise FileNotFoundError(f"No census CSV found in {dest}") + path = files[-1] + + print(f" Census: loading {path} ...") + df = pd.read_csv(path, encoding="latin-1", low_memory=False) + df.rename(columns=COLUMN_MAP, inplace=True) + + if "urn" not in df.columns: + raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") + + df["urn"] = pd.to_numeric(df["urn"], errors="coerce") + df = df.dropna(subset=["urn"]) + df["urn"] = df["urn"].astype(int) + + year = None + m = re.search(r"20(\d{2})", path.stem) + if m: + year = int("20" + m.group(1)) + + inserted = 0 + with get_session() as session: + from sqlalchemy import text + for _, row in df.iterrows(): + urn = int(row["urn"]) + row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year + if not row_year: + continue + + session.execute( + text(""" + INSERT INTO school_census + (urn, year, class_size_avg, + ethnicity_white_pct, ethnicity_asian_pct, ethnicity_black_pct, + ethnicity_mixed_pct, ethnicity_other_pct) + VALUES (:urn, :year, :class_size_avg, + :white, :asian, :black, :mixed, :other) + ON CONFLICT (urn, year) DO UPDATE SET + class_size_avg = EXCLUDED.class_size_avg, + ethnicity_white_pct = EXCLUDED.ethnicity_white_pct, + ethnicity_asian_pct = EXCLUDED.ethnicity_asian_pct, + ethnicity_black_pct = EXCLUDED.ethnicity_black_pct, + ethnicity_mixed_pct = EXCLUDED.ethnicity_mixed_pct, + ethnicity_other_pct = EXCLUDED.ethnicity_other_pct + """), + { + "urn": urn, + "year": row_year, + "class_size_avg": _parse_pct(row.get("class_size_avg")), + "white": _parse_pct(row.get("ethnicity_white_pct")), + "asian": _parse_pct(row.get("ethnicity_asian_pct")), + "black": _parse_pct(row.get("ethnicity_black_pct")), + "mixed": _parse_pct(row.get("ethnicity_mixed_pct")), + "other": _parse_pct(row.get("ethnicity_other_pct")), + }, + ) + inserted += 1 + if inserted % 5000 == 0: + session.flush() + + print(f" Census: upserted {inserted} records") + return {"inserted": inserted, "updated": 0, "skipped": 0} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--action", choices=["download", "load", "all"], default="all") + parser.add_argument("--data-dir", type=Path, default=None) + args = parser.parse_args() + if args.action in ("download", "all"): + download(args.data_dir) + if args.action in ("load", "all"): + load(data_dir=args.data_dir) diff --git a/integrator/scripts/sources/ees.py b/integrator/scripts/sources/ees.py new file mode 100644 index 0000000..d610daa --- /dev/null +++ b/integrator/scripts/sources/ees.py @@ -0,0 +1,53 @@ +""" +Shared EES (Explore Education Statistics) API client. + +Base URL: https://api.education.gov.uk/statistics/v1 +""" +import sys +from pathlib import Path +from typing import Optional + +import requests + +API_BASE = "https://api.education.gov.uk/statistics/v1" +TIMEOUT = 60 + + +def get_publication_files(publication_slug: str) -> list[dict]: + """Return list of data-set file descriptors for a publication.""" + url = f"{API_BASE}/publications/{publication_slug}/data-set-files" + resp = requests.get(url, timeout=TIMEOUT) + resp.raise_for_status() + return resp.json().get("results", []) + + +def get_latest_csv_url(publication_slug: str, keyword: str = "") -> Optional[str]: + """ + Find the most recent CSV download URL for a publication. + Optionally filter by a keyword in the file name. + """ + files = get_publication_files(publication_slug) + for entry in files: + name = entry.get("name", "").lower() + if keyword and keyword.lower() not in name: + continue + csv_url = entry.get("csvDownloadUrl") or entry.get("file", {}).get("url") + if csv_url: + return csv_url + return None + + +def download_csv(url: str, dest_path: Path) -> Path: + """Download a CSV from EES to dest_path.""" + if dest_path.exists(): + print(f" EES: {dest_path.name} already exists, skipping.") + return dest_path + print(f" EES: downloading {url} ...") + resp = requests.get(url, timeout=300, stream=True) + resp.raise_for_status() + dest_path.parent.mkdir(parents=True, exist_ok=True) + with open(dest_path, "wb") as f: + for chunk in resp.iter_content(chunk_size=65536): + f.write(chunk) + print(f" EES: saved {dest_path} ({dest_path.stat().st_size // 1024} KB)") + return dest_path diff --git a/integrator/scripts/sources/finance.py b/integrator/scripts/sources/finance.py new file mode 100644 index 0000000..11bc2d8 --- /dev/null +++ b/integrator/scripts/sources/finance.py @@ -0,0 +1,143 @@ +""" +FBIT (Financial Benchmarking and Insights Tool) financial data loader. + +Source: https://schools-financial-benchmarking.service.gov.uk/api/ +Update: Annual (December — data for the prior financial year) +""" +import argparse +import sys +import time +from pathlib import Path + +import pandas as pd +import requests + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from config import SUPPLEMENTARY_DIR +from db import get_session + +DEST_DIR = SUPPLEMENTARY_DIR / "finance" +API_BASE = "https://schools-financial-benchmarking.service.gov.uk/api" +RATE_LIMIT_DELAY = 0.1 # seconds between requests + + +def download(data_dir: Path | None = None) -> Path: + """ + Fetch per-URN financial data from FBIT API and save as CSV. + Batches all school URNs from the database. + """ + dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR + dest.mkdir(parents=True, exist_ok=True) + + # Determine year from API (use current year minus 1 for completed financials) + from datetime import date + year = date.today().year - 1 + dest_file = dest / f"fbit_{year}.csv" + + if dest_file.exists(): + print(f" Finance: {dest_file.name} already exists, skipping download.") + return dest_file + + # Get all URNs from the database + with get_session() as session: + from sqlalchemy import text + rows = session.execute(text("SELECT urn FROM schools")).fetchall() + urns = [r[0] for r in rows] + print(f" Finance: fetching FBIT data for {len(urns)} schools (year {year}) ...") + + records = [] + errors = 0 + for i, urn in enumerate(urns): + if i % 500 == 0: + print(f" {i}/{len(urns)} ...") + try: + resp = requests.get( + f"{API_BASE}/schoolFinancialDataObject/{urn}", + timeout=10, + ) + if resp.status_code == 200: + data = resp.json() + if data: + records.append({ + "urn": urn, + "year": year, + "per_pupil_spend": data.get("totalExpenditure") and + data.get("numberOfPupils") and + round(data["totalExpenditure"] / data["numberOfPupils"], 2), + "staff_cost_pct": data.get("staffCostPercent"), + "teacher_cost_pct": data.get("teachingStaffCostPercent"), + "support_staff_cost_pct": data.get("educationSupportStaffCostPercent"), + "premises_cost_pct": data.get("premisesStaffCostPercent"), + }) + elif resp.status_code not in (404, 400): + errors += 1 + except Exception: + errors += 1 + + time.sleep(RATE_LIMIT_DELAY) + + df = pd.DataFrame(records) + df.to_csv(dest_file, index=False) + print(f" Finance: saved {len(records)} records to {dest_file} ({errors} errors)") + return dest_file + + +def load(path: Path | None = None, data_dir: Path | None = None) -> dict: + if path is None: + dest = (data_dir / "supplementary" / "finance") if data_dir else DEST_DIR + files = sorted(dest.glob("fbit_*.csv")) + if not files: + raise FileNotFoundError(f"No finance CSV found in {dest}") + path = files[-1] + + print(f" Finance: loading {path} ...") + df = pd.read_csv(path) + + df["urn"] = pd.to_numeric(df["urn"], errors="coerce") + df = df.dropna(subset=["urn"]) + df["urn"] = df["urn"].astype(int) + + inserted = 0 + with get_session() as session: + from sqlalchemy import text + for _, row in df.iterrows(): + session.execute( + text(""" + INSERT INTO school_finance + (urn, year, per_pupil_spend, staff_cost_pct, teacher_cost_pct, + support_staff_cost_pct, premises_cost_pct) + VALUES (:urn, :year, :per_pupil, :staff, :teacher, :support, :premises) + ON CONFLICT (urn, year) DO UPDATE SET + per_pupil_spend = EXCLUDED.per_pupil_spend, + staff_cost_pct = EXCLUDED.staff_cost_pct, + teacher_cost_pct = EXCLUDED.teacher_cost_pct, + support_staff_cost_pct = EXCLUDED.support_staff_cost_pct, + premises_cost_pct = EXCLUDED.premises_cost_pct + """), + { + "urn": int(row["urn"]), + "year": int(row["year"]), + "per_pupil": float(row["per_pupil_spend"]) if pd.notna(row.get("per_pupil_spend")) else None, + "staff": float(row["staff_cost_pct"]) if pd.notna(row.get("staff_cost_pct")) else None, + "teacher": float(row["teacher_cost_pct"]) if pd.notna(row.get("teacher_cost_pct")) else None, + "support": float(row["support_staff_cost_pct"]) if pd.notna(row.get("support_staff_cost_pct")) else None, + "premises": float(row["premises_cost_pct"]) if pd.notna(row.get("premises_cost_pct")) else None, + }, + ) + inserted += 1 + if inserted % 2000 == 0: + session.flush() + + print(f" Finance: upserted {inserted} records") + return {"inserted": inserted, "updated": 0, "skipped": 0} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--action", choices=["download", "load", "all"], default="all") + parser.add_argument("--data-dir", type=Path, default=None) + args = parser.parse_args() + if args.action in ("download", "all"): + download(args.data_dir) + if args.action in ("load", "all"): + load(data_dir=args.data_dir) diff --git a/integrator/scripts/sources/gias.py b/integrator/scripts/sources/gias.py new file mode 100644 index 0000000..19b232c --- /dev/null +++ b/integrator/scripts/sources/gias.py @@ -0,0 +1,159 @@ +""" +GIAS (Get Information About Schools) bulk CSV downloader and loader. + +Source: https://get-information-schools.service.gov.uk/Downloads +Update: Daily; we refresh weekly. +Adds: website, headteacher_name, capacity, trust_name, trust_uid, gender, nursery_provision +""" +import argparse +import sys +from datetime import date +from pathlib import Path + +import pandas as pd +import requests + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from config import SUPPLEMENTARY_DIR +from db import get_session + +DEST_DIR = SUPPLEMENTARY_DIR / "gias" + +# GIAS bulk download URL — date is injected at runtime +GIAS_URL_TEMPLATE = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata{date}.csv" + +COLUMN_MAP = { + "URN": "urn", + "SchoolWebsite": "website", + "SchoolCapacity": "capacity", + "TrustName": "trust_name", + "TrustUID": "trust_uid", + "Gender (name)": "gender", + "NurseryProvision (name)": "nursery_provision_raw", + "HeadTitle": "head_title", + "HeadFirstName": "head_first", + "HeadLastName": "head_last", +} + + +def download(data_dir: Path | None = None) -> Path: + dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR + dest.mkdir(parents=True, exist_ok=True) + + today = date.today().strftime("%Y%m%d") + url = GIAS_URL_TEMPLATE.format(date=today) + filename = f"gias_{today}.csv" + dest_file = dest / filename + + if dest_file.exists(): + print(f" GIAS: {filename} already exists, skipping download.") + return dest_file + + print(f" GIAS: downloading {url} ...") + resp = requests.get(url, timeout=300, stream=True) + + # GIAS may not have today's file yet — fall back to yesterday + if resp.status_code == 404: + from datetime import timedelta + yesterday = (date.today() - timedelta(days=1)).strftime("%Y%m%d") + url = GIAS_URL_TEMPLATE.format(date=yesterday) + filename = f"gias_{yesterday}.csv" + dest_file = dest / filename + if dest_file.exists(): + print(f" GIAS: {filename} already exists, skipping download.") + return dest_file + resp = requests.get(url, timeout=300, stream=True) + + resp.raise_for_status() + with open(dest_file, "wb") as f: + for chunk in resp.iter_content(chunk_size=65536): + f.write(chunk) + + print(f" GIAS: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)") + return dest_file + + +def load(path: Path | None = None, data_dir: Path | None = None) -> dict: + if path is None: + dest = (data_dir / "supplementary" / "gias") if data_dir else DEST_DIR + files = sorted(dest.glob("gias_*.csv")) + if not files: + raise FileNotFoundError(f"No GIAS CSV found in {dest}") + path = files[-1] + + print(f" GIAS: loading {path} ...") + df = pd.read_csv(path, encoding="latin-1", low_memory=False) + df.rename(columns=COLUMN_MAP, inplace=True) + + if "urn" not in df.columns: + raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") + + df["urn"] = pd.to_numeric(df["urn"], errors="coerce") + df = df.dropna(subset=["urn"]) + df["urn"] = df["urn"].astype(int) + + # Build headteacher_name from parts + def build_name(row): + parts = [ + str(row.get("head_title", "") or "").strip(), + str(row.get("head_first", "") or "").strip(), + str(row.get("head_last", "") or "").strip(), + ] + return " ".join(p for p in parts if p) or None + + df["headteacher_name"] = df.apply(build_name, axis=1) + df["nursery_provision"] = df.get("nursery_provision_raw", pd.Series()).apply( + lambda v: True if str(v).strip().lower().startswith("has") else False if pd.notna(v) else None + ) + + def clean_str(val): + s = str(val).strip() if pd.notna(val) else None + return s if s and s.lower() not in ("nan", "none", "") else None + + updated = 0 + with get_session() as session: + from sqlalchemy import text + for _, row in df.iterrows(): + urn = int(row["urn"]) + session.execute( + text(""" + UPDATE schools SET + website = :website, + headteacher_name = :headteacher_name, + capacity = :capacity, + trust_name = :trust_name, + trust_uid = :trust_uid, + gender = :gender, + nursery_provision = :nursery_provision + WHERE urn = :urn + """), + { + "urn": urn, + "website": clean_str(row.get("website")), + "headteacher_name": row.get("headteacher_name"), + "capacity": int(row["capacity"]) if pd.notna(row.get("capacity")) and str(row.get("capacity")).strip().isdigit() else None, + "trust_name": clean_str(row.get("trust_name")), + "trust_uid": clean_str(row.get("trust_uid")), + "gender": clean_str(row.get("gender")), + "nursery_provision": row.get("nursery_provision"), + }, + ) + updated += 1 + if updated % 5000 == 0: + session.flush() + print(f" Updated {updated} schools...") + + print(f" GIAS: updated {updated} school records") + return {"inserted": 0, "updated": updated, "skipped": 0} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--action", choices=["download", "load", "all"], default="all") + parser.add_argument("--data-dir", type=Path, default=None) + args = parser.parse_args() + + if args.action in ("download", "all"): + path = download(args.data_dir) + if args.action in ("load", "all"): + load(data_dir=args.data_dir) diff --git a/integrator/scripts/sources/idaci.py b/integrator/scripts/sources/idaci.py new file mode 100644 index 0000000..bbcd199 --- /dev/null +++ b/integrator/scripts/sources/idaci.py @@ -0,0 +1,176 @@ +""" +IDACI (Income Deprivation Affecting Children Index) loader. + +Source: English Indices of Deprivation 2019 +https://www.gov.uk/government/statistics/english-indices-of-deprivation-2019 + +This is a one-time download (5-yearly release). We join school postcodes to LSOAs +via postcodes.io, then look up IDACI scores from the IoD2019 file. + +Update: ~5-yearly (next release expected 2025/26) +""" +import argparse +import sys +from pathlib import Path + +import pandas as pd +import requests + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from config import SUPPLEMENTARY_DIR +from db import get_session + +DEST_DIR = SUPPLEMENTARY_DIR / "idaci" + +# IoD 2019 supplementary data — "Income Deprivation Affecting Children Index (IDACI)" +IOD_2019_URL = ( + "https://assets.publishing.service.gov.uk/government/uploads/system/uploads/" + "attachment_data/file/833970/File_1_-_IMD2019_Index_of_Multiple_Deprivation.xlsx" +) + +POSTCODES_IO_BATCH = "https://api.postcodes.io/postcodes" +BATCH_SIZE = 100 + + +def download(data_dir: Path | None = None) -> Path: + dest = (data_dir / "supplementary" / "idaci") if data_dir else DEST_DIR + dest.mkdir(parents=True, exist_ok=True) + + filename = "iod2019_idaci.xlsx" + dest_file = dest / filename + if dest_file.exists(): + print(f" IDACI: {filename} already exists, skipping download.") + return dest_file + + print(f" IDACI: downloading IoD2019 file ...") + resp = requests.get(IOD_2019_URL, timeout=300, stream=True) + resp.raise_for_status() + with open(dest_file, "wb") as f: + for chunk in resp.iter_content(chunk_size=65536): + f.write(chunk) + + print(f" IDACI: saved {dest_file}") + return dest_file + + +def _postcode_to_lsoa(postcodes: list[str]) -> dict[str, str]: + """Batch-resolve postcodes to LSOA codes via postcodes.io.""" + result = {} + valid = [p.strip().upper() for p in postcodes if p and len(str(p).strip()) >= 5] + valid = list(set(valid)) + + for i in range(0, len(valid), BATCH_SIZE): + batch = valid[i:i + BATCH_SIZE] + try: + resp = requests.post(POSTCODES_IO_BATCH, json={"postcodes": batch}, timeout=30) + if resp.status_code == 200: + for item in resp.json().get("result", []): + if item and item.get("result"): + lsoa = item["result"].get("lsoa") + if lsoa: + result[item["query"].upper()] = lsoa + except Exception as e: + print(f" Warning: postcodes.io batch failed: {e}") + + return result + + +def load(path: Path | None = None, data_dir: Path | None = None) -> dict: + dest = (data_dir / "supplementary" / "idaci") if data_dir else DEST_DIR + if path is None: + files = sorted(dest.glob("*.xlsx")) + if not files: + raise FileNotFoundError(f"No IDACI file found in {dest}") + path = files[-1] + + print(f" IDACI: loading IoD2019 from {path} ...") + + # IoD2019 File 1 — sheet "IoD2019 IDACI" or similar + try: + iod_df = pd.read_excel(path, sheet_name=None) + # Find sheet with IDACI data + idaci_sheet = None + for name, df in iod_df.items(): + if "IDACI" in name.upper() or "IDACI" in str(df.columns.tolist()).upper(): + idaci_sheet = name + break + if idaci_sheet is None: + idaci_sheet = list(iod_df.keys())[0] + df_iod = iod_df[idaci_sheet] + except Exception as e: + raise RuntimeError(f"Could not read IoD2019 file: {e}") + + # Normalise column names — IoD2019 uses specific headers + col_lsoa = next((c for c in df_iod.columns if "LSOA" in str(c).upper() and "code" in str(c).lower()), None) + col_score = next((c for c in df_iod.columns if "IDACI" in str(c).upper() and "score" in str(c).lower()), None) + col_rank = next((c for c in df_iod.columns if "IDACI" in str(c).upper() and "rank" in str(c).lower()), None) + + if not col_lsoa or not col_score: + print(f" IDACI columns available: {list(df_iod.columns)[:20]}") + raise ValueError("Could not find LSOA code or IDACI score columns") + + df_iod = df_iod[[col_lsoa, col_score]].copy() + df_iod.columns = ["lsoa_code", "idaci_score"] + df_iod = df_iod.dropna() + + # Compute decile from rank (or from score distribution) + total = len(df_iod) + df_iod = df_iod.sort_values("idaci_score", ascending=False) + df_iod["idaci_decile"] = (pd.qcut(df_iod["idaci_score"], 10, labels=False) + 1).astype(int) + # Decile 1 = most deprived (highest IDACI score) + df_iod["idaci_decile"] = 11 - df_iod["idaci_decile"] + + lsoa_lookup = df_iod.set_index("lsoa_code")[["idaci_score", "idaci_decile"]].to_dict("index") + print(f" IDACI: loaded {len(lsoa_lookup)} LSOA records") + + # Fetch all school postcodes from the database + with get_session() as session: + from sqlalchemy import text + rows = session.execute(text("SELECT urn, postcode FROM schools WHERE postcode IS NOT NULL")).fetchall() + + postcodes = [r[1] for r in rows] + print(f" IDACI: resolving {len(postcodes)} postcodes via postcodes.io ...") + pc_to_lsoa = _postcode_to_lsoa(postcodes) + print(f" IDACI: resolved {len(pc_to_lsoa)} postcodes to LSOAs") + + inserted = skipped = 0 + with get_session() as session: + from sqlalchemy import text + for urn, postcode in rows: + lsoa = pc_to_lsoa.get(str(postcode).strip().upper()) + if not lsoa: + skipped += 1 + continue + iod = lsoa_lookup.get(lsoa) + if not iod: + skipped += 1 + continue + + session.execute( + text(""" + INSERT INTO school_deprivation (urn, lsoa_code, idaci_score, idaci_decile) + VALUES (:urn, :lsoa, :score, :decile) + ON CONFLICT (urn) DO UPDATE SET + lsoa_code = EXCLUDED.lsoa_code, + idaci_score = EXCLUDED.idaci_score, + idaci_decile = EXCLUDED.idaci_decile + """), + {"urn": urn, "lsoa": lsoa, "score": float(iod["idaci_score"]), "decile": int(iod["idaci_decile"])}, + ) + inserted += 1 + if inserted % 2000 == 0: + session.flush() + + print(f" IDACI: upserted {inserted}, skipped {skipped}") + return {"inserted": inserted, "updated": 0, "skipped": skipped} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--action", choices=["download", "load", "all"], default="all") + parser.add_argument("--data-dir", type=Path, default=None) + args = parser.parse_args() + if args.action in ("download", "all"): + download(args.data_dir) + if args.action in ("load", "all"): + load(data_dir=args.data_dir) diff --git a/integrator/scripts/sources/ofsted.py b/integrator/scripts/sources/ofsted.py new file mode 100644 index 0000000..924c5e2 --- /dev/null +++ b/integrator/scripts/sources/ofsted.py @@ -0,0 +1,226 @@ +""" +Ofsted Monthly Management Information CSV downloader and loader. + +Source: https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes +Update: Monthly (released ~2 weeks into each month) +""" +import argparse +import re +import sys +from datetime import date, datetime +from pathlib import Path + +import pandas as pd +import requests + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from config import SUPPLEMENTARY_DIR +from db import get_session + +# Current Ofsted MI download URL — update this when Ofsted releases a new file. +# The URL follows a predictable pattern; we attempt to discover it from the GOV.UK page. +GOV_UK_PAGE = "https://www.gov.uk/government/statistical-data-sets/monthly-management-information-ofsteds-school-inspections-outcomes" + +COLUMN_MAP = { + "URN": "urn", + "Inspection date": "inspection_date", + "Publication date": "publication_date", + "Inspection type": "inspection_type", + "Overall effectiveness": "overall_effectiveness", + "Quality of education": "quality_of_education", + "Behaviour and attitudes": "behaviour_attitudes", + "Personal development": "personal_development", + "Leadership and management": "leadership_management", + "Early years provision": "early_years_provision", + # Some CSVs use shortened names + "Urn": "urn", + "InspectionDate": "inspection_date", + "PublicationDate": "publication_date", + "InspectionType": "inspection_type", + "OverallEffectiveness": "overall_effectiveness", + "QualityOfEducation": "quality_of_education", + "BehaviourAndAttitudes": "behaviour_attitudes", + "PersonalDevelopment": "personal_development", + "LeadershipAndManagement": "leadership_management", + "EarlyYearsProvision": "early_years_provision", +} + +GRADE_MAP = { + "Outstanding": 1, "1": 1, 1: 1, + "Good": 2, "2": 2, 2: 2, + "Requires improvement": 3, "3": 3, 3: 3, + "Requires Improvement": 3, + "Inadequate": 4, "4": 4, 4: 4, +} + +DEST_DIR = SUPPLEMENTARY_DIR / "ofsted" + + +def _discover_csv_url() -> str | None: + """Scrape the GOV.UK page for the most recent CSV/ZIP link.""" + try: + resp = requests.get(GOV_UK_PAGE, timeout=30) + resp.raise_for_status() + # Look for links to assets.publishing.service.gov.uk CSV or ZIP files + pattern = r'href="(https://assets\.publishing\.service\.gov\.uk[^"]+\.(?:csv|zip))"' + urls = re.findall(pattern, resp.text, re.IGNORECASE) + if urls: + return urls[0] + except Exception as e: + print(f" Warning: could not scrape GOV.UK page: {e}") + return None + + +def download(data_dir: Path | None = None) -> Path: + dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR + dest.mkdir(parents=True, exist_ok=True) + + url = _discover_csv_url() + if not url: + raise RuntimeError( + "Could not discover Ofsted MI download URL. " + "Visit https://www.gov.uk/government/statistical-data-sets/" + "monthly-management-information-ofsteds-school-inspections-outcomes " + "to get the latest URL and update MANUAL_URL in ofsted.py" + ) + + filename = url.split("/")[-1] + dest_file = dest / filename + + if dest_file.exists(): + print(f" Ofsted: {filename} already exists, skipping download.") + return dest_file + + print(f" Ofsted: downloading {url} ...") + resp = requests.get(url, timeout=120, stream=True) + resp.raise_for_status() + with open(dest_file, "wb") as f: + for chunk in resp.iter_content(chunk_size=65536): + f.write(chunk) + + print(f" Ofsted: saved {dest_file} ({dest_file.stat().st_size // 1024} KB)") + return dest_file + + +def _parse_grade(val) -> int | None: + if pd.isna(val): + return None + key = str(val).strip() + return GRADE_MAP.get(key) + + +def _parse_date(val) -> date | None: + if pd.isna(val): + return None + for fmt in ("%d/%m/%Y", "%Y-%m-%d", "%d-%m-%Y", "%d %B %Y"): + try: + return datetime.strptime(str(val).strip(), fmt).date() + except ValueError: + pass + return None + + +def load(path: Path | None = None, data_dir: Path | None = None) -> dict: + if path is None: + dest = (data_dir / "supplementary" / "ofsted") if data_dir else DEST_DIR + files = sorted(dest.glob("*.csv")) + sorted(dest.glob("*.zip")) + if not files: + raise FileNotFoundError(f"No Ofsted MI file found in {dest}") + path = files[-1] + + print(f" Ofsted: loading {path} ...") + + if str(path).endswith(".zip"): + import zipfile, io + with zipfile.ZipFile(path) as z: + csv_names = [n for n in z.namelist() if n.endswith(".csv")] + if not csv_names: + raise ValueError("No CSV found inside Ofsted ZIP") + with z.open(csv_names[0]) as f: + df = pd.read_csv(io.TextIOWrapper(f, encoding="latin-1"), low_memory=False) + else: + df = pd.read_csv(path, encoding="latin-1", low_memory=False) + + # Normalise column names + df.rename(columns=COLUMN_MAP, inplace=True) + + if "urn" not in df.columns: + raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") + + # Only keep rows with a valid URN + df["urn"] = pd.to_numeric(df["urn"], errors="coerce") + df = df.dropna(subset=["urn"]) + df["urn"] = df["urn"].astype(int) + + inserted = updated = skipped = 0 + + with get_session() as session: + # Keep only the most recent inspection per URN + if "inspection_date" in df.columns: + df["_date_parsed"] = df["inspection_date"].apply(_parse_date) + df = df.sort_values("_date_parsed", ascending=False).groupby("urn").first().reset_index() + + for _, row in df.iterrows(): + urn = int(row["urn"]) + + record = { + "urn": urn, + "inspection_date": _parse_date(row.get("inspection_date")), + "publication_date": _parse_date(row.get("publication_date")), + "inspection_type": str(row.get("inspection_type", "")).strip() or None, + "overall_effectiveness": _parse_grade(row.get("overall_effectiveness")), + "quality_of_education": _parse_grade(row.get("quality_of_education")), + "behaviour_attitudes": _parse_grade(row.get("behaviour_attitudes")), + "personal_development": _parse_grade(row.get("personal_development")), + "leadership_management": _parse_grade(row.get("leadership_management")), + "early_years_provision": _parse_grade(row.get("early_years_provision")), + "previous_overall": None, + } + + from sqlalchemy import text + session.execute( + text(""" + INSERT INTO ofsted_inspections + (urn, inspection_date, publication_date, inspection_type, + overall_effectiveness, quality_of_education, behaviour_attitudes, + personal_development, leadership_management, early_years_provision, + previous_overall) + VALUES + (:urn, :inspection_date, :publication_date, :inspection_type, + :overall_effectiveness, :quality_of_education, :behaviour_attitudes, + :personal_development, :leadership_management, :early_years_provision, + :previous_overall) + ON CONFLICT (urn) DO UPDATE SET + previous_overall = ofsted_inspections.overall_effectiveness, + inspection_date = EXCLUDED.inspection_date, + publication_date = EXCLUDED.publication_date, + inspection_type = EXCLUDED.inspection_type, + overall_effectiveness = EXCLUDED.overall_effectiveness, + quality_of_education = EXCLUDED.quality_of_education, + behaviour_attitudes = EXCLUDED.behaviour_attitudes, + personal_development = EXCLUDED.personal_development, + leadership_management = EXCLUDED.leadership_management, + early_years_provision = EXCLUDED.early_years_provision + """), + record, + ) + inserted += 1 + + if inserted % 5000 == 0: + session.flush() + print(f" Processed {inserted} records...") + + print(f" Ofsted: upserted {inserted} records") + return {"inserted": inserted, "updated": updated, "skipped": skipped} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--action", choices=["download", "load", "all"], default="all") + parser.add_argument("--data-dir", type=Path, default=None) + args = parser.parse_args() + + if args.action in ("download", "all"): + path = download(args.data_dir) + if args.action in ("load", "all"): + load(data_dir=args.data_dir) diff --git a/integrator/scripts/sources/parent_view.py b/integrator/scripts/sources/parent_view.py new file mode 100644 index 0000000..535189e --- /dev/null +++ b/integrator/scripts/sources/parent_view.py @@ -0,0 +1,229 @@ +""" +Ofsted Parent View open data downloader and loader. + +Source: https://parentview.ofsted.gov.uk/open-data +Update: ~3 times/year (Spring, Autumn, Summer) +""" +import argparse +import re +import sys +from datetime import date, datetime +from pathlib import Path + +import pandas as pd +import requests + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from config import SUPPLEMENTARY_DIR +from db import get_session + +DEST_DIR = SUPPLEMENTARY_DIR / "parent_view" +OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data" + +# Question column mapping — Parent View open data uses descriptive column headers +# Map any variant to our internal field names +QUESTION_MAP = { + # Q1 — happiness + "My child is happy at this school": "q_happy_pct", + "Happy": "q_happy_pct", + # Q2 — safety + "My child feels safe at this school": "q_safe_pct", + "Safe": "q_safe_pct", + # Q3 — bullying + "The school makes sure its pupils are well behaved": "q_behaviour_pct", + "Well Behaved": "q_behaviour_pct", + # Q4 — bullying dealt with (sometimes separate) + "My child has been bullied and the school dealt with the bullying quickly and effectively": "q_bullying_pct", + "Bullying": "q_bullying_pct", + # Q5 — curriculum info + "The school makes me aware of what my child will learn during the year": "q_communication_pct", + "Aware of learning": "q_communication_pct", + # Q6 — concerns dealt with + "When I have raised concerns with the school, they have been dealt with properly": "q_communication_pct", + # Q7 — child does well + "My child does well at this school": "q_progress_pct", + "Does well": "q_progress_pct", + # Q8 — teaching + "The teaching is good at this school": "q_teaching_pct", + "Good teaching": "q_teaching_pct", + # Q9 — progress info + "I receive valuable information from the school about my child's progress": "q_information_pct", + "Progress information": "q_information_pct", + # Q10 — curriculum breadth + "My child is taught a broad range of subjects": "q_curriculum_pct", + "Broad subjects": "q_curriculum_pct", + # Q11 — prepares for future + "The school prepares my child well for the future": "q_future_pct", + "Prepared for future": "q_future_pct", + # Q12 — leadership + "The school is led and managed effectively": "q_leadership_pct", + "Led well": "q_leadership_pct", + # Q13 — wellbeing + "The school supports my child's wider personal development": "q_wellbeing_pct", + "Personal development": "q_wellbeing_pct", + # Q14 — recommendation + "I would recommend this school to another parent": "q_recommend_pct", + "Recommend": "q_recommend_pct", +} + + +def download(data_dir: Path | None = None) -> Path: + dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR + dest.mkdir(parents=True, exist_ok=True) + + # Scrape the open data page for the download link + try: + resp = requests.get(OPEN_DATA_PAGE, timeout=30) + resp.raise_for_status() + pattern = r'href="([^"]+\.(?:xlsx|csv|zip))"' + urls = re.findall(pattern, resp.text, re.IGNORECASE) + if not urls: + raise RuntimeError("No download link found on Parent View open data page") + url = urls[0] if urls[0].startswith("http") else "https://parentview.ofsted.gov.uk" + urls[0] + except Exception as e: + raise RuntimeError(f"Could not discover Parent View download URL: {e}") + + filename = url.split("/")[-1].split("?")[0] + dest_file = dest / filename + + if dest_file.exists(): + print(f" ParentView: {filename} already exists, skipping download.") + return dest_file + + print(f" ParentView: downloading {url} ...") + resp = requests.get(url, timeout=120, stream=True) + resp.raise_for_status() + with open(dest_file, "wb") as f: + for chunk in resp.iter_content(chunk_size=65536): + f.write(chunk) + + print(f" ParentView: saved {dest_file}") + return dest_file + + +def _positive_pct(row: pd.Series, q_col_base: str) -> float | None: + """Sum 'Strongly agree' + 'Agree' percentages for a question.""" + # Parent View open data has columns like "Q1 - Strongly agree %", "Q1 - Agree %" + strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %") + agree = row.get(f"{q_col_base} - Agree %") + try: + total = 0.0 + if pd.notna(strongly): + total += float(strongly) + if pd.notna(agree): + total += float(agree) + return round(total, 1) if total > 0 else None + except (TypeError, ValueError): + return None + + +def load(path: Path | None = None, data_dir: Path | None = None) -> dict: + if path is None: + dest = (data_dir / "supplementary" / "parent_view") if data_dir else DEST_DIR + files = sorted(dest.glob("*.xlsx")) + sorted(dest.glob("*.csv")) + if not files: + raise FileNotFoundError(f"No Parent View file found in {dest}") + path = files[-1] + + print(f" ParentView: loading {path} ...") + + if str(path).endswith(".xlsx"): + df = pd.read_excel(path) + else: + df = pd.read_csv(path, encoding="latin-1", low_memory=False) + + # Normalise URN column + urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None) + if not urn_col: + raise ValueError(f"URN column not found. Columns: {list(df.columns)[:20]}") + df.rename(columns={urn_col: "urn"}, inplace=True) + df["urn"] = pd.to_numeric(df["urn"], errors="coerce") + df = df.dropna(subset=["urn"]) + df["urn"] = df["urn"].astype(int) + + # Try to find total responses column + resp_col = next((c for c in df.columns if "total" in c.lower() and "respon" in c.lower()), None) + + inserted = 0 + today = date.today() + + with get_session() as session: + from sqlalchemy import text + for _, row in df.iterrows(): + urn = int(row["urn"]) + total = int(row[resp_col]) if resp_col and pd.notna(row.get(resp_col)) else None + + # Try to extract % positive per question from wide-format columns + # Parent View has numbered questions Q1–Q12 (or Q1–Q14 depending on year) + record = { + "urn": urn, + "survey_date": today, + "total_responses": total, + "q_happy_pct": _positive_pct(row, "Q1"), + "q_safe_pct": _positive_pct(row, "Q2"), + "q_behaviour_pct": _positive_pct(row, "Q3"), + "q_bullying_pct": _positive_pct(row, "Q4"), + "q_communication_pct": _positive_pct(row, "Q5"), + "q_progress_pct": _positive_pct(row, "Q7"), + "q_teaching_pct": _positive_pct(row, "Q8"), + "q_information_pct": _positive_pct(row, "Q9"), + "q_curriculum_pct": _positive_pct(row, "Q10"), + "q_future_pct": _positive_pct(row, "Q11"), + "q_leadership_pct": _positive_pct(row, "Q12"), + "q_wellbeing_pct": _positive_pct(row, "Q13"), + "q_recommend_pct": _positive_pct(row, "Q14"), + "q_sen_pct": None, + } + + session.execute( + text(""" + INSERT INTO ofsted_parent_view + (urn, survey_date, total_responses, + q_happy_pct, q_safe_pct, q_behaviour_pct, q_bullying_pct, + q_communication_pct, q_progress_pct, q_teaching_pct, + q_information_pct, q_curriculum_pct, q_future_pct, + q_leadership_pct, q_wellbeing_pct, q_recommend_pct, q_sen_pct) + VALUES + (:urn, :survey_date, :total_responses, + :q_happy_pct, :q_safe_pct, :q_behaviour_pct, :q_bullying_pct, + :q_communication_pct, :q_progress_pct, :q_teaching_pct, + :q_information_pct, :q_curriculum_pct, :q_future_pct, + :q_leadership_pct, :q_wellbeing_pct, :q_recommend_pct, :q_sen_pct) + ON CONFLICT (urn) DO UPDATE SET + survey_date = EXCLUDED.survey_date, + total_responses = EXCLUDED.total_responses, + q_happy_pct = EXCLUDED.q_happy_pct, + q_safe_pct = EXCLUDED.q_safe_pct, + q_behaviour_pct = EXCLUDED.q_behaviour_pct, + q_bullying_pct = EXCLUDED.q_bullying_pct, + q_communication_pct = EXCLUDED.q_communication_pct, + q_progress_pct = EXCLUDED.q_progress_pct, + q_teaching_pct = EXCLUDED.q_teaching_pct, + q_information_pct = EXCLUDED.q_information_pct, + q_curriculum_pct = EXCLUDED.q_curriculum_pct, + q_future_pct = EXCLUDED.q_future_pct, + q_leadership_pct = EXCLUDED.q_leadership_pct, + q_wellbeing_pct = EXCLUDED.q_wellbeing_pct, + q_recommend_pct = EXCLUDED.q_recommend_pct, + q_sen_pct = EXCLUDED.q_sen_pct + """), + record, + ) + inserted += 1 + if inserted % 2000 == 0: + session.flush() + + print(f" ParentView: upserted {inserted} records") + return {"inserted": inserted, "updated": 0, "skipped": 0} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--action", choices=["download", "load", "all"], default="all") + parser.add_argument("--data-dir", type=Path, default=None) + args = parser.parse_args() + + if args.action in ("download", "all"): + download(args.data_dir) + if args.action in ("load", "all"): + load(data_dir=args.data_dir) diff --git a/integrator/scripts/sources/phonics.py b/integrator/scripts/sources/phonics.py new file mode 100644 index 0000000..4a5264b --- /dev/null +++ b/integrator/scripts/sources/phonics.py @@ -0,0 +1,132 @@ +""" +Phonics Screening Check downloader and loader. + +Source: EES publication "phonics-screening-check-and-key-stage-1-assessments-england" +Update: Annual (September/October) +""" +import argparse +import sys +from pathlib import Path + +import pandas as pd + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from config import SUPPLEMENTARY_DIR +from db import get_session +from sources.ees import get_latest_csv_url, download_csv + +DEST_DIR = SUPPLEMENTARY_DIR / "phonics" +PUBLICATION_SLUG = "phonics-screening-check-and-key-stage-1-assessments-england" + +# Known column names in the phonics CSV (vary by year) +COLUMN_MAP = { + "URN": "urn", + "urn": "urn", + # Year 1 pass rate + "PPTA1": "year1_phonics_pct", # % meeting expected standard Y1 + "PPTA1B": "year1_phonics_pct", + "PT_MET_PHON_Y1": "year1_phonics_pct", + "Y1_MET_EXPECTED_PCT": "year1_phonics_pct", + # Year 2 (re-takers) + "PPTA2": "year2_phonics_pct", + "PT_MET_PHON_Y2": "year2_phonics_pct", + "Y2_MET_EXPECTED_PCT": "year2_phonics_pct", + # Year label + "YEAR": "year", + "Year": "year", +} + +NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", ""} + + +def download(data_dir: Path | None = None) -> Path: + dest = (data_dir / "supplementary" / "phonics") if data_dir else DEST_DIR + dest.mkdir(parents=True, exist_ok=True) + + url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school") + if not url: + raise RuntimeError(f"Could not find CSV URL for phonics publication") + + filename = url.split("/")[-1].split("?")[0] or "phonics_latest.csv" + return download_csv(url, dest / filename) + + +def _parse_pct(val) -> float | None: + if pd.isna(val): + return None + s = str(val).strip().upper().replace("%", "") + if s in NULL_VALUES: + return None + try: + return float(s) + except ValueError: + return None + + +def load(path: Path | None = None, data_dir: Path | None = None) -> dict: + if path is None: + dest = (data_dir / "supplementary" / "phonics") if data_dir else DEST_DIR + files = sorted(dest.glob("*.csv")) + if not files: + raise FileNotFoundError(f"No phonics CSV found in {dest}") + path = files[-1] + + print(f" Phonics: loading {path} ...") + df = pd.read_csv(path, encoding="latin-1", low_memory=False) + df.rename(columns=COLUMN_MAP, inplace=True) + + if "urn" not in df.columns: + raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") + + df["urn"] = pd.to_numeric(df["urn"], errors="coerce") + df = df.dropna(subset=["urn"]) + df["urn"] = df["urn"].astype(int) + + # Infer year from filename if not in data + year = None + import re + m = re.search(r"20(\d{2})", path.stem) + if m: + year = int("20" + m.group(1)) + + inserted = 0 + with get_session() as session: + from sqlalchemy import text + for _, row in df.iterrows(): + urn = int(row["urn"]) + row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year + if not row_year: + continue + + session.execute( + text(""" + INSERT INTO phonics (urn, year, year1_phonics_pct, year2_phonics_pct) + VALUES (:urn, :year, :y1, :y2) + ON CONFLICT (urn, year) DO UPDATE SET + year1_phonics_pct = EXCLUDED.year1_phonics_pct, + year2_phonics_pct = EXCLUDED.year2_phonics_pct + """), + { + "urn": urn, + "year": row_year, + "y1": _parse_pct(row.get("year1_phonics_pct")), + "y2": _parse_pct(row.get("year2_phonics_pct")), + }, + ) + inserted += 1 + if inserted % 5000 == 0: + session.flush() + + print(f" Phonics: upserted {inserted} records") + return {"inserted": inserted, "updated": 0, "skipped": 0} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--action", choices=["download", "load", "all"], default="all") + parser.add_argument("--data-dir", type=Path, default=None) + args = parser.parse_args() + if args.action in ("download", "all"): + download(args.data_dir) + if args.action in ("load", "all"): + load(data_dir=args.data_dir) diff --git a/integrator/scripts/sources/sen_detail.py b/integrator/scripts/sources/sen_detail.py new file mode 100644 index 0000000..8f20ecd --- /dev/null +++ b/integrator/scripts/sources/sen_detail.py @@ -0,0 +1,150 @@ +""" +SEN (Special Educational Needs) primary need type breakdown. + +Source: EES publication "special-educational-needs-in-england" +Update: Annual (September) +""" +import argparse +import re +import sys +from pathlib import Path + +import pandas as pd + +sys.path.insert(0, str(Path(__file__).parent.parent)) +from config import SUPPLEMENTARY_DIR +from db import get_session +from sources.ees import get_latest_csv_url, download_csv + +DEST_DIR = SUPPLEMENTARY_DIR / "sen_detail" +PUBLICATION_SLUG = "special-educational-needs-in-england" + +NULL_VALUES = {"SUPP", "NE", "NA", "NP", "NEW", "LOW", "X", ""} + +COLUMN_MAP = { + "URN": "urn", + "urn": "urn", + "YEAR": "year", + "Year": "year", + # Primary need types — DfE abbreviated codes + "PT_SPEECH": "primary_need_speech_pct", # SLCN + "PT_ASD": "primary_need_autism_pct", # ASD + "PT_MLD": "primary_need_mld_pct", # Moderate learning difficulty + "PT_SPLD": "primary_need_spld_pct", # Specific learning difficulty + "PT_SEMH": "primary_need_semh_pct", # Social, emotional, mental health + "PT_PHYSICAL": "primary_need_physical_pct", # Physical/sensory + "PT_OTHER": "primary_need_other_pct", + # Alternative naming + "SLCN_PCT": "primary_need_speech_pct", + "ASD_PCT": "primary_need_autism_pct", + "MLD_PCT": "primary_need_mld_pct", + "SPLD_PCT": "primary_need_spld_pct", + "SEMH_PCT": "primary_need_semh_pct", + "PHYSICAL_PCT": "primary_need_physical_pct", + "OTHER_PCT": "primary_need_other_pct", +} + + +def download(data_dir: Path | None = None) -> Path: + dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR + dest.mkdir(parents=True, exist_ok=True) + + url = get_latest_csv_url(PUBLICATION_SLUG, keyword="school") + if not url: + url = get_latest_csv_url(PUBLICATION_SLUG) + if not url: + raise RuntimeError("Could not find CSV URL for SEN publication") + + filename = url.split("/")[-1].split("?")[0] or "sen_latest.csv" + return download_csv(url, dest / filename) + + +def _parse_pct(val) -> float | None: + if pd.isna(val): + return None + s = str(val).strip().upper().replace("%", "") + if s in NULL_VALUES: + return None + try: + return float(s) + except ValueError: + return None + + +def load(path: Path | None = None, data_dir: Path | None = None) -> dict: + if path is None: + dest = (data_dir / "supplementary" / "sen_detail") if data_dir else DEST_DIR + files = sorted(dest.glob("*.csv")) + if not files: + raise FileNotFoundError(f"No SEN CSV found in {dest}") + path = files[-1] + + print(f" SEN Detail: loading {path} ...") + df = pd.read_csv(path, encoding="latin-1", low_memory=False) + df.rename(columns=COLUMN_MAP, inplace=True) + + if "urn" not in df.columns: + raise ValueError(f"URN column not found. Available: {list(df.columns)[:20]}") + + df["urn"] = pd.to_numeric(df["urn"], errors="coerce") + df = df.dropna(subset=["urn"]) + df["urn"] = df["urn"].astype(int) + + year = None + m = re.search(r"20(\d{2})", path.stem) + if m: + year = int("20" + m.group(1)) + + inserted = 0 + with get_session() as session: + from sqlalchemy import text + for _, row in df.iterrows(): + urn = int(row["urn"]) + row_year = int(row["year"]) if "year" in df.columns and pd.notna(row.get("year")) else year + if not row_year: + continue + + session.execute( + text(""" + INSERT INTO sen_detail + (urn, year, primary_need_speech_pct, primary_need_autism_pct, + primary_need_mld_pct, primary_need_spld_pct, primary_need_semh_pct, + primary_need_physical_pct, primary_need_other_pct) + VALUES (:urn, :year, :speech, :autism, :mld, :spld, :semh, :physical, :other) + ON CONFLICT (urn, year) DO UPDATE SET + primary_need_speech_pct = EXCLUDED.primary_need_speech_pct, + primary_need_autism_pct = EXCLUDED.primary_need_autism_pct, + primary_need_mld_pct = EXCLUDED.primary_need_mld_pct, + primary_need_spld_pct = EXCLUDED.primary_need_spld_pct, + primary_need_semh_pct = EXCLUDED.primary_need_semh_pct, + primary_need_physical_pct = EXCLUDED.primary_need_physical_pct, + primary_need_other_pct = EXCLUDED.primary_need_other_pct + """), + { + "urn": urn, "year": row_year, + "speech": _parse_pct(row.get("primary_need_speech_pct")), + "autism": _parse_pct(row.get("primary_need_autism_pct")), + "mld": _parse_pct(row.get("primary_need_mld_pct")), + "spld": _parse_pct(row.get("primary_need_spld_pct")), + "semh": _parse_pct(row.get("primary_need_semh_pct")), + "physical": _parse_pct(row.get("primary_need_physical_pct")), + "other": _parse_pct(row.get("primary_need_other_pct")), + }, + ) + inserted += 1 + if inserted % 5000 == 0: + session.flush() + + print(f" SEN Detail: upserted {inserted} records") + return {"inserted": inserted, "updated": 0, "skipped": 0} + + +if __name__ == "__main__": + parser = argparse.ArgumentParser() + parser.add_argument("--action", choices=["download", "load", "all"], default="all") + parser.add_argument("--data-dir", type=Path, default=None) + args = parser.parse_args() + if args.action in ("download", "all"): + download(args.data_dir) + if args.action in ("load", "all"): + load(data_dir=args.data_dir) diff --git a/integrator/server.py b/integrator/server.py new file mode 100644 index 0000000..f46be2d --- /dev/null +++ b/integrator/server.py @@ -0,0 +1,70 @@ +""" +Data integrator HTTP server. +Kestra calls this server via HTTP tasks to trigger download/load operations. +""" +import importlib +import sys +import traceback +from pathlib import Path + +from fastapi import FastAPI, HTTPException +from fastapi.responses import JSONResponse + +sys.path.insert(0, "/app/scripts") + +app = FastAPI(title="SchoolCompare Data Integrator", version="1.0.0") + +SOURCES = { + "ofsted", "gias", "parent_view", + "census", "admissions", "sen_detail", + "phonics", "idaci", "finance", +} + + +@app.get("/health") +def health(): + return {"status": "ok"} + + +@app.post("/run/{source}") +def run_source(source: str, action: str = "all"): + """ + Trigger a data source download and/or load. + action: "download" | "load" | "all" + """ + if source not in SOURCES: + raise HTTPException(status_code=404, detail=f"Unknown source '{source}'. Available: {sorted(SOURCES)}") + if action not in ("download", "load", "all"): + raise HTTPException(status_code=400, detail="action must be 'download', 'load', or 'all'") + + try: + mod = importlib.import_module(f"sources.{source}") + result = {} + + if action in ("download", "all"): + mod.download() + + if action in ("load", "all"): + result = mod.load() + + return {"source": source, "action": action, "result": result} + + except Exception as e: + tb = traceback.format_exc() + raise HTTPException(status_code=500, detail={"error": str(e), "traceback": tb}) + + +@app.post("/run-all") +def run_all(action: str = "all"): + """Trigger all sources in sequence.""" + results = {} + for source in sorted(SOURCES): + try: + mod = importlib.import_module(f"sources.{source}") + if action in ("download", "all"): + mod.download() + if action in ("load", "all"): + results[source] = mod.load() + except Exception as e: + results[source] = {"error": str(e)} + return results diff --git a/nextjs-app/app/school/[urn]/page.tsx b/nextjs-app/app/school/[urn]/page.tsx index 49c7167..d129fb2 100644 --- a/nextjs-app/app/school/[urn]/page.tsx +++ b/nextjs-app/app/school/[urn]/page.tsx @@ -77,7 +77,7 @@ export default async function SchoolPage({ params }: SchoolPageProps) { notFound(); } - const { school_info, yearly_data, absence_data } = data; + const { school_info, yearly_data, absence_data, ofsted, parent_view, census, admissions, sen_detail, phonics, deprivation, finance } = data; // Generate JSON-LD structured data for SEO const structuredData = { @@ -116,6 +116,14 @@ export default async function SchoolPage({ params }: SchoolPageProps) { schoolInfo={school_info} yearlyData={yearly_data} absenceData={absence_data} + ofsted={ofsted ?? null} + parentView={parent_view ?? null} + census={census ?? null} + admissions={admissions ?? null} + senDetail={sen_detail ?? null} + phonics={phonics ?? null} + deprivation={deprivation ?? null} + finance={finance ?? null} /> ); diff --git a/nextjs-app/components/SchoolDetailView.module.css b/nextjs-app/components/SchoolDetailView.module.css index 1932f03..d4ea91f 100644 --- a/nextjs-app/components/SchoolDetailView.module.css +++ b/nextjs-app/components/SchoolDetailView.module.css @@ -424,3 +424,120 @@ color: var(--text-muted); font-style: italic; } + +/* ── Supplementary Data Sections ──────────────────────── */ +.supplementarySection { + background: var(--bg-card, white); + border: 1px solid var(--border-color, #e5dfd5); + border-radius: 10px; + padding: 1.25rem 1.5rem; +} + +.supplementarySubtitle { + font-size: 0.85rem; + color: var(--text-muted, #8a847a); + margin-bottom: 1rem; +} + +.subSectionTitle { + font-size: 0.875rem; + font-weight: 600; + color: var(--text-secondary, #5c564d); + margin: 1.25rem 0 0.75rem; +} + +/* Ofsted */ +.ofstedHeader { + display: flex; + align-items: center; + gap: 0.75rem; + margin-bottom: 1rem; +} + +.ofstedGrade { + display: inline-block; + padding: 0.3rem 0.75rem; + font-size: 1rem; + font-weight: 700; + border-radius: 6px; + white-space: nowrap; +} + +.ofstedGrade1 { background: rgba(45, 125, 125, 0.12); color: var(--accent-teal, #2d7d7d); } +.ofstedGrade2 { background: rgba(60, 140, 60, 0.12); color: #3c8c3c; } +.ofstedGrade3 { background: rgba(201, 162, 39, 0.15); color: #b8920e; } +.ofstedGrade4 { background: rgba(224, 114, 86, 0.15); color: var(--accent-coral, #e07256); } + +.ofstedDate { + font-size: 0.85rem; + color: var(--text-muted, #8a847a); +} + +.ofstedType { + font-size: 0.8rem; + color: var(--text-muted, #8a847a); + margin-top: 0.5rem; + font-style: italic; +} + +/* Parent View */ +.parentViewGrid { + display: flex; + flex-direction: column; + gap: 0.5rem; +} + +.parentViewRow { + display: flex; + align-items: center; + gap: 0.75rem; + font-size: 0.875rem; +} + +.parentViewLabel { + flex: 0 0 18rem; + color: var(--text-secondary, #5c564d); + font-size: 0.8125rem; +} + +.parentViewBar { + flex: 1; + height: 0.5rem; + background: var(--bg-secondary, #f3ede4); + border-radius: 4px; + overflow: hidden; +} + +.parentViewFill { + height: 100%; + background: var(--accent-teal, #2d7d7d); + border-radius: 4px; + transition: width 0.4s ease; +} + +.parentViewPct { + flex: 0 0 2.75rem; + text-align: right; + font-size: 0.8125rem; + font-weight: 600; + color: var(--text-primary, #1a1612); +} + +/* Metric hint (small label below metricValue) */ +.metricHint { + font-size: 0.75rem; + color: var(--text-muted, #8a847a); + margin-top: 0.25rem; + font-style: italic; +} + +/* ── Mobile ──────────────────────────────────────────── */ +@media (max-width: 640px) { + .supplementarySection { + padding: 1rem; + } + + .parentViewLabel { + flex: 0 0 10rem; + } +} diff --git a/nextjs-app/components/SchoolDetailView.tsx b/nextjs-app/components/SchoolDetailView.tsx index aacd98b..d1f8d49 100644 --- a/nextjs-app/components/SchoolDetailView.tsx +++ b/nextjs-app/components/SchoolDetailView.tsx @@ -9,17 +9,37 @@ import { useRouter } from 'next/navigation'; import { useComparison } from '@/hooks/useComparison'; import { PerformanceChart } from './PerformanceChart'; import { SchoolMap } from './SchoolMap'; -import type { School, SchoolResult, AbsenceData } from '@/lib/types'; +import type { + School, SchoolResult, AbsenceData, + OfstedInspection, OfstedParentView, SchoolCensus, + SchoolAdmissions, SenDetail, Phonics, + SchoolDeprivation, SchoolFinance, +} from '@/lib/types'; import { formatPercentage, formatProgress, calculateTrend } from '@/lib/utils'; import styles from './SchoolDetailView.module.css'; +const OFSTED_LABELS: Record = { + 1: 'Outstanding', 2: 'Good', 3: 'Requires Improvement', 4: 'Inadequate', +}; + interface SchoolDetailViewProps { schoolInfo: School; yearlyData: SchoolResult[]; absenceData: AbsenceData | null; + ofsted: OfstedInspection | null; + parentView: OfstedParentView | null; + census: SchoolCensus | null; + admissions: SchoolAdmissions | null; + senDetail: SenDetail | null; + phonics: Phonics | null; + deprivation: SchoolDeprivation | null; + finance: SchoolFinance | null; } -export function SchoolDetailView({ schoolInfo, yearlyData, absenceData }: SchoolDetailViewProps) { +export function SchoolDetailView({ + schoolInfo, yearlyData, absenceData, + ofsted, parentView, census, admissions, senDetail, phonics, deprivation, finance, +}: SchoolDetailViewProps) { const router = useRouter(); const { addSchool, removeSchool, isSelected } = useComparison(); const isInComparison = isSelected(schoolInfo.urn); @@ -322,6 +342,209 @@ export function SchoolDetailView({ schoolInfo, yearlyData, absenceData }: School )} + + {/* Ofsted Section */} + {ofsted && ( +
+

Ofsted Inspection

+
+ + {ofsted.overall_effectiveness ? OFSTED_LABELS[ofsted.overall_effectiveness] : 'Not rated'} + + {ofsted.inspection_date && ( + + Inspected: {new Date(ofsted.inspection_date).toLocaleDateString('en-GB', { day: 'numeric', month: 'long', year: 'numeric' })} + + )} +
+
+ {[ + { label: 'Quality of Education', value: ofsted.quality_of_education }, + { label: 'Behaviour & Attitudes', value: ofsted.behaviour_attitudes }, + { label: 'Personal Development', value: ofsted.personal_development }, + { label: 'Leadership & Management', value: ofsted.leadership_management }, + ...(ofsted.early_years_provision != null ? [{ label: 'Early Years', value: ofsted.early_years_provision }] : []), + ].map(({ label, value }) => value != null && ( +
+
{label}
+
+ {OFSTED_LABELS[value]} +
+
+ ))} +
+ {ofsted.inspection_type && ( +

{ofsted.inspection_type}

+ )} +
+ )} + + {/* What Parents Think */} + {parentView && parentView.total_responses != null && parentView.total_responses > 0 && ( +
+

What Parents Think

+

+ Based on {parentView.total_responses.toLocaleString()} parent responses to the Ofsted Parent View survey. +

+
+ {[ + { label: 'My child is happy here', pct: parentView.q_happy_pct }, + { label: 'My child feels safe here', pct: parentView.q_safe_pct }, + { label: 'Would recommend this school', pct: parentView.q_recommend_pct }, + { label: 'Teaching is good', pct: parentView.q_teaching_pct }, + { label: 'My child makes good progress', pct: parentView.q_progress_pct }, + { label: 'School looks after wellbeing', pct: parentView.q_wellbeing_pct }, + { label: 'Led and managed effectively', pct: parentView.q_leadership_pct }, + { label: 'Behaviour is well managed', pct: parentView.q_behaviour_pct }, + { label: 'Communicates well with parents', pct: parentView.q_communication_pct }, + ].filter(q => q.pct != null).map(({ label, pct }) => ( +
+ {label} +
+
+
+ {pct}% +
+ ))} +
+
+ )} + + {/* Admissions */} + {admissions && ( +
+

Admissions ({admissions.year})

+
+ {admissions.published_admission_number != null && ( +
+
Places available
+
{admissions.published_admission_number}
+
+ )} + {admissions.total_applications != null && ( +
+
Applications received
+
{admissions.total_applications.toLocaleString()}
+
+ )} + {admissions.first_preference_offers_pct != null && ( +
+
Got first choice
+
{admissions.first_preference_offers_pct}%
+
+ )} + {admissions.oversubscribed != null && ( +
+
Oversubscribed
+
{admissions.oversubscribed ? 'Yes' : 'No'}
+
+ )} +
+
+ )} + + {/* Pupils & Inclusion (Census + SEN) */} + {(census || senDetail) && ( +
+

Pupils & Inclusion

+
+ {census?.class_size_avg != null && ( +
+
Average class size
+
{census.class_size_avg.toFixed(1)}
+
+ )} +
+ {senDetail && ( + <> +

Primary SEN Needs (latest year)

+
+ {[ + { label: 'Speech & Language', pct: senDetail.primary_need_speech_pct }, + { label: 'Autism (ASD)', pct: senDetail.primary_need_autism_pct }, + { label: 'Learning Difficulties', pct: senDetail.primary_need_mld_pct }, + { label: 'Specific Learning (Dyslexia etc.)', pct: senDetail.primary_need_spld_pct }, + { label: 'Social, Emotional & Mental Health', pct: senDetail.primary_need_semh_pct }, + { label: 'Physical / Sensory', pct: senDetail.primary_need_physical_pct }, + ].filter(n => n.pct != null).map(({ label, pct }) => ( +
+
{label}
+
{pct}%
+
+ ))} +
+ + )} +
+ )} + + {/* Year 1 Phonics */} + {phonics && phonics.year1_phonics_pct != null && ( +
+

Year 1 Phonics ({phonics.year})

+
+
+
Reached expected standard
+
{formatPercentage(phonics.year1_phonics_pct)}
+
+ {phonics.year2_phonics_pct != null && ( +
+
Year 2 (re-takers) standard
+
{formatPercentage(phonics.year2_phonics_pct)}
+
+ )} +
+
+ )} + + {/* Deprivation Context */} + {deprivation && deprivation.idaci_decile != null && ( +
+

Deprivation Context

+
+
+
Area deprivation decile
+
{deprivation.idaci_decile} / 10
+
+ 1 = most deprived, 10 = least deprived +
+
+ {deprivation.idaci_score != null && ( +
+
IDACI score
+
{deprivation.idaci_score.toFixed(3)}
+
+ )} +
+
+ )} + + {/* Finances */} + {finance && finance.per_pupil_spend != null && ( +
+

Finances ({finance.year})

+
+ {finance.per_pupil_spend != null && ( +
+
Spend per pupil
+
£{Math.round(finance.per_pupil_spend).toLocaleString()}
+
+ )} + {finance.teacher_cost_pct != null && ( +
+
Teacher costs
+
{finance.teacher_cost_pct.toFixed(1)}% of budget
+
+ )} + {finance.staff_cost_pct != null && ( +
+
All staff costs
+
{finance.staff_cost_pct.toFixed(1)}% of budget
+
+ )} +
+
+ )} ); } diff --git a/nextjs-app/components/SchoolRow.module.css b/nextjs-app/components/SchoolRow.module.css index bf0bc16..73b1591 100644 --- a/nextjs-app/components/SchoolRow.module.css +++ b/nextjs-app/components/SchoolRow.module.css @@ -211,6 +211,23 @@ color: var(--text-primary, #1a1612); } +/* ── Ofsted badge ────────────────────────────────────── */ +.ofstedBadge { + display: inline-block; + padding: 0.0625rem 0.375rem; + font-size: 0.6875rem; + font-weight: 600; + border-radius: 3px; + white-space: nowrap; + flex-shrink: 0; + line-height: 1.4; +} + +.ofsted1 { background: rgba(45, 125, 125, 0.12); color: var(--accent-teal, #2d7d7d); } +.ofsted2 { background: rgba(60, 140, 60, 0.12); color: #3c8c3c; } +.ofsted3 { background: rgba(201, 162, 39, 0.15); color: #b8920e; } +.ofsted4 { background: rgba(224, 114, 86, 0.15); color: var(--accent-coral, #e07256); } + /* ── Mobile ──────────────────────────────────────────── */ @media (max-width: 640px) { .row { diff --git a/nextjs-app/components/SchoolRow.tsx b/nextjs-app/components/SchoolRow.tsx index 7f99c04..e266aeb 100644 --- a/nextjs-app/components/SchoolRow.tsx +++ b/nextjs-app/components/SchoolRow.tsx @@ -12,6 +12,13 @@ import { formatPercentage, formatProgress, calculateTrend } from '@/lib/utils'; import { progressBand } from '@/lib/metrics'; import styles from './SchoolRow.module.css'; +const OFSTED_LABELS: Record = { + 1: 'Outstanding', + 2: 'Good', + 3: 'Req. Improvement', + 4: 'Inadequate', +}; + interface SchoolRowProps { school: School; isLocationSearch?: boolean; @@ -46,7 +53,7 @@ export function SchoolRow({ {/* Left: three content lines */}
- {/* Line 1: School name + type */} + {/* Line 1: School name + type + Ofsted badge */} {/* Line 2: Key stats */} diff --git a/nextjs-app/lib/types.ts b/nextjs-app/lib/types.ts index cb7c3cb..515a049 100644 --- a/nextjs-app/lib/types.ts +++ b/nextjs-app/lib/types.ts @@ -47,6 +47,102 @@ export interface School { // Location search fields distance?: number | null; + + // GIAS enrichment fields + website?: string | null; + headteacher_name?: string | null; + capacity?: number | null; + trust_name?: string | null; + gender?: string | null; + + // Ofsted (for list view — summary only) + ofsted_grade?: 1 | 2 | 3 | 4 | null; + ofsted_date?: string | null; +} + +// ============================================================================ +// Supplementary Data Types (populated by Kestra data integrator) +// ============================================================================ + +export interface OfstedInspection { + overall_effectiveness: 1 | 2 | 3 | 4 | null; + quality_of_education: number | null; + behaviour_attitudes: number | null; + personal_development: number | null; + leadership_management: number | null; + early_years_provision: number | null; + previous_overall: number | null; + inspection_date: string | null; + inspection_type: string | null; +} + +export interface OfstedParentView { + survey_date: string | null; + total_responses: number | null; + q_happy_pct: number | null; + q_safe_pct: number | null; + q_behaviour_pct: number | null; + q_bullying_pct: number | null; + q_communication_pct: number | null; + q_progress_pct: number | null; + q_teaching_pct: number | null; + q_information_pct: number | null; + q_curriculum_pct: number | null; + q_future_pct: number | null; + q_leadership_pct: number | null; + q_wellbeing_pct: number | null; + q_recommend_pct: number | null; + q_sen_pct: number | null; +} + +export interface SchoolCensus { + year: number; + class_size_avg: number | null; + ethnicity_white_pct: number | null; + ethnicity_asian_pct: number | null; + ethnicity_black_pct: number | null; + ethnicity_mixed_pct: number | null; + ethnicity_other_pct: number | null; +} + +export interface SchoolAdmissions { + year: number; + published_admission_number: number | null; + total_applications: number | null; + first_preference_offers_pct: number | null; + oversubscribed: boolean | null; +} + +export interface SenDetail { + year: number; + primary_need_speech_pct: number | null; + primary_need_autism_pct: number | null; + primary_need_mld_pct: number | null; + primary_need_spld_pct: number | null; + primary_need_semh_pct: number | null; + primary_need_physical_pct: number | null; + primary_need_other_pct: number | null; +} + +export interface Phonics { + year: number; + year1_phonics_pct: number | null; + year2_phonics_pct: number | null; +} + +export interface SchoolDeprivation { + lsoa_code: string | null; + idaci_score: number | null; + idaci_decile: number | null; +} + +export interface SchoolFinance { + year: number; + per_pupil_spend: number | null; + staff_cost_pct: number | null; + teacher_cost_pct: number | null; + support_staff_cost_pct: number | null; + premises_cost_pct: number | null; } // ============================================================================ @@ -152,6 +248,15 @@ export interface SchoolDetailsResponse { school_info: School; yearly_data: SchoolResult[]; absence_data: AbsenceData | null; + // Supplementary data (null until Kestra populates) + ofsted: OfstedInspection | null; + parent_view: OfstedParentView | null; + census: SchoolCensus | null; + admissions: SchoolAdmissions | null; + sen_detail: SenDetail | null; + phonics: Phonics | null; + deprivation: SchoolDeprivation | null; + finance: SchoolFinance | null; } export interface ComparisonData {