Files
school_compare/backend/data_loader.py
Tudor 5eff9af69c
Some checks failed
Build and Push Docker Images / Build Frontend (Next.js) (push) Has been cancelled
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Has been cancelled
Build and Push Docker Images / Trigger Portainer Update (push) Has been cancelled
Build and Push Docker Images / Build Backend (FastAPI) (push) Has been cancelled
feat: add secondary school support with KS4 data and metric tooltips
- Backend: replace INNER JOIN ks2 with UNION ALL (ks2 + ks4) so primary
  and secondary schools both appear in the main DataFrame
- Backend: add /api/national-averages endpoint computing means from live
  data, replacing the hardcoded NATIONAL_AVG constant on the frontend
- Backend: add phase filter param to /api/schools; return phases from
  /api/filters; fix hardcoded "phase": "Primary" in school detail endpoint
- Backend: add KS4 metric definitions (Attainment 8, Progress 8, EBacc,
  English & Maths pass rates) to METRIC_DEFINITIONS and RANKING_COLUMNS
- Frontend: SchoolDetailView is now phase-aware — secondary schools show
  a GCSE Results section (Att8, P8, E&M, EBacc) instead of SATs; phonics
  tab hidden for secondary; admissions says Year 7 instead of Year 3;
  history table shows KS4 columns; chart datasets switch for secondary
- Frontend: new MetricTooltip component (CSS-only ⓘ icon) backed by
  METRIC_EXPLANATIONS — added to RWM, GPS, SEN, EAL, IDACI, progress
  scores and all KS4 metrics throughout SchoolDetailView and SchoolCard
- Frontend: METRIC_EXPLANATIONS extended with KS4 terms (Attainment 8,
  Progress 8, EBacc) and previously missing terms (SEN, EHCP, EAL, IDACI)
- Frontend: SchoolCard expands "RWM" to "Reading, Writing & Maths" and
  shows Attainment 8 / English & Maths Grade 4+ for secondary schools
- Frontend: FilterBar adds Phase dropdown (Primary / Secondary / All-through)
- Frontend: HomeView hero copy updated; compact list shows phase-aware metric
- Global metadata updated to remove "primary only" framing

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
2026-03-28 14:59:40 +00:00

542 lines
17 KiB
Python

"""
Data loading module — reads from marts.* tables built by dbt.
Provides efficient queries with caching.
"""
import pandas as pd
import numpy as np
from typing import Optional, Dict, Tuple, List
import requests
from sqlalchemy import text
from sqlalchemy.orm import Session
from .config import settings
from .database import SessionLocal, engine
from .models import (
DimSchool, DimLocation, KS2Performance,
FactOfstedInspection, FactParentView, FactAdmissions,
FactDeprivation, FactFinance,
)
from .schemas import SCHOOL_TYPE_MAP
_postcode_cache: Dict[str, Tuple[float, float]] = {}
_typesense_client = None
def _get_typesense_client():
global _typesense_client
if _typesense_client is not None:
return _typesense_client
url = settings.typesense_url
key = settings.typesense_api_key
if not url or not key:
return None
try:
import typesense
host = url.split("//")[-1]
host_part, _, port_str = host.partition(":")
port = int(port_str) if port_str else 8108
_typesense_client = typesense.Client({
"nodes": [{"host": host_part, "port": str(port), "protocol": "http"}],
"api_key": key,
"connection_timeout_seconds": 2,
})
return _typesense_client
except Exception:
return None
def search_schools_typesense(query: str, limit: int = 250) -> List[int]:
"""Search Typesense. Returns URNs in relevance order, or [] if unavailable."""
client = _get_typesense_client()
if client is None:
return []
try:
result = client.collections["schools"].documents.search({
"q": query,
"query_by": "school_name,local_authority,postcode",
"per_page": min(limit, 250),
"typo_tokens_threshold": 1,
})
return [int(h["document"]["urn"]) for h in result.get("hits", [])]
except Exception:
return []
def normalize_school_type(school_type: Optional[str]) -> Optional[str]:
"""Convert cryptic school type codes to user-friendly names."""
if not school_type:
return None
code = school_type.strip().upper()
if code in SCHOOL_TYPE_MAP:
return SCHOOL_TYPE_MAP[code]
return school_type
def geocode_single_postcode(postcode: str) -> Optional[Tuple[float, float]]:
"""Geocode a single postcode using postcodes.io API."""
if not postcode:
return None
postcode = postcode.strip().upper()
if postcode in _postcode_cache:
return _postcode_cache[postcode]
try:
response = requests.get(
f"https://api.postcodes.io/postcodes/{postcode}",
timeout=10,
)
if response.status_code == 200:
data = response.json()
if data.get("result"):
lat = data["result"].get("latitude")
lon = data["result"].get("longitude")
if lat and lon:
_postcode_cache[postcode] = (lat, lon)
return (lat, lon)
except Exception:
pass
return None
def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
"""Calculate great-circle distance between two points (miles)."""
from math import radians, cos, sin, asin, sqrt
lat1, lon1, lat2, lon2 = map(radians, [lat1, lon1, lat2, lon2])
dlat = lat2 - lat1
dlon = lon2 - lon1
a = sin(dlat / 2) ** 2 + cos(lat1) * cos(lat2) * sin(dlon / 2) ** 2
return 2 * asin(sqrt(a)) * 3956
# =============================================================================
# MAIN DATA LOAD — joins dim_school + dim_location + fact_ks2_performance
# =============================================================================
_MAIN_QUERY = text("""
-- Branch 1: Primary schools (KS2 data; KS4 columns NULL)
SELECT
s.urn,
s.school_name,
s.phase,
s.school_type,
s.academy_trust_name AS trust_name,
s.academy_trust_uid AS trust_uid,
s.religious_character AS religious_denomination,
s.gender,
s.age_range,
s.capacity,
s.headteacher_name,
s.website,
s.ofsted_grade,
s.ofsted_date,
s.ofsted_framework,
l.local_authority_name AS local_authority,
l.local_authority_code,
l.address_line1 AS address1,
l.address_line2 AS address2,
l.town,
l.postcode,
l.latitude,
l.longitude,
k.year,
k.source_urn,
k.total_pupils,
k.eligible_pupils,
-- KS2 columns
k.rwm_expected_pct,
k.rwm_high_pct,
k.reading_expected_pct,
k.reading_high_pct,
k.reading_avg_score,
k.reading_progress,
k.writing_expected_pct,
k.writing_high_pct,
k.writing_progress,
k.maths_expected_pct,
k.maths_high_pct,
k.maths_avg_score,
k.maths_progress,
k.gps_expected_pct,
k.gps_high_pct,
k.gps_avg_score,
k.science_expected_pct,
k.reading_absence_pct,
k.writing_absence_pct,
k.maths_absence_pct,
k.gps_absence_pct,
k.science_absence_pct,
k.rwm_expected_boys_pct,
k.rwm_high_boys_pct,
k.rwm_expected_girls_pct,
k.rwm_high_girls_pct,
k.rwm_expected_disadvantaged_pct,
k.rwm_expected_non_disadvantaged_pct,
k.disadvantaged_gap,
k.disadvantaged_pct,
k.eal_pct,
k.sen_support_pct,
k.sen_ehcp_pct,
k.stability_pct,
-- KS4 columns (NULL for primary)
NULL::numeric AS attainment_8_score,
NULL::numeric AS progress_8_score,
NULL::numeric AS progress_8_lower_ci,
NULL::numeric AS progress_8_upper_ci,
NULL::numeric AS progress_8_english,
NULL::numeric AS progress_8_maths,
NULL::numeric AS progress_8_ebacc,
NULL::numeric AS progress_8_open,
NULL::numeric AS english_maths_strong_pass_pct,
NULL::numeric AS english_maths_standard_pass_pct,
NULL::numeric AS ebacc_entry_pct,
NULL::numeric AS ebacc_strong_pass_pct,
NULL::numeric AS ebacc_standard_pass_pct,
NULL::numeric AS ebacc_avg_score,
NULL::numeric AS gcse_grade_91_pct,
NULL::numeric AS prior_attainment_avg
FROM marts.dim_school s
JOIN marts.dim_location l ON s.urn = l.urn
JOIN marts.fact_ks2_performance k ON s.urn = k.urn
UNION ALL
-- Branch 2: Secondary schools (KS4 data; KS2 columns NULL)
SELECT
s.urn,
s.school_name,
s.phase,
s.school_type,
s.academy_trust_name AS trust_name,
s.academy_trust_uid AS trust_uid,
s.religious_character AS religious_denomination,
s.gender,
s.age_range,
s.capacity,
s.headteacher_name,
s.website,
s.ofsted_grade,
s.ofsted_date,
s.ofsted_framework,
l.local_authority_name AS local_authority,
l.local_authority_code,
l.address_line1 AS address1,
l.address_line2 AS address2,
l.town,
l.postcode,
l.latitude,
l.longitude,
k4.year,
k4.source_urn,
k4.total_pupils,
k4.eligible_pupils,
-- KS2 columns (NULL for secondary)
NULL::numeric AS rwm_expected_pct,
NULL::numeric AS rwm_high_pct,
NULL::numeric AS reading_expected_pct,
NULL::numeric AS reading_high_pct,
NULL::numeric AS reading_avg_score,
NULL::numeric AS reading_progress,
NULL::numeric AS writing_expected_pct,
NULL::numeric AS writing_high_pct,
NULL::numeric AS writing_progress,
NULL::numeric AS maths_expected_pct,
NULL::numeric AS maths_high_pct,
NULL::numeric AS maths_avg_score,
NULL::numeric AS maths_progress,
NULL::numeric AS gps_expected_pct,
NULL::numeric AS gps_high_pct,
NULL::numeric AS gps_avg_score,
NULL::numeric AS science_expected_pct,
NULL::numeric AS reading_absence_pct,
NULL::numeric AS writing_absence_pct,
NULL::numeric AS maths_absence_pct,
NULL::numeric AS gps_absence_pct,
NULL::numeric AS science_absence_pct,
NULL::numeric AS rwm_expected_boys_pct,
NULL::numeric AS rwm_high_boys_pct,
NULL::numeric AS rwm_expected_girls_pct,
NULL::numeric AS rwm_high_girls_pct,
NULL::numeric AS rwm_expected_disadvantaged_pct,
NULL::numeric AS rwm_expected_non_disadvantaged_pct,
NULL::numeric AS disadvantaged_gap,
NULL::numeric AS disadvantaged_pct,
NULL::numeric AS eal_pct,
k4.sen_support_pct,
k4.sen_ehcp_pct,
NULL::numeric AS stability_pct,
-- KS4 columns
k4.attainment_8_score,
k4.progress_8_score,
k4.progress_8_lower_ci,
k4.progress_8_upper_ci,
k4.progress_8_english,
k4.progress_8_maths,
k4.progress_8_ebacc,
k4.progress_8_open,
k4.english_maths_strong_pass_pct,
k4.english_maths_standard_pass_pct,
k4.ebacc_entry_pct,
k4.ebacc_strong_pass_pct,
k4.ebacc_standard_pass_pct,
k4.ebacc_avg_score,
k4.gcse_grade_91_pct,
k4.prior_attainment_avg
FROM marts.dim_school s
JOIN marts.dim_location l ON s.urn = l.urn
JOIN marts.fact_ks4_performance k4 ON s.urn = k4.urn
ORDER BY school_name, year
""")
def load_school_data_as_dataframe() -> pd.DataFrame:
"""Load all school + KS2 data as a pandas DataFrame."""
try:
df = pd.read_sql(_MAIN_QUERY, engine)
except Exception as exc:
print(f"Warning: Could not load school data from marts: {exc}")
return pd.DataFrame()
if df.empty:
return df
# Build address string
df["address"] = df.apply(
lambda r: ", ".join(
p for p in [r.get("address1"), r.get("address2"), r.get("town"), r.get("postcode")]
if p and str(p) != "None"
),
axis=1,
)
# Normalize school type
df["school_type"] = df["school_type"].apply(normalize_school_type)
return df
# Cache for DataFrame
_df_cache: Optional[pd.DataFrame] = None
def load_school_data() -> pd.DataFrame:
"""Load school data with caching."""
global _df_cache
if _df_cache is not None:
return _df_cache
print("Loading school data from marts...")
_df_cache = load_school_data_as_dataframe()
if not _df_cache.empty:
print(f"Total records loaded: {len(_df_cache)}")
print(f"Unique schools: {_df_cache['urn'].nunique()}")
print(f"Years: {sorted(_df_cache['year'].unique())}")
else:
print("No data found in marts (EES data may not have been loaded yet)")
return _df_cache
def clear_cache():
"""Clear all caches."""
global _df_cache
_df_cache = None
# =============================================================================
# METADATA QUERIES
# =============================================================================
def get_available_years(db: Session = None) -> List[int]:
close_db = db is None
if db is None:
db = SessionLocal()
try:
result = db.query(KS2Performance.year).distinct().order_by(KS2Performance.year).all()
return [r[0] for r in result]
except Exception:
return []
finally:
if close_db:
db.close()
def get_available_local_authorities(db: Session = None) -> List[str]:
close_db = db is None
if db is None:
db = SessionLocal()
try:
result = (
db.query(DimLocation.local_authority_name)
.filter(DimLocation.local_authority_name.isnot(None))
.distinct()
.order_by(DimLocation.local_authority_name)
.all()
)
return [r[0] for r in result if r[0]]
except Exception:
return []
finally:
if close_db:
db.close()
def get_schools_count(db: Session = None) -> int:
close_db = db is None
if db is None:
db = SessionLocal()
try:
return db.query(DimSchool).count()
except Exception:
return 0
finally:
if close_db:
db.close()
def get_data_info(db: Session = None) -> dict:
close_db = db is None
if db is None:
db = SessionLocal()
try:
school_count = get_schools_count(db)
years = get_available_years(db)
local_authorities = get_available_local_authorities(db)
return {
"total_schools": school_count,
"years_available": years,
"local_authorities_count": len(local_authorities),
"data_source": "PostgreSQL (marts)",
}
finally:
if close_db:
db.close()
# =============================================================================
# SUPPLEMENTARY DATA — per-school detail page
# =============================================================================
def get_supplementary_data(db: Session, urn: int) -> dict:
"""Fetch all supplementary data for a single school URN."""
result = {}
def safe_query(model, pk_field, latest_field=None):
try:
q = db.query(model).filter(getattr(model, pk_field) == urn)
if latest_field:
q = q.order_by(getattr(model, latest_field).desc())
return q.first()
except Exception as e:
import logging
logging.getLogger(__name__).error("safe_query failed for %s: %s", model.__name__, e)
db.rollback()
return None
# Latest Ofsted inspection
o = safe_query(FactOfstedInspection, "urn", "inspection_date")
result["ofsted"] = (
{
"framework": o.framework,
"inspection_date": o.inspection_date.isoformat() if o.inspection_date else None,
"inspection_type": o.inspection_type,
"overall_effectiveness": o.overall_effectiveness,
"quality_of_education": o.quality_of_education,
"behaviour_attitudes": o.behaviour_attitudes,
"personal_development": o.personal_development,
"leadership_management": o.leadership_management,
"early_years_provision": o.early_years_provision,
"sixth_form_provision": o.sixth_form_provision,
"previous_overall": None, # Not available in new schema
"rc_safeguarding_met": o.rc_safeguarding_met,
"rc_inclusion": o.rc_inclusion,
"rc_curriculum_teaching": o.rc_curriculum_teaching,
"rc_achievement": o.rc_achievement,
"rc_attendance_behaviour": o.rc_attendance_behaviour,
"rc_personal_development": o.rc_personal_development,
"rc_leadership_governance": o.rc_leadership_governance,
"rc_early_years": o.rc_early_years,
"rc_sixth_form": o.rc_sixth_form,
"report_url": o.report_url,
}
if o
else None
)
# Parent View
pv = safe_query(FactParentView, "urn")
result["parent_view"] = (
{
"survey_date": pv.survey_date.isoformat() if pv.survey_date else None,
"total_responses": pv.total_responses,
"q_happy_pct": pv.q_happy_pct,
"q_safe_pct": pv.q_safe_pct,
"q_behaviour_pct": pv.q_behaviour_pct,
"q_bullying_pct": pv.q_bullying_pct,
"q_communication_pct": pv.q_communication_pct,
"q_progress_pct": pv.q_progress_pct,
"q_teaching_pct": pv.q_teaching_pct,
"q_information_pct": pv.q_information_pct,
"q_curriculum_pct": pv.q_curriculum_pct,
"q_future_pct": pv.q_future_pct,
"q_leadership_pct": pv.q_leadership_pct,
"q_wellbeing_pct": pv.q_wellbeing_pct,
"q_recommend_pct": pv.q_recommend_pct,
}
if pv
else None
)
# Census (fact_pupil_characteristics — minimal until census columns are verified)
result["census"] = None
# Admissions (latest year)
a = safe_query(FactAdmissions, "urn", "year")
result["admissions"] = (
{
"year": a.year,
"school_phase": a.school_phase,
"published_admission_number": a.published_admission_number,
"total_applications": a.total_applications,
"first_preference_applications": a.first_preference_applications,
"first_preference_offers": a.first_preference_offers,
"first_preference_offer_pct": a.first_preference_offer_pct,
"oversubscribed": a.oversubscribed,
}
if a
else None
)
# SEN detail — not available in current marts
result["sen_detail"] = None
# Phonics — no school-level data on EES
result["phonics"] = None
# Deprivation
d = safe_query(FactDeprivation, "urn")
result["deprivation"] = (
{
"lsoa_code": d.lsoa_code,
"idaci_score": d.idaci_score,
"idaci_decile": d.idaci_decile,
}
if d
else None
)
# Finance (latest year)
f = safe_query(FactFinance, "urn", "year")
result["finance"] = (
{
"year": f.year,
"per_pupil_spend": f.per_pupil_spend,
"staff_cost_pct": f.staff_cost_pct,
"teacher_cost_pct": f.teacher_cost_pct,
"support_staff_cost_pct": f.support_staff_cost_pct,
"premises_cost_pct": f.premises_cost_pct,
}
if f
else None
)
return result