feat(pipeline): implement parent-view, fbit, idaci Singer taps + align staging/mart models
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 34s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Integrator (push) Successful in 57s
Build and Push Docker Images / Build Kestra Init (push) Successful in 31s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m6s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 34s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Integrator (push) Successful in 57s
Build and Push Docker Images / Build Kestra Init (push) Successful in 31s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m6s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
Port extraction logic from integrator scripts into Singer SDK taps: - tap-uk-parent-view: scrapes Ofsted open data portal, parses survey responses (14 questions) - tap-uk-fbit: queries FBIT API per-URN with rate limiting, computes per-pupil spend - tap-uk-idaci: downloads IoD2019 XLSX, batch-resolves postcodes→LSOAs via postcodes.io Update dbt models to match actual tap output schemas: - stg_idaci now includes URN (tap does the postcode→LSOA→school join) - stg_parent_view expanded from 8 to 13 question columns - fact_deprivation simplified (no longer needs postcode→LSOA join in dbt) - fact_parent_view expanded to include all 13 question metrics Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,10 +1,33 @@
|
||||
"""Parent View Singer tap — extracts survey data from Ofsted Parent View portal."""
|
||||
"""Parent View Singer tap — extracts survey data from Ofsted Parent View open data portal."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import re
|
||||
from datetime import date
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from singer_sdk import Stream, Tap
|
||||
from singer_sdk import typing as th
|
||||
|
||||
OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data"
|
||||
|
||||
|
||||
def _positive_pct(row: pd.Series, q_col_base: str) -> float | None:
|
||||
"""Sum 'Strongly agree' + 'Agree' percentages for a question."""
|
||||
strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %")
|
||||
agree = row.get(f"{q_col_base} - Agree %")
|
||||
try:
|
||||
total = 0.0
|
||||
if pd.notna(strongly):
|
||||
total += float(strongly)
|
||||
if pd.notna(agree):
|
||||
total += float(agree)
|
||||
return round(total, 1) if total > 0 else None
|
||||
except (TypeError, ValueError):
|
||||
return None
|
||||
|
||||
|
||||
class ParentViewStream(Stream):
|
||||
"""Stream: Parent View survey responses per school."""
|
||||
@@ -19,27 +42,106 @@ class ParentViewStream(Stream):
|
||||
th.Property("total_responses", th.IntegerType),
|
||||
th.Property("q_happy_pct", th.NumberType),
|
||||
th.Property("q_safe_pct", th.NumberType),
|
||||
th.Property("q_progress_pct", th.NumberType),
|
||||
th.Property("q_well_taught_pct", th.NumberType),
|
||||
th.Property("q_well_led_pct", th.NumberType),
|
||||
th.Property("q_behaviour_pct", th.NumberType),
|
||||
th.Property("q_bullying_pct", th.NumberType),
|
||||
th.Property("q_communication_pct", th.NumberType),
|
||||
th.Property("q_progress_pct", th.NumberType),
|
||||
th.Property("q_teaching_pct", th.NumberType),
|
||||
th.Property("q_information_pct", th.NumberType),
|
||||
th.Property("q_curriculum_pct", th.NumberType),
|
||||
th.Property("q_future_pct", th.NumberType),
|
||||
th.Property("q_leadership_pct", th.NumberType),
|
||||
th.Property("q_wellbeing_pct", th.NumberType),
|
||||
th.Property("q_recommend_pct", th.NumberType),
|
||||
).to_dict()
|
||||
|
||||
def _discover_download_url(self) -> str:
|
||||
"""Scrape the open data page for the download link."""
|
||||
resp = requests.get(OPEN_DATA_PAGE, timeout=30)
|
||||
resp.raise_for_status()
|
||||
urls = re.findall(r'href="([^"]+\.(?:xlsx|csv|zip))"', resp.text, re.IGNORECASE)
|
||||
if not urls:
|
||||
msg = "No download link found on Parent View open data page"
|
||||
raise RuntimeError(msg)
|
||||
url = urls[0]
|
||||
if not url.startswith("http"):
|
||||
url = "https://parentview.ofsted.gov.uk" + url
|
||||
return url
|
||||
|
||||
def get_records(self, context):
|
||||
# TODO: Implement Parent View data extraction
|
||||
# Source: Ofsted Parent View portal XLSX/CSV download
|
||||
# URL discovery requires scraping parentview.ofsted.gov.uk
|
||||
self.logger.warning("Parent View extraction not yet implemented")
|
||||
return iter([])
|
||||
url = self._discover_download_url()
|
||||
self.logger.info("Downloading Parent View data: %s", url)
|
||||
|
||||
resp = requests.get(url, timeout=120)
|
||||
resp.raise_for_status()
|
||||
|
||||
if url.endswith(".xlsx"):
|
||||
df = pd.read_excel(io.BytesIO(resp.content))
|
||||
else:
|
||||
df = pd.read_csv(
|
||||
io.BytesIO(resp.content),
|
||||
encoding="latin-1",
|
||||
low_memory=False,
|
||||
)
|
||||
|
||||
# Normalise URN column
|
||||
urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None)
|
||||
if not urn_col:
|
||||
self.logger.error("URN column not found. Columns: %s", list(df.columns)[:20])
|
||||
return
|
||||
|
||||
df.rename(columns={urn_col: "urn"}, inplace=True)
|
||||
df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
|
||||
df = df.dropna(subset=["urn"])
|
||||
|
||||
# Find total responses column
|
||||
resp_col = next(
|
||||
(c for c in df.columns if "total" in c.lower() and "respon" in c.lower()),
|
||||
None,
|
||||
)
|
||||
|
||||
today = date.today().isoformat()
|
||||
|
||||
for _, row in df.iterrows():
|
||||
try:
|
||||
urn = int(row["urn"])
|
||||
except (ValueError, TypeError):
|
||||
continue
|
||||
|
||||
total = None
|
||||
if resp_col and pd.notna(row.get(resp_col)):
|
||||
try:
|
||||
total = int(row[resp_col])
|
||||
except (ValueError, TypeError):
|
||||
pass
|
||||
|
||||
yield {
|
||||
"urn": urn,
|
||||
"survey_date": today,
|
||||
"total_responses": total,
|
||||
"q_happy_pct": _positive_pct(row, "Q1"),
|
||||
"q_safe_pct": _positive_pct(row, "Q2"),
|
||||
"q_behaviour_pct": _positive_pct(row, "Q3"),
|
||||
"q_bullying_pct": _positive_pct(row, "Q4"),
|
||||
"q_communication_pct": _positive_pct(row, "Q5"),
|
||||
"q_progress_pct": _positive_pct(row, "Q7"),
|
||||
"q_teaching_pct": _positive_pct(row, "Q8"),
|
||||
"q_information_pct": _positive_pct(row, "Q9"),
|
||||
"q_curriculum_pct": _positive_pct(row, "Q10"),
|
||||
"q_future_pct": _positive_pct(row, "Q11"),
|
||||
"q_leadership_pct": _positive_pct(row, "Q12"),
|
||||
"q_wellbeing_pct": _positive_pct(row, "Q13"),
|
||||
"q_recommend_pct": _positive_pct(row, "Q14"),
|
||||
}
|
||||
|
||||
|
||||
class TapUKParentView(Tap):
|
||||
"""Singer tap for UK Ofsted Parent View."""
|
||||
|
||||
name = "tap-uk-parent-view"
|
||||
config_jsonschema = th.PropertiesList().to_dict()
|
||||
config_jsonschema = th.PropertiesList(
|
||||
th.Property("download_url", th.StringType, description="Direct URL to Parent View data file"),
|
||||
).to_dict()
|
||||
|
||||
def discover_streams(self):
|
||||
return [ParentViewStream(self)]
|
||||
|
||||
Reference in New Issue
Block a user