feat(pipeline): implement parent-view, fbit, idaci Singer taps + align staging/mart models

Port extraction logic from integrator scripts into Singer SDK taps: - tap-uk-parent-view: scrapes Ofsted open data portal, parses survey responses (14 questions) - tap-uk-fbit: queries FBIT API per-URN with rate limiting, computes per-pupil spend - tap-uk-idaci: downloads IoD2019 XLSX, batch-resolves postcodes→LSOAs via postcodes.io Update dbt models to match actual tap output schemas: - stg_idaci now includes URN (tap does the postcode→LSOA→school join) - stg_parent_view expanded from 8 to 13 question columns - fact_deprivation simplified (no longer needs postcode→LSOA join in dbt) - fact_parent_view expanded to include all 13 question metrics Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 10:38:07 +00:00
parent 904093ea8a
commit 97d975114a
9 changed files with 360 additions and 60 deletions
@@ -1,10 +1,33 @@
-"""Parent View Singer tap — extracts survey data from Ofsted Parent View portal."""
+"""Parent View Singer tap — extracts survey data from Ofsted Parent View open data portal."""

 from __future__ import annotations

+import io
+import re
+from datetime import date
+
+import pandas as pd
+import requests
 from singer_sdk import Stream, Tap
 from singer_sdk import typing as th

+OPEN_DATA_PAGE = "https://parentview.ofsted.gov.uk/open-data"
+
+
+def _positive_pct(row: pd.Series, q_col_base: str) -> float | None:
+    """Sum 'Strongly agree' + 'Agree' percentages for a question."""
+    strongly = row.get(f"{q_col_base} - Strongly agree %") or row.get(f"{q_col_base} - Strongly Agree %")
+    agree = row.get(f"{q_col_base} - Agree %")
+    try:
+        total = 0.0
+        if pd.notna(strongly):
+            total += float(strongly)
+        if pd.notna(agree):
+            total += float(agree)
+        return round(total, 1) if total > 0 else None
+    except (TypeError, ValueError):
+        return None
+

 class ParentViewStream(Stream):
    """Stream: Parent View survey responses per school."""
@@ -19,27 +42,106 @@ class ParentViewStream(Stream):
        th.Property("total_responses", th.IntegerType),
        th.Property("q_happy_pct", th.NumberType),
        th.Property("q_safe_pct", th.NumberType),
-        th.Property("q_progress_pct", th.NumberType),
-        th.Property("q_well_taught_pct", th.NumberType),
-        th.Property("q_well_led_pct", th.NumberType),
        th.Property("q_behaviour_pct", th.NumberType),
        th.Property("q_bullying_pct", th.NumberType),
+        th.Property("q_communication_pct", th.NumberType),
+        th.Property("q_progress_pct", th.NumberType),
+        th.Property("q_teaching_pct", th.NumberType),
+        th.Property("q_information_pct", th.NumberType),
+        th.Property("q_curriculum_pct", th.NumberType),
+        th.Property("q_future_pct", th.NumberType),
+        th.Property("q_leadership_pct", th.NumberType),
+        th.Property("q_wellbeing_pct", th.NumberType),
        th.Property("q_recommend_pct", th.NumberType),
    ).to_dict()

+    def _discover_download_url(self) -> str:
+        """Scrape the open data page for the download link."""
+        resp = requests.get(OPEN_DATA_PAGE, timeout=30)
+        resp.raise_for_status()
+        urls = re.findall(r'href="([^"]+\.(?:xlsx|csv|zip))"', resp.text, re.IGNORECASE)
+        if not urls:
+            msg = "No download link found on Parent View open data page"
+            raise RuntimeError(msg)
+        url = urls[0]
+        if not url.startswith("http"):
+            url = "https://parentview.ofsted.gov.uk" + url
+        return url
+
    def get_records(self, context):
-        # TODO: Implement Parent View data extraction
-        # Source: Ofsted Parent View portal XLSX/CSV download
-        # URL discovery requires scraping parentview.ofsted.gov.uk
-        self.logger.warning("Parent View extraction not yet implemented")
-        return iter([])
+        url = self._discover_download_url()
+        self.logger.info("Downloading Parent View data: %s", url)
+
+        resp = requests.get(url, timeout=120)
+        resp.raise_for_status()
+
+        if url.endswith(".xlsx"):
+            df = pd.read_excel(io.BytesIO(resp.content))
+        else:
+            df = pd.read_csv(
+                io.BytesIO(resp.content),
+                encoding="latin-1",
+                low_memory=False,
+            )
+
+        # Normalise URN column
+        urn_col = next((c for c in df.columns if c.strip().upper() == "URN"), None)
+        if not urn_col:
+            self.logger.error("URN column not found. Columns: %s", list(df.columns)[:20])
+            return
+
+        df.rename(columns={urn_col: "urn"}, inplace=True)
+        df["urn"] = pd.to_numeric(df["urn"], errors="coerce")
+        df = df.dropna(subset=["urn"])
+
+        # Find total responses column
+        resp_col = next(
+            (c for c in df.columns if "total" in c.lower() and "respon" in c.lower()),
+            None,
+        )
+
+        today = date.today().isoformat()
+
+        for _, row in df.iterrows():
+            try:
+                urn = int(row["urn"])
+            except (ValueError, TypeError):
+                continue
+
+            total = None
+            if resp_col and pd.notna(row.get(resp_col)):
+                try:
+                    total = int(row[resp_col])
+                except (ValueError, TypeError):
+                    pass
+
+            yield {
+                "urn": urn,
+                "survey_date": today,
+                "total_responses": total,
+                "q_happy_pct": _positive_pct(row, "Q1"),
+                "q_safe_pct": _positive_pct(row, "Q2"),
+                "q_behaviour_pct": _positive_pct(row, "Q3"),
+                "q_bullying_pct": _positive_pct(row, "Q4"),
+                "q_communication_pct": _positive_pct(row, "Q5"),
+                "q_progress_pct": _positive_pct(row, "Q7"),
+                "q_teaching_pct": _positive_pct(row, "Q8"),
+                "q_information_pct": _positive_pct(row, "Q9"),
+                "q_curriculum_pct": _positive_pct(row, "Q10"),
+                "q_future_pct": _positive_pct(row, "Q11"),
+                "q_leadership_pct": _positive_pct(row, "Q12"),
+                "q_wellbeing_pct": _positive_pct(row, "Q13"),
+                "q_recommend_pct": _positive_pct(row, "Q14"),
+            }


 class TapUKParentView(Tap):
    """Singer tap for UK Ofsted Parent View."""

    name = "tap-uk-parent-view"
-    config_jsonschema = th.PropertiesList().to_dict()
+    config_jsonschema = th.PropertiesList(
+        th.Property("download_url", th.StringType, description="Direct URL to Parent View data file"),
+    ).to_dict()

    def discover_streams(self):
        return [ParentViewStream(self)]