feat(pipeline): add Meltano + dbt + Airflow ELT pipeline scaffold

Replaces the hand-rolled integrator with a production-grade ELT pipeline using Meltano (Singer taps), dbt Core (medallion architecture), and Apache Airflow (orchestration). Adds Typesense for search and PostGIS for geospatial queries. - 6 custom Singer taps (GIAS, EES, Ofsted, Parent View, FBIT, IDACI) - dbt project: 12 staging, 5 intermediate, 12 mart models - 3 Airflow DAGs (daily/monthly/annual schedules) - Typesense sync + batch geocoding scripts - docker-compose: add Airflow, Typesense; upgrade to PostGIS - Portainer stack definition matching live deployment topology Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 08:37:53 +00:00
parent 8aca0a7a53
commit 8f02b5125e
65 changed files with 2822 additions and 72 deletions
@@ -0,0 +1,18 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "tap-uk-ofsted"
+version = "0.1.0"
+description = "Singer tap for UK Ofsted Management Information"
+requires-python = ">=3.10"
+dependencies = [
+    "singer-sdk~=0.39",
+    "requests>=2.31",
+    "pandas>=2.0",
+    "odfpy>=1.4",
+]
+
+[project.scripts]
+tap-uk-ofsted = "tap_uk_ofsted.tap:TapUKOfsted.cli"
@@ -0,0 +1 @@
+"""tap-uk-ofsted: Singer tap for Ofsted Management Information."""
@@ -0,0 +1,176 @@
+"""Ofsted MI Singer tap — extracts inspection records from GOV.UK CSV/ODS."""
+
+from __future__ import annotations
+
+import io
+import re
+
+import requests
+from singer_sdk import Stream, Tap
+from singer_sdk import typing as th
+
+GOV_UK_PAGE = (
+    "https://www.gov.uk/government/statistical-data-sets/"
+    "monthly-management-information-ofsteds-school-inspections-outcomes"
+)
+
+# Column name → internal field, in priority order (first match wins).
+# Handles both current and older file formats.
+COLUMN_PRIORITY = {
+    "urn": ["URN", "Urn", "urn"],
+    "inspection_date": [
+        "Inspection start date of latest OEIF graded inspection",
+        "Inspection start date",
+        "Inspection date",
+    ],
+    "inspection_type": [
+        "Inspection type of latest OEIF graded inspection",
+        "Inspection type",
+    ],
+    "event_type_grouping": [
+        "Event type grouping",
+        "Inspection type grouping",
+    ],
+    "overall_effectiveness": [
+        "Latest OEIF overall effectiveness",
+        "Overall effectiveness",
+    ],
+    "quality_of_education": [
+        "Latest OEIF quality of education",
+        "Quality of education",
+    ],
+    "behaviour_and_attitudes": [
+        "Latest OEIF behaviour and attitudes",
+        "Behaviour and attitudes",
+    ],
+    "personal_development": [
+        "Latest OEIF personal development",
+        "Personal development",
+    ],
+    "effectiveness_of_leadership_and_management": [
+        "Latest OEIF effectiveness of leadership and management",
+        "Effectiveness of leadership and management",
+    ],
+    "early_years_provision": [
+        "Latest OEIF early years provision",
+        "Early years provision (where applicable)",
+    ],
+    "sixth_form_provision": [
+        "Latest OEIF sixth form provision",
+        "Sixth form provision (where applicable)",
+    ],
+}
+
+
+def discover_csv_url() -> str | None:
+    """Scrape GOV.UK page to find the latest MI CSV download link."""
+    resp = requests.get(GOV_UK_PAGE, timeout=30)
+    resp.raise_for_status()
+    # Look for CSV attachment links
+    matches = re.findall(
+        r'href="(https://assets\.publishing\.service\.gov\.uk/[^"]+\.csv)"',
+        resp.text,
+    )
+    if matches:
+        return matches[0]
+    # Fall back to ODS
+    matches = re.findall(
+        r'href="(https://assets\.publishing\.service\.gov\.uk/[^"]+\.ods)"',
+        resp.text,
+    )
+    return matches[0] if matches else None
+
+
+class OfstedInspectionsStream(Stream):
+    """Stream: Ofsted inspection records."""
+
+    name = "ofsted_inspections"
+    primary_keys = ["urn", "inspection_date"]
+    replication_key = None
+
+    schema = th.PropertiesList(
+        th.Property("urn", th.IntegerType, required=True),
+        th.Property("inspection_date", th.StringType),
+        th.Property("inspection_type", th.StringType),
+        th.Property("event_type_grouping", th.StringType),
+        th.Property("overall_effectiveness", th.StringType),
+        th.Property("quality_of_education", th.StringType),
+        th.Property("behaviour_and_attitudes", th.StringType),
+        th.Property("personal_development", th.StringType),
+        th.Property("effectiveness_of_leadership_and_management", th.StringType),
+        th.Property("early_years_provision", th.StringType),
+        th.Property("sixth_form_provision", th.StringType),
+        th.Property("report_url", th.StringType),
+    ).to_dict()
+
+    def _resolve_columns(self, df_columns: list[str]) -> dict[str, str]:
+        """Map internal field names to actual CSV column names."""
+        mapping = {}
+        for field, candidates in COLUMN_PRIORITY.items():
+            for candidate in candidates:
+                if candidate in df_columns:
+                    mapping[field] = candidate
+                    break
+        return mapping
+
+    def get_records(self, context):
+        import pandas as pd
+
+        url = self.config.get("mi_url") or discover_csv_url()
+        if not url:
+            self.logger.error("Could not discover Ofsted MI download URL")
+            return
+
+        self.logger.info("Downloading Ofsted MI: %s", url)
+        resp = requests.get(url, timeout=120)
+        resp.raise_for_status()
+
+        if url.endswith(".ods"):
+            df = pd.read_excel(io.BytesIO(resp.content), engine="odf", dtype=str)
+        else:
+            # Detect header row (may not be row 0)
+            text = resp.content.decode("utf-8-sig", errors="replace")
+            lines = text.split("\n")
+            header_idx = 0
+            for i, line in enumerate(lines[:20]):
+                if "URN" in line or "urn" in line.lower():
+                    header_idx = i
+                    break
+            df = pd.read_csv(
+                io.StringIO(text),
+                skiprows=header_idx,
+                dtype=str,
+                keep_default_na=False,
+            )
+
+        col_map = self._resolve_columns(list(df.columns))
+
+        for _, row in df.iterrows():
+            record = {}
+            for field, col in col_map.items():
+                record[field] = row.get(col, None)
+
+            # Cast URN
+            try:
+                record["urn"] = int(record["urn"])
+            except (ValueError, KeyError, TypeError):
+                continue
+
+            yield record
+
+
+class TapUKOfsted(Tap):
+    """Singer tap for UK Ofsted Management Information."""
+
+    name = "tap-uk-ofsted"
+
+    config_jsonschema = th.PropertiesList(
+        th.Property("mi_url", th.StringType, description="Direct URL to Ofsted MI file"),
+    ).to_dict()
+
+    def discover_streams(self):
+        return [OfstedInspectionsStream(self)]
+
+
+if __name__ == "__main__":
+    TapUKOfsted.cli()
				`@@ -0,0 +1 @@`
				`"""tap-uk-ofsted: Singer tap for Ofsted Management Information."""`