feat(pipeline): add Meltano + dbt + Airflow ELT pipeline scaffold
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 35s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

Replaces the hand-rolled integrator with a production-grade ELT pipeline
using Meltano (Singer taps), dbt Core (medallion architecture), and
Apache Airflow (orchestration). Adds Typesense for search and PostGIS
for geospatial queries.

- 6 custom Singer taps (GIAS, EES, Ofsted, Parent View, FBIT, IDACI)
- dbt project: 12 staging, 5 intermediate, 12 mart models
- 3 Airflow DAGs (daily/monthly/annual schedules)
- Typesense sync + batch geocoding scripts
- docker-compose: add Airflow, Typesense; upgrade to PostGIS
- Portainer stack definition matching live deployment topology

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 08:37:53 +00:00
parent 8aca0a7a53
commit 8f02b5125e
65 changed files with 2822 additions and 72 deletions

View File

@@ -0,0 +1,18 @@
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "tap-uk-ofsted"
version = "0.1.0"
description = "Singer tap for UK Ofsted Management Information"
requires-python = ">=3.10"
dependencies = [
"singer-sdk~=0.39",
"requests>=2.31",
"pandas>=2.0",
"odfpy>=1.4",
]
[project.scripts]
tap-uk-ofsted = "tap_uk_ofsted.tap:TapUKOfsted.cli"

View File

@@ -0,0 +1 @@
"""tap-uk-ofsted: Singer tap for Ofsted Management Information."""

View File

@@ -0,0 +1,176 @@
"""Ofsted MI Singer tap — extracts inspection records from GOV.UK CSV/ODS."""
from __future__ import annotations
import io
import re
import requests
from singer_sdk import Stream, Tap
from singer_sdk import typing as th
GOV_UK_PAGE = (
"https://www.gov.uk/government/statistical-data-sets/"
"monthly-management-information-ofsteds-school-inspections-outcomes"
)
# Column name → internal field, in priority order (first match wins).
# Handles both current and older file formats.
COLUMN_PRIORITY = {
"urn": ["URN", "Urn", "urn"],
"inspection_date": [
"Inspection start date of latest OEIF graded inspection",
"Inspection start date",
"Inspection date",
],
"inspection_type": [
"Inspection type of latest OEIF graded inspection",
"Inspection type",
],
"event_type_grouping": [
"Event type grouping",
"Inspection type grouping",
],
"overall_effectiveness": [
"Latest OEIF overall effectiveness",
"Overall effectiveness",
],
"quality_of_education": [
"Latest OEIF quality of education",
"Quality of education",
],
"behaviour_and_attitudes": [
"Latest OEIF behaviour and attitudes",
"Behaviour and attitudes",
],
"personal_development": [
"Latest OEIF personal development",
"Personal development",
],
"effectiveness_of_leadership_and_management": [
"Latest OEIF effectiveness of leadership and management",
"Effectiveness of leadership and management",
],
"early_years_provision": [
"Latest OEIF early years provision",
"Early years provision (where applicable)",
],
"sixth_form_provision": [
"Latest OEIF sixth form provision",
"Sixth form provision (where applicable)",
],
}
def discover_csv_url() -> str | None:
"""Scrape GOV.UK page to find the latest MI CSV download link."""
resp = requests.get(GOV_UK_PAGE, timeout=30)
resp.raise_for_status()
# Look for CSV attachment links
matches = re.findall(
r'href="(https://assets\.publishing\.service\.gov\.uk/[^"]+\.csv)"',
resp.text,
)
if matches:
return matches[0]
# Fall back to ODS
matches = re.findall(
r'href="(https://assets\.publishing\.service\.gov\.uk/[^"]+\.ods)"',
resp.text,
)
return matches[0] if matches else None
class OfstedInspectionsStream(Stream):
"""Stream: Ofsted inspection records."""
name = "ofsted_inspections"
primary_keys = ["urn", "inspection_date"]
replication_key = None
schema = th.PropertiesList(
th.Property("urn", th.IntegerType, required=True),
th.Property("inspection_date", th.StringType),
th.Property("inspection_type", th.StringType),
th.Property("event_type_grouping", th.StringType),
th.Property("overall_effectiveness", th.StringType),
th.Property("quality_of_education", th.StringType),
th.Property("behaviour_and_attitudes", th.StringType),
th.Property("personal_development", th.StringType),
th.Property("effectiveness_of_leadership_and_management", th.StringType),
th.Property("early_years_provision", th.StringType),
th.Property("sixth_form_provision", th.StringType),
th.Property("report_url", th.StringType),
).to_dict()
def _resolve_columns(self, df_columns: list[str]) -> dict[str, str]:
"""Map internal field names to actual CSV column names."""
mapping = {}
for field, candidates in COLUMN_PRIORITY.items():
for candidate in candidates:
if candidate in df_columns:
mapping[field] = candidate
break
return mapping
def get_records(self, context):
import pandas as pd
url = self.config.get("mi_url") or discover_csv_url()
if not url:
self.logger.error("Could not discover Ofsted MI download URL")
return
self.logger.info("Downloading Ofsted MI: %s", url)
resp = requests.get(url, timeout=120)
resp.raise_for_status()
if url.endswith(".ods"):
df = pd.read_excel(io.BytesIO(resp.content), engine="odf", dtype=str)
else:
# Detect header row (may not be row 0)
text = resp.content.decode("utf-8-sig", errors="replace")
lines = text.split("\n")
header_idx = 0
for i, line in enumerate(lines[:20]):
if "URN" in line or "urn" in line.lower():
header_idx = i
break
df = pd.read_csv(
io.StringIO(text),
skiprows=header_idx,
dtype=str,
keep_default_na=False,
)
col_map = self._resolve_columns(list(df.columns))
for _, row in df.iterrows():
record = {}
for field, col in col_map.items():
record[field] = row.get(col, None)
# Cast URN
try:
record["urn"] = int(record["urn"])
except (ValueError, KeyError, TypeError):
continue
yield record
class TapUKOfsted(Tap):
"""Singer tap for UK Ofsted Management Information."""
name = "tap-uk-ofsted"
config_jsonschema = th.PropertiesList(
th.Property("mi_url", th.StringType, description="Direct URL to Ofsted MI file"),
).to_dict()
def discover_streams(self):
return [OfstedInspectionsStream(self)]
if __name__ == "__main__":
TapUKOfsted.cli()