feat(pipeline): add Meltano + dbt + Airflow ELT pipeline scaffold
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 35s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

Replaces the hand-rolled integrator with a production-grade ELT pipeline
using Meltano (Singer taps), dbt Core (medallion architecture), and
Apache Airflow (orchestration). Adds Typesense for search and PostGIS
for geospatial queries.

- 6 custom Singer taps (GIAS, EES, Ofsted, Parent View, FBIT, IDACI)
- dbt project: 12 staging, 5 intermediate, 12 mart models
- 3 Airflow DAGs (daily/monthly/annual schedules)
- Typesense sync + batch geocoding scripts
- docker-compose: add Airflow, Typesense; upgrade to PostGIS
- Portainer stack definition matching live deployment topology

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 08:37:53 +00:00
parent 8aca0a7a53
commit 8f02b5125e
65 changed files with 2822 additions and 72 deletions

View File

@@ -0,0 +1,17 @@
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "tap-uk-ees"
version = "0.1.0"
description = "Singer tap for UK Explore Education Statistics (KS2, KS4, Census, Admissions, Phonics)"
requires-python = ">=3.10"
dependencies = [
"singer-sdk~=0.39",
"requests>=2.31",
"pandas>=2.0",
]
[project.scripts]
tap-uk-ees = "tap_uk_ees.tap:TapUKEES.cli"

View File

@@ -0,0 +1 @@
"""tap-uk-ees: Singer tap for Explore Education Statistics API."""

View File

@@ -0,0 +1,154 @@
"""EES Singer tap — extracts KS2, KS4, Census, Admissions, Phonics data."""
from __future__ import annotations
import io
import zipfile
import requests
from singer_sdk import Stream, Tap
from singer_sdk import typing as th
CONTENT_API_BASE = (
"https://content.explore-education-statistics.service.gov.uk/api"
)
STATS_API_BASE = "https://api.education.gov.uk/statistics/v1"
TIMEOUT = 120
def get_content_release_id(publication_slug: str) -> str:
"""Return the latest release ID via the EES content API."""
url = f"{CONTENT_API_BASE}/publications/{publication_slug}/releases/latest"
resp = requests.get(url, timeout=TIMEOUT)
resp.raise_for_status()
return resp.json()["id"]
def download_release_zip(release_id: str) -> zipfile.ZipFile:
"""Download all data files for a release as a ZIP."""
url = f"{CONTENT_API_BASE}/releases/{release_id}/files"
resp = requests.get(url, timeout=300, stream=True)
resp.raise_for_status()
return zipfile.ZipFile(io.BytesIO(resp.content))
class EESDatasetStream(Stream):
"""Base stream for an EES dataset extracted from a release ZIP."""
replication_key = None
_publication_slug: str = ""
_file_keyword: str = ""
def get_records(self, context):
import pandas as pd
release_id = get_content_release_id(self._publication_slug)
self.logger.info(
"Downloading release %s for %s",
release_id,
self._publication_slug,
)
zf = download_release_zip(release_id)
# Find the CSV matching our keyword
csv_names = [n for n in zf.namelist() if n.endswith(".csv")]
target = None
for name in csv_names:
if self._file_keyword.lower() in name.lower():
target = name
break
if not target and csv_names:
target = csv_names[0]
if not target:
self.logger.warning("No CSV found in release ZIP")
return
self.logger.info("Reading %s from ZIP", target)
with zf.open(target) as f:
df = pd.read_csv(f, dtype=str, keep_default_na=False)
# Filter to school-level data
if "geographic_level" in df.columns:
df = df[df["geographic_level"] == "School"]
for _, row in df.iterrows():
yield row.to_dict()
class EESKS2Stream(EESDatasetStream):
name = "ees_ks2"
primary_keys = ["urn", "time_period"]
_publication_slug = "key-stage-2-attainment"
_file_keyword = "school"
schema = th.PropertiesList(
th.Property("urn", th.StringType, required=True),
th.Property("time_period", th.StringType, required=True),
).to_dict()
class EESKS4Stream(EESDatasetStream):
name = "ees_ks4"
primary_keys = ["urn", "time_period"]
_publication_slug = "key-stage-4-performance-revised"
_file_keyword = "school"
schema = th.PropertiesList(
th.Property("urn", th.StringType, required=True),
th.Property("time_period", th.StringType, required=True),
).to_dict()
class EESCensusStream(EESDatasetStream):
name = "ees_census"
primary_keys = ["urn", "time_period"]
_publication_slug = "school-pupils-and-their-characteristics"
_file_keyword = "school"
schema = th.PropertiesList(
th.Property("urn", th.StringType, required=True),
th.Property("time_period", th.StringType, required=True),
).to_dict()
class EESAdmissionsStream(EESDatasetStream):
name = "ees_admissions"
primary_keys = ["urn", "time_period"]
_publication_slug = "secondary-and-primary-school-applications-and-offers"
_file_keyword = "school"
schema = th.PropertiesList(
th.Property("urn", th.StringType, required=True),
th.Property("time_period", th.StringType, required=True),
).to_dict()
class EESPhonicsStream(EESDatasetStream):
name = "ees_phonics"
primary_keys = ["urn", "time_period"]
_publication_slug = "phonics-screening-check-and-key-stage-1-assessments"
_file_keyword = "school"
schema = th.PropertiesList(
th.Property("urn", th.StringType, required=True),
th.Property("time_period", th.StringType, required=True),
).to_dict()
class TapUKEES(Tap):
"""Singer tap for UK Explore Education Statistics."""
name = "tap-uk-ees"
config_jsonschema = th.PropertiesList(
th.Property("base_url", th.StringType, description="EES API base URL"),
).to_dict()
def discover_streams(self):
return [
EESKS2Stream(self),
EESKS4Stream(self),
EESCensusStream(self),
EESAdmissionsStream(self),
EESPhonicsStream(self),
]
if __name__ == "__main__":
TapUKEES.cli()