feat(pipeline): add Meltano + dbt + Airflow ELT pipeline scaffold
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 35s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 35s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
Replaces the hand-rolled integrator with a production-grade ELT pipeline using Meltano (Singer taps), dbt Core (medallion architecture), and Apache Airflow (orchestration). Adds Typesense for search and PostGIS for geospatial queries. - 6 custom Singer taps (GIAS, EES, Ofsted, Parent View, FBIT, IDACI) - dbt project: 12 staging, 5 intermediate, 12 mart models - 3 Airflow DAGs (daily/monthly/annual schedules) - Typesense sync + batch geocoding scripts - docker-compose: add Airflow, Typesense; upgrade to PostGIS - Portainer stack definition matching live deployment topology Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
17
pipeline/plugins/extractors/tap-uk-ees/pyproject.toml
Normal file
17
pipeline/plugins/extractors/tap-uk-ees/pyproject.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "tap-uk-ees"
|
||||
version = "0.1.0"
|
||||
description = "Singer tap for UK Explore Education Statistics (KS2, KS4, Census, Admissions, Phonics)"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"singer-sdk~=0.39",
|
||||
"requests>=2.31",
|
||||
"pandas>=2.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
tap-uk-ees = "tap_uk_ees.tap:TapUKEES.cli"
|
||||
@@ -0,0 +1 @@
|
||||
"""tap-uk-ees: Singer tap for Explore Education Statistics API."""
|
||||
154
pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py
Normal file
154
pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""EES Singer tap — extracts KS2, KS4, Census, Admissions, Phonics data."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import zipfile
|
||||
|
||||
import requests
|
||||
from singer_sdk import Stream, Tap
|
||||
from singer_sdk import typing as th
|
||||
|
||||
CONTENT_API_BASE = (
|
||||
"https://content.explore-education-statistics.service.gov.uk/api"
|
||||
)
|
||||
STATS_API_BASE = "https://api.education.gov.uk/statistics/v1"
|
||||
TIMEOUT = 120
|
||||
|
||||
|
||||
def get_content_release_id(publication_slug: str) -> str:
|
||||
"""Return the latest release ID via the EES content API."""
|
||||
url = f"{CONTENT_API_BASE}/publications/{publication_slug}/releases/latest"
|
||||
resp = requests.get(url, timeout=TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return resp.json()["id"]
|
||||
|
||||
|
||||
def download_release_zip(release_id: str) -> zipfile.ZipFile:
|
||||
"""Download all data files for a release as a ZIP."""
|
||||
url = f"{CONTENT_API_BASE}/releases/{release_id}/files"
|
||||
resp = requests.get(url, timeout=300, stream=True)
|
||||
resp.raise_for_status()
|
||||
return zipfile.ZipFile(io.BytesIO(resp.content))
|
||||
|
||||
|
||||
class EESDatasetStream(Stream):
|
||||
"""Base stream for an EES dataset extracted from a release ZIP."""
|
||||
|
||||
replication_key = None
|
||||
_publication_slug: str = ""
|
||||
_file_keyword: str = ""
|
||||
|
||||
def get_records(self, context):
|
||||
import pandas as pd
|
||||
|
||||
release_id = get_content_release_id(self._publication_slug)
|
||||
self.logger.info(
|
||||
"Downloading release %s for %s",
|
||||
release_id,
|
||||
self._publication_slug,
|
||||
)
|
||||
zf = download_release_zip(release_id)
|
||||
|
||||
# Find the CSV matching our keyword
|
||||
csv_names = [n for n in zf.namelist() if n.endswith(".csv")]
|
||||
target = None
|
||||
for name in csv_names:
|
||||
if self._file_keyword.lower() in name.lower():
|
||||
target = name
|
||||
break
|
||||
if not target and csv_names:
|
||||
target = csv_names[0]
|
||||
|
||||
if not target:
|
||||
self.logger.warning("No CSV found in release ZIP")
|
||||
return
|
||||
|
||||
self.logger.info("Reading %s from ZIP", target)
|
||||
with zf.open(target) as f:
|
||||
df = pd.read_csv(f, dtype=str, keep_default_na=False)
|
||||
|
||||
# Filter to school-level data
|
||||
if "geographic_level" in df.columns:
|
||||
df = df[df["geographic_level"] == "School"]
|
||||
|
||||
for _, row in df.iterrows():
|
||||
yield row.to_dict()
|
||||
|
||||
|
||||
class EESKS2Stream(EESDatasetStream):
|
||||
name = "ees_ks2"
|
||||
primary_keys = ["urn", "time_period"]
|
||||
_publication_slug = "key-stage-2-attainment"
|
||||
_file_keyword = "school"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.StringType, required=True),
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
).to_dict()
|
||||
|
||||
|
||||
class EESKS4Stream(EESDatasetStream):
|
||||
name = "ees_ks4"
|
||||
primary_keys = ["urn", "time_period"]
|
||||
_publication_slug = "key-stage-4-performance-revised"
|
||||
_file_keyword = "school"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.StringType, required=True),
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
).to_dict()
|
||||
|
||||
|
||||
class EESCensusStream(EESDatasetStream):
|
||||
name = "ees_census"
|
||||
primary_keys = ["urn", "time_period"]
|
||||
_publication_slug = "school-pupils-and-their-characteristics"
|
||||
_file_keyword = "school"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.StringType, required=True),
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
).to_dict()
|
||||
|
||||
|
||||
class EESAdmissionsStream(EESDatasetStream):
|
||||
name = "ees_admissions"
|
||||
primary_keys = ["urn", "time_period"]
|
||||
_publication_slug = "secondary-and-primary-school-applications-and-offers"
|
||||
_file_keyword = "school"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.StringType, required=True),
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
).to_dict()
|
||||
|
||||
|
||||
class EESPhonicsStream(EESDatasetStream):
|
||||
name = "ees_phonics"
|
||||
primary_keys = ["urn", "time_period"]
|
||||
_publication_slug = "phonics-screening-check-and-key-stage-1-assessments"
|
||||
_file_keyword = "school"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.StringType, required=True),
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
).to_dict()
|
||||
|
||||
|
||||
class TapUKEES(Tap):
|
||||
"""Singer tap for UK Explore Education Statistics."""
|
||||
|
||||
name = "tap-uk-ees"
|
||||
|
||||
config_jsonschema = th.PropertiesList(
|
||||
th.Property("base_url", th.StringType, description="EES API base URL"),
|
||||
).to_dict()
|
||||
|
||||
def discover_streams(self):
|
||||
return [
|
||||
EESKS2Stream(self),
|
||||
EESKS4Stream(self),
|
||||
EESCensusStream(self),
|
||||
EESAdmissionsStream(self),
|
||||
EESPhonicsStream(self),
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TapUKEES.cli()
|
||||
Reference in New Issue
Block a user