feat(pipeline): add Meltano + dbt + Airflow ELT pipeline scaffold

Replaces the hand-rolled integrator with a production-grade ELT pipeline using Meltano (Singer taps), dbt Core (medallion architecture), and Apache Airflow (orchestration). Adds Typesense for search and PostGIS for geospatial queries. - 6 custom Singer taps (GIAS, EES, Ofsted, Parent View, FBIT, IDACI) - dbt project: 12 staging, 5 intermediate, 12 mart models - 3 Airflow DAGs (daily/monthly/annual schedules) - Typesense sync + batch geocoding scripts - docker-compose: add Airflow, Typesense; upgrade to PostGIS - Portainer stack definition matching live deployment topology Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 08:37:53 +00:00
parent 8aca0a7a53
commit 8f02b5125e
65 changed files with 2822 additions and 72 deletions
@@ -0,0 +1,17 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "tap-uk-ees"
+version = "0.1.0"
+description = "Singer tap for UK Explore Education Statistics (KS2, KS4, Census, Admissions, Phonics)"
+requires-python = ">=3.10"
+dependencies = [
+    "singer-sdk~=0.39",
+    "requests>=2.31",
+    "pandas>=2.0",
+]
+
+[project.scripts]
+tap-uk-ees = "tap_uk_ees.tap:TapUKEES.cli"
@@ -0,0 +1 @@
+"""tap-uk-ees: Singer tap for Explore Education Statistics API."""
@@ -0,0 +1,154 @@
+"""EES Singer tap — extracts KS2, KS4, Census, Admissions, Phonics data."""
+
+from __future__ import annotations
+
+import io
+import zipfile
+
+import requests
+from singer_sdk import Stream, Tap
+from singer_sdk import typing as th
+
+CONTENT_API_BASE = (
+    "https://content.explore-education-statistics.service.gov.uk/api"
+)
+STATS_API_BASE = "https://api.education.gov.uk/statistics/v1"
+TIMEOUT = 120
+
+
+def get_content_release_id(publication_slug: str) -> str:
+    """Return the latest release ID via the EES content API."""
+    url = f"{CONTENT_API_BASE}/publications/{publication_slug}/releases/latest"
+    resp = requests.get(url, timeout=TIMEOUT)
+    resp.raise_for_status()
+    return resp.json()["id"]
+
+
+def download_release_zip(release_id: str) -> zipfile.ZipFile:
+    """Download all data files for a release as a ZIP."""
+    url = f"{CONTENT_API_BASE}/releases/{release_id}/files"
+    resp = requests.get(url, timeout=300, stream=True)
+    resp.raise_for_status()
+    return zipfile.ZipFile(io.BytesIO(resp.content))
+
+
+class EESDatasetStream(Stream):
+    """Base stream for an EES dataset extracted from a release ZIP."""
+
+    replication_key = None
+    _publication_slug: str = ""
+    _file_keyword: str = ""
+
+    def get_records(self, context):
+        import pandas as pd
+
+        release_id = get_content_release_id(self._publication_slug)
+        self.logger.info(
+            "Downloading release %s for %s",
+            release_id,
+            self._publication_slug,
+        )
+        zf = download_release_zip(release_id)
+
+        # Find the CSV matching our keyword
+        csv_names = [n for n in zf.namelist() if n.endswith(".csv")]
+        target = None
+        for name in csv_names:
+            if self._file_keyword.lower() in name.lower():
+                target = name
+                break
+        if not target and csv_names:
+            target = csv_names[0]
+
+        if not target:
+            self.logger.warning("No CSV found in release ZIP")
+            return
+
+        self.logger.info("Reading %s from ZIP", target)
+        with zf.open(target) as f:
+            df = pd.read_csv(f, dtype=str, keep_default_na=False)
+
+        # Filter to school-level data
+        if "geographic_level" in df.columns:
+            df = df[df["geographic_level"] == "School"]
+
+        for _, row in df.iterrows():
+            yield row.to_dict()
+
+
+class EESKS2Stream(EESDatasetStream):
+    name = "ees_ks2"
+    primary_keys = ["urn", "time_period"]
+    _publication_slug = "key-stage-2-attainment"
+    _file_keyword = "school"
+    schema = th.PropertiesList(
+        th.Property("urn", th.StringType, required=True),
+        th.Property("time_period", th.StringType, required=True),
+    ).to_dict()
+
+
+class EESKS4Stream(EESDatasetStream):
+    name = "ees_ks4"
+    primary_keys = ["urn", "time_period"]
+    _publication_slug = "key-stage-4-performance-revised"
+    _file_keyword = "school"
+    schema = th.PropertiesList(
+        th.Property("urn", th.StringType, required=True),
+        th.Property("time_period", th.StringType, required=True),
+    ).to_dict()
+
+
+class EESCensusStream(EESDatasetStream):
+    name = "ees_census"
+    primary_keys = ["urn", "time_period"]
+    _publication_slug = "school-pupils-and-their-characteristics"
+    _file_keyword = "school"
+    schema = th.PropertiesList(
+        th.Property("urn", th.StringType, required=True),
+        th.Property("time_period", th.StringType, required=True),
+    ).to_dict()
+
+
+class EESAdmissionsStream(EESDatasetStream):
+    name = "ees_admissions"
+    primary_keys = ["urn", "time_period"]
+    _publication_slug = "secondary-and-primary-school-applications-and-offers"
+    _file_keyword = "school"
+    schema = th.PropertiesList(
+        th.Property("urn", th.StringType, required=True),
+        th.Property("time_period", th.StringType, required=True),
+    ).to_dict()
+
+
+class EESPhonicsStream(EESDatasetStream):
+    name = "ees_phonics"
+    primary_keys = ["urn", "time_period"]
+    _publication_slug = "phonics-screening-check-and-key-stage-1-assessments"
+    _file_keyword = "school"
+    schema = th.PropertiesList(
+        th.Property("urn", th.StringType, required=True),
+        th.Property("time_period", th.StringType, required=True),
+    ).to_dict()
+
+
+class TapUKEES(Tap):
+    """Singer tap for UK Explore Education Statistics."""
+
+    name = "tap-uk-ees"
+
+    config_jsonschema = th.PropertiesList(
+        th.Property("base_url", th.StringType, description="EES API base URL"),
+    ).to_dict()
+
+    def discover_streams(self):
+        return [
+            EESKS2Stream(self),
+            EESKS4Stream(self),
+            EESCensusStream(self),
+            EESAdmissionsStream(self),
+            EESPhonicsStream(self),
+        ]
+
+
+if __name__ == "__main__":
+    TapUKEES.cli()
				`@@ -0,0 +1 @@`
				`"""tap-uk-ees: Singer tap for Explore Education Statistics API."""`