feat(pipeline): add Meltano + dbt + Airflow ELT pipeline scaffold

Replaces the hand-rolled integrator with a production-grade ELT pipeline using Meltano (Singer taps), dbt Core (medallion architecture), and Apache Airflow (orchestration). Adds Typesense for search and PostGIS for geospatial queries. - 6 custom Singer taps (GIAS, EES, Ofsted, Parent View, FBIT, IDACI) - dbt project: 12 staging, 5 intermediate, 12 mart models - 3 Airflow DAGs (daily/monthly/annual schedules) - Typesense sync + batch geocoding scripts - docker-compose: add Airflow, Typesense; upgrade to PostGIS - Portainer stack definition matching live deployment topology Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
2026-03-26 08:37:53 +00:00
parent 8aca0a7a53
commit 8f02b5125e
65 changed files with 2822 additions and 72 deletions
@@ -0,0 +1,17 @@
+[build-system]
+requires = ["setuptools>=68", "wheel"]
+build-backend = "setuptools.build_meta"
+
+[project]
+name = "tap-uk-gias"
+version = "0.1.0"
+description = "Singer tap for UK GIAS (Get Information About Schools) bulk data"
+requires-python = ">=3.10"
+dependencies = [
+    "singer-sdk~=0.39",
+    "requests>=2.31",
+    "pandas>=2.0",
+]
+
+[project.scripts]
+tap-uk-gias = "tap_uk_gias.tap:TapUKGIAS.cli"
@@ -0,0 +1 @@
+"""tap-uk-gias: Singer tap for GIAS bulk establishment data."""
@@ -0,0 +1,135 @@
+"""GIAS Singer tap — extracts bulk establishment CSV from GIAS API."""
+
+from __future__ import annotations
+
+from datetime import date
+
+from singer_sdk import Stream, Tap
+from singer_sdk import typing as th
+
+GIAS_URL_TEMPLATE = (
+    "https://ea-edubase-api-prod.azurewebsites.net"
+    "/edubase/downloads/public/edubasealldata{date}.csv"
+)
+
+
+class GIASEstablishmentsStream(Stream):
+    """Stream: GIAS establishments (one row per URN)."""
+
+    name = "gias_establishments"
+    primary_keys = ["URN"]
+    replication_key = None
+
+    # Schema is wide (~250 columns); we declare key columns and pass through the rest
+    schema = th.PropertiesList(
+        th.Property("URN", th.IntegerType, required=True),
+        th.Property("EstablishmentName", th.StringType),
+        th.Property("TypeOfEstablishment (name)", th.StringType),
+        th.Property("PhaseOfEducation (name)", th.StringType),
+        th.Property("LA (code)", th.IntegerType),
+        th.Property("LA (name)", th.StringType),
+        th.Property("EstablishmentNumber", th.IntegerType),
+        th.Property("EstablishmentStatus (name)", th.StringType),
+        th.Property("Postcode", th.StringType),
+    ).to_dict()
+
+    def get_records(self, context):
+        """Download GIAS CSV and yield rows."""
+        import io
+
+        import pandas as pd
+        import requests
+
+        today = date.today().strftime("%Y%m%d")
+        url = GIAS_URL_TEMPLATE.format(date=today)
+
+        self.logger.info("Downloading GIAS bulk CSV: %s", url)
+        resp = requests.get(url, timeout=120)
+        resp.raise_for_status()
+
+        df = pd.read_csv(
+            io.StringIO(resp.text),
+            encoding="utf-8-sig",
+            dtype=str,
+            keep_default_na=False,
+        )
+
+        for _, row in df.iterrows():
+            record = row.to_dict()
+            # Cast URN to int
+            try:
+                record["URN"] = int(record["URN"])
+            except (ValueError, KeyError):
+                continue
+            yield record
+
+
+class GIASLinksStream(Stream):
+    """Stream: GIAS school links (predecessor/successor)."""
+
+    name = "gias_links"
+    primary_keys = ["URN", "LinkURN"]
+    replication_key = None
+
+    schema = th.PropertiesList(
+        th.Property("URN", th.IntegerType, required=True),
+        th.Property("LinkURN", th.IntegerType, required=True),
+        th.Property("LinkType", th.StringType),
+        th.Property("LinkEstablishedDate", th.StringType),
+    ).to_dict()
+
+    def get_records(self, context):
+        """Download GIAS links CSV and yield rows."""
+        import io
+
+        import pandas as pd
+        import requests
+
+        url = (
+            "https://ea-edubase-api-prod.azurewebsites.net"
+            "/edubase/downloads/public/links_edubasealldata.csv"
+        )
+
+        self.logger.info("Downloading GIAS links CSV: %s", url)
+        resp = requests.get(url, timeout=120)
+        resp.raise_for_status()
+
+        df = pd.read_csv(
+            io.StringIO(resp.text),
+            encoding="utf-8-sig",
+            dtype=str,
+            keep_default_na=False,
+        )
+
+        for _, row in df.iterrows():
+            record = row.to_dict()
+            try:
+                record["URN"] = int(record["URN"])
+                record["LinkURN"] = int(record["LinkURN"])
+            except (ValueError, KeyError):
+                continue
+            yield record
+
+
+class TapUKGIAS(Tap):
+    """Singer tap for UK GIAS data."""
+
+    name = "tap-uk-gias"
+
+    config_jsonschema = th.PropertiesList(
+        th.Property(
+            "download_url",
+            th.StringType,
+            description="Override GIAS CSV download URL",
+        ),
+    ).to_dict()
+
+    def discover_streams(self):
+        return [
+            GIASEstablishmentsStream(self),
+            GIASLinksStream(self),
+        ]
+
+
+if __name__ == "__main__":
+    TapUKGIAS.cli()
				`@@ -0,0 +1 @@`
				`"""tap-uk-gias: Singer tap for GIAS bulk establishment data."""`