feat(pipeline): add Meltano + dbt + Airflow ELT pipeline scaffold
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 35s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

Replaces the hand-rolled integrator with a production-grade ELT pipeline
using Meltano (Singer taps), dbt Core (medallion architecture), and
Apache Airflow (orchestration). Adds Typesense for search and PostGIS
for geospatial queries.

- 6 custom Singer taps (GIAS, EES, Ofsted, Parent View, FBIT, IDACI)
- dbt project: 12 staging, 5 intermediate, 12 mart models
- 3 Airflow DAGs (daily/monthly/annual schedules)
- Typesense sync + batch geocoding scripts
- docker-compose: add Airflow, Typesense; upgrade to PostGIS
- Portainer stack definition matching live deployment topology

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 08:37:53 +00:00
parent 8aca0a7a53
commit 8f02b5125e
65 changed files with 2822 additions and 72 deletions

View File

@@ -0,0 +1,17 @@
[build-system]
requires = ["setuptools>=68", "wheel"]
build-backend = "setuptools.build_meta"
[project]
name = "tap-uk-gias"
version = "0.1.0"
description = "Singer tap for UK GIAS (Get Information About Schools) bulk data"
requires-python = ">=3.10"
dependencies = [
"singer-sdk~=0.39",
"requests>=2.31",
"pandas>=2.0",
]
[project.scripts]
tap-uk-gias = "tap_uk_gias.tap:TapUKGIAS.cli"

View File

@@ -0,0 +1 @@
"""tap-uk-gias: Singer tap for GIAS bulk establishment data."""

View File

@@ -0,0 +1,135 @@
"""GIAS Singer tap — extracts bulk establishment CSV from GIAS API."""
from __future__ import annotations
from datetime import date
from singer_sdk import Stream, Tap
from singer_sdk import typing as th
GIAS_URL_TEMPLATE = (
"https://ea-edubase-api-prod.azurewebsites.net"
"/edubase/downloads/public/edubasealldata{date}.csv"
)
class GIASEstablishmentsStream(Stream):
"""Stream: GIAS establishments (one row per URN)."""
name = "gias_establishments"
primary_keys = ["URN"]
replication_key = None
# Schema is wide (~250 columns); we declare key columns and pass through the rest
schema = th.PropertiesList(
th.Property("URN", th.IntegerType, required=True),
th.Property("EstablishmentName", th.StringType),
th.Property("TypeOfEstablishment (name)", th.StringType),
th.Property("PhaseOfEducation (name)", th.StringType),
th.Property("LA (code)", th.IntegerType),
th.Property("LA (name)", th.StringType),
th.Property("EstablishmentNumber", th.IntegerType),
th.Property("EstablishmentStatus (name)", th.StringType),
th.Property("Postcode", th.StringType),
).to_dict()
def get_records(self, context):
"""Download GIAS CSV and yield rows."""
import io
import pandas as pd
import requests
today = date.today().strftime("%Y%m%d")
url = GIAS_URL_TEMPLATE.format(date=today)
self.logger.info("Downloading GIAS bulk CSV: %s", url)
resp = requests.get(url, timeout=120)
resp.raise_for_status()
df = pd.read_csv(
io.StringIO(resp.text),
encoding="utf-8-sig",
dtype=str,
keep_default_na=False,
)
for _, row in df.iterrows():
record = row.to_dict()
# Cast URN to int
try:
record["URN"] = int(record["URN"])
except (ValueError, KeyError):
continue
yield record
class GIASLinksStream(Stream):
"""Stream: GIAS school links (predecessor/successor)."""
name = "gias_links"
primary_keys = ["URN", "LinkURN"]
replication_key = None
schema = th.PropertiesList(
th.Property("URN", th.IntegerType, required=True),
th.Property("LinkURN", th.IntegerType, required=True),
th.Property("LinkType", th.StringType),
th.Property("LinkEstablishedDate", th.StringType),
).to_dict()
def get_records(self, context):
"""Download GIAS links CSV and yield rows."""
import io
import pandas as pd
import requests
url = (
"https://ea-edubase-api-prod.azurewebsites.net"
"/edubase/downloads/public/links_edubasealldata.csv"
)
self.logger.info("Downloading GIAS links CSV: %s", url)
resp = requests.get(url, timeout=120)
resp.raise_for_status()
df = pd.read_csv(
io.StringIO(resp.text),
encoding="utf-8-sig",
dtype=str,
keep_default_na=False,
)
for _, row in df.iterrows():
record = row.to_dict()
try:
record["URN"] = int(record["URN"])
record["LinkURN"] = int(record["LinkURN"])
except (ValueError, KeyError):
continue
yield record
class TapUKGIAS(Tap):
"""Singer tap for UK GIAS data."""
name = "tap-uk-gias"
config_jsonschema = th.PropertiesList(
th.Property(
"download_url",
th.StringType,
description="Override GIAS CSV download URL",
),
).to_dict()
def discover_streams(self):
return [
GIASEstablishmentsStream(self),
GIASLinksStream(self),
]
if __name__ == "__main__":
TapUKGIAS.cli()