feat(pipeline): add Meltano + dbt + Airflow ELT pipeline scaffold
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 35s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 35s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
Replaces the hand-rolled integrator with a production-grade ELT pipeline using Meltano (Singer taps), dbt Core (medallion architecture), and Apache Airflow (orchestration). Adds Typesense for search and PostGIS for geospatial queries. - 6 custom Singer taps (GIAS, EES, Ofsted, Parent View, FBIT, IDACI) - dbt project: 12 staging, 5 intermediate, 12 mart models - 3 Airflow DAGs (daily/monthly/annual schedules) - Typesense sync + batch geocoding scripts - docker-compose: add Airflow, Typesense; upgrade to PostGIS - Portainer stack definition matching live deployment topology Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
17
pipeline/plugins/extractors/tap-uk-gias/pyproject.toml
Normal file
17
pipeline/plugins/extractors/tap-uk-gias/pyproject.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "tap-uk-gias"
|
||||
version = "0.1.0"
|
||||
description = "Singer tap for UK GIAS (Get Information About Schools) bulk data"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"singer-sdk~=0.39",
|
||||
"requests>=2.31",
|
||||
"pandas>=2.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
tap-uk-gias = "tap_uk_gias.tap:TapUKGIAS.cli"
|
||||
@@ -0,0 +1 @@
|
||||
"""tap-uk-gias: Singer tap for GIAS bulk establishment data."""
|
||||
135
pipeline/plugins/extractors/tap-uk-gias/tap_uk_gias/tap.py
Normal file
135
pipeline/plugins/extractors/tap-uk-gias/tap_uk_gias/tap.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""GIAS Singer tap — extracts bulk establishment CSV from GIAS API."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date
|
||||
|
||||
from singer_sdk import Stream, Tap
|
||||
from singer_sdk import typing as th
|
||||
|
||||
GIAS_URL_TEMPLATE = (
|
||||
"https://ea-edubase-api-prod.azurewebsites.net"
|
||||
"/edubase/downloads/public/edubasealldata{date}.csv"
|
||||
)
|
||||
|
||||
|
||||
class GIASEstablishmentsStream(Stream):
|
||||
"""Stream: GIAS establishments (one row per URN)."""
|
||||
|
||||
name = "gias_establishments"
|
||||
primary_keys = ["URN"]
|
||||
replication_key = None
|
||||
|
||||
# Schema is wide (~250 columns); we declare key columns and pass through the rest
|
||||
schema = th.PropertiesList(
|
||||
th.Property("URN", th.IntegerType, required=True),
|
||||
th.Property("EstablishmentName", th.StringType),
|
||||
th.Property("TypeOfEstablishment (name)", th.StringType),
|
||||
th.Property("PhaseOfEducation (name)", th.StringType),
|
||||
th.Property("LA (code)", th.IntegerType),
|
||||
th.Property("LA (name)", th.StringType),
|
||||
th.Property("EstablishmentNumber", th.IntegerType),
|
||||
th.Property("EstablishmentStatus (name)", th.StringType),
|
||||
th.Property("Postcode", th.StringType),
|
||||
).to_dict()
|
||||
|
||||
def get_records(self, context):
|
||||
"""Download GIAS CSV and yield rows."""
|
||||
import io
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
|
||||
today = date.today().strftime("%Y%m%d")
|
||||
url = GIAS_URL_TEMPLATE.format(date=today)
|
||||
|
||||
self.logger.info("Downloading GIAS bulk CSV: %s", url)
|
||||
resp = requests.get(url, timeout=120)
|
||||
resp.raise_for_status()
|
||||
|
||||
df = pd.read_csv(
|
||||
io.StringIO(resp.text),
|
||||
encoding="utf-8-sig",
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
)
|
||||
|
||||
for _, row in df.iterrows():
|
||||
record = row.to_dict()
|
||||
# Cast URN to int
|
||||
try:
|
||||
record["URN"] = int(record["URN"])
|
||||
except (ValueError, KeyError):
|
||||
continue
|
||||
yield record
|
||||
|
||||
|
||||
class GIASLinksStream(Stream):
|
||||
"""Stream: GIAS school links (predecessor/successor)."""
|
||||
|
||||
name = "gias_links"
|
||||
primary_keys = ["URN", "LinkURN"]
|
||||
replication_key = None
|
||||
|
||||
schema = th.PropertiesList(
|
||||
th.Property("URN", th.IntegerType, required=True),
|
||||
th.Property("LinkURN", th.IntegerType, required=True),
|
||||
th.Property("LinkType", th.StringType),
|
||||
th.Property("LinkEstablishedDate", th.StringType),
|
||||
).to_dict()
|
||||
|
||||
def get_records(self, context):
|
||||
"""Download GIAS links CSV and yield rows."""
|
||||
import io
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
|
||||
url = (
|
||||
"https://ea-edubase-api-prod.azurewebsites.net"
|
||||
"/edubase/downloads/public/links_edubasealldata.csv"
|
||||
)
|
||||
|
||||
self.logger.info("Downloading GIAS links CSV: %s", url)
|
||||
resp = requests.get(url, timeout=120)
|
||||
resp.raise_for_status()
|
||||
|
||||
df = pd.read_csv(
|
||||
io.StringIO(resp.text),
|
||||
encoding="utf-8-sig",
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
)
|
||||
|
||||
for _, row in df.iterrows():
|
||||
record = row.to_dict()
|
||||
try:
|
||||
record["URN"] = int(record["URN"])
|
||||
record["LinkURN"] = int(record["LinkURN"])
|
||||
except (ValueError, KeyError):
|
||||
continue
|
||||
yield record
|
||||
|
||||
|
||||
class TapUKGIAS(Tap):
|
||||
"""Singer tap for UK GIAS data."""
|
||||
|
||||
name = "tap-uk-gias"
|
||||
|
||||
config_jsonschema = th.PropertiesList(
|
||||
th.Property(
|
||||
"download_url",
|
||||
th.StringType,
|
||||
description="Override GIAS CSV download URL",
|
||||
),
|
||||
).to_dict()
|
||||
|
||||
def discover_streams(self):
|
||||
return [
|
||||
GIASEstablishmentsStream(self),
|
||||
GIASLinksStream(self),
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TapUKGIAS.cli()
|
||||
Reference in New Issue
Block a user