feat(pipeline): add Meltano + dbt + Airflow ELT pipeline scaffold
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 35s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

Replaces the hand-rolled integrator with a production-grade ELT pipeline
using Meltano (Singer taps), dbt Core (medallion architecture), and
Apache Airflow (orchestration). Adds Typesense for search and PostGIS
for geospatial queries.

- 6 custom Singer taps (GIAS, EES, Ofsted, Parent View, FBIT, IDACI)
- dbt project: 12 staging, 5 intermediate, 12 mart models
- 3 Airflow DAGs (daily/monthly/annual schedules)
- Typesense sync + batch geocoding scripts
- docker-compose: add Airflow, Typesense; upgrade to PostGIS
- Portainer stack definition matching live deployment topology

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 08:37:53 +00:00
parent 8aca0a7a53
commit 8f02b5125e
65 changed files with 2822 additions and 72 deletions

View File

@@ -1,9 +1,9 @@
version: '3.8'
services:
# PostgreSQL Database
# PostgreSQL Database with PostGIS
db:
image: postgres:16-alpine
image: postgis/postgis:16-3.4-alpine
container_name: schoolcompare_db
environment:
POSTGRES_USER: schoolcompare
@@ -33,6 +33,8 @@ services:
DATABASE_URL: postgresql://schoolcompare:schoolcompare@db:5432/schoolcompare
PYTHONUNBUFFERED: 1
ADMIN_API_KEY: ${ADMIN_API_KEY:-changeme}
TYPESENSE_URL: http://typesense:8108
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:-changeme}
volumes:
- ./data:/app/data:ro
depends_on:
@@ -58,6 +60,8 @@ services:
NODE_ENV: production
NEXT_PUBLIC_API_URL: http://localhost:8000/api
FASTAPI_URL: http://backend:80/api
TYPESENSE_URL: http://typesense:8108
TYPESENSE_API_KEY: ${TYPESENSE_SEARCH_KEY:-changeme}
depends_on:
backend:
condition: service_healthy
@@ -71,32 +75,49 @@ services:
retries: 3
start_period: 40s
# Kestra — workflow orchestrator (UI at http://localhost:8080)
kestra:
image: kestra/kestra:latest
container_name: schoolcompare_kestra
command: server standalone
# Typesense — search engine
typesense:
image: typesense/typesense:27.1
container_name: schoolcompare_typesense
ports:
- "8108:8108"
environment:
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:-changeme}
TYPESENSE_DATA_DIR: /data
volumes:
- typesense_data:/data
networks:
- schoolcompare-network
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8108/health"]
interval: 15s
timeout: 5s
retries: 5
start_period: 10s
# Apache Airflow — workflow orchestrator (UI at http://localhost:8080)
airflow-webserver:
image: privaterepo.sitaru.org/tudor/school_compare-pipeline:latest
container_name: schoolcompare_airflow_webserver
command: airflow webserver --port 8080
ports:
- "8080:8080"
environment: &airflow-env
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://schoolcompare:schoolcompare@db:5432/schoolcompare
AIRFLOW__CORE__DAGS_FOLDER: /opt/pipeline/dags
AIRFLOW__CORE__LOAD_EXAMPLES: "false"
AIRFLOW__WEBSERVER__EXPOSE_CONFIG: "false"
PG_HOST: db
PG_PORT: "5432"
PG_USER: schoolcompare
PG_PASSWORD: schoolcompare
PG_DATABASE: schoolcompare
TYPESENSE_URL: http://typesense:8108
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:-changeme}
volumes:
- kestra_storage:/app/storage
environment:
KESTRA_CONFIGURATION: |
datasources:
postgres:
url: jdbc:postgresql://db:5432/kestra
driverClassName: org.postgresql.Driver
username: schoolcompare
password: schoolcompare
kestra:
repository:
type: postgres
queue:
type: postgres
storage:
type: local
local:
base-path: /app/storage
- ./pipeline/dags:/opt/pipeline/dags:ro
depends_on:
db:
condition: service_healthy
@@ -104,53 +125,42 @@ services:
- schoolcompare-network
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:8081/health | grep -q '\"status\":\"UP\"'"]
interval: 15s
timeout: 10s
retries: 10
start_period: 60s
# One-shot container: imports flow YAMLs into Kestra after it's healthy
kestra-init:
image: privaterepo.sitaru.org/tudor/school_compare-kestra-init:latest
container_name: schoolcompare_kestra_init
environment:
KESTRA_URL: http://kestra:8080
KESTRA_USER: ${KESTRA_USER:-}
KESTRA_PASSWORD: ${KESTRA_PASSWORD:-}
depends_on:
kestra:
condition: service_healthy
networks:
- schoolcompare-network
restart: no
# Data integrator — Python microservice called by Kestra
integrator:
image: privaterepo.sitaru.org/tudor/school_compare-integrator:latest
container_name: schoolcompare_integrator
ports:
- "8001:8001"
environment:
DATABASE_URL: postgresql://schoolcompare:schoolcompare@db:5432/schoolcompare
DATA_DIR: /data
BACKEND_URL: http://backend:80
ADMIN_API_KEY: ${ADMIN_API_KEY:-changeme}
PYTHONUNBUFFERED: 1
volumes:
- supplementary_data:/data
depends_on:
db:
condition: service_healthy
networks:
- schoolcompare-network
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
retries: 5
start_period: 60s
airflow-scheduler:
image: privaterepo.sitaru.org/tudor/school_compare-pipeline:latest
container_name: schoolcompare_airflow_scheduler
command: airflow scheduler
environment: *airflow-env
volumes:
- ./pipeline/dags:/opt/pipeline/dags:ro
depends_on:
db:
condition: service_healthy
networks:
- schoolcompare-network
restart: unless-stopped
# One-shot: initialise Airflow metadata DB
airflow-init:
image: privaterepo.sitaru.org/tudor/school_compare-pipeline:latest
container_name: schoolcompare_airflow_init
command: >
bash -c "
airflow db migrate &&
airflow users create --username admin --password admin --firstname Admin --lastname User --role Admin --email admin@localhost || true
"
environment: *airflow-env
depends_on:
db:
condition: service_healthy
networks:
- schoolcompare-network
restart: "no"
networks:
schoolcompare-network:
@@ -158,5 +168,4 @@ networks:
volumes:
postgres_data:
kestra_storage:
supplementary_data:
typesense_data: