feat(pipeline): add Meltano + dbt + Airflow ELT pipeline scaffold
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 35s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

Replaces the hand-rolled integrator with a production-grade ELT pipeline
using Meltano (Singer taps), dbt Core (medallion architecture), and
Apache Airflow (orchestration). Adds Typesense for search and PostGIS
for geospatial queries.

- 6 custom Singer taps (GIAS, EES, Ofsted, Parent View, FBIT, IDACI)
- dbt project: 12 staging, 5 intermediate, 12 mart models
- 3 Airflow DAGs (daily/monthly/annual schedules)
- Typesense sync + batch geocoding scripts
- docker-compose: add Airflow, Typesense; upgrade to PostGIS
- Portainer stack definition matching live deployment topology

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 08:37:53 +00:00
parent 8aca0a7a53
commit 8f02b5125e
65 changed files with 2822 additions and 72 deletions

View File

@@ -0,0 +1,289 @@
# Portainer Stack Definition for School Compare
#
# Portainer environment variables (set in Portainer UI -> Stack -> Environment):
# DB_USERNAME — PostgreSQL username
# DB_PASSWORD — PostgreSQL password
# DB_DATABASE_NAME — PostgreSQL database name
# ADMIN_API_KEY — Backend admin API key
# TYPESENSE_API_KEY — Typesense admin API key
# TYPESENSE_SEARCH_KEY — Typesense search-only key (exposed to frontend)
# AIRFLOW_ADMIN_PASSWORD — Airflow web UI admin password
# KESTRA_USER — Kestra UI username (optional)
# KESTRA_PASSWORD — Kestra UI password (optional)
services:
# ── PostgreSQL ────────────────────────────────────────────────────────
sc_database:
container_name: sc_postgres
image: postgis/postgis:18-3.6-alpine
environment:
POSTGRES_PASSWORD: ${DB_PASSWORD}
POSTGRES_USER: ${DB_USERNAME}
POSTGRES_DB: ${DB_DATABASE_NAME}
volumes:
- postgres_data:/var/lib/postgresql
shm_size: 128mb
networks:
backend: {}
macvlan:
ipv4_address: 10.0.1.189
healthcheck:
test: ["CMD-SHELL", "pg_isready -U postgres"]
interval: 10s
timeout: 5s
retries: 5
start_period: 10s
restart: unless-stopped
# ── FastAPI Backend ───────────────────────────────────────────────────
backend:
image: privaterepo.sitaru.org/tudor/school_compare-backend:latest
container_name: schoolcompare_backend
environment:
DATABASE_URL: postgresql://${DB_USERNAME}:${DB_PASSWORD}@sc_database:5432/${DB_DATABASE_NAME}
PYTHONUNBUFFERED: 1
ADMIN_API_KEY: ${ADMIN_API_KEY:-changeme}
TYPESENSE_URL: http://typesense:8108
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:-changeme}
depends_on:
sc_database:
condition: service_healthy
networks:
- backend
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:80/api/data-info"]
interval: 30s
timeout: 10s
retries: 3
start_period: 30s
# ── Next.js Frontend ──────────────────────────────────────────────────
frontend:
image: privaterepo.sitaru.org/tudor/school_compare-frontend:latest
container_name: schoolcompare_nextjs
environment:
- NODE_ENV=production
- NEXT_PUBLIC_API_URL=http://localhost:8000/api
- FASTAPI_URL=http://backend:80/api
- TYPESENSE_URL=http://typesense:8108
- TYPESENSE_API_KEY=${TYPESENSE_SEARCH_KEY:-changeme}
depends_on:
backend:
condition: service_healthy
networks:
backend: {}
macvlan:
ipv4_address: 10.0.1.150
restart: unless-stopped
healthcheck:
test: ["CMD", "node", "-e", "require('http').get('http://localhost:3000/', (r) => {process.exit(r.statusCode === 200 ? 0 : 1)})"]
interval: 30s
timeout: 10s
retries: 3
start_period: 40s
# ── Typesense Search Engine ───────────────────────────────────────────
typesense:
image: typesense/typesense:27.1
container_name: schoolcompare_typesense
environment:
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:-changeme}
TYPESENSE_DATA_DIR: /data
volumes:
- typesense_data:/data
networks:
- backend
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-sf", "http://localhost:8108/health"]
interval: 15s
timeout: 5s
retries: 5
start_period: 10s
# ── Kestra — workflow orchestrator (legacy, kept during migration) ────
kestra:
image: kestra/kestra:latest
container_name: schoolcompare_kestra
command: server standalone
ports:
- "8090:8080"
volumes:
- kestra_storage:/app/storage
environment:
KESTRA_CONFIGURATION: |
datasources:
postgres:
url: jdbc:postgresql://sc_database:5432/kestra
driverClassName: org.postgresql.Driver
username: ${DB_USERNAME}
password: ${DB_PASSWORD}
kestra:
repository:
type: postgres
queue:
type: postgres
storage:
type: local
local:
base-path: /app/storage
depends_on:
sc_database:
condition: service_healthy
networks:
- backend
restart: unless-stopped
healthcheck:
test: ["CMD-SHELL", "curl -sf http://localhost:8081/health | grep -q '\"status\":\"UP\"'"]
interval: 15s
timeout: 10s
retries: 10
start_period: 60s
# ── Kestra init (legacy, kept during migration) ──────────────────────
kestra-init:
image: privaterepo.sitaru.org/tudor/school_compare-kestra-init:latest
container_name: schoolcompare_kestra_init
environment:
KESTRA_URL: http://kestra:8080
KESTRA_USER: ${KESTRA_USER:-}
KESTRA_PASSWORD: ${KESTRA_PASSWORD:-}
depends_on:
kestra:
condition: service_healthy
networks:
- backend
restart: "no"
# ── Data integrator (legacy, kept during migration) ──────────────────
integrator:
image: privaterepo.sitaru.org/tudor/school_compare-integrator:latest
container_name: schoolcompare_integrator
ports:
- "8001:8001"
environment:
DATABASE_URL: postgresql://${DB_USERNAME}:${DB_PASSWORD}@sc_database:5432/${DB_DATABASE_NAME}
DATA_DIR: /data
BACKEND_URL: http://backend:80
ADMIN_API_KEY: ${ADMIN_API_KEY:-changeme}
PYTHONUNBUFFERED: 1
volumes:
- supplementary_data:/data
depends_on:
sc_database:
condition: service_healthy
networks:
- backend
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8001/health"]
interval: 30s
timeout: 10s
retries: 3
start_period: 15s
# ── Airflow Webserver (UI at :8080) ──────────────────────────────────
airflow-webserver:
image: privaterepo.sitaru.org/tudor/school_compare-pipeline:latest
container_name: schoolcompare_airflow_webserver
command: airflow webserver --port 8080
ports:
- "8080:8080"
environment:
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://${DB_USERNAME}:${DB_PASSWORD}@sc_database:5432/${DB_DATABASE_NAME}
AIRFLOW__CORE__DAGS_FOLDER: /opt/pipeline/dags
AIRFLOW__CORE__LOAD_EXAMPLES: "false"
AIRFLOW__WEBSERVER__EXPOSE_CONFIG: "false"
PG_HOST: sc_database
PG_PORT: "5432"
PG_USER: ${DB_USERNAME}
PG_PASSWORD: ${DB_PASSWORD}
PG_DATABASE: ${DB_DATABASE_NAME}
TYPESENSE_URL: http://typesense:8108
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:-changeme}
volumes:
- airflow_dags:/opt/pipeline/dags:ro
depends_on:
sc_database:
condition: service_healthy
networks:
- backend
restart: unless-stopped
healthcheck:
test: ["CMD", "curl", "-f", "http://localhost:8080/health"]
interval: 30s
timeout: 10s
retries: 5
start_period: 60s
# ── Airflow Scheduler ────────────────────────────────────────────────
airflow-scheduler:
image: privaterepo.sitaru.org/tudor/school_compare-pipeline:latest
container_name: schoolcompare_airflow_scheduler
command: airflow scheduler
environment:
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://${DB_USERNAME}:${DB_PASSWORD}@sc_database:5432/${DB_DATABASE_NAME}
AIRFLOW__CORE__DAGS_FOLDER: /opt/pipeline/dags
AIRFLOW__CORE__LOAD_EXAMPLES: "false"
AIRFLOW__WEBSERVER__EXPOSE_CONFIG: "false"
PG_HOST: sc_database
PG_PORT: "5432"
PG_USER: ${DB_USERNAME}
PG_PASSWORD: ${DB_PASSWORD}
PG_DATABASE: ${DB_DATABASE_NAME}
TYPESENSE_URL: http://typesense:8108
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:-changeme}
volumes:
- airflow_dags:/opt/pipeline/dags:ro
depends_on:
sc_database:
condition: service_healthy
networks:
- backend
restart: unless-stopped
# ── Airflow DB Init (one-shot) ───────────────────────────────────────
airflow-init:
image: privaterepo.sitaru.org/tudor/school_compare-pipeline:latest
container_name: schoolcompare_airflow_init
command: >
bash -c "
airflow db migrate &&
airflow users create
--username admin
--password $${AIRFLOW_ADMIN_PASSWORD:-admin}
--firstname Admin
--lastname User
--role Admin
--email admin@localhost || true
"
environment:
AIRFLOW__CORE__EXECUTOR: LocalExecutor
AIRFLOW__DATABASE__SQL_ALCHEMY_CONN: postgresql+psycopg2://${DB_USERNAME}:${DB_PASSWORD}@sc_database:5432/${DB_DATABASE_NAME}
AIRFLOW__CORE__DAGS_FOLDER: /opt/pipeline/dags
AIRFLOW__CORE__LOAD_EXAMPLES: "false"
AIRFLOW_ADMIN_PASSWORD: ${AIRFLOW_ADMIN_PASSWORD:-admin}
depends_on:
sc_database:
condition: service_healthy
networks:
- backend
restart: "no"
networks:
backend:
driver: bridge
macvlan:
external:
name: macvlan
volumes:
postgres_data:
kestra_storage:
supplementary_data:
typesense_data:
airflow_dags: