feat(pipeline): implement parent-view, fbit, idaci Singer taps + align staging/mart models
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 34s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Integrator (push) Successful in 57s
Build and Push Docker Images / Build Kestra Init (push) Successful in 31s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m6s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 34s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Integrator (push) Successful in 57s
Build and Push Docker Images / Build Kestra Init (push) Successful in 31s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m6s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
Port extraction logic from integrator scripts into Singer SDK taps: - tap-uk-parent-view: scrapes Ofsted open data portal, parses survey responses (14 questions) - tap-uk-fbit: queries FBIT API per-URN with rate limiting, computes per-pupil spend - tap-uk-idaci: downloads IoD2019 XLSX, batch-resolves postcodes→LSOAs via postcodes.io Update dbt models to match actual tap output schemas: - stg_idaci now includes URN (tap does the postcode→LSOA→school join) - stg_parent_view expanded from 8 to 13 question columns - fact_deprivation simplified (no longer needs postcode→LSOA join in dbt) - fact_parent_view expanded to include all 13 question metrics Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -2,9 +2,16 @@
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import time
|
||||
from datetime import date
|
||||
|
||||
import requests
|
||||
from singer_sdk import Stream, Tap
|
||||
from singer_sdk import typing as th
|
||||
|
||||
API_BASE = "https://schools-financial-benchmarking.service.gov.uk/api"
|
||||
RATE_LIMIT_DELAY = 0.1 # seconds between requests
|
||||
|
||||
|
||||
class FBITFinanceStream(Stream):
|
||||
"""Stream: School financial benchmarking data."""
|
||||
@@ -23,13 +30,68 @@ class FBITFinanceStream(Stream):
|
||||
th.Property("premises_cost_pct", th.NumberType),
|
||||
).to_dict()
|
||||
|
||||
def _get_school_urns(self) -> list[int]:
|
||||
"""Fetch all open school URNs from GIAS to know what to query."""
|
||||
import io
|
||||
import pandas as pd
|
||||
|
||||
url = (
|
||||
"https://ea-edubase-api-prod.azurewebsites.net"
|
||||
f"/edubase/downloads/public/edubasealldata{date.today().strftime('%Y%m%d')}.csv"
|
||||
)
|
||||
self.logger.info("Fetching URN list from GIAS for FBIT extraction...")
|
||||
try:
|
||||
resp = requests.get(url, timeout=120)
|
||||
resp.raise_for_status()
|
||||
df = pd.read_csv(
|
||||
io.StringIO(resp.text),
|
||||
encoding="utf-8-sig",
|
||||
usecols=["URN", "EstablishmentStatus (name)"],
|
||||
dtype=str,
|
||||
)
|
||||
df = df[df["EstablishmentStatus (name)"] == "Open"]
|
||||
return [int(u) for u in df["URN"].dropna().unique()]
|
||||
except Exception as e:
|
||||
self.logger.error("Failed to fetch URN list: %s", e)
|
||||
return []
|
||||
|
||||
def get_records(self, context):
|
||||
# TODO: Implement FBIT API extraction
|
||||
# The FBIT API requires per-URN requests with rate limiting.
|
||||
# Implementation will batch URNs from dim_school and request
|
||||
# financial data for each.
|
||||
self.logger.warning("FBIT extraction not yet implemented")
|
||||
return iter([])
|
||||
urns = self._get_school_urns()
|
||||
year = date.today().year - 1
|
||||
|
||||
self.logger.info("Fetching FBIT data for %d schools (year %d)...", len(urns), year)
|
||||
|
||||
for i, urn in enumerate(urns):
|
||||
if i % 1000 == 0:
|
||||
self.logger.info(" Progress: %d/%d", i, len(urns))
|
||||
|
||||
try:
|
||||
resp = requests.get(
|
||||
f"{API_BASE}/schoolFinancialDataObject/{urn}",
|
||||
timeout=10,
|
||||
)
|
||||
if resp.status_code == 200:
|
||||
data = resp.json()
|
||||
if data:
|
||||
per_pupil = None
|
||||
total_exp = data.get("totalExpenditure")
|
||||
num_pupils = data.get("numberOfPupils")
|
||||
if total_exp and num_pupils:
|
||||
per_pupil = round(total_exp / num_pupils, 2)
|
||||
|
||||
yield {
|
||||
"urn": urn,
|
||||
"year": year,
|
||||
"per_pupil_spend": per_pupil,
|
||||
"staff_cost_pct": data.get("staffCostPercent"),
|
||||
"teacher_cost_pct": data.get("teachingStaffCostPercent"),
|
||||
"support_staff_cost_pct": data.get("educationSupportStaffCostPercent"),
|
||||
"premises_cost_pct": data.get("premisesStaffCostPercent"),
|
||||
}
|
||||
except requests.RequestException:
|
||||
pass
|
||||
|
||||
time.sleep(RATE_LIMIT_DELAY)
|
||||
|
||||
|
||||
class TapUKFBIT(Tap):
|
||||
@@ -41,7 +103,8 @@ class TapUKFBIT(Tap):
|
||||
th.Property(
|
||||
"base_url",
|
||||
th.StringType,
|
||||
default="https://financial-benchmarking-and-insights-tool.education.gov.uk/api",
|
||||
default=API_BASE,
|
||||
description="FBIT API base URL",
|
||||
),
|
||||
).to_dict()
|
||||
|
||||
|
||||
Reference in New Issue
Block a user