feat(pipeline): implement parent-view, fbit, idaci Singer taps + align staging/mart models
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 34s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Integrator (push) Successful in 57s
Build and Push Docker Images / Build Kestra Init (push) Successful in 31s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m6s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 34s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Integrator (push) Successful in 57s
Build and Push Docker Images / Build Kestra Init (push) Successful in 31s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m6s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
Port extraction logic from integrator scripts into Singer SDK taps: - tap-uk-parent-view: scrapes Ofsted open data portal, parses survey responses (14 questions) - tap-uk-fbit: queries FBIT API per-URN with rate limiting, computes per-pupil spend - tap-uk-idaci: downloads IoD2019 XLSX, batch-resolves postcodes→LSOAs via postcodes.io Update dbt models to match actual tap output schemas: - stg_idaci now includes URN (tap does the postcode→LSOA→school join) - stg_parent_view expanded from 8 to 13 question columns - fact_deprivation simplified (no longer needs postcode→LSOA join in dbt) - fact_parent_view expanded to include all 13 question metrics Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -1,30 +1,157 @@
|
||||
"""IDACI Singer tap — extracts deprivation index lookup data."""
|
||||
"""IDACI Singer tap — extracts deprivation index from IoD2019 + postcodes.io LSOA lookup."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
from singer_sdk import Stream, Tap
|
||||
from singer_sdk import typing as th
|
||||
|
||||
IOD_2019_URL = (
|
||||
"https://assets.publishing.service.gov.uk/government/uploads/system/uploads/"
|
||||
"attachment_data/file/833970/File_1_-_IMD2019_Index_of_Multiple_Deprivation.xlsx"
|
||||
)
|
||||
|
||||
POSTCODES_IO_BATCH = "https://api.postcodes.io/postcodes"
|
||||
BATCH_SIZE = 100
|
||||
|
||||
|
||||
def _postcode_to_lsoa(postcodes: list[str], logger) -> dict[str, str]:
|
||||
"""Batch-resolve postcodes to LSOA codes via postcodes.io."""
|
||||
result = {}
|
||||
valid = list({p.strip().upper() for p in postcodes if p and len(str(p).strip()) >= 5})
|
||||
|
||||
for i in range(0, len(valid), BATCH_SIZE):
|
||||
batch = valid[i : i + BATCH_SIZE]
|
||||
try:
|
||||
resp = requests.post(POSTCODES_IO_BATCH, json={"postcodes": batch}, timeout=30)
|
||||
if resp.status_code == 200:
|
||||
for item in resp.json().get("result", []):
|
||||
if item and item.get("result"):
|
||||
lsoa = item["result"].get("lsoa")
|
||||
if lsoa:
|
||||
result[item["query"].upper()] = lsoa
|
||||
except Exception as e:
|
||||
logger.warning("postcodes.io batch failed: %s", e)
|
||||
|
||||
if i % 5000 == 0 and i > 0:
|
||||
logger.info(" LSOA resolution: %d/%d postcodes", i, len(valid))
|
||||
|
||||
return result
|
||||
|
||||
|
||||
class IDACIStream(Stream):
|
||||
"""Stream: IDACI scores by LSOA."""
|
||||
"""Stream: IDACI scores joined to school URNs via postcode → LSOA lookup."""
|
||||
|
||||
name = "idaci"
|
||||
primary_keys = ["lsoa_code"]
|
||||
primary_keys = ["urn"]
|
||||
replication_key = None
|
||||
|
||||
schema = th.PropertiesList(
|
||||
th.Property("lsoa_code", th.StringType, required=True),
|
||||
th.Property("urn", th.IntegerType, required=True),
|
||||
th.Property("lsoa_code", th.StringType),
|
||||
th.Property("idaci_score", th.NumberType),
|
||||
th.Property("idaci_decile", th.IntegerType),
|
||||
).to_dict()
|
||||
|
||||
def _load_iod_data(self) -> dict[str, dict]:
|
||||
"""Download and parse IoD2019 IDACI data into a LSOA lookup."""
|
||||
self.logger.info("Downloading IoD2019 IDACI data...")
|
||||
resp = requests.get(IOD_2019_URL, timeout=300)
|
||||
resp.raise_for_status()
|
||||
|
||||
iod_sheets = pd.read_excel(io.BytesIO(resp.content), sheet_name=None)
|
||||
|
||||
# Find the IDACI sheet
|
||||
idaci_sheet = None
|
||||
for name, df in iod_sheets.items():
|
||||
if "IDACI" in name.upper() or "IDACI" in str(df.columns.tolist()).upper():
|
||||
idaci_sheet = name
|
||||
break
|
||||
if idaci_sheet is None:
|
||||
idaci_sheet = list(iod_sheets.keys())[0]
|
||||
|
||||
df_iod = iod_sheets[idaci_sheet]
|
||||
|
||||
# Find columns dynamically
|
||||
col_lsoa = next(
|
||||
(c for c in df_iod.columns if "LSOA" in str(c).upper() and "code" in str(c).lower()),
|
||||
None,
|
||||
)
|
||||
col_score = next(
|
||||
(c for c in df_iod.columns if "IDACI" in str(c).upper() and "score" in str(c).lower()),
|
||||
None,
|
||||
)
|
||||
|
||||
if not col_lsoa or not col_score:
|
||||
self.logger.error("Could not find LSOA/IDACI columns. Available: %s", list(df_iod.columns)[:20])
|
||||
return {}
|
||||
|
||||
df_iod = df_iod[[col_lsoa, col_score]].copy()
|
||||
df_iod.columns = ["lsoa_code", "idaci_score"]
|
||||
df_iod = df_iod.dropna()
|
||||
|
||||
# Compute deciles (1 = most deprived)
|
||||
df_iod = df_iod.sort_values("idaci_score", ascending=False)
|
||||
df_iod["idaci_decile"] = (pd.qcut(df_iod["idaci_score"], 10, labels=False) + 1).astype(int)
|
||||
df_iod["idaci_decile"] = 11 - df_iod["idaci_decile"]
|
||||
|
||||
self.logger.info("Loaded %d LSOA IDACI records", len(df_iod))
|
||||
return df_iod.set_index("lsoa_code")[["idaci_score", "idaci_decile"]].to_dict("index")
|
||||
|
||||
def _get_school_postcodes(self) -> list[tuple[int, str]]:
|
||||
"""Fetch URN + postcode pairs from GIAS."""
|
||||
url = (
|
||||
"https://ea-edubase-api-prod.azurewebsites.net"
|
||||
"/edubase/downloads/public/edubasealldata.csv"
|
||||
)
|
||||
self.logger.info("Fetching school postcodes from GIAS...")
|
||||
resp = requests.get(url, timeout=120)
|
||||
resp.raise_for_status()
|
||||
|
||||
df = pd.read_csv(
|
||||
io.StringIO(resp.text),
|
||||
encoding="utf-8-sig",
|
||||
usecols=["URN", "Postcode", "EstablishmentStatus (name)"],
|
||||
dtype=str,
|
||||
)
|
||||
df = df[df["EstablishmentStatus (name)"] == "Open"]
|
||||
df = df.dropna(subset=["URN", "Postcode"])
|
||||
|
||||
return [(int(row["URN"]), row["Postcode"]) for _, row in df.iterrows()]
|
||||
|
||||
def get_records(self, context):
|
||||
# TODO: Implement IDACI extraction
|
||||
# Source: MHCLG IoD 2019 LSOA-level data
|
||||
# Available as a static CSV download
|
||||
self.logger.warning("IDACI extraction not yet implemented")
|
||||
return iter([])
|
||||
lsoa_lookup = self._load_iod_data()
|
||||
if not lsoa_lookup:
|
||||
return
|
||||
|
||||
schools = self._get_school_postcodes()
|
||||
postcodes = [pc for _, pc in schools]
|
||||
|
||||
self.logger.info("Resolving %d postcodes to LSOAs...", len(postcodes))
|
||||
pc_to_lsoa = _postcode_to_lsoa(postcodes, self.logger)
|
||||
self.logger.info("Resolved %d postcodes", len(pc_to_lsoa))
|
||||
|
||||
yielded = 0
|
||||
for urn, postcode in schools:
|
||||
lsoa = pc_to_lsoa.get(postcode.strip().upper())
|
||||
if not lsoa:
|
||||
continue
|
||||
iod = lsoa_lookup.get(lsoa)
|
||||
if not iod:
|
||||
continue
|
||||
|
||||
yield {
|
||||
"urn": urn,
|
||||
"lsoa_code": lsoa,
|
||||
"idaci_score": float(iod["idaci_score"]),
|
||||
"idaci_decile": int(iod["idaci_decile"]),
|
||||
}
|
||||
yielded += 1
|
||||
|
||||
self.logger.info("Yielded IDACI data for %d schools", yielded)
|
||||
|
||||
|
||||
class TapUKIDACI(Tap):
|
||||
|
||||
Reference in New Issue
Block a user