fix(taps): align with integrator resilience patterns
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 32s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m7s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s

Port critical patterns from the working integrator into Singer taps:
- GIAS: add 404 fallback to yesterday's date, increase timeout to 300s,
  use latin-1 encoding, use dated URL for links (static URL returns 500)
- FBIT: add GIAS date fallback, increase timeout, fix encoding to latin-1
- IDACI: use dated GIAS URL with fallback instead of undated static URL,
  fix encoding to latin-1, increase timeout to 300s
- Ofsted: try utf-8-sig then fall back to latin-1 encoding

Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
2026-03-26 11:13:38 +00:00
parent b6a487776b
commit cd75fc4c24
4 changed files with 66 additions and 21 deletions

View File

@@ -33,19 +33,30 @@ class FBITFinanceStream(Stream):
def _get_school_urns(self) -> list[int]: def _get_school_urns(self) -> list[int]:
"""Fetch all open school URNs from GIAS to know what to query.""" """Fetch all open school URNs from GIAS to know what to query."""
import io import io
from datetime import timedelta
import pandas as pd import pandas as pd
url = ( base = (
"https://ea-edubase-api-prod.azurewebsites.net" "https://ea-edubase-api-prod.azurewebsites.net"
f"/edubase/downloads/public/edubasealldata{date.today().strftime('%Y%m%d')}.csv" "/edubase/downloads/public/edubasealldata{date}.csv"
) )
today = date.today()
url = base.format(date=today.strftime("%Y%m%d"))
self.logger.info("Fetching URN list from GIAS for FBIT extraction...") self.logger.info("Fetching URN list from GIAS for FBIT extraction...")
try: try:
resp = requests.get(url, timeout=120) resp = requests.get(url, timeout=300)
# Fall back to yesterday if today's file isn't available
if resp.status_code == 404:
yesterday = (today - timedelta(days=1)).strftime("%Y%m%d")
url = base.format(date=yesterday)
self.logger.info("Today's GIAS file not available, trying yesterday")
resp = requests.get(url, timeout=300)
resp.raise_for_status() resp.raise_for_status()
df = pd.read_csv( df = pd.read_csv(
io.StringIO(resp.text), io.StringIO(resp.text),
encoding="utf-8-sig", encoding="latin-1",
usecols=["URN", "EstablishmentStatus (name)"], usecols=["URN", "EstablishmentStatus (name)"],
dtype=str, dtype=str,
) )

View File

@@ -2,7 +2,7 @@
from __future__ import annotations from __future__ import annotations
from datetime import date from datetime import date, timedelta
from singer_sdk import Stream, Tap from singer_sdk import Stream, Tap
from singer_sdk import typing as th from singer_sdk import typing as th
@@ -12,6 +12,11 @@ GIAS_URL_TEMPLATE = (
"/edubase/downloads/public/edubasealldata{date}.csv" "/edubase/downloads/public/edubasealldata{date}.csv"
) )
GIAS_LINKS_URL_TEMPLATE = (
"https://ea-edubase-api-prod.azurewebsites.net"
"/edubase/downloads/public/links_edubasealldata{date}.csv"
)
class GIASEstablishmentsStream(Stream): class GIASEstablishmentsStream(Stream):
"""Stream: GIAS establishments (one row per URN).""" """Stream: GIAS establishments (one row per URN)."""
@@ -40,23 +45,30 @@ class GIASEstablishmentsStream(Stream):
import pandas as pd import pandas as pd
import requests import requests
today = date.today().strftime("%Y%m%d") today = date.today()
url = GIAS_URL_TEMPLATE.format(date=today) url = GIAS_URL_TEMPLATE.format(date=today.strftime("%Y%m%d"))
self.logger.info("Downloading GIAS bulk CSV: %s", url) self.logger.info("Downloading GIAS bulk CSV: %s", url)
resp = requests.get(url, timeout=120) resp = requests.get(url, timeout=300)
# GIAS may not have today's file yet — fall back to yesterday
if resp.status_code == 404:
yesterday = (today - timedelta(days=1)).strftime("%Y%m%d")
url = GIAS_URL_TEMPLATE.format(date=yesterday)
self.logger.info("Today's file not available, trying yesterday: %s", url)
resp = requests.get(url, timeout=300)
resp.raise_for_status() resp.raise_for_status()
df = pd.read_csv( df = pd.read_csv(
io.StringIO(resp.text), io.StringIO(resp.text),
encoding="utf-8-sig", encoding="latin-1",
dtype=str, dtype=str,
keep_default_na=False, keep_default_na=False,
) )
for _, row in df.iterrows(): for _, row in df.iterrows():
record = row.to_dict() record = row.to_dict()
# Cast URN to int
try: try:
record["URN"] = int(record["URN"]) record["URN"] = int(record["URN"])
except (ValueError, KeyError): except (ValueError, KeyError):
@@ -85,18 +97,24 @@ class GIASLinksStream(Stream):
import pandas as pd import pandas as pd
import requests import requests
url = ( today = date.today()
"https://ea-edubase-api-prod.azurewebsites.net" url = GIAS_LINKS_URL_TEMPLATE.format(date=today.strftime("%Y%m%d"))
"/edubase/downloads/public/links_edubasealldata.csv"
)
self.logger.info("Downloading GIAS links CSV: %s", url) self.logger.info("Downloading GIAS links CSV: %s", url)
resp = requests.get(url, timeout=120) resp = requests.get(url, timeout=300)
# Fall back to yesterday if today's file isn't available
if resp.status_code in (404, 500):
yesterday = (today - timedelta(days=1)).strftime("%Y%m%d")
url = GIAS_LINKS_URL_TEMPLATE.format(date=yesterday)
self.logger.info("Today's links file not available, trying yesterday: %s", url)
resp = requests.get(url, timeout=300)
resp.raise_for_status() resp.raise_for_status()
df = pd.read_csv( df = pd.read_csv(
io.StringIO(resp.text), io.StringIO(resp.text),
encoding="utf-8-sig", encoding="latin-1",
dtype=str, dtype=str,
keep_default_na=False, keep_default_na=False,
) )

View File

@@ -103,17 +103,29 @@ class IDACIStream(Stream):
def _get_school_postcodes(self) -> list[tuple[int, str]]: def _get_school_postcodes(self) -> list[tuple[int, str]]:
"""Fetch URN + postcode pairs from GIAS.""" """Fetch URN + postcode pairs from GIAS."""
url = ( from datetime import date, timedelta
today = date.today()
base = (
"https://ea-edubase-api-prod.azurewebsites.net" "https://ea-edubase-api-prod.azurewebsites.net"
"/edubase/downloads/public/edubasealldata.csv" "/edubase/downloads/public/edubasealldata{date}.csv"
) )
url = base.format(date=today.strftime("%Y%m%d"))
self.logger.info("Fetching school postcodes from GIAS...") self.logger.info("Fetching school postcodes from GIAS...")
resp = requests.get(url, timeout=120) resp = requests.get(url, timeout=300)
# Fall back to yesterday if today's file isn't available
if resp.status_code == 404:
yesterday = (today - timedelta(days=1)).strftime("%Y%m%d")
url = base.format(date=yesterday)
self.logger.info("Today's GIAS file not available, trying yesterday: %s", url)
resp = requests.get(url, timeout=300)
resp.raise_for_status() resp.raise_for_status()
df = pd.read_csv( df = pd.read_csv(
io.StringIO(resp.text), io.StringIO(resp.text),
encoding="utf-8-sig", encoding="latin-1",
usecols=["URN", "Postcode", "EstablishmentStatus (name)"], usecols=["URN", "Postcode", "EstablishmentStatus (name)"],
dtype=str, dtype=str,
) )

View File

@@ -129,7 +129,11 @@ class OfstedInspectionsStream(Stream):
df = pd.read_excel(io.BytesIO(resp.content), engine="odf", dtype=str) df = pd.read_excel(io.BytesIO(resp.content), engine="odf", dtype=str)
else: else:
# Detect header row (may not be row 0) # Detect header row (may not be row 0)
text = resp.content.decode("utf-8-sig", errors="replace") # Try utf-8-sig first, fall back to latin-1 (matches integrator)
try:
text = resp.content.decode("utf-8-sig")
except UnicodeDecodeError:
text = resp.content.decode("latin-1")
lines = text.split("\n") lines = text.split("\n")
header_idx = 0 header_idx = 0
for i, line in enumerate(lines[:20]): for i, line in enumerate(lines[:20]):