fix(taps): align with integrator resilience patterns
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 32s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m7s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 32s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m7s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
Port critical patterns from the working integrator into Singer taps: - GIAS: add 404 fallback to yesterday's date, increase timeout to 300s, use latin-1 encoding, use dated URL for links (static URL returns 500) - FBIT: add GIAS date fallback, increase timeout, fix encoding to latin-1 - IDACI: use dated GIAS URL with fallback instead of undated static URL, fix encoding to latin-1, increase timeout to 300s - Ofsted: try utf-8-sig then fall back to latin-1 encoding Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -33,19 +33,30 @@ class FBITFinanceStream(Stream):
|
|||||||
def _get_school_urns(self) -> list[int]:
|
def _get_school_urns(self) -> list[int]:
|
||||||
"""Fetch all open school URNs from GIAS to know what to query."""
|
"""Fetch all open school URNs from GIAS to know what to query."""
|
||||||
import io
|
import io
|
||||||
|
from datetime import timedelta
|
||||||
|
|
||||||
import pandas as pd
|
import pandas as pd
|
||||||
|
|
||||||
url = (
|
base = (
|
||||||
"https://ea-edubase-api-prod.azurewebsites.net"
|
"https://ea-edubase-api-prod.azurewebsites.net"
|
||||||
f"/edubase/downloads/public/edubasealldata{date.today().strftime('%Y%m%d')}.csv"
|
"/edubase/downloads/public/edubasealldata{date}.csv"
|
||||||
)
|
)
|
||||||
|
today = date.today()
|
||||||
|
url = base.format(date=today.strftime("%Y%m%d"))
|
||||||
|
|
||||||
self.logger.info("Fetching URN list from GIAS for FBIT extraction...")
|
self.logger.info("Fetching URN list from GIAS for FBIT extraction...")
|
||||||
try:
|
try:
|
||||||
resp = requests.get(url, timeout=120)
|
resp = requests.get(url, timeout=300)
|
||||||
|
# Fall back to yesterday if today's file isn't available
|
||||||
|
if resp.status_code == 404:
|
||||||
|
yesterday = (today - timedelta(days=1)).strftime("%Y%m%d")
|
||||||
|
url = base.format(date=yesterday)
|
||||||
|
self.logger.info("Today's GIAS file not available, trying yesterday")
|
||||||
|
resp = requests.get(url, timeout=300)
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
df = pd.read_csv(
|
df = pd.read_csv(
|
||||||
io.StringIO(resp.text),
|
io.StringIO(resp.text),
|
||||||
encoding="utf-8-sig",
|
encoding="latin-1",
|
||||||
usecols=["URN", "EstablishmentStatus (name)"],
|
usecols=["URN", "EstablishmentStatus (name)"],
|
||||||
dtype=str,
|
dtype=str,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -2,7 +2,7 @@
|
|||||||
|
|
||||||
from __future__ import annotations
|
from __future__ import annotations
|
||||||
|
|
||||||
from datetime import date
|
from datetime import date, timedelta
|
||||||
|
|
||||||
from singer_sdk import Stream, Tap
|
from singer_sdk import Stream, Tap
|
||||||
from singer_sdk import typing as th
|
from singer_sdk import typing as th
|
||||||
@@ -12,6 +12,11 @@ GIAS_URL_TEMPLATE = (
|
|||||||
"/edubase/downloads/public/edubasealldata{date}.csv"
|
"/edubase/downloads/public/edubasealldata{date}.csv"
|
||||||
)
|
)
|
||||||
|
|
||||||
|
GIAS_LINKS_URL_TEMPLATE = (
|
||||||
|
"https://ea-edubase-api-prod.azurewebsites.net"
|
||||||
|
"/edubase/downloads/public/links_edubasealldata{date}.csv"
|
||||||
|
)
|
||||||
|
|
||||||
|
|
||||||
class GIASEstablishmentsStream(Stream):
|
class GIASEstablishmentsStream(Stream):
|
||||||
"""Stream: GIAS establishments (one row per URN)."""
|
"""Stream: GIAS establishments (one row per URN)."""
|
||||||
@@ -40,23 +45,30 @@ class GIASEstablishmentsStream(Stream):
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
today = date.today().strftime("%Y%m%d")
|
today = date.today()
|
||||||
url = GIAS_URL_TEMPLATE.format(date=today)
|
url = GIAS_URL_TEMPLATE.format(date=today.strftime("%Y%m%d"))
|
||||||
|
|
||||||
self.logger.info("Downloading GIAS bulk CSV: %s", url)
|
self.logger.info("Downloading GIAS bulk CSV: %s", url)
|
||||||
resp = requests.get(url, timeout=120)
|
resp = requests.get(url, timeout=300)
|
||||||
|
|
||||||
|
# GIAS may not have today's file yet — fall back to yesterday
|
||||||
|
if resp.status_code == 404:
|
||||||
|
yesterday = (today - timedelta(days=1)).strftime("%Y%m%d")
|
||||||
|
url = GIAS_URL_TEMPLATE.format(date=yesterday)
|
||||||
|
self.logger.info("Today's file not available, trying yesterday: %s", url)
|
||||||
|
resp = requests.get(url, timeout=300)
|
||||||
|
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
df = pd.read_csv(
|
df = pd.read_csv(
|
||||||
io.StringIO(resp.text),
|
io.StringIO(resp.text),
|
||||||
encoding="utf-8-sig",
|
encoding="latin-1",
|
||||||
dtype=str,
|
dtype=str,
|
||||||
keep_default_na=False,
|
keep_default_na=False,
|
||||||
)
|
)
|
||||||
|
|
||||||
for _, row in df.iterrows():
|
for _, row in df.iterrows():
|
||||||
record = row.to_dict()
|
record = row.to_dict()
|
||||||
# Cast URN to int
|
|
||||||
try:
|
try:
|
||||||
record["URN"] = int(record["URN"])
|
record["URN"] = int(record["URN"])
|
||||||
except (ValueError, KeyError):
|
except (ValueError, KeyError):
|
||||||
@@ -85,18 +97,24 @@ class GIASLinksStream(Stream):
|
|||||||
import pandas as pd
|
import pandas as pd
|
||||||
import requests
|
import requests
|
||||||
|
|
||||||
url = (
|
today = date.today()
|
||||||
"https://ea-edubase-api-prod.azurewebsites.net"
|
url = GIAS_LINKS_URL_TEMPLATE.format(date=today.strftime("%Y%m%d"))
|
||||||
"/edubase/downloads/public/links_edubasealldata.csv"
|
|
||||||
)
|
|
||||||
|
|
||||||
self.logger.info("Downloading GIAS links CSV: %s", url)
|
self.logger.info("Downloading GIAS links CSV: %s", url)
|
||||||
resp = requests.get(url, timeout=120)
|
resp = requests.get(url, timeout=300)
|
||||||
|
|
||||||
|
# Fall back to yesterday if today's file isn't available
|
||||||
|
if resp.status_code in (404, 500):
|
||||||
|
yesterday = (today - timedelta(days=1)).strftime("%Y%m%d")
|
||||||
|
url = GIAS_LINKS_URL_TEMPLATE.format(date=yesterday)
|
||||||
|
self.logger.info("Today's links file not available, trying yesterday: %s", url)
|
||||||
|
resp = requests.get(url, timeout=300)
|
||||||
|
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
df = pd.read_csv(
|
df = pd.read_csv(
|
||||||
io.StringIO(resp.text),
|
io.StringIO(resp.text),
|
||||||
encoding="utf-8-sig",
|
encoding="latin-1",
|
||||||
dtype=str,
|
dtype=str,
|
||||||
keep_default_na=False,
|
keep_default_na=False,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -103,17 +103,29 @@ class IDACIStream(Stream):
|
|||||||
|
|
||||||
def _get_school_postcodes(self) -> list[tuple[int, str]]:
|
def _get_school_postcodes(self) -> list[tuple[int, str]]:
|
||||||
"""Fetch URN + postcode pairs from GIAS."""
|
"""Fetch URN + postcode pairs from GIAS."""
|
||||||
url = (
|
from datetime import date, timedelta
|
||||||
|
|
||||||
|
today = date.today()
|
||||||
|
base = (
|
||||||
"https://ea-edubase-api-prod.azurewebsites.net"
|
"https://ea-edubase-api-prod.azurewebsites.net"
|
||||||
"/edubase/downloads/public/edubasealldata.csv"
|
"/edubase/downloads/public/edubasealldata{date}.csv"
|
||||||
)
|
)
|
||||||
|
url = base.format(date=today.strftime("%Y%m%d"))
|
||||||
self.logger.info("Fetching school postcodes from GIAS...")
|
self.logger.info("Fetching school postcodes from GIAS...")
|
||||||
resp = requests.get(url, timeout=120)
|
resp = requests.get(url, timeout=300)
|
||||||
|
|
||||||
|
# Fall back to yesterday if today's file isn't available
|
||||||
|
if resp.status_code == 404:
|
||||||
|
yesterday = (today - timedelta(days=1)).strftime("%Y%m%d")
|
||||||
|
url = base.format(date=yesterday)
|
||||||
|
self.logger.info("Today's GIAS file not available, trying yesterday: %s", url)
|
||||||
|
resp = requests.get(url, timeout=300)
|
||||||
|
|
||||||
resp.raise_for_status()
|
resp.raise_for_status()
|
||||||
|
|
||||||
df = pd.read_csv(
|
df = pd.read_csv(
|
||||||
io.StringIO(resp.text),
|
io.StringIO(resp.text),
|
||||||
encoding="utf-8-sig",
|
encoding="latin-1",
|
||||||
usecols=["URN", "Postcode", "EstablishmentStatus (name)"],
|
usecols=["URN", "Postcode", "EstablishmentStatus (name)"],
|
||||||
dtype=str,
|
dtype=str,
|
||||||
)
|
)
|
||||||
|
|||||||
@@ -129,7 +129,11 @@ class OfstedInspectionsStream(Stream):
|
|||||||
df = pd.read_excel(io.BytesIO(resp.content), engine="odf", dtype=str)
|
df = pd.read_excel(io.BytesIO(resp.content), engine="odf", dtype=str)
|
||||||
else:
|
else:
|
||||||
# Detect header row (may not be row 0)
|
# Detect header row (may not be row 0)
|
||||||
text = resp.content.decode("utf-8-sig", errors="replace")
|
# Try utf-8-sig first, fall back to latin-1 (matches integrator)
|
||||||
|
try:
|
||||||
|
text = resp.content.decode("utf-8-sig")
|
||||||
|
except UnicodeDecodeError:
|
||||||
|
text = resp.content.decode("latin-1")
|
||||||
lines = text.split("\n")
|
lines = text.split("\n")
|
||||||
header_idx = 0
|
header_idx = 0
|
||||||
for i, line in enumerate(lines[:20]):
|
for i, line in enumerate(lines[:20]):
|
||||||
|
|||||||
Reference in New Issue
Block a user