feat(pipeline): add Meltano + dbt + Airflow ELT pipeline scaffold
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 35s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 35s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m9s
Build and Push Docker Images / Build Integrator (push) Successful in 56s
Build and Push Docker Images / Build Kestra Init (push) Successful in 32s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 1s
Replaces the hand-rolled integrator with a production-grade ELT pipeline using Meltano (Singer taps), dbt Core (medallion architecture), and Apache Airflow (orchestration). Adds Typesense for search and PostGIS for geospatial queries. - 6 custom Singer taps (GIAS, EES, Ofsted, Parent View, FBIT, IDACI) - dbt project: 12 staging, 5 intermediate, 12 mart models - 3 Airflow DAGs (daily/monthly/annual schedules) - Typesense sync + batch geocoding scripts - docker-compose: add Airflow, Typesense; upgrade to PostGIS - Portainer stack definition matching live deployment topology Co-Authored-By: Claude Opus 4.6 <noreply@anthropic.com>
This commit is contained in:
17
pipeline/plugins/extractors/tap-uk-ees/pyproject.toml
Normal file
17
pipeline/plugins/extractors/tap-uk-ees/pyproject.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "tap-uk-ees"
|
||||
version = "0.1.0"
|
||||
description = "Singer tap for UK Explore Education Statistics (KS2, KS4, Census, Admissions, Phonics)"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"singer-sdk~=0.39",
|
||||
"requests>=2.31",
|
||||
"pandas>=2.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
tap-uk-ees = "tap_uk_ees.tap:TapUKEES.cli"
|
||||
@@ -0,0 +1 @@
|
||||
"""tap-uk-ees: Singer tap for Explore Education Statistics API."""
|
||||
154
pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py
Normal file
154
pipeline/plugins/extractors/tap-uk-ees/tap_uk_ees/tap.py
Normal file
@@ -0,0 +1,154 @@
|
||||
"""EES Singer tap — extracts KS2, KS4, Census, Admissions, Phonics data."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import zipfile
|
||||
|
||||
import requests
|
||||
from singer_sdk import Stream, Tap
|
||||
from singer_sdk import typing as th
|
||||
|
||||
CONTENT_API_BASE = (
|
||||
"https://content.explore-education-statistics.service.gov.uk/api"
|
||||
)
|
||||
STATS_API_BASE = "https://api.education.gov.uk/statistics/v1"
|
||||
TIMEOUT = 120
|
||||
|
||||
|
||||
def get_content_release_id(publication_slug: str) -> str:
|
||||
"""Return the latest release ID via the EES content API."""
|
||||
url = f"{CONTENT_API_BASE}/publications/{publication_slug}/releases/latest"
|
||||
resp = requests.get(url, timeout=TIMEOUT)
|
||||
resp.raise_for_status()
|
||||
return resp.json()["id"]
|
||||
|
||||
|
||||
def download_release_zip(release_id: str) -> zipfile.ZipFile:
|
||||
"""Download all data files for a release as a ZIP."""
|
||||
url = f"{CONTENT_API_BASE}/releases/{release_id}/files"
|
||||
resp = requests.get(url, timeout=300, stream=True)
|
||||
resp.raise_for_status()
|
||||
return zipfile.ZipFile(io.BytesIO(resp.content))
|
||||
|
||||
|
||||
class EESDatasetStream(Stream):
|
||||
"""Base stream for an EES dataset extracted from a release ZIP."""
|
||||
|
||||
replication_key = None
|
||||
_publication_slug: str = ""
|
||||
_file_keyword: str = ""
|
||||
|
||||
def get_records(self, context):
|
||||
import pandas as pd
|
||||
|
||||
release_id = get_content_release_id(self._publication_slug)
|
||||
self.logger.info(
|
||||
"Downloading release %s for %s",
|
||||
release_id,
|
||||
self._publication_slug,
|
||||
)
|
||||
zf = download_release_zip(release_id)
|
||||
|
||||
# Find the CSV matching our keyword
|
||||
csv_names = [n for n in zf.namelist() if n.endswith(".csv")]
|
||||
target = None
|
||||
for name in csv_names:
|
||||
if self._file_keyword.lower() in name.lower():
|
||||
target = name
|
||||
break
|
||||
if not target and csv_names:
|
||||
target = csv_names[0]
|
||||
|
||||
if not target:
|
||||
self.logger.warning("No CSV found in release ZIP")
|
||||
return
|
||||
|
||||
self.logger.info("Reading %s from ZIP", target)
|
||||
with zf.open(target) as f:
|
||||
df = pd.read_csv(f, dtype=str, keep_default_na=False)
|
||||
|
||||
# Filter to school-level data
|
||||
if "geographic_level" in df.columns:
|
||||
df = df[df["geographic_level"] == "School"]
|
||||
|
||||
for _, row in df.iterrows():
|
||||
yield row.to_dict()
|
||||
|
||||
|
||||
class EESKS2Stream(EESDatasetStream):
|
||||
name = "ees_ks2"
|
||||
primary_keys = ["urn", "time_period"]
|
||||
_publication_slug = "key-stage-2-attainment"
|
||||
_file_keyword = "school"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.StringType, required=True),
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
).to_dict()
|
||||
|
||||
|
||||
class EESKS4Stream(EESDatasetStream):
|
||||
name = "ees_ks4"
|
||||
primary_keys = ["urn", "time_period"]
|
||||
_publication_slug = "key-stage-4-performance-revised"
|
||||
_file_keyword = "school"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.StringType, required=True),
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
).to_dict()
|
||||
|
||||
|
||||
class EESCensusStream(EESDatasetStream):
|
||||
name = "ees_census"
|
||||
primary_keys = ["urn", "time_period"]
|
||||
_publication_slug = "school-pupils-and-their-characteristics"
|
||||
_file_keyword = "school"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.StringType, required=True),
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
).to_dict()
|
||||
|
||||
|
||||
class EESAdmissionsStream(EESDatasetStream):
|
||||
name = "ees_admissions"
|
||||
primary_keys = ["urn", "time_period"]
|
||||
_publication_slug = "secondary-and-primary-school-applications-and-offers"
|
||||
_file_keyword = "school"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.StringType, required=True),
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
).to_dict()
|
||||
|
||||
|
||||
class EESPhonicsStream(EESDatasetStream):
|
||||
name = "ees_phonics"
|
||||
primary_keys = ["urn", "time_period"]
|
||||
_publication_slug = "phonics-screening-check-and-key-stage-1-assessments"
|
||||
_file_keyword = "school"
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.StringType, required=True),
|
||||
th.Property("time_period", th.StringType, required=True),
|
||||
).to_dict()
|
||||
|
||||
|
||||
class TapUKEES(Tap):
|
||||
"""Singer tap for UK Explore Education Statistics."""
|
||||
|
||||
name = "tap-uk-ees"
|
||||
|
||||
config_jsonschema = th.PropertiesList(
|
||||
th.Property("base_url", th.StringType, description="EES API base URL"),
|
||||
).to_dict()
|
||||
|
||||
def discover_streams(self):
|
||||
return [
|
||||
EESKS2Stream(self),
|
||||
EESKS4Stream(self),
|
||||
EESCensusStream(self),
|
||||
EESAdmissionsStream(self),
|
||||
EESPhonicsStream(self),
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TapUKEES.cli()
|
||||
16
pipeline/plugins/extractors/tap-uk-fbit/pyproject.toml
Normal file
16
pipeline/plugins/extractors/tap-uk-fbit/pyproject.toml
Normal file
@@ -0,0 +1,16 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "tap-uk-fbit"
|
||||
version = "0.1.0"
|
||||
description = "Singer tap for UK FBIT (Financial Benchmarking and Insights Tool)"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"singer-sdk~=0.39",
|
||||
"requests>=2.31",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
tap-uk-fbit = "tap_uk_fbit.tap:TapUKFBIT.cli"
|
||||
@@ -0,0 +1 @@
|
||||
"""tap-uk-fbit: Singer tap for Financial Benchmarking and Insights Tool API."""
|
||||
53
pipeline/plugins/extractors/tap-uk-fbit/tap_uk_fbit/tap.py
Normal file
53
pipeline/plugins/extractors/tap-uk-fbit/tap_uk_fbit/tap.py
Normal file
@@ -0,0 +1,53 @@
|
||||
"""FBIT Singer tap — extracts financial data from the FBIT REST API."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from singer_sdk import Stream, Tap
|
||||
from singer_sdk import typing as th
|
||||
|
||||
|
||||
class FBITFinanceStream(Stream):
|
||||
"""Stream: School financial benchmarking data."""
|
||||
|
||||
name = "fbit_finance"
|
||||
primary_keys = ["urn", "year"]
|
||||
replication_key = None
|
||||
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.IntegerType, required=True),
|
||||
th.Property("year", th.IntegerType, required=True),
|
||||
th.Property("per_pupil_spend", th.NumberType),
|
||||
th.Property("staff_cost_pct", th.NumberType),
|
||||
th.Property("teacher_cost_pct", th.NumberType),
|
||||
th.Property("support_staff_cost_pct", th.NumberType),
|
||||
th.Property("premises_cost_pct", th.NumberType),
|
||||
).to_dict()
|
||||
|
||||
def get_records(self, context):
|
||||
# TODO: Implement FBIT API extraction
|
||||
# The FBIT API requires per-URN requests with rate limiting.
|
||||
# Implementation will batch URNs from dim_school and request
|
||||
# financial data for each.
|
||||
self.logger.warning("FBIT extraction not yet implemented")
|
||||
return iter([])
|
||||
|
||||
|
||||
class TapUKFBIT(Tap):
|
||||
"""Singer tap for UK FBIT financial data."""
|
||||
|
||||
name = "tap-uk-fbit"
|
||||
|
||||
config_jsonschema = th.PropertiesList(
|
||||
th.Property(
|
||||
"base_url",
|
||||
th.StringType,
|
||||
default="https://financial-benchmarking-and-insights-tool.education.gov.uk/api",
|
||||
),
|
||||
).to_dict()
|
||||
|
||||
def discover_streams(self):
|
||||
return [FBITFinanceStream(self)]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TapUKFBIT.cli()
|
||||
17
pipeline/plugins/extractors/tap-uk-gias/pyproject.toml
Normal file
17
pipeline/plugins/extractors/tap-uk-gias/pyproject.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "tap-uk-gias"
|
||||
version = "0.1.0"
|
||||
description = "Singer tap for UK GIAS (Get Information About Schools) bulk data"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"singer-sdk~=0.39",
|
||||
"requests>=2.31",
|
||||
"pandas>=2.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
tap-uk-gias = "tap_uk_gias.tap:TapUKGIAS.cli"
|
||||
@@ -0,0 +1 @@
|
||||
"""tap-uk-gias: Singer tap for GIAS bulk establishment data."""
|
||||
135
pipeline/plugins/extractors/tap-uk-gias/tap_uk_gias/tap.py
Normal file
135
pipeline/plugins/extractors/tap-uk-gias/tap_uk_gias/tap.py
Normal file
@@ -0,0 +1,135 @@
|
||||
"""GIAS Singer tap — extracts bulk establishment CSV from GIAS API."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from datetime import date
|
||||
|
||||
from singer_sdk import Stream, Tap
|
||||
from singer_sdk import typing as th
|
||||
|
||||
GIAS_URL_TEMPLATE = (
|
||||
"https://ea-edubase-api-prod.azurewebsites.net"
|
||||
"/edubase/downloads/public/edubasealldata{date}.csv"
|
||||
)
|
||||
|
||||
|
||||
class GIASEstablishmentsStream(Stream):
|
||||
"""Stream: GIAS establishments (one row per URN)."""
|
||||
|
||||
name = "gias_establishments"
|
||||
primary_keys = ["URN"]
|
||||
replication_key = None
|
||||
|
||||
# Schema is wide (~250 columns); we declare key columns and pass through the rest
|
||||
schema = th.PropertiesList(
|
||||
th.Property("URN", th.IntegerType, required=True),
|
||||
th.Property("EstablishmentName", th.StringType),
|
||||
th.Property("TypeOfEstablishment (name)", th.StringType),
|
||||
th.Property("PhaseOfEducation (name)", th.StringType),
|
||||
th.Property("LA (code)", th.IntegerType),
|
||||
th.Property("LA (name)", th.StringType),
|
||||
th.Property("EstablishmentNumber", th.IntegerType),
|
||||
th.Property("EstablishmentStatus (name)", th.StringType),
|
||||
th.Property("Postcode", th.StringType),
|
||||
).to_dict()
|
||||
|
||||
def get_records(self, context):
|
||||
"""Download GIAS CSV and yield rows."""
|
||||
import io
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
|
||||
today = date.today().strftime("%Y%m%d")
|
||||
url = GIAS_URL_TEMPLATE.format(date=today)
|
||||
|
||||
self.logger.info("Downloading GIAS bulk CSV: %s", url)
|
||||
resp = requests.get(url, timeout=120)
|
||||
resp.raise_for_status()
|
||||
|
||||
df = pd.read_csv(
|
||||
io.StringIO(resp.text),
|
||||
encoding="utf-8-sig",
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
)
|
||||
|
||||
for _, row in df.iterrows():
|
||||
record = row.to_dict()
|
||||
# Cast URN to int
|
||||
try:
|
||||
record["URN"] = int(record["URN"])
|
||||
except (ValueError, KeyError):
|
||||
continue
|
||||
yield record
|
||||
|
||||
|
||||
class GIASLinksStream(Stream):
|
||||
"""Stream: GIAS school links (predecessor/successor)."""
|
||||
|
||||
name = "gias_links"
|
||||
primary_keys = ["URN", "LinkURN"]
|
||||
replication_key = None
|
||||
|
||||
schema = th.PropertiesList(
|
||||
th.Property("URN", th.IntegerType, required=True),
|
||||
th.Property("LinkURN", th.IntegerType, required=True),
|
||||
th.Property("LinkType", th.StringType),
|
||||
th.Property("LinkEstablishedDate", th.StringType),
|
||||
).to_dict()
|
||||
|
||||
def get_records(self, context):
|
||||
"""Download GIAS links CSV and yield rows."""
|
||||
import io
|
||||
|
||||
import pandas as pd
|
||||
import requests
|
||||
|
||||
url = (
|
||||
"https://ea-edubase-api-prod.azurewebsites.net"
|
||||
"/edubase/downloads/public/links_edubasealldata.csv"
|
||||
)
|
||||
|
||||
self.logger.info("Downloading GIAS links CSV: %s", url)
|
||||
resp = requests.get(url, timeout=120)
|
||||
resp.raise_for_status()
|
||||
|
||||
df = pd.read_csv(
|
||||
io.StringIO(resp.text),
|
||||
encoding="utf-8-sig",
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
)
|
||||
|
||||
for _, row in df.iterrows():
|
||||
record = row.to_dict()
|
||||
try:
|
||||
record["URN"] = int(record["URN"])
|
||||
record["LinkURN"] = int(record["LinkURN"])
|
||||
except (ValueError, KeyError):
|
||||
continue
|
||||
yield record
|
||||
|
||||
|
||||
class TapUKGIAS(Tap):
|
||||
"""Singer tap for UK GIAS data."""
|
||||
|
||||
name = "tap-uk-gias"
|
||||
|
||||
config_jsonschema = th.PropertiesList(
|
||||
th.Property(
|
||||
"download_url",
|
||||
th.StringType,
|
||||
description="Override GIAS CSV download URL",
|
||||
),
|
||||
).to_dict()
|
||||
|
||||
def discover_streams(self):
|
||||
return [
|
||||
GIASEstablishmentsStream(self),
|
||||
GIASLinksStream(self),
|
||||
]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TapUKGIAS.cli()
|
||||
17
pipeline/plugins/extractors/tap-uk-idaci/pyproject.toml
Normal file
17
pipeline/plugins/extractors/tap-uk-idaci/pyproject.toml
Normal file
@@ -0,0 +1,17 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "tap-uk-idaci"
|
||||
version = "0.1.0"
|
||||
description = "Singer tap for UK IDACI deprivation index"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"singer-sdk~=0.39",
|
||||
"requests>=2.31",
|
||||
"pandas>=2.0",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
tap-uk-idaci = "tap_uk_idaci.tap:TapUKIDACI.cli"
|
||||
@@ -0,0 +1 @@
|
||||
"""tap-uk-idaci: Singer tap for IDACI (Income Deprivation Affecting Children Index)."""
|
||||
41
pipeline/plugins/extractors/tap-uk-idaci/tap_uk_idaci/tap.py
Normal file
41
pipeline/plugins/extractors/tap-uk-idaci/tap_uk_idaci/tap.py
Normal file
@@ -0,0 +1,41 @@
|
||||
"""IDACI Singer tap — extracts deprivation index lookup data."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from singer_sdk import Stream, Tap
|
||||
from singer_sdk import typing as th
|
||||
|
||||
|
||||
class IDACIStream(Stream):
|
||||
"""Stream: IDACI scores by LSOA."""
|
||||
|
||||
name = "idaci"
|
||||
primary_keys = ["lsoa_code"]
|
||||
replication_key = None
|
||||
|
||||
schema = th.PropertiesList(
|
||||
th.Property("lsoa_code", th.StringType, required=True),
|
||||
th.Property("idaci_score", th.NumberType),
|
||||
th.Property("idaci_decile", th.IntegerType),
|
||||
).to_dict()
|
||||
|
||||
def get_records(self, context):
|
||||
# TODO: Implement IDACI extraction
|
||||
# Source: MHCLG IoD 2019 LSOA-level data
|
||||
# Available as a static CSV download
|
||||
self.logger.warning("IDACI extraction not yet implemented")
|
||||
return iter([])
|
||||
|
||||
|
||||
class TapUKIDACI(Tap):
|
||||
"""Singer tap for UK IDACI data."""
|
||||
|
||||
name = "tap-uk-idaci"
|
||||
config_jsonschema = th.PropertiesList().to_dict()
|
||||
|
||||
def discover_streams(self):
|
||||
return [IDACIStream(self)]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TapUKIDACI.cli()
|
||||
18
pipeline/plugins/extractors/tap-uk-ofsted/pyproject.toml
Normal file
18
pipeline/plugins/extractors/tap-uk-ofsted/pyproject.toml
Normal file
@@ -0,0 +1,18 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "tap-uk-ofsted"
|
||||
version = "0.1.0"
|
||||
description = "Singer tap for UK Ofsted Management Information"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"singer-sdk~=0.39",
|
||||
"requests>=2.31",
|
||||
"pandas>=2.0",
|
||||
"odfpy>=1.4",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
tap-uk-ofsted = "tap_uk_ofsted.tap:TapUKOfsted.cli"
|
||||
@@ -0,0 +1 @@
|
||||
"""tap-uk-ofsted: Singer tap for Ofsted Management Information."""
|
||||
176
pipeline/plugins/extractors/tap-uk-ofsted/tap_uk_ofsted/tap.py
Normal file
176
pipeline/plugins/extractors/tap-uk-ofsted/tap_uk_ofsted/tap.py
Normal file
@@ -0,0 +1,176 @@
|
||||
"""Ofsted MI Singer tap — extracts inspection records from GOV.UK CSV/ODS."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import io
|
||||
import re
|
||||
|
||||
import requests
|
||||
from singer_sdk import Stream, Tap
|
||||
from singer_sdk import typing as th
|
||||
|
||||
GOV_UK_PAGE = (
|
||||
"https://www.gov.uk/government/statistical-data-sets/"
|
||||
"monthly-management-information-ofsteds-school-inspections-outcomes"
|
||||
)
|
||||
|
||||
# Column name → internal field, in priority order (first match wins).
|
||||
# Handles both current and older file formats.
|
||||
COLUMN_PRIORITY = {
|
||||
"urn": ["URN", "Urn", "urn"],
|
||||
"inspection_date": [
|
||||
"Inspection start date of latest OEIF graded inspection",
|
||||
"Inspection start date",
|
||||
"Inspection date",
|
||||
],
|
||||
"inspection_type": [
|
||||
"Inspection type of latest OEIF graded inspection",
|
||||
"Inspection type",
|
||||
],
|
||||
"event_type_grouping": [
|
||||
"Event type grouping",
|
||||
"Inspection type grouping",
|
||||
],
|
||||
"overall_effectiveness": [
|
||||
"Latest OEIF overall effectiveness",
|
||||
"Overall effectiveness",
|
||||
],
|
||||
"quality_of_education": [
|
||||
"Latest OEIF quality of education",
|
||||
"Quality of education",
|
||||
],
|
||||
"behaviour_and_attitudes": [
|
||||
"Latest OEIF behaviour and attitudes",
|
||||
"Behaviour and attitudes",
|
||||
],
|
||||
"personal_development": [
|
||||
"Latest OEIF personal development",
|
||||
"Personal development",
|
||||
],
|
||||
"effectiveness_of_leadership_and_management": [
|
||||
"Latest OEIF effectiveness of leadership and management",
|
||||
"Effectiveness of leadership and management",
|
||||
],
|
||||
"early_years_provision": [
|
||||
"Latest OEIF early years provision",
|
||||
"Early years provision (where applicable)",
|
||||
],
|
||||
"sixth_form_provision": [
|
||||
"Latest OEIF sixth form provision",
|
||||
"Sixth form provision (where applicable)",
|
||||
],
|
||||
}
|
||||
|
||||
|
||||
def discover_csv_url() -> str | None:
|
||||
"""Scrape GOV.UK page to find the latest MI CSV download link."""
|
||||
resp = requests.get(GOV_UK_PAGE, timeout=30)
|
||||
resp.raise_for_status()
|
||||
# Look for CSV attachment links
|
||||
matches = re.findall(
|
||||
r'href="(https://assets\.publishing\.service\.gov\.uk/[^"]+\.csv)"',
|
||||
resp.text,
|
||||
)
|
||||
if matches:
|
||||
return matches[0]
|
||||
# Fall back to ODS
|
||||
matches = re.findall(
|
||||
r'href="(https://assets\.publishing\.service\.gov\.uk/[^"]+\.ods)"',
|
||||
resp.text,
|
||||
)
|
||||
return matches[0] if matches else None
|
||||
|
||||
|
||||
class OfstedInspectionsStream(Stream):
|
||||
"""Stream: Ofsted inspection records."""
|
||||
|
||||
name = "ofsted_inspections"
|
||||
primary_keys = ["urn", "inspection_date"]
|
||||
replication_key = None
|
||||
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.IntegerType, required=True),
|
||||
th.Property("inspection_date", th.StringType),
|
||||
th.Property("inspection_type", th.StringType),
|
||||
th.Property("event_type_grouping", th.StringType),
|
||||
th.Property("overall_effectiveness", th.StringType),
|
||||
th.Property("quality_of_education", th.StringType),
|
||||
th.Property("behaviour_and_attitudes", th.StringType),
|
||||
th.Property("personal_development", th.StringType),
|
||||
th.Property("effectiveness_of_leadership_and_management", th.StringType),
|
||||
th.Property("early_years_provision", th.StringType),
|
||||
th.Property("sixth_form_provision", th.StringType),
|
||||
th.Property("report_url", th.StringType),
|
||||
).to_dict()
|
||||
|
||||
def _resolve_columns(self, df_columns: list[str]) -> dict[str, str]:
|
||||
"""Map internal field names to actual CSV column names."""
|
||||
mapping = {}
|
||||
for field, candidates in COLUMN_PRIORITY.items():
|
||||
for candidate in candidates:
|
||||
if candidate in df_columns:
|
||||
mapping[field] = candidate
|
||||
break
|
||||
return mapping
|
||||
|
||||
def get_records(self, context):
|
||||
import pandas as pd
|
||||
|
||||
url = self.config.get("mi_url") or discover_csv_url()
|
||||
if not url:
|
||||
self.logger.error("Could not discover Ofsted MI download URL")
|
||||
return
|
||||
|
||||
self.logger.info("Downloading Ofsted MI: %s", url)
|
||||
resp = requests.get(url, timeout=120)
|
||||
resp.raise_for_status()
|
||||
|
||||
if url.endswith(".ods"):
|
||||
df = pd.read_excel(io.BytesIO(resp.content), engine="odf", dtype=str)
|
||||
else:
|
||||
# Detect header row (may not be row 0)
|
||||
text = resp.content.decode("utf-8-sig", errors="replace")
|
||||
lines = text.split("\n")
|
||||
header_idx = 0
|
||||
for i, line in enumerate(lines[:20]):
|
||||
if "URN" in line or "urn" in line.lower():
|
||||
header_idx = i
|
||||
break
|
||||
df = pd.read_csv(
|
||||
io.StringIO(text),
|
||||
skiprows=header_idx,
|
||||
dtype=str,
|
||||
keep_default_na=False,
|
||||
)
|
||||
|
||||
col_map = self._resolve_columns(list(df.columns))
|
||||
|
||||
for _, row in df.iterrows():
|
||||
record = {}
|
||||
for field, col in col_map.items():
|
||||
record[field] = row.get(col, None)
|
||||
|
||||
# Cast URN
|
||||
try:
|
||||
record["urn"] = int(record["urn"])
|
||||
except (ValueError, KeyError, TypeError):
|
||||
continue
|
||||
|
||||
yield record
|
||||
|
||||
|
||||
class TapUKOfsted(Tap):
|
||||
"""Singer tap for UK Ofsted Management Information."""
|
||||
|
||||
name = "tap-uk-ofsted"
|
||||
|
||||
config_jsonschema = th.PropertiesList(
|
||||
th.Property("mi_url", th.StringType, description="Direct URL to Ofsted MI file"),
|
||||
).to_dict()
|
||||
|
||||
def discover_streams(self):
|
||||
return [OfstedInspectionsStream(self)]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TapUKOfsted.cli()
|
||||
@@ -0,0 +1,18 @@
|
||||
[build-system]
|
||||
requires = ["setuptools>=68", "wheel"]
|
||||
build-backend = "setuptools.build_meta"
|
||||
|
||||
[project]
|
||||
name = "tap-uk-parent-view"
|
||||
version = "0.1.0"
|
||||
description = "Singer tap for UK Ofsted Parent View survey data"
|
||||
requires-python = ">=3.10"
|
||||
dependencies = [
|
||||
"singer-sdk~=0.39",
|
||||
"requests>=2.31",
|
||||
"pandas>=2.0",
|
||||
"openpyxl>=3.1",
|
||||
]
|
||||
|
||||
[project.scripts]
|
||||
tap-uk-parent-view = "tap_uk_parent_view.tap:TapUKParentView.cli"
|
||||
@@ -0,0 +1 @@
|
||||
"""tap-uk-parent-view: Singer tap for Ofsted Parent View survey data."""
|
||||
@@ -0,0 +1,49 @@
|
||||
"""Parent View Singer tap — extracts survey data from Ofsted Parent View portal."""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from singer_sdk import Stream, Tap
|
||||
from singer_sdk import typing as th
|
||||
|
||||
|
||||
class ParentViewStream(Stream):
|
||||
"""Stream: Parent View survey responses per school."""
|
||||
|
||||
name = "parent_view"
|
||||
primary_keys = ["urn"]
|
||||
replication_key = None
|
||||
|
||||
schema = th.PropertiesList(
|
||||
th.Property("urn", th.IntegerType, required=True),
|
||||
th.Property("survey_date", th.StringType),
|
||||
th.Property("total_responses", th.IntegerType),
|
||||
th.Property("q_happy_pct", th.NumberType),
|
||||
th.Property("q_safe_pct", th.NumberType),
|
||||
th.Property("q_progress_pct", th.NumberType),
|
||||
th.Property("q_well_taught_pct", th.NumberType),
|
||||
th.Property("q_well_led_pct", th.NumberType),
|
||||
th.Property("q_behaviour_pct", th.NumberType),
|
||||
th.Property("q_bullying_pct", th.NumberType),
|
||||
th.Property("q_recommend_pct", th.NumberType),
|
||||
).to_dict()
|
||||
|
||||
def get_records(self, context):
|
||||
# TODO: Implement Parent View data extraction
|
||||
# Source: Ofsted Parent View portal XLSX/CSV download
|
||||
# URL discovery requires scraping parentview.ofsted.gov.uk
|
||||
self.logger.warning("Parent View extraction not yet implemented")
|
||||
return iter([])
|
||||
|
||||
|
||||
class TapUKParentView(Tap):
|
||||
"""Singer tap for UK Ofsted Parent View."""
|
||||
|
||||
name = "tap-uk-parent-view"
|
||||
config_jsonschema = th.PropertiesList().to_dict()
|
||||
|
||||
def discover_streams(self):
|
||||
return [ParentViewStream(self)]
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
TapUKParentView.cli()
|
||||
Reference in New Issue
Block a user