feat(seo): static sitemap generation job via Airflow
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 45s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m29s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 45s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m29s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
- Backend builds sitemap.xml from school data at startup (in-memory) - POST /api/admin/regenerate-sitemap refreshes it after data updates - New Airflow DAG (sitemap_generate) runs Sundays 05:00 and calls the endpoint - Next.js proxies /sitemap.xml to the backend; removes the slow dynamic sitemap.ts - docker-compose passes BACKEND_URL + ADMIN_API_KEY to Airflow env Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -35,6 +35,66 @@ from .utils import clean_for_json
|
||||
# Values to exclude from filter dropdowns (empty strings, non-applicable labels)
|
||||
EXCLUDED_FILTER_VALUES = {"", "Not applicable", "Does not apply"}
|
||||
|
||||
BASE_URL = "https://schoolcompare.co.uk"
|
||||
MAX_SLUG_LENGTH = 60
|
||||
|
||||
# In-memory sitemap cache
|
||||
_sitemap_xml: str | None = None
|
||||
|
||||
|
||||
def _slugify(text: str) -> str:
|
||||
text = text.lower()
|
||||
text = re.sub(r"[^\w\s-]", "", text)
|
||||
text = re.sub(r"\s+", "-", text)
|
||||
text = re.sub(r"-+", "-", text)
|
||||
return text.strip("-")
|
||||
|
||||
|
||||
def _school_url(urn: int, school_name: str) -> str:
|
||||
slug = _slugify(school_name)
|
||||
if len(slug) > MAX_SLUG_LENGTH:
|
||||
slug = slug[:MAX_SLUG_LENGTH].rstrip("-")
|
||||
return f"/school/{urn}-{slug}"
|
||||
|
||||
|
||||
def build_sitemap() -> str:
|
||||
"""Generate sitemap XML from in-memory school data. Returns the XML string."""
|
||||
df = load_school_data()
|
||||
|
||||
static_urls = [
|
||||
(BASE_URL + "/", "daily", "1.0"),
|
||||
(BASE_URL + "/rankings", "weekly", "0.8"),
|
||||
(BASE_URL + "/compare", "weekly", "0.8"),
|
||||
]
|
||||
|
||||
lines = ['<?xml version="1.0" encoding="UTF-8"?>',
|
||||
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">']
|
||||
|
||||
for url, freq, priority in static_urls:
|
||||
lines.append(
|
||||
f" <url><loc>{url}</loc>"
|
||||
f"<changefreq>{freq}</changefreq>"
|
||||
f"<priority>{priority}</priority></url>"
|
||||
)
|
||||
|
||||
if not df.empty and "urn" in df.columns and "school_name" in df.columns:
|
||||
seen = set()
|
||||
for _, row in df[["urn", "school_name"]].drop_duplicates(subset="urn").iterrows():
|
||||
urn = int(row["urn"])
|
||||
name = str(row["school_name"])
|
||||
if urn in seen:
|
||||
continue
|
||||
seen.add(urn)
|
||||
path = _school_url(urn, name)
|
||||
lines.append(
|
||||
f" <url><loc>{BASE_URL}{path}</loc>"
|
||||
f"<changefreq>monthly</changefreq>"
|
||||
f"<priority>0.6</priority></url>"
|
||||
)
|
||||
|
||||
lines.append("</urlset>")
|
||||
return "\n".join(lines)
|
||||
|
||||
|
||||
def clean_filter_values(series: pd.Series) -> list[str]:
|
||||
"""Return sorted unique values from a Series, excluding NaN and junk labels."""
|
||||
@@ -148,12 +208,19 @@ def validate_postcode(postcode: Optional[str]) -> Optional[str]:
|
||||
@asynccontextmanager
|
||||
async def lifespan(app: FastAPI):
|
||||
"""Application lifespan - startup and shutdown events."""
|
||||
global _sitemap_xml
|
||||
print("Loading school data from marts...")
|
||||
df = load_school_data()
|
||||
if df.empty:
|
||||
print("Warning: No data in marts. Run the annual EES pipeline to populate KS2 data.")
|
||||
else:
|
||||
print(f"Data loaded successfully: {len(df)} records.")
|
||||
try:
|
||||
_sitemap_xml = build_sitemap()
|
||||
n = _sitemap_xml.count("<url>")
|
||||
print(f"Sitemap built: {n} URLs.")
|
||||
except Exception as e:
|
||||
print(f"Warning: sitemap build failed on startup: {e}")
|
||||
|
||||
yield
|
||||
|
||||
@@ -805,7 +872,26 @@ async def robots_txt():
|
||||
@app.get("/sitemap.xml")
|
||||
async def sitemap_xml():
|
||||
"""Serve sitemap.xml for search engine indexing."""
|
||||
return FileResponse(settings.frontend_dir / "sitemap.xml", media_type="application/xml")
|
||||
global _sitemap_xml
|
||||
if _sitemap_xml is None:
|
||||
try:
|
||||
_sitemap_xml = build_sitemap()
|
||||
except Exception as e:
|
||||
raise HTTPException(status_code=503, detail=f"Sitemap unavailable: {e}")
|
||||
return Response(content=_sitemap_xml, media_type="application/xml")
|
||||
|
||||
|
||||
@app.post("/api/admin/regenerate-sitemap")
|
||||
@limiter.limit("10/minute")
|
||||
async def regenerate_sitemap(
|
||||
request: Request,
|
||||
_: bool = Depends(verify_admin_api_key),
|
||||
):
|
||||
"""Rebuild and cache the sitemap from current school data. Called by Airflow after data updates."""
|
||||
global _sitemap_xml
|
||||
_sitemap_xml = build_sitemap()
|
||||
n = _sitemap_xml.count("<url>")
|
||||
return {"status": "ok", "urls": n}
|
||||
|
||||
|
||||
# Mount static files directly (must be after all routes to avoid catching API calls)
|
||||
|
||||
Reference in New Issue
Block a user