feat(seo): static sitemap generation job via Airflow
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 45s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m29s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
All checks were successful
Build and Push Docker Images / Build Backend (FastAPI) (push) Successful in 45s
Build and Push Docker Images / Build Frontend (Next.js) (push) Successful in 1m5s
Build and Push Docker Images / Build Pipeline (Meltano + dbt + Airflow) (push) Successful in 1m29s
Build and Push Docker Images / Trigger Portainer Update (push) Successful in 0s
- Backend builds sitemap.xml from school data at startup (in-memory) - POST /api/admin/regenerate-sitemap refreshes it after data updates - New Airflow DAG (sitemap_generate) runs Sundays 05:00 and calls the endpoint - Next.js proxies /sitemap.xml to the backend; removes the slow dynamic sitemap.ts - docker-compose passes BACKEND_URL + ADMIN_API_KEY to Airflow env Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
@@ -35,6 +35,66 @@ from .utils import clean_for_json
|
|||||||
# Values to exclude from filter dropdowns (empty strings, non-applicable labels)
|
# Values to exclude from filter dropdowns (empty strings, non-applicable labels)
|
||||||
EXCLUDED_FILTER_VALUES = {"", "Not applicable", "Does not apply"}
|
EXCLUDED_FILTER_VALUES = {"", "Not applicable", "Does not apply"}
|
||||||
|
|
||||||
|
BASE_URL = "https://schoolcompare.co.uk"
|
||||||
|
MAX_SLUG_LENGTH = 60
|
||||||
|
|
||||||
|
# In-memory sitemap cache
|
||||||
|
_sitemap_xml: str | None = None
|
||||||
|
|
||||||
|
|
||||||
|
def _slugify(text: str) -> str:
|
||||||
|
text = text.lower()
|
||||||
|
text = re.sub(r"[^\w\s-]", "", text)
|
||||||
|
text = re.sub(r"\s+", "-", text)
|
||||||
|
text = re.sub(r"-+", "-", text)
|
||||||
|
return text.strip("-")
|
||||||
|
|
||||||
|
|
||||||
|
def _school_url(urn: int, school_name: str) -> str:
|
||||||
|
slug = _slugify(school_name)
|
||||||
|
if len(slug) > MAX_SLUG_LENGTH:
|
||||||
|
slug = slug[:MAX_SLUG_LENGTH].rstrip("-")
|
||||||
|
return f"/school/{urn}-{slug}"
|
||||||
|
|
||||||
|
|
||||||
|
def build_sitemap() -> str:
|
||||||
|
"""Generate sitemap XML from in-memory school data. Returns the XML string."""
|
||||||
|
df = load_school_data()
|
||||||
|
|
||||||
|
static_urls = [
|
||||||
|
(BASE_URL + "/", "daily", "1.0"),
|
||||||
|
(BASE_URL + "/rankings", "weekly", "0.8"),
|
||||||
|
(BASE_URL + "/compare", "weekly", "0.8"),
|
||||||
|
]
|
||||||
|
|
||||||
|
lines = ['<?xml version="1.0" encoding="UTF-8"?>',
|
||||||
|
'<urlset xmlns="http://www.sitemaps.org/schemas/sitemap/0.9">']
|
||||||
|
|
||||||
|
for url, freq, priority in static_urls:
|
||||||
|
lines.append(
|
||||||
|
f" <url><loc>{url}</loc>"
|
||||||
|
f"<changefreq>{freq}</changefreq>"
|
||||||
|
f"<priority>{priority}</priority></url>"
|
||||||
|
)
|
||||||
|
|
||||||
|
if not df.empty and "urn" in df.columns and "school_name" in df.columns:
|
||||||
|
seen = set()
|
||||||
|
for _, row in df[["urn", "school_name"]].drop_duplicates(subset="urn").iterrows():
|
||||||
|
urn = int(row["urn"])
|
||||||
|
name = str(row["school_name"])
|
||||||
|
if urn in seen:
|
||||||
|
continue
|
||||||
|
seen.add(urn)
|
||||||
|
path = _school_url(urn, name)
|
||||||
|
lines.append(
|
||||||
|
f" <url><loc>{BASE_URL}{path}</loc>"
|
||||||
|
f"<changefreq>monthly</changefreq>"
|
||||||
|
f"<priority>0.6</priority></url>"
|
||||||
|
)
|
||||||
|
|
||||||
|
lines.append("</urlset>")
|
||||||
|
return "\n".join(lines)
|
||||||
|
|
||||||
|
|
||||||
def clean_filter_values(series: pd.Series) -> list[str]:
|
def clean_filter_values(series: pd.Series) -> list[str]:
|
||||||
"""Return sorted unique values from a Series, excluding NaN and junk labels."""
|
"""Return sorted unique values from a Series, excluding NaN and junk labels."""
|
||||||
@@ -148,12 +208,19 @@ def validate_postcode(postcode: Optional[str]) -> Optional[str]:
|
|||||||
@asynccontextmanager
|
@asynccontextmanager
|
||||||
async def lifespan(app: FastAPI):
|
async def lifespan(app: FastAPI):
|
||||||
"""Application lifespan - startup and shutdown events."""
|
"""Application lifespan - startup and shutdown events."""
|
||||||
|
global _sitemap_xml
|
||||||
print("Loading school data from marts...")
|
print("Loading school data from marts...")
|
||||||
df = load_school_data()
|
df = load_school_data()
|
||||||
if df.empty:
|
if df.empty:
|
||||||
print("Warning: No data in marts. Run the annual EES pipeline to populate KS2 data.")
|
print("Warning: No data in marts. Run the annual EES pipeline to populate KS2 data.")
|
||||||
else:
|
else:
|
||||||
print(f"Data loaded successfully: {len(df)} records.")
|
print(f"Data loaded successfully: {len(df)} records.")
|
||||||
|
try:
|
||||||
|
_sitemap_xml = build_sitemap()
|
||||||
|
n = _sitemap_xml.count("<url>")
|
||||||
|
print(f"Sitemap built: {n} URLs.")
|
||||||
|
except Exception as e:
|
||||||
|
print(f"Warning: sitemap build failed on startup: {e}")
|
||||||
|
|
||||||
yield
|
yield
|
||||||
|
|
||||||
@@ -805,7 +872,26 @@ async def robots_txt():
|
|||||||
@app.get("/sitemap.xml")
|
@app.get("/sitemap.xml")
|
||||||
async def sitemap_xml():
|
async def sitemap_xml():
|
||||||
"""Serve sitemap.xml for search engine indexing."""
|
"""Serve sitemap.xml for search engine indexing."""
|
||||||
return FileResponse(settings.frontend_dir / "sitemap.xml", media_type="application/xml")
|
global _sitemap_xml
|
||||||
|
if _sitemap_xml is None:
|
||||||
|
try:
|
||||||
|
_sitemap_xml = build_sitemap()
|
||||||
|
except Exception as e:
|
||||||
|
raise HTTPException(status_code=503, detail=f"Sitemap unavailable: {e}")
|
||||||
|
return Response(content=_sitemap_xml, media_type="application/xml")
|
||||||
|
|
||||||
|
|
||||||
|
@app.post("/api/admin/regenerate-sitemap")
|
||||||
|
@limiter.limit("10/minute")
|
||||||
|
async def regenerate_sitemap(
|
||||||
|
request: Request,
|
||||||
|
_: bool = Depends(verify_admin_api_key),
|
||||||
|
):
|
||||||
|
"""Rebuild and cache the sitemap from current school data. Called by Airflow after data updates."""
|
||||||
|
global _sitemap_xml
|
||||||
|
_sitemap_xml = build_sitemap()
|
||||||
|
n = _sitemap_xml.count("<url>")
|
||||||
|
return {"status": "ok", "urls": n}
|
||||||
|
|
||||||
|
|
||||||
# Mount static files directly (must be after all routes to avoid catching API calls)
|
# Mount static files directly (must be after all routes to avoid catching API calls)
|
||||||
|
|||||||
@@ -119,6 +119,8 @@ services:
|
|||||||
PG_DATABASE: schoolcompare
|
PG_DATABASE: schoolcompare
|
||||||
TYPESENSE_URL: http://typesense:8108
|
TYPESENSE_URL: http://typesense:8108
|
||||||
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:-changeme}
|
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:-changeme}
|
||||||
|
BACKEND_URL: http://backend:80
|
||||||
|
ADMIN_API_KEY: ${ADMIN_API_KEY:-changeme}
|
||||||
volumes:
|
volumes:
|
||||||
|
|
||||||
depends_on:
|
depends_on:
|
||||||
|
|||||||
@@ -1,55 +0,0 @@
|
|||||||
/**
|
|
||||||
* Dynamic Sitemap Generation
|
|
||||||
* Generates sitemap with all school pages and main routes
|
|
||||||
*/
|
|
||||||
|
|
||||||
import { MetadataRoute } from 'next';
|
|
||||||
import { fetchSchools } from '@/lib/api';
|
|
||||||
import { schoolUrl } from '@/lib/utils';
|
|
||||||
|
|
||||||
const BASE_URL = 'https://schoolcompare.co.uk';
|
|
||||||
|
|
||||||
export default async function sitemap(): Promise<MetadataRoute.Sitemap> {
|
|
||||||
// Static pages
|
|
||||||
const staticPages: MetadataRoute.Sitemap = [
|
|
||||||
{
|
|
||||||
url: BASE_URL,
|
|
||||||
lastModified: new Date(),
|
|
||||||
changeFrequency: 'daily',
|
|
||||||
priority: 1.0,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
url: `${BASE_URL}/compare`,
|
|
||||||
lastModified: new Date(),
|
|
||||||
changeFrequency: 'weekly',
|
|
||||||
priority: 0.8,
|
|
||||||
},
|
|
||||||
{
|
|
||||||
url: `${BASE_URL}/rankings`,
|
|
||||||
lastModified: new Date(),
|
|
||||||
changeFrequency: 'weekly',
|
|
||||||
priority: 0.8,
|
|
||||||
},
|
|
||||||
];
|
|
||||||
|
|
||||||
// Fetch all schools (in batches if necessary)
|
|
||||||
try {
|
|
||||||
const schoolsData = await fetchSchools({
|
|
||||||
page: 1,
|
|
||||||
page_size: 10000, // Fetch all schools
|
|
||||||
});
|
|
||||||
|
|
||||||
const schoolPages: MetadataRoute.Sitemap = schoolsData.schools.map((school) => ({
|
|
||||||
url: `${BASE_URL}${schoolUrl(school.urn, school.school_name)}`,
|
|
||||||
lastModified: new Date(),
|
|
||||||
changeFrequency: 'monthly',
|
|
||||||
priority: 0.6,
|
|
||||||
}));
|
|
||||||
|
|
||||||
return [...staticPages, ...schoolPages];
|
|
||||||
} catch (error) {
|
|
||||||
console.error('Failed to generate sitemap:', error);
|
|
||||||
// Return just static pages if school fetch fails
|
|
||||||
return staticPages;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
@@ -6,11 +6,16 @@ const nextConfig = {
|
|||||||
// API Proxy to FastAPI backend
|
// API Proxy to FastAPI backend
|
||||||
async rewrites() {
|
async rewrites() {
|
||||||
const apiUrl = process.env.FASTAPI_URL || 'http://localhost:8000/api';
|
const apiUrl = process.env.FASTAPI_URL || 'http://localhost:8000/api';
|
||||||
|
const backendUrl = apiUrl.replace(/\/api$/, '');
|
||||||
return [
|
return [
|
||||||
{
|
{
|
||||||
source: '/api/:path*',
|
source: '/api/:path*',
|
||||||
destination: `${apiUrl}/:path*`,
|
destination: `${apiUrl}/:path*`,
|
||||||
},
|
},
|
||||||
|
{
|
||||||
|
source: '/sitemap.xml',
|
||||||
|
destination: `${backendUrl}/sitemap.xml`,
|
||||||
|
},
|
||||||
];
|
];
|
||||||
},
|
},
|
||||||
|
|
||||||
|
|||||||
62
pipeline/dags/sitemap_dag.py
Normal file
62
pipeline/dags/sitemap_dag.py
Normal file
@@ -0,0 +1,62 @@
|
|||||||
|
"""
|
||||||
|
Sitemap Generation DAG
|
||||||
|
|
||||||
|
Rebuilds the sitemap.xml on the backend after data updates so search engines
|
||||||
|
get fresh school URLs. Runs weekly (Sunday 05:00) and can be triggered manually.
|
||||||
|
"""
|
||||||
|
|
||||||
|
from __future__ import annotations
|
||||||
|
|
||||||
|
from datetime import datetime, timedelta
|
||||||
|
|
||||||
|
from airflow import DAG
|
||||||
|
|
||||||
|
try:
|
||||||
|
from airflow.providers.standard.operators.bash import BashOperator
|
||||||
|
except ImportError:
|
||||||
|
from airflow.operators.bash import BashOperator
|
||||||
|
|
||||||
|
default_args = {
|
||||||
|
"owner": "school-compare",
|
||||||
|
"depends_on_past": False,
|
||||||
|
"email_on_failure": False,
|
||||||
|
"retries": 2,
|
||||||
|
"retry_delay": timedelta(minutes=5),
|
||||||
|
}
|
||||||
|
|
||||||
|
with DAG(
|
||||||
|
dag_id="sitemap_generate",
|
||||||
|
default_args=default_args,
|
||||||
|
description="Rebuild sitemap.xml on the backend after school data updates",
|
||||||
|
schedule="0 5 * * 0", # Sundays 05:00 — after the weekly data window
|
||||||
|
start_date=datetime(2025, 1, 1),
|
||||||
|
catchup=False,
|
||||||
|
tags=["school-compare", "seo"],
|
||||||
|
) as dag:
|
||||||
|
|
||||||
|
regenerate_sitemap = BashOperator(
|
||||||
|
task_id="call_regenerate_sitemap",
|
||||||
|
bash_command="""
|
||||||
|
set -e
|
||||||
|
BACKEND_URL="${BACKEND_URL:-http://backend:80}"
|
||||||
|
ADMIN_KEY="${ADMIN_API_KEY:-changeme}"
|
||||||
|
|
||||||
|
echo "Calling $BACKEND_URL/api/admin/regenerate-sitemap ..."
|
||||||
|
|
||||||
|
response=$(curl -s -o /tmp/sitemap_response.json -w "%{http_code}" \\
|
||||||
|
-X POST "$BACKEND_URL/api/admin/regenerate-sitemap" \\
|
||||||
|
-H "X-API-Key: $ADMIN_KEY" \\
|
||||||
|
-H "Content-Type: application/json")
|
||||||
|
|
||||||
|
echo "HTTP status: $response"
|
||||||
|
cat /tmp/sitemap_response.json
|
||||||
|
|
||||||
|
if [ "$response" != "200" ]; then
|
||||||
|
echo "ERROR: sitemap regeneration failed (HTTP $response)"
|
||||||
|
exit 1
|
||||||
|
fi
|
||||||
|
|
||||||
|
urls=$(python3 -c "import json,sys; d=json.load(open('/tmp/sitemap_response.json')); print(d.get('urls', '?'))")
|
||||||
|
echo "Sitemap regenerated: $urls URLs."
|
||||||
|
""",
|
||||||
|
)
|
||||||
Reference in New Issue
Block a user