diff --git a/backend/app.py b/backend/app.py
index ed421c5..c8894e0 100644
--- a/backend/app.py
+++ b/backend/app.py
@@ -35,6 +35,66 @@ from .utils import clean_for_json
# Values to exclude from filter dropdowns (empty strings, non-applicable labels)
EXCLUDED_FILTER_VALUES = {"", "Not applicable", "Does not apply"}
+BASE_URL = "https://schoolcompare.co.uk"
+MAX_SLUG_LENGTH = 60
+
+# In-memory sitemap cache
+_sitemap_xml: str | None = None
+
+
+def _slugify(text: str) -> str:
+ text = text.lower()
+ text = re.sub(r"[^\w\s-]", "", text)
+ text = re.sub(r"\s+", "-", text)
+ text = re.sub(r"-+", "-", text)
+ return text.strip("-")
+
+
+def _school_url(urn: int, school_name: str) -> str:
+ slug = _slugify(school_name)
+ if len(slug) > MAX_SLUG_LENGTH:
+ slug = slug[:MAX_SLUG_LENGTH].rstrip("-")
+ return f"/school/{urn}-{slug}"
+
+
+def build_sitemap() -> str:
+ """Generate sitemap XML from in-memory school data. Returns the XML string."""
+ df = load_school_data()
+
+ static_urls = [
+ (BASE_URL + "/", "daily", "1.0"),
+ (BASE_URL + "/rankings", "weekly", "0.8"),
+ (BASE_URL + "/compare", "weekly", "0.8"),
+ ]
+
+ lines = ['',
+ '']
+
+ for url, freq, priority in static_urls:
+ lines.append(
+ f" {url}"
+ f"{freq}"
+ f"{priority}"
+ )
+
+ if not df.empty and "urn" in df.columns and "school_name" in df.columns:
+ seen = set()
+ for _, row in df[["urn", "school_name"]].drop_duplicates(subset="urn").iterrows():
+ urn = int(row["urn"])
+ name = str(row["school_name"])
+ if urn in seen:
+ continue
+ seen.add(urn)
+ path = _school_url(urn, name)
+ lines.append(
+ f" {BASE_URL}{path}"
+ f"monthly"
+ f"0.6"
+ )
+
+ lines.append("")
+ return "\n".join(lines)
+
def clean_filter_values(series: pd.Series) -> list[str]:
"""Return sorted unique values from a Series, excluding NaN and junk labels."""
@@ -148,12 +208,19 @@ def validate_postcode(postcode: Optional[str]) -> Optional[str]:
@asynccontextmanager
async def lifespan(app: FastAPI):
"""Application lifespan - startup and shutdown events."""
+ global _sitemap_xml
print("Loading school data from marts...")
df = load_school_data()
if df.empty:
print("Warning: No data in marts. Run the annual EES pipeline to populate KS2 data.")
else:
print(f"Data loaded successfully: {len(df)} records.")
+ try:
+ _sitemap_xml = build_sitemap()
+ n = _sitemap_xml.count("")
+ print(f"Sitemap built: {n} URLs.")
+ except Exception as e:
+ print(f"Warning: sitemap build failed on startup: {e}")
yield
@@ -805,7 +872,26 @@ async def robots_txt():
@app.get("/sitemap.xml")
async def sitemap_xml():
"""Serve sitemap.xml for search engine indexing."""
- return FileResponse(settings.frontend_dir / "sitemap.xml", media_type="application/xml")
+ global _sitemap_xml
+ if _sitemap_xml is None:
+ try:
+ _sitemap_xml = build_sitemap()
+ except Exception as e:
+ raise HTTPException(status_code=503, detail=f"Sitemap unavailable: {e}")
+ return Response(content=_sitemap_xml, media_type="application/xml")
+
+
+@app.post("/api/admin/regenerate-sitemap")
+@limiter.limit("10/minute")
+async def regenerate_sitemap(
+ request: Request,
+ _: bool = Depends(verify_admin_api_key),
+):
+ """Rebuild and cache the sitemap from current school data. Called by Airflow after data updates."""
+ global _sitemap_xml
+ _sitemap_xml = build_sitemap()
+ n = _sitemap_xml.count("")
+ return {"status": "ok", "urls": n}
# Mount static files directly (must be after all routes to avoid catching API calls)
diff --git a/docker-compose.yml b/docker-compose.yml
index 85c67cb..c411b4a 100644
--- a/docker-compose.yml
+++ b/docker-compose.yml
@@ -119,6 +119,8 @@ services:
PG_DATABASE: schoolcompare
TYPESENSE_URL: http://typesense:8108
TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:-changeme}
+ BACKEND_URL: http://backend:80
+ ADMIN_API_KEY: ${ADMIN_API_KEY:-changeme}
volumes:
depends_on:
diff --git a/nextjs-app/app/sitemap.ts b/nextjs-app/app/sitemap.ts
deleted file mode 100644
index eeb4f8b..0000000
--- a/nextjs-app/app/sitemap.ts
+++ /dev/null
@@ -1,55 +0,0 @@
-/**
- * Dynamic Sitemap Generation
- * Generates sitemap with all school pages and main routes
- */
-
-import { MetadataRoute } from 'next';
-import { fetchSchools } from '@/lib/api';
-import { schoolUrl } from '@/lib/utils';
-
-const BASE_URL = 'https://schoolcompare.co.uk';
-
-export default async function sitemap(): Promise {
- // Static pages
- const staticPages: MetadataRoute.Sitemap = [
- {
- url: BASE_URL,
- lastModified: new Date(),
- changeFrequency: 'daily',
- priority: 1.0,
- },
- {
- url: `${BASE_URL}/compare`,
- lastModified: new Date(),
- changeFrequency: 'weekly',
- priority: 0.8,
- },
- {
- url: `${BASE_URL}/rankings`,
- lastModified: new Date(),
- changeFrequency: 'weekly',
- priority: 0.8,
- },
- ];
-
- // Fetch all schools (in batches if necessary)
- try {
- const schoolsData = await fetchSchools({
- page: 1,
- page_size: 10000, // Fetch all schools
- });
-
- const schoolPages: MetadataRoute.Sitemap = schoolsData.schools.map((school) => ({
- url: `${BASE_URL}${schoolUrl(school.urn, school.school_name)}`,
- lastModified: new Date(),
- changeFrequency: 'monthly',
- priority: 0.6,
- }));
-
- return [...staticPages, ...schoolPages];
- } catch (error) {
- console.error('Failed to generate sitemap:', error);
- // Return just static pages if school fetch fails
- return staticPages;
- }
-}
diff --git a/nextjs-app/next.config.js b/nextjs-app/next.config.js
index 4a84256..325e074 100644
--- a/nextjs-app/next.config.js
+++ b/nextjs-app/next.config.js
@@ -6,11 +6,16 @@ const nextConfig = {
// API Proxy to FastAPI backend
async rewrites() {
const apiUrl = process.env.FASTAPI_URL || 'http://localhost:8000/api';
+ const backendUrl = apiUrl.replace(/\/api$/, '');
return [
{
source: '/api/:path*',
destination: `${apiUrl}/:path*`,
},
+ {
+ source: '/sitemap.xml',
+ destination: `${backendUrl}/sitemap.xml`,
+ },
];
},
diff --git a/pipeline/dags/sitemap_dag.py b/pipeline/dags/sitemap_dag.py
new file mode 100644
index 0000000..0dc88b2
--- /dev/null
+++ b/pipeline/dags/sitemap_dag.py
@@ -0,0 +1,62 @@
+"""
+Sitemap Generation DAG
+
+Rebuilds the sitemap.xml on the backend after data updates so search engines
+get fresh school URLs. Runs weekly (Sunday 05:00) and can be triggered manually.
+"""
+
+from __future__ import annotations
+
+from datetime import datetime, timedelta
+
+from airflow import DAG
+
+try:
+ from airflow.providers.standard.operators.bash import BashOperator
+except ImportError:
+ from airflow.operators.bash import BashOperator
+
+default_args = {
+ "owner": "school-compare",
+ "depends_on_past": False,
+ "email_on_failure": False,
+ "retries": 2,
+ "retry_delay": timedelta(minutes=5),
+}
+
+with DAG(
+ dag_id="sitemap_generate",
+ default_args=default_args,
+ description="Rebuild sitemap.xml on the backend after school data updates",
+ schedule="0 5 * * 0", # Sundays 05:00 — after the weekly data window
+ start_date=datetime(2025, 1, 1),
+ catchup=False,
+ tags=["school-compare", "seo"],
+) as dag:
+
+ regenerate_sitemap = BashOperator(
+ task_id="call_regenerate_sitemap",
+ bash_command="""
+set -e
+BACKEND_URL="${BACKEND_URL:-http://backend:80}"
+ADMIN_KEY="${ADMIN_API_KEY:-changeme}"
+
+echo "Calling $BACKEND_URL/api/admin/regenerate-sitemap ..."
+
+response=$(curl -s -o /tmp/sitemap_response.json -w "%{http_code}" \\
+ -X POST "$BACKEND_URL/api/admin/regenerate-sitemap" \\
+ -H "X-API-Key: $ADMIN_KEY" \\
+ -H "Content-Type: application/json")
+
+echo "HTTP status: $response"
+cat /tmp/sitemap_response.json
+
+if [ "$response" != "200" ]; then
+ echo "ERROR: sitemap regeneration failed (HTTP $response)"
+ exit 1
+fi
+
+urls=$(python3 -c "import json,sys; d=json.load(open('/tmp/sitemap_response.json')); print(d.get('urls', '?'))")
+echo "Sitemap regenerated: $urls URLs."
+""",
+ )