diff --git a/backend/app.py b/backend/app.py index ed421c5..c8894e0 100644 --- a/backend/app.py +++ b/backend/app.py @@ -35,6 +35,66 @@ from .utils import clean_for_json # Values to exclude from filter dropdowns (empty strings, non-applicable labels) EXCLUDED_FILTER_VALUES = {"", "Not applicable", "Does not apply"} +BASE_URL = "https://schoolcompare.co.uk" +MAX_SLUG_LENGTH = 60 + +# In-memory sitemap cache +_sitemap_xml: str | None = None + + +def _slugify(text: str) -> str: + text = text.lower() + text = re.sub(r"[^\w\s-]", "", text) + text = re.sub(r"\s+", "-", text) + text = re.sub(r"-+", "-", text) + return text.strip("-") + + +def _school_url(urn: int, school_name: str) -> str: + slug = _slugify(school_name) + if len(slug) > MAX_SLUG_LENGTH: + slug = slug[:MAX_SLUG_LENGTH].rstrip("-") + return f"/school/{urn}-{slug}" + + +def build_sitemap() -> str: + """Generate sitemap XML from in-memory school data. Returns the XML string.""" + df = load_school_data() + + static_urls = [ + (BASE_URL + "/", "daily", "1.0"), + (BASE_URL + "/rankings", "weekly", "0.8"), + (BASE_URL + "/compare", "weekly", "0.8"), + ] + + lines = ['', + ''] + + for url, freq, priority in static_urls: + lines.append( + f" {url}" + f"{freq}" + f"{priority}" + ) + + if not df.empty and "urn" in df.columns and "school_name" in df.columns: + seen = set() + for _, row in df[["urn", "school_name"]].drop_duplicates(subset="urn").iterrows(): + urn = int(row["urn"]) + name = str(row["school_name"]) + if urn in seen: + continue + seen.add(urn) + path = _school_url(urn, name) + lines.append( + f" {BASE_URL}{path}" + f"monthly" + f"0.6" + ) + + lines.append("") + return "\n".join(lines) + def clean_filter_values(series: pd.Series) -> list[str]: """Return sorted unique values from a Series, excluding NaN and junk labels.""" @@ -148,12 +208,19 @@ def validate_postcode(postcode: Optional[str]) -> Optional[str]: @asynccontextmanager async def lifespan(app: FastAPI): """Application lifespan - startup and shutdown events.""" + global _sitemap_xml print("Loading school data from marts...") df = load_school_data() if df.empty: print("Warning: No data in marts. Run the annual EES pipeline to populate KS2 data.") else: print(f"Data loaded successfully: {len(df)} records.") + try: + _sitemap_xml = build_sitemap() + n = _sitemap_xml.count("") + print(f"Sitemap built: {n} URLs.") + except Exception as e: + print(f"Warning: sitemap build failed on startup: {e}") yield @@ -805,7 +872,26 @@ async def robots_txt(): @app.get("/sitemap.xml") async def sitemap_xml(): """Serve sitemap.xml for search engine indexing.""" - return FileResponse(settings.frontend_dir / "sitemap.xml", media_type="application/xml") + global _sitemap_xml + if _sitemap_xml is None: + try: + _sitemap_xml = build_sitemap() + except Exception as e: + raise HTTPException(status_code=503, detail=f"Sitemap unavailable: {e}") + return Response(content=_sitemap_xml, media_type="application/xml") + + +@app.post("/api/admin/regenerate-sitemap") +@limiter.limit("10/minute") +async def regenerate_sitemap( + request: Request, + _: bool = Depends(verify_admin_api_key), +): + """Rebuild and cache the sitemap from current school data. Called by Airflow after data updates.""" + global _sitemap_xml + _sitemap_xml = build_sitemap() + n = _sitemap_xml.count("") + return {"status": "ok", "urls": n} # Mount static files directly (must be after all routes to avoid catching API calls) diff --git a/docker-compose.yml b/docker-compose.yml index 85c67cb..c411b4a 100644 --- a/docker-compose.yml +++ b/docker-compose.yml @@ -119,6 +119,8 @@ services: PG_DATABASE: schoolcompare TYPESENSE_URL: http://typesense:8108 TYPESENSE_API_KEY: ${TYPESENSE_API_KEY:-changeme} + BACKEND_URL: http://backend:80 + ADMIN_API_KEY: ${ADMIN_API_KEY:-changeme} volumes: depends_on: diff --git a/nextjs-app/app/sitemap.ts b/nextjs-app/app/sitemap.ts deleted file mode 100644 index eeb4f8b..0000000 --- a/nextjs-app/app/sitemap.ts +++ /dev/null @@ -1,55 +0,0 @@ -/** - * Dynamic Sitemap Generation - * Generates sitemap with all school pages and main routes - */ - -import { MetadataRoute } from 'next'; -import { fetchSchools } from '@/lib/api'; -import { schoolUrl } from '@/lib/utils'; - -const BASE_URL = 'https://schoolcompare.co.uk'; - -export default async function sitemap(): Promise { - // Static pages - const staticPages: MetadataRoute.Sitemap = [ - { - url: BASE_URL, - lastModified: new Date(), - changeFrequency: 'daily', - priority: 1.0, - }, - { - url: `${BASE_URL}/compare`, - lastModified: new Date(), - changeFrequency: 'weekly', - priority: 0.8, - }, - { - url: `${BASE_URL}/rankings`, - lastModified: new Date(), - changeFrequency: 'weekly', - priority: 0.8, - }, - ]; - - // Fetch all schools (in batches if necessary) - try { - const schoolsData = await fetchSchools({ - page: 1, - page_size: 10000, // Fetch all schools - }); - - const schoolPages: MetadataRoute.Sitemap = schoolsData.schools.map((school) => ({ - url: `${BASE_URL}${schoolUrl(school.urn, school.school_name)}`, - lastModified: new Date(), - changeFrequency: 'monthly', - priority: 0.6, - })); - - return [...staticPages, ...schoolPages]; - } catch (error) { - console.error('Failed to generate sitemap:', error); - // Return just static pages if school fetch fails - return staticPages; - } -} diff --git a/nextjs-app/next.config.js b/nextjs-app/next.config.js index 4a84256..325e074 100644 --- a/nextjs-app/next.config.js +++ b/nextjs-app/next.config.js @@ -6,11 +6,16 @@ const nextConfig = { // API Proxy to FastAPI backend async rewrites() { const apiUrl = process.env.FASTAPI_URL || 'http://localhost:8000/api'; + const backendUrl = apiUrl.replace(/\/api$/, ''); return [ { source: '/api/:path*', destination: `${apiUrl}/:path*`, }, + { + source: '/sitemap.xml', + destination: `${backendUrl}/sitemap.xml`, + }, ]; }, diff --git a/pipeline/dags/sitemap_dag.py b/pipeline/dags/sitemap_dag.py new file mode 100644 index 0000000..0dc88b2 --- /dev/null +++ b/pipeline/dags/sitemap_dag.py @@ -0,0 +1,62 @@ +""" +Sitemap Generation DAG + +Rebuilds the sitemap.xml on the backend after data updates so search engines +get fresh school URLs. Runs weekly (Sunday 05:00) and can be triggered manually. +""" + +from __future__ import annotations + +from datetime import datetime, timedelta + +from airflow import DAG + +try: + from airflow.providers.standard.operators.bash import BashOperator +except ImportError: + from airflow.operators.bash import BashOperator + +default_args = { + "owner": "school-compare", + "depends_on_past": False, + "email_on_failure": False, + "retries": 2, + "retry_delay": timedelta(minutes=5), +} + +with DAG( + dag_id="sitemap_generate", + default_args=default_args, + description="Rebuild sitemap.xml on the backend after school data updates", + schedule="0 5 * * 0", # Sundays 05:00 — after the weekly data window + start_date=datetime(2025, 1, 1), + catchup=False, + tags=["school-compare", "seo"], +) as dag: + + regenerate_sitemap = BashOperator( + task_id="call_regenerate_sitemap", + bash_command=""" +set -e +BACKEND_URL="${BACKEND_URL:-http://backend:80}" +ADMIN_KEY="${ADMIN_API_KEY:-changeme}" + +echo "Calling $BACKEND_URL/api/admin/regenerate-sitemap ..." + +response=$(curl -s -o /tmp/sitemap_response.json -w "%{http_code}" \\ + -X POST "$BACKEND_URL/api/admin/regenerate-sitemap" \\ + -H "X-API-Key: $ADMIN_KEY" \\ + -H "Content-Type: application/json") + +echo "HTTP status: $response" +cat /tmp/sitemap_response.json + +if [ "$response" != "200" ]; then + echo "ERROR: sitemap regeneration failed (HTTP $response)" + exit 1 +fi + +urls=$(python3 -c "import json,sys; d=json.load(open('/tmp/sitemap_response.json')); print(d.get('urls', '?'))") +echo "Sitemap regenerated: $urls URLs." +""", + )