#!/usr/bin/env python3 """ Fetch real school performance data from UK Government sources. This script downloads KS2 (Key Stage 2) primary school data from: - Compare School Performance service - Get Information about Schools (GIAS) Data is filtered to only include schools in Wandsworth and Merton. """ import os import sys import requests import pandas as pd from pathlib import Path from io import StringIO # Output directory DATA_DIR = Path(__file__).parent.parent / "data" # Local Authority codes for Wandsworth and Merton LA_CODES = { "Wandsworth": "212", "Merton": "315" } # Academic years to fetch (last 5 years available) YEARS = ["2023-2024", "2022-2023", "2021-2022", "2019-2020", "2018-2019"] # Note: 2020-2021 had no SATs due to COVID def fetch_gias_data(): """ Fetch school establishment data from Get Information About Schools. This gives us the list of schools with URN, name, address, type, etc. """ print("Fetching school establishment data from GIAS...") # GIAS provides downloadable extracts # Main extract URL (this may need to be updated periodically) gias_url = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata.csv" try: response = requests.get(gias_url, timeout=60) response.raise_for_status() # Parse CSV df = pd.read_csv(StringIO(response.text), encoding='utf-8-sig', low_memory=False) # Filter to primary schools in Wandsworth and Merton # Phase of education: Primary, Middle deemed primary # LA codes: 212 (Wandsworth), 315 (Merton) df = df[ (df['LA (code)'].astype(str).isin(LA_CODES.values())) & (df['PhaseOfEducation (name)'].str.contains('Primary', na=False)) ] # Select relevant columns columns_to_keep = [ 'URN', 'EstablishmentName', 'LA (name)', 'TypeOfEstablishment (name)', 'Street', 'Locality', 'Town', 'Postcode', 'SchoolCapacity', 'NumberOfPupils', 'OfstedRating (name)' ] available_cols = [c for c in columns_to_keep if c in df.columns] df = df[available_cols] # Rename columns df = df.rename(columns={ 'URN': 'urn', 'EstablishmentName': 'school_name', 'LA (name)': 'local_authority', 'TypeOfEstablishment (name)': 'school_type', 'Street': 'street', 'Town': 'town', 'Postcode': 'postcode', 'NumberOfPupils': 'pupils', 'OfstedRating (name)': 'ofsted_rating' }) # Create address field df['address'] = df.apply( lambda row: f"{row.get('street', '')}, {row.get('postcode', '')}".strip(', '), axis=1 ) print(f"Found {len(df)} primary schools in Wandsworth and Merton") return df except Exception as e: print(f"Error fetching GIAS data: {e}") return None def fetch_ks2_performance_data(): """ Fetch KS2 performance data from Compare School Performance. Note: The official download page requires form submission. We'll try to access the underlying data files directly. """ print("\nFetching KS2 performance data...") # The performance data is available at gov.uk statistics pages # KS2 data URLs follow a pattern base_urls = { "2023-2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/", "2022-2023": "https://content.explore-education-statistics.service.gov.uk/api/releases/", } # Alternative: Direct download links from gov.uk (when available) # These URLs may need to be updated when new data is released data_urls = { # 2024 KS2 results (provisional) "2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/b4cb82e3-6dca-4c98-a3b0-ba7d1d3ef555/files", } print("Note: For the most accurate data, please download manually from:") print("https://www.compare-school-performance.service.gov.uk/download-data") print("\nSteps:") print("1. Select 'Key Stage 2' for Data type") print("2. Select 'All data' for File type") print("3. Select desired academic year") print("4. Download and place CSV files in the 'data' folder") return None def download_from_explore_education_statistics(): """ Try to fetch data from the Explore Education Statistics API. API docs: https://dfe-analytical-services.github.io/explore-education-statistics-api-docs/ """ print("\nAttempting to fetch from Explore Education Statistics API...") api_base = "https://explore-education-statistics.service.gov.uk/api/v1" # First, list available publications try: # Get KS2 publication publications_url = f"{api_base}/publications" response = requests.get(publications_url, timeout=30) if response.status_code == 200: publications = response.json() # Find KS2 related publication ks2_pubs = [p for p in publications.get('results', []) if 'key stage 2' in p.get('title', '').lower() or 'ks2' in p.get('title', '').lower()] if ks2_pubs: print(f"Found KS2 publications: {[p['title'] for p in ks2_pubs]}") # Get the latest release for pub in ks2_pubs: pub_id = pub.get('id') if pub_id: release_url = f"{api_base}/publications/{pub_id}/releases/latest" release_response = requests.get(release_url, timeout=30) if release_response.status_code == 200: release = release_response.json() print(f"Latest release: {release.get('title')}") # Get data files data_sets = release.get('dataSets', []) for ds in data_sets: print(f" - Dataset: {ds.get('name')}") else: print("No KS2 publications found via API") else: print(f"API returned status {response.status_code}") except Exception as e: print(f"Error accessing API: {e}") return None def create_combined_dataset(schools_df, performance_data=None): """ Combine school information with performance data. If no performance data is available, returns school info only. """ if schools_df is None: return None # Add year column for compatibility schools_df['year'] = 2024 # Add placeholder performance columns if no real data if performance_data is None: print("\nNo performance data available - school list saved without metrics") print("Download KS2 data manually and re-run to add performance metrics") return schools_df def main(): """Main entry point.""" print("=" * 60) print("Fetching Real School Data for Wandsworth & Merton") print("=" * 60) # Create data directory DATA_DIR.mkdir(exist_ok=True) # Fetch school establishment data schools_df = fetch_gias_data() # Try to fetch performance data fetch_ks2_performance_data() download_from_explore_education_statistics() # Save school data if schools_df is not None: output_file = DATA_DIR / "schools_wandsworth_merton.csv" schools_df.to_csv(output_file, index=False) print(f"\nSchool data saved to: {output_file}") print(f"Total schools: {len(schools_df)}") # Show breakdown print("\nBreakdown by Local Authority:") print(schools_df['local_authority'].value_counts()) else: print("\nFailed to fetch school data") print("\n" + "=" * 60) print("NEXT STEPS:") print("=" * 60) print(""" To get complete performance data: 1. Go to: https://www.compare-school-performance.service.gov.uk/download-data 2. Download KS2 data for each year (2019-2024): - Select: Key Stage 2 - Select: All data (or specific metrics) - Select: Academic year - Click: Download data 3. Place downloaded CSV files in the 'data' folder 4. Restart the application - it will automatically load the real data The app will merge school info with performance metrics. """) if __name__ == "__main__": main()