254 lines
8.5 KiB
Python
254 lines
8.5 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Fetch real school performance data from UK Government sources.
|
|
|
|
This script downloads KS2 (Key Stage 2) primary school data from:
|
|
- Compare School Performance service
|
|
- Get Information about Schools (GIAS)
|
|
|
|
Data is filtered to only include schools in Wandsworth and Merton.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
import requests
|
|
import pandas as pd
|
|
from pathlib import Path
|
|
from io import StringIO
|
|
|
|
# Output directory
|
|
DATA_DIR = Path(__file__).parent.parent / "data"
|
|
|
|
# Local Authority codes for Wandsworth and Merton
|
|
LA_CODES = {
|
|
"Wandsworth": "212",
|
|
"Merton": "315"
|
|
}
|
|
|
|
# Academic years to fetch (last 5 years available)
|
|
YEARS = ["2023-2024", "2022-2023", "2021-2022", "2019-2020", "2018-2019"]
|
|
# Note: 2020-2021 had no SATs due to COVID
|
|
|
|
|
|
def fetch_gias_data():
|
|
"""
|
|
Fetch school establishment data from Get Information About Schools.
|
|
This gives us the list of schools with URN, name, address, type, etc.
|
|
"""
|
|
print("Fetching school establishment data from GIAS...")
|
|
|
|
# GIAS provides downloadable extracts
|
|
# Main extract URL (this may need to be updated periodically)
|
|
gias_url = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata.csv"
|
|
|
|
try:
|
|
response = requests.get(gias_url, timeout=60)
|
|
response.raise_for_status()
|
|
|
|
# Parse CSV
|
|
df = pd.read_csv(StringIO(response.text), encoding='utf-8-sig', low_memory=False)
|
|
|
|
# Filter to primary schools in Wandsworth and Merton
|
|
# Phase of education: Primary, Middle deemed primary
|
|
# LA codes: 212 (Wandsworth), 315 (Merton)
|
|
df = df[
|
|
(df['LA (code)'].astype(str).isin(LA_CODES.values())) &
|
|
(df['PhaseOfEducation (name)'].str.contains('Primary', na=False))
|
|
]
|
|
|
|
# Select relevant columns
|
|
columns_to_keep = [
|
|
'URN', 'EstablishmentName', 'LA (name)', 'TypeOfEstablishment (name)',
|
|
'Street', 'Locality', 'Town', 'Postcode',
|
|
'SchoolCapacity', 'NumberOfPupils', 'OfstedRating (name)'
|
|
]
|
|
available_cols = [c for c in columns_to_keep if c in df.columns]
|
|
df = df[available_cols]
|
|
|
|
# Rename columns
|
|
df = df.rename(columns={
|
|
'URN': 'urn',
|
|
'EstablishmentName': 'school_name',
|
|
'LA (name)': 'local_authority',
|
|
'TypeOfEstablishment (name)': 'school_type',
|
|
'Street': 'street',
|
|
'Town': 'town',
|
|
'Postcode': 'postcode',
|
|
'NumberOfPupils': 'pupils',
|
|
'OfstedRating (name)': 'ofsted_rating'
|
|
})
|
|
|
|
# Create address field
|
|
df['address'] = df.apply(
|
|
lambda row: f"{row.get('street', '')}, {row.get('postcode', '')}".strip(', '),
|
|
axis=1
|
|
)
|
|
|
|
print(f"Found {len(df)} primary schools in Wandsworth and Merton")
|
|
return df
|
|
|
|
except Exception as e:
|
|
print(f"Error fetching GIAS data: {e}")
|
|
return None
|
|
|
|
|
|
def fetch_ks2_performance_data():
|
|
"""
|
|
Fetch KS2 performance data from Compare School Performance.
|
|
|
|
Note: The official download page requires form submission.
|
|
We'll try to access the underlying data files directly.
|
|
"""
|
|
print("\nFetching KS2 performance data...")
|
|
|
|
# The performance data is available at gov.uk statistics pages
|
|
# KS2 data URLs follow a pattern
|
|
base_urls = {
|
|
"2023-2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/",
|
|
"2022-2023": "https://content.explore-education-statistics.service.gov.uk/api/releases/",
|
|
}
|
|
|
|
# Alternative: Direct download links from gov.uk (when available)
|
|
# These URLs may need to be updated when new data is released
|
|
data_urls = {
|
|
# 2024 KS2 results (provisional)
|
|
"2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/b4cb82e3-6dca-4c98-a3b0-ba7d1d3ef555/files",
|
|
}
|
|
|
|
print("Note: For the most accurate data, please download manually from:")
|
|
print("https://www.compare-school-performance.service.gov.uk/download-data")
|
|
print("\nSteps:")
|
|
print("1. Select 'Key Stage 2' for Data type")
|
|
print("2. Select 'All data' for File type")
|
|
print("3. Select desired academic year")
|
|
print("4. Download and place CSV files in the 'data' folder")
|
|
|
|
return None
|
|
|
|
|
|
def download_from_explore_education_statistics():
|
|
"""
|
|
Try to fetch data from the Explore Education Statistics API.
|
|
API docs: https://dfe-analytical-services.github.io/explore-education-statistics-api-docs/
|
|
"""
|
|
print("\nAttempting to fetch from Explore Education Statistics API...")
|
|
|
|
api_base = "https://explore-education-statistics.service.gov.uk/api/v1"
|
|
|
|
# First, list available publications
|
|
try:
|
|
# Get KS2 publication
|
|
publications_url = f"{api_base}/publications"
|
|
response = requests.get(publications_url, timeout=30)
|
|
|
|
if response.status_code == 200:
|
|
publications = response.json()
|
|
|
|
# Find KS2 related publication
|
|
ks2_pubs = [p for p in publications.get('results', [])
|
|
if 'key stage 2' in p.get('title', '').lower()
|
|
or 'ks2' in p.get('title', '').lower()]
|
|
|
|
if ks2_pubs:
|
|
print(f"Found KS2 publications: {[p['title'] for p in ks2_pubs]}")
|
|
|
|
# Get the latest release
|
|
for pub in ks2_pubs:
|
|
pub_id = pub.get('id')
|
|
if pub_id:
|
|
release_url = f"{api_base}/publications/{pub_id}/releases/latest"
|
|
release_response = requests.get(release_url, timeout=30)
|
|
|
|
if release_response.status_code == 200:
|
|
release = release_response.json()
|
|
print(f"Latest release: {release.get('title')}")
|
|
|
|
# Get data files
|
|
data_sets = release.get('dataSets', [])
|
|
for ds in data_sets:
|
|
print(f" - Dataset: {ds.get('name')}")
|
|
else:
|
|
print("No KS2 publications found via API")
|
|
else:
|
|
print(f"API returned status {response.status_code}")
|
|
|
|
except Exception as e:
|
|
print(f"Error accessing API: {e}")
|
|
|
|
return None
|
|
|
|
|
|
def create_combined_dataset(schools_df, performance_data=None):
|
|
"""
|
|
Combine school information with performance data.
|
|
If no performance data is available, returns school info only.
|
|
"""
|
|
if schools_df is None:
|
|
return None
|
|
|
|
# Add year column for compatibility
|
|
schools_df['year'] = 2024
|
|
|
|
# Add placeholder performance columns if no real data
|
|
if performance_data is None:
|
|
print("\nNo performance data available - school list saved without metrics")
|
|
print("Download KS2 data manually and re-run to add performance metrics")
|
|
|
|
return schools_df
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
print("=" * 60)
|
|
print("Fetching Real School Data for Wandsworth & Merton")
|
|
print("=" * 60)
|
|
|
|
# Create data directory
|
|
DATA_DIR.mkdir(exist_ok=True)
|
|
|
|
# Fetch school establishment data
|
|
schools_df = fetch_gias_data()
|
|
|
|
# Try to fetch performance data
|
|
fetch_ks2_performance_data()
|
|
download_from_explore_education_statistics()
|
|
|
|
# Save school data
|
|
if schools_df is not None:
|
|
output_file = DATA_DIR / "schools_wandsworth_merton.csv"
|
|
schools_df.to_csv(output_file, index=False)
|
|
print(f"\nSchool data saved to: {output_file}")
|
|
print(f"Total schools: {len(schools_df)}")
|
|
|
|
# Show breakdown
|
|
print("\nBreakdown by Local Authority:")
|
|
print(schools_df['local_authority'].value_counts())
|
|
else:
|
|
print("\nFailed to fetch school data")
|
|
|
|
print("\n" + "=" * 60)
|
|
print("NEXT STEPS:")
|
|
print("=" * 60)
|
|
print("""
|
|
To get complete performance data:
|
|
|
|
1. Go to: https://www.compare-school-performance.service.gov.uk/download-data
|
|
|
|
2. Download KS2 data for each year (2019-2024):
|
|
- Select: Key Stage 2
|
|
- Select: All data (or specific metrics)
|
|
- Select: Academic year
|
|
- Click: Download data
|
|
|
|
3. Place downloaded CSV files in the 'data' folder
|
|
|
|
4. Restart the application - it will automatically load the real data
|
|
|
|
The app will merge school info with performance metrics.
|
|
""")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|