initial commit

This commit is contained in:
Tudor Sitaru
2026-01-06 13:52:00 +00:00
commit c65eb1a00f
37 changed files with 402537 additions and 0 deletions

253
scripts/fetch_real_data.py Normal file
View File

@@ -0,0 +1,253 @@
#!/usr/bin/env python3
"""
Fetch real school performance data from UK Government sources.
This script downloads KS2 (Key Stage 2) primary school data from:
- Compare School Performance service
- Get Information about Schools (GIAS)
Data is filtered to only include schools in Wandsworth and Merton.
"""
import os
import sys
import requests
import pandas as pd
from pathlib import Path
from io import StringIO
# Output directory
DATA_DIR = Path(__file__).parent.parent / "data"
# Local Authority codes for Wandsworth and Merton
LA_CODES = {
"Wandsworth": "212",
"Merton": "315"
}
# Academic years to fetch (last 5 years available)
YEARS = ["2023-2024", "2022-2023", "2021-2022", "2019-2020", "2018-2019"]
# Note: 2020-2021 had no SATs due to COVID
def fetch_gias_data():
"""
Fetch school establishment data from Get Information About Schools.
This gives us the list of schools with URN, name, address, type, etc.
"""
print("Fetching school establishment data from GIAS...")
# GIAS provides downloadable extracts
# Main extract URL (this may need to be updated periodically)
gias_url = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata.csv"
try:
response = requests.get(gias_url, timeout=60)
response.raise_for_status()
# Parse CSV
df = pd.read_csv(StringIO(response.text), encoding='utf-8-sig', low_memory=False)
# Filter to primary schools in Wandsworth and Merton
# Phase of education: Primary, Middle deemed primary
# LA codes: 212 (Wandsworth), 315 (Merton)
df = df[
(df['LA (code)'].astype(str).isin(LA_CODES.values())) &
(df['PhaseOfEducation (name)'].str.contains('Primary', na=False))
]
# Select relevant columns
columns_to_keep = [
'URN', 'EstablishmentName', 'LA (name)', 'TypeOfEstablishment (name)',
'Street', 'Locality', 'Town', 'Postcode',
'SchoolCapacity', 'NumberOfPupils', 'OfstedRating (name)'
]
available_cols = [c for c in columns_to_keep if c in df.columns]
df = df[available_cols]
# Rename columns
df = df.rename(columns={
'URN': 'urn',
'EstablishmentName': 'school_name',
'LA (name)': 'local_authority',
'TypeOfEstablishment (name)': 'school_type',
'Street': 'street',
'Town': 'town',
'Postcode': 'postcode',
'NumberOfPupils': 'pupils',
'OfstedRating (name)': 'ofsted_rating'
})
# Create address field
df['address'] = df.apply(
lambda row: f"{row.get('street', '')}, {row.get('postcode', '')}".strip(', '),
axis=1
)
print(f"Found {len(df)} primary schools in Wandsworth and Merton")
return df
except Exception as e:
print(f"Error fetching GIAS data: {e}")
return None
def fetch_ks2_performance_data():
"""
Fetch KS2 performance data from Compare School Performance.
Note: The official download page requires form submission.
We'll try to access the underlying data files directly.
"""
print("\nFetching KS2 performance data...")
# The performance data is available at gov.uk statistics pages
# KS2 data URLs follow a pattern
base_urls = {
"2023-2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/",
"2022-2023": "https://content.explore-education-statistics.service.gov.uk/api/releases/",
}
# Alternative: Direct download links from gov.uk (when available)
# These URLs may need to be updated when new data is released
data_urls = {
# 2024 KS2 results (provisional)
"2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/b4cb82e3-6dca-4c98-a3b0-ba7d1d3ef555/files",
}
print("Note: For the most accurate data, please download manually from:")
print("https://www.compare-school-performance.service.gov.uk/download-data")
print("\nSteps:")
print("1. Select 'Key Stage 2' for Data type")
print("2. Select 'All data' for File type")
print("3. Select desired academic year")
print("4. Download and place CSV files in the 'data' folder")
return None
def download_from_explore_education_statistics():
"""
Try to fetch data from the Explore Education Statistics API.
API docs: https://dfe-analytical-services.github.io/explore-education-statistics-api-docs/
"""
print("\nAttempting to fetch from Explore Education Statistics API...")
api_base = "https://explore-education-statistics.service.gov.uk/api/v1"
# First, list available publications
try:
# Get KS2 publication
publications_url = f"{api_base}/publications"
response = requests.get(publications_url, timeout=30)
if response.status_code == 200:
publications = response.json()
# Find KS2 related publication
ks2_pubs = [p for p in publications.get('results', [])
if 'key stage 2' in p.get('title', '').lower()
or 'ks2' in p.get('title', '').lower()]
if ks2_pubs:
print(f"Found KS2 publications: {[p['title'] for p in ks2_pubs]}")
# Get the latest release
for pub in ks2_pubs:
pub_id = pub.get('id')
if pub_id:
release_url = f"{api_base}/publications/{pub_id}/releases/latest"
release_response = requests.get(release_url, timeout=30)
if release_response.status_code == 200:
release = release_response.json()
print(f"Latest release: {release.get('title')}")
# Get data files
data_sets = release.get('dataSets', [])
for ds in data_sets:
print(f" - Dataset: {ds.get('name')}")
else:
print("No KS2 publications found via API")
else:
print(f"API returned status {response.status_code}")
except Exception as e:
print(f"Error accessing API: {e}")
return None
def create_combined_dataset(schools_df, performance_data=None):
"""
Combine school information with performance data.
If no performance data is available, returns school info only.
"""
if schools_df is None:
return None
# Add year column for compatibility
schools_df['year'] = 2024
# Add placeholder performance columns if no real data
if performance_data is None:
print("\nNo performance data available - school list saved without metrics")
print("Download KS2 data manually and re-run to add performance metrics")
return schools_df
def main():
"""Main entry point."""
print("=" * 60)
print("Fetching Real School Data for Wandsworth & Merton")
print("=" * 60)
# Create data directory
DATA_DIR.mkdir(exist_ok=True)
# Fetch school establishment data
schools_df = fetch_gias_data()
# Try to fetch performance data
fetch_ks2_performance_data()
download_from_explore_education_statistics()
# Save school data
if schools_df is not None:
output_file = DATA_DIR / "schools_wandsworth_merton.csv"
schools_df.to_csv(output_file, index=False)
print(f"\nSchool data saved to: {output_file}")
print(f"Total schools: {len(schools_df)}")
# Show breakdown
print("\nBreakdown by Local Authority:")
print(schools_df['local_authority'].value_counts())
else:
print("\nFailed to fetch school data")
print("\n" + "=" * 60)
print("NEXT STEPS:")
print("=" * 60)
print("""
To get complete performance data:
1. Go to: https://www.compare-school-performance.service.gov.uk/download-data
2. Download KS2 data for each year (2019-2024):
- Select: Key Stage 2
- Select: All data (or specific metrics)
- Select: Academic year
- Click: Download data
3. Place downloaded CSV files in the 'data' folder
4. Restart the application - it will automatically load the real data
The app will merge school info with performance metrics.
""")
if __name__ == "__main__":
main()