initial commit
This commit is contained in:
253
scripts/fetch_real_data.py
Normal file
253
scripts/fetch_real_data.py
Normal file
@@ -0,0 +1,253 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fetch real school performance data from UK Government sources.
|
||||
|
||||
This script downloads KS2 (Key Stage 2) primary school data from:
|
||||
- Compare School Performance service
|
||||
- Get Information about Schools (GIAS)
|
||||
|
||||
Data is filtered to only include schools in Wandsworth and Merton.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import requests
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from io import StringIO
|
||||
|
||||
# Output directory
|
||||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||
|
||||
# Local Authority codes for Wandsworth and Merton
|
||||
LA_CODES = {
|
||||
"Wandsworth": "212",
|
||||
"Merton": "315"
|
||||
}
|
||||
|
||||
# Academic years to fetch (last 5 years available)
|
||||
YEARS = ["2023-2024", "2022-2023", "2021-2022", "2019-2020", "2018-2019"]
|
||||
# Note: 2020-2021 had no SATs due to COVID
|
||||
|
||||
|
||||
def fetch_gias_data():
|
||||
"""
|
||||
Fetch school establishment data from Get Information About Schools.
|
||||
This gives us the list of schools with URN, name, address, type, etc.
|
||||
"""
|
||||
print("Fetching school establishment data from GIAS...")
|
||||
|
||||
# GIAS provides downloadable extracts
|
||||
# Main extract URL (this may need to be updated periodically)
|
||||
gias_url = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata.csv"
|
||||
|
||||
try:
|
||||
response = requests.get(gias_url, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse CSV
|
||||
df = pd.read_csv(StringIO(response.text), encoding='utf-8-sig', low_memory=False)
|
||||
|
||||
# Filter to primary schools in Wandsworth and Merton
|
||||
# Phase of education: Primary, Middle deemed primary
|
||||
# LA codes: 212 (Wandsworth), 315 (Merton)
|
||||
df = df[
|
||||
(df['LA (code)'].astype(str).isin(LA_CODES.values())) &
|
||||
(df['PhaseOfEducation (name)'].str.contains('Primary', na=False))
|
||||
]
|
||||
|
||||
# Select relevant columns
|
||||
columns_to_keep = [
|
||||
'URN', 'EstablishmentName', 'LA (name)', 'TypeOfEstablishment (name)',
|
||||
'Street', 'Locality', 'Town', 'Postcode',
|
||||
'SchoolCapacity', 'NumberOfPupils', 'OfstedRating (name)'
|
||||
]
|
||||
available_cols = [c for c in columns_to_keep if c in df.columns]
|
||||
df = df[available_cols]
|
||||
|
||||
# Rename columns
|
||||
df = df.rename(columns={
|
||||
'URN': 'urn',
|
||||
'EstablishmentName': 'school_name',
|
||||
'LA (name)': 'local_authority',
|
||||
'TypeOfEstablishment (name)': 'school_type',
|
||||
'Street': 'street',
|
||||
'Town': 'town',
|
||||
'Postcode': 'postcode',
|
||||
'NumberOfPupils': 'pupils',
|
||||
'OfstedRating (name)': 'ofsted_rating'
|
||||
})
|
||||
|
||||
# Create address field
|
||||
df['address'] = df.apply(
|
||||
lambda row: f"{row.get('street', '')}, {row.get('postcode', '')}".strip(', '),
|
||||
axis=1
|
||||
)
|
||||
|
||||
print(f"Found {len(df)} primary schools in Wandsworth and Merton")
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching GIAS data: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def fetch_ks2_performance_data():
|
||||
"""
|
||||
Fetch KS2 performance data from Compare School Performance.
|
||||
|
||||
Note: The official download page requires form submission.
|
||||
We'll try to access the underlying data files directly.
|
||||
"""
|
||||
print("\nFetching KS2 performance data...")
|
||||
|
||||
# The performance data is available at gov.uk statistics pages
|
||||
# KS2 data URLs follow a pattern
|
||||
base_urls = {
|
||||
"2023-2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/",
|
||||
"2022-2023": "https://content.explore-education-statistics.service.gov.uk/api/releases/",
|
||||
}
|
||||
|
||||
# Alternative: Direct download links from gov.uk (when available)
|
||||
# These URLs may need to be updated when new data is released
|
||||
data_urls = {
|
||||
# 2024 KS2 results (provisional)
|
||||
"2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/b4cb82e3-6dca-4c98-a3b0-ba7d1d3ef555/files",
|
||||
}
|
||||
|
||||
print("Note: For the most accurate data, please download manually from:")
|
||||
print("https://www.compare-school-performance.service.gov.uk/download-data")
|
||||
print("\nSteps:")
|
||||
print("1. Select 'Key Stage 2' for Data type")
|
||||
print("2. Select 'All data' for File type")
|
||||
print("3. Select desired academic year")
|
||||
print("4. Download and place CSV files in the 'data' folder")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def download_from_explore_education_statistics():
|
||||
"""
|
||||
Try to fetch data from the Explore Education Statistics API.
|
||||
API docs: https://dfe-analytical-services.github.io/explore-education-statistics-api-docs/
|
||||
"""
|
||||
print("\nAttempting to fetch from Explore Education Statistics API...")
|
||||
|
||||
api_base = "https://explore-education-statistics.service.gov.uk/api/v1"
|
||||
|
||||
# First, list available publications
|
||||
try:
|
||||
# Get KS2 publication
|
||||
publications_url = f"{api_base}/publications"
|
||||
response = requests.get(publications_url, timeout=30)
|
||||
|
||||
if response.status_code == 200:
|
||||
publications = response.json()
|
||||
|
||||
# Find KS2 related publication
|
||||
ks2_pubs = [p for p in publications.get('results', [])
|
||||
if 'key stage 2' in p.get('title', '').lower()
|
||||
or 'ks2' in p.get('title', '').lower()]
|
||||
|
||||
if ks2_pubs:
|
||||
print(f"Found KS2 publications: {[p['title'] for p in ks2_pubs]}")
|
||||
|
||||
# Get the latest release
|
||||
for pub in ks2_pubs:
|
||||
pub_id = pub.get('id')
|
||||
if pub_id:
|
||||
release_url = f"{api_base}/publications/{pub_id}/releases/latest"
|
||||
release_response = requests.get(release_url, timeout=30)
|
||||
|
||||
if release_response.status_code == 200:
|
||||
release = release_response.json()
|
||||
print(f"Latest release: {release.get('title')}")
|
||||
|
||||
# Get data files
|
||||
data_sets = release.get('dataSets', [])
|
||||
for ds in data_sets:
|
||||
print(f" - Dataset: {ds.get('name')}")
|
||||
else:
|
||||
print("No KS2 publications found via API")
|
||||
else:
|
||||
print(f"API returned status {response.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error accessing API: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def create_combined_dataset(schools_df, performance_data=None):
|
||||
"""
|
||||
Combine school information with performance data.
|
||||
If no performance data is available, returns school info only.
|
||||
"""
|
||||
if schools_df is None:
|
||||
return None
|
||||
|
||||
# Add year column for compatibility
|
||||
schools_df['year'] = 2024
|
||||
|
||||
# Add placeholder performance columns if no real data
|
||||
if performance_data is None:
|
||||
print("\nNo performance data available - school list saved without metrics")
|
||||
print("Download KS2 data manually and re-run to add performance metrics")
|
||||
|
||||
return schools_df
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
print("=" * 60)
|
||||
print("Fetching Real School Data for Wandsworth & Merton")
|
||||
print("=" * 60)
|
||||
|
||||
# Create data directory
|
||||
DATA_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# Fetch school establishment data
|
||||
schools_df = fetch_gias_data()
|
||||
|
||||
# Try to fetch performance data
|
||||
fetch_ks2_performance_data()
|
||||
download_from_explore_education_statistics()
|
||||
|
||||
# Save school data
|
||||
if schools_df is not None:
|
||||
output_file = DATA_DIR / "schools_wandsworth_merton.csv"
|
||||
schools_df.to_csv(output_file, index=False)
|
||||
print(f"\nSchool data saved to: {output_file}")
|
||||
print(f"Total schools: {len(schools_df)}")
|
||||
|
||||
# Show breakdown
|
||||
print("\nBreakdown by Local Authority:")
|
||||
print(schools_df['local_authority'].value_counts())
|
||||
else:
|
||||
print("\nFailed to fetch school data")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("NEXT STEPS:")
|
||||
print("=" * 60)
|
||||
print("""
|
||||
To get complete performance data:
|
||||
|
||||
1. Go to: https://www.compare-school-performance.service.gov.uk/download-data
|
||||
|
||||
2. Download KS2 data for each year (2019-2024):
|
||||
- Select: Key Stage 2
|
||||
- Select: All data (or specific metrics)
|
||||
- Select: Academic year
|
||||
- Click: Download data
|
||||
|
||||
3. Place downloaded CSV files in the 'data' folder
|
||||
|
||||
4. Restart the application - it will automatically load the real data
|
||||
|
||||
The app will merge school info with performance metrics.
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user