initial commit
This commit is contained in:
181
scripts/download_data.py
Normal file
181
scripts/download_data.py
Normal file
@@ -0,0 +1,181 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Data Download Helper Script
|
||||
|
||||
This script provides instructions and utilities for downloading
|
||||
UK school performance data from the official government source.
|
||||
|
||||
Data Source: https://www.compare-school-performance.service.gov.uk/download-data
|
||||
|
||||
Note: The actual CSV downloads require manual selection on the website
|
||||
as they use dynamic form submissions. This script helps prepare and
|
||||
organize the downloaded data.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
from pathlib import Path
|
||||
import pandas as pd
|
||||
|
||||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||
|
||||
|
||||
def print_instructions():
|
||||
"""Print instructions for downloading the data."""
|
||||
print("""
|
||||
╔══════════════════════════════════════════════════════════════════════════════╗
|
||||
║ UK School Performance Data Download Instructions ║
|
||||
╠══════════════════════════════════════════════════════════════════════════════╣
|
||||
║ ║
|
||||
║ 1. Visit: https://www.compare-school-performance.service.gov.uk/download-data║
|
||||
║ ║
|
||||
║ 2. For each year (2019-2020 through 2023-2024), select: ║
|
||||
║ • Year: Select the academic year ║
|
||||
║ • Data type: "Key Stage 4" (for secondary school GCSE data) ║
|
||||
║ • File type: "All data" or specific metrics you need ║
|
||||
║ ║
|
||||
║ 3. Key metrics available: ║
|
||||
║ • Progress 8 - measures pupil progress from KS2 to KS4 ║
|
||||
║ • Attainment 8 - average attainment across 8 qualifications ║
|
||||
║ • English & Maths Grade 5+ percentage ║
|
||||
║ • EBacc entry and achievement percentages ║
|
||||
║ ║
|
||||
║ 4. Download the CSV files and place them in the 'data' folder ║
|
||||
║ ║
|
||||
║ 5. Rename files with the year for clarity, e.g.: ║
|
||||
║ • ks4_2020.csv ║
|
||||
║ • ks4_2021.csv ║
|
||||
║ • ks4_2022.csv ║
|
||||
║ • ks4_2023.csv ║
|
||||
║ • ks4_2024.csv ║
|
||||
║ ║
|
||||
╚══════════════════════════════════════════════════════════════════════════════╝
|
||||
""")
|
||||
|
||||
|
||||
def check_data_files():
|
||||
"""Check what data files are present in the data directory."""
|
||||
if not DATA_DIR.exists():
|
||||
print(f"Data directory not found: {DATA_DIR}")
|
||||
return []
|
||||
|
||||
csv_files = list(DATA_DIR.glob("*.csv"))
|
||||
|
||||
if not csv_files:
|
||||
print("No CSV files found in the data directory.")
|
||||
print(f"Please place your downloaded CSV files in: {DATA_DIR}")
|
||||
return []
|
||||
|
||||
print(f"\nFound {len(csv_files)} CSV file(s):")
|
||||
for f in csv_files:
|
||||
size_mb = f.stat().st_size / (1024 * 1024)
|
||||
print(f" • {f.name} ({size_mb:.2f} MB)")
|
||||
|
||||
return csv_files
|
||||
|
||||
|
||||
def preview_data(file_path: Path, rows: int = 5):
|
||||
"""Preview a CSV file."""
|
||||
try:
|
||||
df = pd.read_csv(file_path, nrows=rows)
|
||||
print(f"\n--- Preview of {file_path.name} ---")
|
||||
print(f"Columns ({len(df.columns)}):")
|
||||
for col in df.columns[:20]:
|
||||
print(f" • {col}")
|
||||
if len(df.columns) > 20:
|
||||
print(f" ... and {len(df.columns) - 20} more columns")
|
||||
print(f"\nFirst {rows} rows:")
|
||||
print(df.to_string())
|
||||
except Exception as e:
|
||||
print(f"Error reading {file_path}: {e}")
|
||||
|
||||
|
||||
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
|
||||
"""Standardize column names for consistency."""
|
||||
# Common column mappings from the official data
|
||||
column_mappings = {
|
||||
'URN': 'urn',
|
||||
'SCHNAME': 'school_name',
|
||||
'TOWN': 'town',
|
||||
'REGION': 'region',
|
||||
'RELDENOM': 'school_type',
|
||||
'P8MEA': 'progress_8',
|
||||
'ATT8SCR': 'attainment_8',
|
||||
'PTAC5EM': 'grade_5_eng_maths_pct',
|
||||
'PTEBACCEG': 'ebacc_entry_pct',
|
||||
'TPUP': 'pupils',
|
||||
}
|
||||
|
||||
# Normalize column names
|
||||
df.columns = df.columns.str.strip().str.upper()
|
||||
|
||||
# Apply mappings
|
||||
df = df.rename(columns={k.upper(): v for k, v in column_mappings.items()})
|
||||
|
||||
return df
|
||||
|
||||
|
||||
def process_and_combine_data():
|
||||
"""Process and combine all CSV files into a single dataset."""
|
||||
csv_files = check_data_files()
|
||||
if not csv_files:
|
||||
return None
|
||||
|
||||
all_data = []
|
||||
|
||||
for csv_file in csv_files:
|
||||
print(f"\nProcessing: {csv_file.name}")
|
||||
try:
|
||||
df = pd.read_csv(csv_file, low_memory=False)
|
||||
df = standardize_columns(df)
|
||||
|
||||
# Try to extract year from filename
|
||||
import re
|
||||
year_match = re.search(r'20\d{2}', csv_file.stem)
|
||||
if year_match:
|
||||
df['year'] = int(year_match.group())
|
||||
|
||||
all_data.append(df)
|
||||
print(f" Loaded {len(df)} rows")
|
||||
except Exception as e:
|
||||
print(f" Error: {e}")
|
||||
|
||||
if all_data:
|
||||
combined = pd.concat(all_data, ignore_index=True)
|
||||
output_path = DATA_DIR / "combined_data.csv"
|
||||
combined.to_csv(output_path, index=False)
|
||||
print(f"\nCombined data saved to: {output_path}")
|
||||
print(f"Total rows: {len(combined)}")
|
||||
return combined
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
if len(sys.argv) > 1:
|
||||
command = sys.argv[1].lower()
|
||||
|
||||
if command == "check":
|
||||
check_data_files()
|
||||
elif command == "preview" and len(sys.argv) > 2:
|
||||
file_path = DATA_DIR / sys.argv[2]
|
||||
if file_path.exists():
|
||||
preview_data(file_path)
|
||||
else:
|
||||
print(f"File not found: {file_path}")
|
||||
elif command == "combine":
|
||||
process_and_combine_data()
|
||||
else:
|
||||
print_instructions()
|
||||
else:
|
||||
print_instructions()
|
||||
print("\nAvailable commands:")
|
||||
print(" python download_data.py check - Check for existing data files")
|
||||
print(" python download_data.py preview <filename> - Preview a CSV file")
|
||||
print(" python download_data.py combine - Combine all CSV files")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
253
scripts/fetch_real_data.py
Normal file
253
scripts/fetch_real_data.py
Normal file
@@ -0,0 +1,253 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Fetch real school performance data from UK Government sources.
|
||||
|
||||
This script downloads KS2 (Key Stage 2) primary school data from:
|
||||
- Compare School Performance service
|
||||
- Get Information about Schools (GIAS)
|
||||
|
||||
Data is filtered to only include schools in Wandsworth and Merton.
|
||||
"""
|
||||
|
||||
import os
|
||||
import sys
|
||||
import requests
|
||||
import pandas as pd
|
||||
from pathlib import Path
|
||||
from io import StringIO
|
||||
|
||||
# Output directory
|
||||
DATA_DIR = Path(__file__).parent.parent / "data"
|
||||
|
||||
# Local Authority codes for Wandsworth and Merton
|
||||
LA_CODES = {
|
||||
"Wandsworth": "212",
|
||||
"Merton": "315"
|
||||
}
|
||||
|
||||
# Academic years to fetch (last 5 years available)
|
||||
YEARS = ["2023-2024", "2022-2023", "2021-2022", "2019-2020", "2018-2019"]
|
||||
# Note: 2020-2021 had no SATs due to COVID
|
||||
|
||||
|
||||
def fetch_gias_data():
|
||||
"""
|
||||
Fetch school establishment data from Get Information About Schools.
|
||||
This gives us the list of schools with URN, name, address, type, etc.
|
||||
"""
|
||||
print("Fetching school establishment data from GIAS...")
|
||||
|
||||
# GIAS provides downloadable extracts
|
||||
# Main extract URL (this may need to be updated periodically)
|
||||
gias_url = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata.csv"
|
||||
|
||||
try:
|
||||
response = requests.get(gias_url, timeout=60)
|
||||
response.raise_for_status()
|
||||
|
||||
# Parse CSV
|
||||
df = pd.read_csv(StringIO(response.text), encoding='utf-8-sig', low_memory=False)
|
||||
|
||||
# Filter to primary schools in Wandsworth and Merton
|
||||
# Phase of education: Primary, Middle deemed primary
|
||||
# LA codes: 212 (Wandsworth), 315 (Merton)
|
||||
df = df[
|
||||
(df['LA (code)'].astype(str).isin(LA_CODES.values())) &
|
||||
(df['PhaseOfEducation (name)'].str.contains('Primary', na=False))
|
||||
]
|
||||
|
||||
# Select relevant columns
|
||||
columns_to_keep = [
|
||||
'URN', 'EstablishmentName', 'LA (name)', 'TypeOfEstablishment (name)',
|
||||
'Street', 'Locality', 'Town', 'Postcode',
|
||||
'SchoolCapacity', 'NumberOfPupils', 'OfstedRating (name)'
|
||||
]
|
||||
available_cols = [c for c in columns_to_keep if c in df.columns]
|
||||
df = df[available_cols]
|
||||
|
||||
# Rename columns
|
||||
df = df.rename(columns={
|
||||
'URN': 'urn',
|
||||
'EstablishmentName': 'school_name',
|
||||
'LA (name)': 'local_authority',
|
||||
'TypeOfEstablishment (name)': 'school_type',
|
||||
'Street': 'street',
|
||||
'Town': 'town',
|
||||
'Postcode': 'postcode',
|
||||
'NumberOfPupils': 'pupils',
|
||||
'OfstedRating (name)': 'ofsted_rating'
|
||||
})
|
||||
|
||||
# Create address field
|
||||
df['address'] = df.apply(
|
||||
lambda row: f"{row.get('street', '')}, {row.get('postcode', '')}".strip(', '),
|
||||
axis=1
|
||||
)
|
||||
|
||||
print(f"Found {len(df)} primary schools in Wandsworth and Merton")
|
||||
return df
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error fetching GIAS data: {e}")
|
||||
return None
|
||||
|
||||
|
||||
def fetch_ks2_performance_data():
|
||||
"""
|
||||
Fetch KS2 performance data from Compare School Performance.
|
||||
|
||||
Note: The official download page requires form submission.
|
||||
We'll try to access the underlying data files directly.
|
||||
"""
|
||||
print("\nFetching KS2 performance data...")
|
||||
|
||||
# The performance data is available at gov.uk statistics pages
|
||||
# KS2 data URLs follow a pattern
|
||||
base_urls = {
|
||||
"2023-2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/",
|
||||
"2022-2023": "https://content.explore-education-statistics.service.gov.uk/api/releases/",
|
||||
}
|
||||
|
||||
# Alternative: Direct download links from gov.uk (when available)
|
||||
# These URLs may need to be updated when new data is released
|
||||
data_urls = {
|
||||
# 2024 KS2 results (provisional)
|
||||
"2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/b4cb82e3-6dca-4c98-a3b0-ba7d1d3ef555/files",
|
||||
}
|
||||
|
||||
print("Note: For the most accurate data, please download manually from:")
|
||||
print("https://www.compare-school-performance.service.gov.uk/download-data")
|
||||
print("\nSteps:")
|
||||
print("1. Select 'Key Stage 2' for Data type")
|
||||
print("2. Select 'All data' for File type")
|
||||
print("3. Select desired academic year")
|
||||
print("4. Download and place CSV files in the 'data' folder")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def download_from_explore_education_statistics():
|
||||
"""
|
||||
Try to fetch data from the Explore Education Statistics API.
|
||||
API docs: https://dfe-analytical-services.github.io/explore-education-statistics-api-docs/
|
||||
"""
|
||||
print("\nAttempting to fetch from Explore Education Statistics API...")
|
||||
|
||||
api_base = "https://explore-education-statistics.service.gov.uk/api/v1"
|
||||
|
||||
# First, list available publications
|
||||
try:
|
||||
# Get KS2 publication
|
||||
publications_url = f"{api_base}/publications"
|
||||
response = requests.get(publications_url, timeout=30)
|
||||
|
||||
if response.status_code == 200:
|
||||
publications = response.json()
|
||||
|
||||
# Find KS2 related publication
|
||||
ks2_pubs = [p for p in publications.get('results', [])
|
||||
if 'key stage 2' in p.get('title', '').lower()
|
||||
or 'ks2' in p.get('title', '').lower()]
|
||||
|
||||
if ks2_pubs:
|
||||
print(f"Found KS2 publications: {[p['title'] for p in ks2_pubs]}")
|
||||
|
||||
# Get the latest release
|
||||
for pub in ks2_pubs:
|
||||
pub_id = pub.get('id')
|
||||
if pub_id:
|
||||
release_url = f"{api_base}/publications/{pub_id}/releases/latest"
|
||||
release_response = requests.get(release_url, timeout=30)
|
||||
|
||||
if release_response.status_code == 200:
|
||||
release = release_response.json()
|
||||
print(f"Latest release: {release.get('title')}")
|
||||
|
||||
# Get data files
|
||||
data_sets = release.get('dataSets', [])
|
||||
for ds in data_sets:
|
||||
print(f" - Dataset: {ds.get('name')}")
|
||||
else:
|
||||
print("No KS2 publications found via API")
|
||||
else:
|
||||
print(f"API returned status {response.status_code}")
|
||||
|
||||
except Exception as e:
|
||||
print(f"Error accessing API: {e}")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
def create_combined_dataset(schools_df, performance_data=None):
|
||||
"""
|
||||
Combine school information with performance data.
|
||||
If no performance data is available, returns school info only.
|
||||
"""
|
||||
if schools_df is None:
|
||||
return None
|
||||
|
||||
# Add year column for compatibility
|
||||
schools_df['year'] = 2024
|
||||
|
||||
# Add placeholder performance columns if no real data
|
||||
if performance_data is None:
|
||||
print("\nNo performance data available - school list saved without metrics")
|
||||
print("Download KS2 data manually and re-run to add performance metrics")
|
||||
|
||||
return schools_df
|
||||
|
||||
|
||||
def main():
|
||||
"""Main entry point."""
|
||||
print("=" * 60)
|
||||
print("Fetching Real School Data for Wandsworth & Merton")
|
||||
print("=" * 60)
|
||||
|
||||
# Create data directory
|
||||
DATA_DIR.mkdir(exist_ok=True)
|
||||
|
||||
# Fetch school establishment data
|
||||
schools_df = fetch_gias_data()
|
||||
|
||||
# Try to fetch performance data
|
||||
fetch_ks2_performance_data()
|
||||
download_from_explore_education_statistics()
|
||||
|
||||
# Save school data
|
||||
if schools_df is not None:
|
||||
output_file = DATA_DIR / "schools_wandsworth_merton.csv"
|
||||
schools_df.to_csv(output_file, index=False)
|
||||
print(f"\nSchool data saved to: {output_file}")
|
||||
print(f"Total schools: {len(schools_df)}")
|
||||
|
||||
# Show breakdown
|
||||
print("\nBreakdown by Local Authority:")
|
||||
print(schools_df['local_authority'].value_counts())
|
||||
else:
|
||||
print("\nFailed to fetch school data")
|
||||
|
||||
print("\n" + "=" * 60)
|
||||
print("NEXT STEPS:")
|
||||
print("=" * 60)
|
||||
print("""
|
||||
To get complete performance data:
|
||||
|
||||
1. Go to: https://www.compare-school-performance.service.gov.uk/download-data
|
||||
|
||||
2. Download KS2 data for each year (2019-2024):
|
||||
- Select: Key Stage 2
|
||||
- Select: All data (or specific metrics)
|
||||
- Select: Academic year
|
||||
- Click: Download data
|
||||
|
||||
3. Place downloaded CSV files in the 'data' folder
|
||||
|
||||
4. Restart the application - it will automatically load the real data
|
||||
|
||||
The app will merge school info with performance metrics.
|
||||
""")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
|
||||
Reference in New Issue
Block a user