182 lines
7.2 KiB
Python
182 lines
7.2 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Data Download Helper Script
|
|
|
|
This script provides instructions and utilities for downloading
|
|
UK school performance data from the official government source.
|
|
|
|
Data Source: https://www.compare-school-performance.service.gov.uk/download-data
|
|
|
|
Note: The actual CSV downloads require manual selection on the website
|
|
as they use dynamic form submissions. This script helps prepare and
|
|
organize the downloaded data.
|
|
"""
|
|
|
|
import os
|
|
import sys
|
|
from pathlib import Path
|
|
import pandas as pd
|
|
|
|
DATA_DIR = Path(__file__).parent.parent / "data"
|
|
|
|
|
|
def print_instructions():
|
|
"""Print instructions for downloading the data."""
|
|
print("""
|
|
╔══════════════════════════════════════════════════════════════════════════════╗
|
|
║ UK School Performance Data Download Instructions ║
|
|
╠══════════════════════════════════════════════════════════════════════════════╣
|
|
║ ║
|
|
║ 1. Visit: https://www.compare-school-performance.service.gov.uk/download-data║
|
|
║ ║
|
|
║ 2. For each year (2019-2020 through 2023-2024), select: ║
|
|
║ • Year: Select the academic year ║
|
|
║ • Data type: "Key Stage 4" (for secondary school GCSE data) ║
|
|
║ • File type: "All data" or specific metrics you need ║
|
|
║ ║
|
|
║ 3. Key metrics available: ║
|
|
║ • Progress 8 - measures pupil progress from KS2 to KS4 ║
|
|
║ • Attainment 8 - average attainment across 8 qualifications ║
|
|
║ • English & Maths Grade 5+ percentage ║
|
|
║ • EBacc entry and achievement percentages ║
|
|
║ ║
|
|
║ 4. Download the CSV files and place them in the 'data' folder ║
|
|
║ ║
|
|
║ 5. Rename files with the year for clarity, e.g.: ║
|
|
║ • ks4_2020.csv ║
|
|
║ • ks4_2021.csv ║
|
|
║ • ks4_2022.csv ║
|
|
║ • ks4_2023.csv ║
|
|
║ • ks4_2024.csv ║
|
|
║ ║
|
|
╚══════════════════════════════════════════════════════════════════════════════╝
|
|
""")
|
|
|
|
|
|
def check_data_files():
|
|
"""Check what data files are present in the data directory."""
|
|
if not DATA_DIR.exists():
|
|
print(f"Data directory not found: {DATA_DIR}")
|
|
return []
|
|
|
|
csv_files = list(DATA_DIR.glob("*.csv"))
|
|
|
|
if not csv_files:
|
|
print("No CSV files found in the data directory.")
|
|
print(f"Please place your downloaded CSV files in: {DATA_DIR}")
|
|
return []
|
|
|
|
print(f"\nFound {len(csv_files)} CSV file(s):")
|
|
for f in csv_files:
|
|
size_mb = f.stat().st_size / (1024 * 1024)
|
|
print(f" • {f.name} ({size_mb:.2f} MB)")
|
|
|
|
return csv_files
|
|
|
|
|
|
def preview_data(file_path: Path, rows: int = 5):
|
|
"""Preview a CSV file."""
|
|
try:
|
|
df = pd.read_csv(file_path, nrows=rows)
|
|
print(f"\n--- Preview of {file_path.name} ---")
|
|
print(f"Columns ({len(df.columns)}):")
|
|
for col in df.columns[:20]:
|
|
print(f" • {col}")
|
|
if len(df.columns) > 20:
|
|
print(f" ... and {len(df.columns) - 20} more columns")
|
|
print(f"\nFirst {rows} rows:")
|
|
print(df.to_string())
|
|
except Exception as e:
|
|
print(f"Error reading {file_path}: {e}")
|
|
|
|
|
|
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
|
|
"""Standardize column names for consistency."""
|
|
# Common column mappings from the official data
|
|
column_mappings = {
|
|
'URN': 'urn',
|
|
'SCHNAME': 'school_name',
|
|
'TOWN': 'town',
|
|
'REGION': 'region',
|
|
'RELDENOM': 'school_type',
|
|
'P8MEA': 'progress_8',
|
|
'ATT8SCR': 'attainment_8',
|
|
'PTAC5EM': 'grade_5_eng_maths_pct',
|
|
'PTEBACCEG': 'ebacc_entry_pct',
|
|
'TPUP': 'pupils',
|
|
}
|
|
|
|
# Normalize column names
|
|
df.columns = df.columns.str.strip().str.upper()
|
|
|
|
# Apply mappings
|
|
df = df.rename(columns={k.upper(): v for k, v in column_mappings.items()})
|
|
|
|
return df
|
|
|
|
|
|
def process_and_combine_data():
|
|
"""Process and combine all CSV files into a single dataset."""
|
|
csv_files = check_data_files()
|
|
if not csv_files:
|
|
return None
|
|
|
|
all_data = []
|
|
|
|
for csv_file in csv_files:
|
|
print(f"\nProcessing: {csv_file.name}")
|
|
try:
|
|
df = pd.read_csv(csv_file, low_memory=False)
|
|
df = standardize_columns(df)
|
|
|
|
# Try to extract year from filename
|
|
import re
|
|
year_match = re.search(r'20\d{2}', csv_file.stem)
|
|
if year_match:
|
|
df['year'] = int(year_match.group())
|
|
|
|
all_data.append(df)
|
|
print(f" Loaded {len(df)} rows")
|
|
except Exception as e:
|
|
print(f" Error: {e}")
|
|
|
|
if all_data:
|
|
combined = pd.concat(all_data, ignore_index=True)
|
|
output_path = DATA_DIR / "combined_data.csv"
|
|
combined.to_csv(output_path, index=False)
|
|
print(f"\nCombined data saved to: {output_path}")
|
|
print(f"Total rows: {len(combined)}")
|
|
return combined
|
|
|
|
return None
|
|
|
|
|
|
def main():
|
|
"""Main entry point."""
|
|
if len(sys.argv) > 1:
|
|
command = sys.argv[1].lower()
|
|
|
|
if command == "check":
|
|
check_data_files()
|
|
elif command == "preview" and len(sys.argv) > 2:
|
|
file_path = DATA_DIR / sys.argv[2]
|
|
if file_path.exists():
|
|
preview_data(file_path)
|
|
else:
|
|
print(f"File not found: {file_path}")
|
|
elif command == "combine":
|
|
process_and_combine_data()
|
|
else:
|
|
print_instructions()
|
|
else:
|
|
print_instructions()
|
|
print("\nAvailable commands:")
|
|
print(" python download_data.py check - Check for existing data files")
|
|
print(" python download_data.py preview <filename> - Preview a CSV file")
|
|
print(" python download_data.py combine - Combine all CSV files")
|
|
|
|
|
|
if __name__ == "__main__":
|
|
main()
|
|
|