#!/usr/bin/env python3 """ Data Download Helper Script This script provides instructions and utilities for downloading UK school performance data from the official government source. Data Source: https://www.compare-school-performance.service.gov.uk/download-data Note: The actual CSV downloads require manual selection on the website as they use dynamic form submissions. This script helps prepare and organize the downloaded data. """ import os import sys from pathlib import Path import pandas as pd DATA_DIR = Path(__file__).parent.parent / "data" def print_instructions(): """Print instructions for downloading the data.""" print(""" ╔══════════════════════════════════════════════════════════════════════════════╗ ║ UK School Performance Data Download Instructions ║ ╠══════════════════════════════════════════════════════════════════════════════╣ ║ ║ ║ 1. Visit: https://www.compare-school-performance.service.gov.uk/download-data║ ║ ║ ║ 2. For each year (2019-2020 through 2023-2024), select: ║ ║ • Year: Select the academic year ║ ║ • Data type: "Key Stage 4" (for secondary school GCSE data) ║ ║ • File type: "All data" or specific metrics you need ║ ║ ║ ║ 3. Key metrics available: ║ ║ • Progress 8 - measures pupil progress from KS2 to KS4 ║ ║ • Attainment 8 - average attainment across 8 qualifications ║ ║ • English & Maths Grade 5+ percentage ║ ║ • EBacc entry and achievement percentages ║ ║ ║ ║ 4. Download the CSV files and place them in the 'data' folder ║ ║ ║ ║ 5. Rename files with the year for clarity, e.g.: ║ ║ • ks4_2020.csv ║ ║ • ks4_2021.csv ║ ║ • ks4_2022.csv ║ ║ • ks4_2023.csv ║ ║ • ks4_2024.csv ║ ║ ║ ╚══════════════════════════════════════════════════════════════════════════════╝ """) def check_data_files(): """Check what data files are present in the data directory.""" if not DATA_DIR.exists(): print(f"Data directory not found: {DATA_DIR}") return [] csv_files = list(DATA_DIR.glob("*.csv")) if not csv_files: print("No CSV files found in the data directory.") print(f"Please place your downloaded CSV files in: {DATA_DIR}") return [] print(f"\nFound {len(csv_files)} CSV file(s):") for f in csv_files: size_mb = f.stat().st_size / (1024 * 1024) print(f" • {f.name} ({size_mb:.2f} MB)") return csv_files def preview_data(file_path: Path, rows: int = 5): """Preview a CSV file.""" try: df = pd.read_csv(file_path, nrows=rows) print(f"\n--- Preview of {file_path.name} ---") print(f"Columns ({len(df.columns)}):") for col in df.columns[:20]: print(f" • {col}") if len(df.columns) > 20: print(f" ... and {len(df.columns) - 20} more columns") print(f"\nFirst {rows} rows:") print(df.to_string()) except Exception as e: print(f"Error reading {file_path}: {e}") def standardize_columns(df: pd.DataFrame) -> pd.DataFrame: """Standardize column names for consistency.""" # Common column mappings from the official data column_mappings = { 'URN': 'urn', 'SCHNAME': 'school_name', 'TOWN': 'town', 'REGION': 'region', 'RELDENOM': 'school_type', 'P8MEA': 'progress_8', 'ATT8SCR': 'attainment_8', 'PTAC5EM': 'grade_5_eng_maths_pct', 'PTEBACCEG': 'ebacc_entry_pct', 'TPUP': 'pupils', } # Normalize column names df.columns = df.columns.str.strip().str.upper() # Apply mappings df = df.rename(columns={k.upper(): v for k, v in column_mappings.items()}) return df def process_and_combine_data(): """Process and combine all CSV files into a single dataset.""" csv_files = check_data_files() if not csv_files: return None all_data = [] for csv_file in csv_files: print(f"\nProcessing: {csv_file.name}") try: df = pd.read_csv(csv_file, low_memory=False) df = standardize_columns(df) # Try to extract year from filename import re year_match = re.search(r'20\d{2}', csv_file.stem) if year_match: df['year'] = int(year_match.group()) all_data.append(df) print(f" Loaded {len(df)} rows") except Exception as e: print(f" Error: {e}") if all_data: combined = pd.concat(all_data, ignore_index=True) output_path = DATA_DIR / "combined_data.csv" combined.to_csv(output_path, index=False) print(f"\nCombined data saved to: {output_path}") print(f"Total rows: {len(combined)}") return combined return None def main(): """Main entry point.""" if len(sys.argv) > 1: command = sys.argv[1].lower() if command == "check": check_data_files() elif command == "preview" and len(sys.argv) > 2: file_path = DATA_DIR / sys.argv[2] if file_path.exists(): preview_data(file_path) else: print(f"File not found: {file_path}") elif command == "combine": process_and_combine_data() else: print_instructions() else: print_instructions() print("\nAvailable commands:") print(" python download_data.py check - Check for existing data files") print(" python download_data.py preview - Preview a CSV file") print(" python download_data.py combine - Combine all CSV files") if __name__ == "__main__": main()