school_compare/scripts/download_data.py

#!/usr/bin/env python3
"""
Data Download Helper Script

This script provides instructions and utilities for downloading
UK school performance data from the official government source.

Data Source: https://www.compare-school-performance.service.gov.uk/download-data

Note: The actual CSV downloads require manual selection on the website
as they use dynamic form submissions. This script helps prepare and
organize the downloaded data.
"""

import os
import sys
from pathlib import Path
import pandas as pd

DATA_DIR = Path(__file__).parent.parent / "data"


def print_instructions():
    """Print instructions for downloading the data."""
    print("""
╔══════════════════════════════════════════════════════════════════════════════╗
║           UK School Performance Data Download Instructions                    ║
╠══════════════════════════════════════════════════════════════════════════════╣
║                                                                              ║
║  1. Visit: https://www.compare-school-performance.service.gov.uk/download-data║
║                                                                              ║
║  2. For each year (2019-2020 through 2023-2024), select:                    ║
║     • Year: Select the academic year                                         ║
║     • Data type: "Key Stage 4" (for secondary school GCSE data)             ║
║     • File type: "All data" or specific metrics you need                    ║
║                                                                              ║
║  3. Key metrics available:                                                   ║
║     • Progress 8 - measures pupil progress from KS2 to KS4                  ║
║     • Attainment 8 - average attainment across 8 qualifications             ║
║     • English & Maths Grade 5+ percentage                                   ║
║     • EBacc entry and achievement percentages                               ║
║                                                                              ║
║  4. Download the CSV files and place them in the 'data' folder              ║
║                                                                              ║
║  5. Rename files with the year for clarity, e.g.:                           ║
║     • ks4_2020.csv                                                          ║
║     • ks4_2021.csv                                                          ║
║     • ks4_2022.csv                                                          ║
║     • ks4_2023.csv                                                          ║
║     • ks4_2024.csv                                                          ║
║                                                                              ║
╚══════════════════════════════════════════════════════════════════════════════╝
""")


def check_data_files():
    """Check what data files are present in the data directory."""
    if not DATA_DIR.exists():
        print(f"Data directory not found: {DATA_DIR}")
        return []

    csv_files = list(DATA_DIR.glob("*.csv"))

    if not csv_files:
        print("No CSV files found in the data directory.")
        print(f"Please place your downloaded CSV files in: {DATA_DIR}")
        return []

    print(f"\nFound {len(csv_files)} CSV file(s):")
    for f in csv_files:
        size_mb = f.stat().st_size / (1024 * 1024)
        print(f"  • {f.name} ({size_mb:.2f} MB)")

    return csv_files


def preview_data(file_path: Path, rows: int = 5):
    """Preview a CSV file."""
    try:
        df = pd.read_csv(file_path, nrows=rows)
        print(f"\n--- Preview of {file_path.name} ---")
        print(f"Columns ({len(df.columns)}):")
        for col in df.columns[:20]:
            print(f"  • {col}")
        if len(df.columns) > 20:
            print(f"  ... and {len(df.columns) - 20} more columns")
        print(f"\nFirst {rows} rows:")
        print(df.to_string())
    except Exception as e:
        print(f"Error reading {file_path}: {e}")


def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
    """Standardize column names for consistency."""
    # Common column mappings from the official data
    column_mappings = {
        'URN': 'urn',
        'SCHNAME': 'school_name',
        'TOWN': 'town',
        'REGION': 'region',
        'RELDENOM': 'school_type',
        'P8MEA': 'progress_8',
        'ATT8SCR': 'attainment_8',
        'PTAC5EM': 'grade_5_eng_maths_pct',
        'PTEBACCEG': 'ebacc_entry_pct',
        'TPUP': 'pupils',
    }

    # Normalize column names
    df.columns = df.columns.str.strip().str.upper()

    # Apply mappings
    df = df.rename(columns={k.upper(): v for k, v in column_mappings.items()})

    return df


def process_and_combine_data():
    """Process and combine all CSV files into a single dataset."""
    csv_files = check_data_files()
    if not csv_files:
        return None

    all_data = []

    for csv_file in csv_files:
        print(f"\nProcessing: {csv_file.name}")
        try:
            df = pd.read_csv(csv_file, low_memory=False)
            df = standardize_columns(df)

            # Try to extract year from filename
            import re
            year_match = re.search(r'20\d{2}', csv_file.stem)
            if year_match:
                df['year'] = int(year_match.group())

            all_data.append(df)
            print(f"  Loaded {len(df)} rows")
        except Exception as e:
            print(f"  Error: {e}")

    if all_data:
        combined = pd.concat(all_data, ignore_index=True)
        output_path = DATA_DIR / "combined_data.csv"
        combined.to_csv(output_path, index=False)
        print(f"\nCombined data saved to: {output_path}")
        print(f"Total rows: {len(combined)}")
        return combined

    return None


def main():
    """Main entry point."""
    if len(sys.argv) > 1:
        command = sys.argv[1].lower()

        if command == "check":
            check_data_files()
        elif command == "preview" and len(sys.argv) > 2:
            file_path = DATA_DIR / sys.argv[2]
            if file_path.exists():
                preview_data(file_path)
            else:
                print(f"File not found: {file_path}")
        elif command == "combine":
            process_and_combine_data()
        else:
            print_instructions()
    else:
        print_instructions()
        print("\nAvailable commands:")
        print("  python download_data.py check    - Check for existing data files")
        print("  python download_data.py preview <filename> - Preview a CSV file")
        print("  python download_data.py combine  - Combine all CSV files")


if __name__ == "__main__":
    main()