Files
school_compare/scripts/download_data.py
Tudor Sitaru c65eb1a00f initial commit
2026-01-06 13:52:00 +00:00

182 lines
7.2 KiB
Python

#!/usr/bin/env python3
"""
Data Download Helper Script
This script provides instructions and utilities for downloading
UK school performance data from the official government source.
Data Source: https://www.compare-school-performance.service.gov.uk/download-data
Note: The actual CSV downloads require manual selection on the website
as they use dynamic form submissions. This script helps prepare and
organize the downloaded data.
"""
import os
import sys
from pathlib import Path
import pandas as pd
DATA_DIR = Path(__file__).parent.parent / "data"
def print_instructions():
"""Print instructions for downloading the data."""
print("""
╔══════════════════════════════════════════════════════════════════════════════╗
║ UK School Performance Data Download Instructions ║
╠══════════════════════════════════════════════════════════════════════════════╣
║ ║
║ 1. Visit: https://www.compare-school-performance.service.gov.uk/download-data║
║ ║
║ 2. For each year (2019-2020 through 2023-2024), select: ║
║ • Year: Select the academic year ║
║ • Data type: "Key Stage 4" (for secondary school GCSE data) ║
║ • File type: "All data" or specific metrics you need ║
║ ║
║ 3. Key metrics available: ║
║ • Progress 8 - measures pupil progress from KS2 to KS4 ║
║ • Attainment 8 - average attainment across 8 qualifications ║
║ • English & Maths Grade 5+ percentage ║
║ • EBacc entry and achievement percentages ║
║ ║
║ 4. Download the CSV files and place them in the 'data' folder ║
║ ║
║ 5. Rename files with the year for clarity, e.g.: ║
║ • ks4_2020.csv ║
║ • ks4_2021.csv ║
║ • ks4_2022.csv ║
║ • ks4_2023.csv ║
║ • ks4_2024.csv ║
║ ║
╚══════════════════════════════════════════════════════════════════════════════╝
""")
def check_data_files():
"""Check what data files are present in the data directory."""
if not DATA_DIR.exists():
print(f"Data directory not found: {DATA_DIR}")
return []
csv_files = list(DATA_DIR.glob("*.csv"))
if not csv_files:
print("No CSV files found in the data directory.")
print(f"Please place your downloaded CSV files in: {DATA_DIR}")
return []
print(f"\nFound {len(csv_files)} CSV file(s):")
for f in csv_files:
size_mb = f.stat().st_size / (1024 * 1024)
print(f"{f.name} ({size_mb:.2f} MB)")
return csv_files
def preview_data(file_path: Path, rows: int = 5):
"""Preview a CSV file."""
try:
df = pd.read_csv(file_path, nrows=rows)
print(f"\n--- Preview of {file_path.name} ---")
print(f"Columns ({len(df.columns)}):")
for col in df.columns[:20]:
print(f"{col}")
if len(df.columns) > 20:
print(f" ... and {len(df.columns) - 20} more columns")
print(f"\nFirst {rows} rows:")
print(df.to_string())
except Exception as e:
print(f"Error reading {file_path}: {e}")
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Standardize column names for consistency."""
# Common column mappings from the official data
column_mappings = {
'URN': 'urn',
'SCHNAME': 'school_name',
'TOWN': 'town',
'REGION': 'region',
'RELDENOM': 'school_type',
'P8MEA': 'progress_8',
'ATT8SCR': 'attainment_8',
'PTAC5EM': 'grade_5_eng_maths_pct',
'PTEBACCEG': 'ebacc_entry_pct',
'TPUP': 'pupils',
}
# Normalize column names
df.columns = df.columns.str.strip().str.upper()
# Apply mappings
df = df.rename(columns={k.upper(): v for k, v in column_mappings.items()})
return df
def process_and_combine_data():
"""Process and combine all CSV files into a single dataset."""
csv_files = check_data_files()
if not csv_files:
return None
all_data = []
for csv_file in csv_files:
print(f"\nProcessing: {csv_file.name}")
try:
df = pd.read_csv(csv_file, low_memory=False)
df = standardize_columns(df)
# Try to extract year from filename
import re
year_match = re.search(r'20\d{2}', csv_file.stem)
if year_match:
df['year'] = int(year_match.group())
all_data.append(df)
print(f" Loaded {len(df)} rows")
except Exception as e:
print(f" Error: {e}")
if all_data:
combined = pd.concat(all_data, ignore_index=True)
output_path = DATA_DIR / "combined_data.csv"
combined.to_csv(output_path, index=False)
print(f"\nCombined data saved to: {output_path}")
print(f"Total rows: {len(combined)}")
return combined
return None
def main():
"""Main entry point."""
if len(sys.argv) > 1:
command = sys.argv[1].lower()
if command == "check":
check_data_files()
elif command == "preview" and len(sys.argv) > 2:
file_path = DATA_DIR / sys.argv[2]
if file_path.exists():
preview_data(file_path)
else:
print(f"File not found: {file_path}")
elif command == "combine":
process_and_combine_data()
else:
print_instructions()
else:
print_instructions()
print("\nAvailable commands:")
print(" python download_data.py check - Check for existing data files")
print(" python download_data.py preview <filename> - Preview a CSV file")
print(" python download_data.py combine - Combine all CSV files")
if __name__ == "__main__":
main()