Files
school_compare/scripts/download_data.py

182 lines
7.2 KiB
Python
Raw Normal View History

2026-01-06 13:52:00 +00:00
#!/usr/bin/env python3
"""
Data Download Helper Script
This script provides instructions and utilities for downloading
UK school performance data from the official government source.
Data Source: https://www.compare-school-performance.service.gov.uk/download-data
Note: The actual CSV downloads require manual selection on the website
as they use dynamic form submissions. This script helps prepare and
organize the downloaded data.
"""
import os
import sys
from pathlib import Path
import pandas as pd
DATA_DIR = Path(__file__).parent.parent / "data"
def print_instructions():
"""Print instructions for downloading the data."""
print("""
UK School Performance Data Download Instructions
1. Visit: https://www.compare-school-performance.service.gov.uk/download-data
2. For each year (2019-2020 through 2023-2024), select:
Year: Select the academic year
Data type: "Key Stage 4" (for secondary school GCSE data)
File type: "All data" or specific metrics you need
3. Key metrics available:
Progress 8 - measures pupil progress from KS2 to KS4
Attainment 8 - average attainment across 8 qualifications
English & Maths Grade 5+ percentage
EBacc entry and achievement percentages
4. Download the CSV files and place them in the 'data' folder
5. Rename files with the year for clarity, e.g.:
ks4_2020.csv
ks4_2021.csv
ks4_2022.csv
ks4_2023.csv
ks4_2024.csv
""")
def check_data_files():
"""Check what data files are present in the data directory."""
if not DATA_DIR.exists():
print(f"Data directory not found: {DATA_DIR}")
return []
csv_files = list(DATA_DIR.glob("*.csv"))
if not csv_files:
print("No CSV files found in the data directory.")
print(f"Please place your downloaded CSV files in: {DATA_DIR}")
return []
print(f"\nFound {len(csv_files)} CSV file(s):")
for f in csv_files:
size_mb = f.stat().st_size / (1024 * 1024)
print(f"{f.name} ({size_mb:.2f} MB)")
return csv_files
def preview_data(file_path: Path, rows: int = 5):
"""Preview a CSV file."""
try:
df = pd.read_csv(file_path, nrows=rows)
print(f"\n--- Preview of {file_path.name} ---")
print(f"Columns ({len(df.columns)}):")
for col in df.columns[:20]:
print(f"{col}")
if len(df.columns) > 20:
print(f" ... and {len(df.columns) - 20} more columns")
print(f"\nFirst {rows} rows:")
print(df.to_string())
except Exception as e:
print(f"Error reading {file_path}: {e}")
def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
"""Standardize column names for consistency."""
# Common column mappings from the official data
column_mappings = {
'URN': 'urn',
'SCHNAME': 'school_name',
'TOWN': 'town',
'REGION': 'region',
'RELDENOM': 'school_type',
'P8MEA': 'progress_8',
'ATT8SCR': 'attainment_8',
'PTAC5EM': 'grade_5_eng_maths_pct',
'PTEBACCEG': 'ebacc_entry_pct',
'TPUP': 'pupils',
}
# Normalize column names
df.columns = df.columns.str.strip().str.upper()
# Apply mappings
df = df.rename(columns={k.upper(): v for k, v in column_mappings.items()})
return df
def process_and_combine_data():
"""Process and combine all CSV files into a single dataset."""
csv_files = check_data_files()
if not csv_files:
return None
all_data = []
for csv_file in csv_files:
print(f"\nProcessing: {csv_file.name}")
try:
df = pd.read_csv(csv_file, low_memory=False)
df = standardize_columns(df)
# Try to extract year from filename
import re
year_match = re.search(r'20\d{2}', csv_file.stem)
if year_match:
df['year'] = int(year_match.group())
all_data.append(df)
print(f" Loaded {len(df)} rows")
except Exception as e:
print(f" Error: {e}")
if all_data:
combined = pd.concat(all_data, ignore_index=True)
output_path = DATA_DIR / "combined_data.csv"
combined.to_csv(output_path, index=False)
print(f"\nCombined data saved to: {output_path}")
print(f"Total rows: {len(combined)}")
return combined
return None
def main():
"""Main entry point."""
if len(sys.argv) > 1:
command = sys.argv[1].lower()
if command == "check":
check_data_files()
elif command == "preview" and len(sys.argv) > 2:
file_path = DATA_DIR / sys.argv[2]
if file_path.exists():
preview_data(file_path)
else:
print(f"File not found: {file_path}")
elif command == "combine":
process_and_combine_data()
else:
print_instructions()
else:
print_instructions()
print("\nAvailable commands:")
print(" python download_data.py check - Check for existing data files")
print(" python download_data.py preview <filename> - Preview a CSV file")
print(" python download_data.py combine - Combine all CSV files")
if __name__ == "__main__":
main()