initial commit

2026-01-06 13:52:00 +00:00
commit c65eb1a00f
37 changed files with 402537 additions and 0 deletions
@@ -0,0 +1,181 @@
+#!/usr/bin/env python3
+"""
+Data Download Helper Script
+
+This script provides instructions and utilities for downloading
+UK school performance data from the official government source.
+
+Data Source: https://www.compare-school-performance.service.gov.uk/download-data
+
+Note: The actual CSV downloads require manual selection on the website
+as they use dynamic form submissions. This script helps prepare and
+organize the downloaded data.
+"""
+
+import os
+import sys
+from pathlib import Path
+import pandas as pd
+
+DATA_DIR = Path(__file__).parent.parent / "data"
+
+
+def print_instructions():
+    """Print instructions for downloading the data."""
+    print("""
+╔══════════════════════════════════════════════════════════════════════════════╗
+║           UK School Performance Data Download Instructions                    ║
+╠══════════════════════════════════════════════════════════════════════════════╣
+║                                                                              ║
+║  1. Visit: https://www.compare-school-performance.service.gov.uk/download-data║
+║                                                                              ║
+║  2. For each year (2019-2020 through 2023-2024), select:                    ║
+║     • Year: Select the academic year                                         ║
+║     • Data type: "Key Stage 4" (for secondary school GCSE data)             ║
+║     • File type: "All data" or specific metrics you need                    ║
+║                                                                              ║
+║  3. Key metrics available:                                                   ║
+║     • Progress 8 - measures pupil progress from KS2 to KS4                  ║
+║     • Attainment 8 - average attainment across 8 qualifications             ║
+║     • English & Maths Grade 5+ percentage                                   ║
+║     • EBacc entry and achievement percentages                               ║
+║                                                                              ║
+║  4. Download the CSV files and place them in the 'data' folder              ║
+║                                                                              ║
+║  5. Rename files with the year for clarity, e.g.:                           ║
+║     • ks4_2020.csv                                                          ║
+║     • ks4_2021.csv                                                          ║
+║     • ks4_2022.csv                                                          ║
+║     • ks4_2023.csv                                                          ║
+║     • ks4_2024.csv                                                          ║
+║                                                                              ║
+╚══════════════════════════════════════════════════════════════════════════════╝
+""")
+
+
+def check_data_files():
+    """Check what data files are present in the data directory."""
+    if not DATA_DIR.exists():
+        print(f"Data directory not found: {DATA_DIR}")
+        return []
+    
+    csv_files = list(DATA_DIR.glob("*.csv"))
+    
+    if not csv_files:
+        print("No CSV files found in the data directory.")
+        print(f"Please place your downloaded CSV files in: {DATA_DIR}")
+        return []
+    
+    print(f"\nFound {len(csv_files)} CSV file(s):")
+    for f in csv_files:
+        size_mb = f.stat().st_size / (1024 * 1024)
+        print(f"  • {f.name} ({size_mb:.2f} MB)")
+    
+    return csv_files
+
+
+def preview_data(file_path: Path, rows: int = 5):
+    """Preview a CSV file."""
+    try:
+        df = pd.read_csv(file_path, nrows=rows)
+        print(f"\n--- Preview of {file_path.name} ---")
+        print(f"Columns ({len(df.columns)}):")
+        for col in df.columns[:20]:
+            print(f"  • {col}")
+        if len(df.columns) > 20:
+            print(f"  ... and {len(df.columns) - 20} more columns")
+        print(f"\nFirst {rows} rows:")
+        print(df.to_string())
+    except Exception as e:
+        print(f"Error reading {file_path}: {e}")
+
+
+def standardize_columns(df: pd.DataFrame) -> pd.DataFrame:
+    """Standardize column names for consistency."""
+    # Common column mappings from the official data
+    column_mappings = {
+        'URN': 'urn',
+        'SCHNAME': 'school_name',
+        'TOWN': 'town',
+        'REGION': 'region',
+        'RELDENOM': 'school_type',
+        'P8MEA': 'progress_8',
+        'ATT8SCR': 'attainment_8',
+        'PTAC5EM': 'grade_5_eng_maths_pct',
+        'PTEBACCEG': 'ebacc_entry_pct',
+        'TPUP': 'pupils',
+    }
+    
+    # Normalize column names
+    df.columns = df.columns.str.strip().str.upper()
+    
+    # Apply mappings
+    df = df.rename(columns={k.upper(): v for k, v in column_mappings.items()})
+    
+    return df
+
+
+def process_and_combine_data():
+    """Process and combine all CSV files into a single dataset."""
+    csv_files = check_data_files()
+    if not csv_files:
+        return None
+    
+    all_data = []
+    
+    for csv_file in csv_files:
+        print(f"\nProcessing: {csv_file.name}")
+        try:
+            df = pd.read_csv(csv_file, low_memory=False)
+            df = standardize_columns(df)
+            
+            # Try to extract year from filename
+            import re
+            year_match = re.search(r'20\d{2}', csv_file.stem)
+            if year_match:
+                df['year'] = int(year_match.group())
+            
+            all_data.append(df)
+            print(f"  Loaded {len(df)} rows")
+        except Exception as e:
+            print(f"  Error: {e}")
+    
+    if all_data:
+        combined = pd.concat(all_data, ignore_index=True)
+        output_path = DATA_DIR / "combined_data.csv"
+        combined.to_csv(output_path, index=False)
+        print(f"\nCombined data saved to: {output_path}")
+        print(f"Total rows: {len(combined)}")
+        return combined
+    
+    return None
+
+
+def main():
+    """Main entry point."""
+    if len(sys.argv) > 1:
+        command = sys.argv[1].lower()
+        
+        if command == "check":
+            check_data_files()
+        elif command == "preview" and len(sys.argv) > 2:
+            file_path = DATA_DIR / sys.argv[2]
+            if file_path.exists():
+                preview_data(file_path)
+            else:
+                print(f"File not found: {file_path}")
+        elif command == "combine":
+            process_and_combine_data()
+        else:
+            print_instructions()
+    else:
+        print_instructions()
+        print("\nAvailable commands:")
+        print("  python download_data.py check    - Check for existing data files")
+        print("  python download_data.py preview <filename> - Preview a CSV file")
+        print("  python download_data.py combine  - Combine all CSV files")
+
+
+if __name__ == "__main__":
+    main()
+
@@ -0,0 +1,253 @@
+#!/usr/bin/env python3
+"""
+Fetch real school performance data from UK Government sources.
+
+This script downloads KS2 (Key Stage 2) primary school data from:
+- Compare School Performance service
+- Get Information about Schools (GIAS)
+
+Data is filtered to only include schools in Wandsworth and Merton.
+"""
+
+import os
+import sys
+import requests
+import pandas as pd
+from pathlib import Path
+from io import StringIO
+
+# Output directory
+DATA_DIR = Path(__file__).parent.parent / "data"
+
+# Local Authority codes for Wandsworth and Merton
+LA_CODES = {
+    "Wandsworth": "212",
+    "Merton": "315"
+}
+
+# Academic years to fetch (last 5 years available)
+YEARS = ["2023-2024", "2022-2023", "2021-2022", "2019-2020", "2018-2019"]
+# Note: 2020-2021 had no SATs due to COVID
+
+
+def fetch_gias_data():
+    """
+    Fetch school establishment data from Get Information About Schools.
+    This gives us the list of schools with URN, name, address, type, etc.
+    """
+    print("Fetching school establishment data from GIAS...")
+    
+    # GIAS provides downloadable extracts
+    # Main extract URL (this may need to be updated periodically)
+    gias_url = "https://ea-edubase-api-prod.azurewebsites.net/edubase/downloads/public/edubasealldata.csv"
+    
+    try:
+        response = requests.get(gias_url, timeout=60)
+        response.raise_for_status()
+        
+        # Parse CSV
+        df = pd.read_csv(StringIO(response.text), encoding='utf-8-sig', low_memory=False)
+        
+        # Filter to primary schools in Wandsworth and Merton
+        # Phase of education: Primary, Middle deemed primary
+        # LA codes: 212 (Wandsworth), 315 (Merton)
+        df = df[
+            (df['LA (code)'].astype(str).isin(LA_CODES.values())) &
+            (df['PhaseOfEducation (name)'].str.contains('Primary', na=False))
+        ]
+        
+        # Select relevant columns
+        columns_to_keep = [
+            'URN', 'EstablishmentName', 'LA (name)', 'TypeOfEstablishment (name)',
+            'Street', 'Locality', 'Town', 'Postcode',
+            'SchoolCapacity', 'NumberOfPupils', 'OfstedRating (name)'
+        ]
+        available_cols = [c for c in columns_to_keep if c in df.columns]
+        df = df[available_cols]
+        
+        # Rename columns
+        df = df.rename(columns={
+            'URN': 'urn',
+            'EstablishmentName': 'school_name',
+            'LA (name)': 'local_authority',
+            'TypeOfEstablishment (name)': 'school_type',
+            'Street': 'street',
+            'Town': 'town',
+            'Postcode': 'postcode',
+            'NumberOfPupils': 'pupils',
+            'OfstedRating (name)': 'ofsted_rating'
+        })
+        
+        # Create address field
+        df['address'] = df.apply(
+            lambda row: f"{row.get('street', '')}, {row.get('postcode', '')}".strip(', '),
+            axis=1
+        )
+        
+        print(f"Found {len(df)} primary schools in Wandsworth and Merton")
+        return df
+        
+    except Exception as e:
+        print(f"Error fetching GIAS data: {e}")
+        return None
+
+
+def fetch_ks2_performance_data():
+    """
+    Fetch KS2 performance data from Compare School Performance.
+    
+    Note: The official download page requires form submission.
+    We'll try to access the underlying data files directly.
+    """
+    print("\nFetching KS2 performance data...")
+    
+    # The performance data is available at gov.uk statistics pages
+    # KS2 data URLs follow a pattern
+    base_urls = {
+        "2023-2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/",
+        "2022-2023": "https://content.explore-education-statistics.service.gov.uk/api/releases/",
+    }
+    
+    # Alternative: Direct download links from gov.uk (when available)
+    # These URLs may need to be updated when new data is released
+    data_urls = {
+        # 2024 KS2 results (provisional)
+        "2024": "https://content.explore-education-statistics.service.gov.uk/api/releases/b4cb82e3-6dca-4c98-a3b0-ba7d1d3ef555/files",
+    }
+    
+    print("Note: For the most accurate data, please download manually from:")
+    print("https://www.compare-school-performance.service.gov.uk/download-data")
+    print("\nSteps:")
+    print("1. Select 'Key Stage 2' for Data type")
+    print("2. Select 'All data' for File type")
+    print("3. Select desired academic year")
+    print("4. Download and place CSV files in the 'data' folder")
+    
+    return None
+
+
+def download_from_explore_education_statistics():
+    """
+    Try to fetch data from the Explore Education Statistics API.
+    API docs: https://dfe-analytical-services.github.io/explore-education-statistics-api-docs/
+    """
+    print("\nAttempting to fetch from Explore Education Statistics API...")
+    
+    api_base = "https://explore-education-statistics.service.gov.uk/api/v1"
+    
+    # First, list available publications
+    try:
+        # Get KS2 publication
+        publications_url = f"{api_base}/publications"
+        response = requests.get(publications_url, timeout=30)
+        
+        if response.status_code == 200:
+            publications = response.json()
+            
+            # Find KS2 related publication
+            ks2_pubs = [p for p in publications.get('results', []) 
+                       if 'key stage 2' in p.get('title', '').lower() 
+                       or 'ks2' in p.get('title', '').lower()]
+            
+            if ks2_pubs:
+                print(f"Found KS2 publications: {[p['title'] for p in ks2_pubs]}")
+                
+                # Get the latest release
+                for pub in ks2_pubs:
+                    pub_id = pub.get('id')
+                    if pub_id:
+                        release_url = f"{api_base}/publications/{pub_id}/releases/latest"
+                        release_response = requests.get(release_url, timeout=30)
+                        
+                        if release_response.status_code == 200:
+                            release = release_response.json()
+                            print(f"Latest release: {release.get('title')}")
+                            
+                            # Get data files
+                            data_sets = release.get('dataSets', [])
+                            for ds in data_sets:
+                                print(f"  - Dataset: {ds.get('name')}")
+            else:
+                print("No KS2 publications found via API")
+        else:
+            print(f"API returned status {response.status_code}")
+            
+    except Exception as e:
+        print(f"Error accessing API: {e}")
+    
+    return None
+
+
+def create_combined_dataset(schools_df, performance_data=None):
+    """
+    Combine school information with performance data.
+    If no performance data is available, returns school info only.
+    """
+    if schools_df is None:
+        return None
+    
+    # Add year column for compatibility
+    schools_df['year'] = 2024
+    
+    # Add placeholder performance columns if no real data
+    if performance_data is None:
+        print("\nNo performance data available - school list saved without metrics")
+        print("Download KS2 data manually and re-run to add performance metrics")
+    
+    return schools_df
+
+
+def main():
+    """Main entry point."""
+    print("=" * 60)
+    print("Fetching Real School Data for Wandsworth & Merton")
+    print("=" * 60)
+    
+    # Create data directory
+    DATA_DIR.mkdir(exist_ok=True)
+    
+    # Fetch school establishment data
+    schools_df = fetch_gias_data()
+    
+    # Try to fetch performance data
+    fetch_ks2_performance_data()
+    download_from_explore_education_statistics()
+    
+    # Save school data
+    if schools_df is not None:
+        output_file = DATA_DIR / "schools_wandsworth_merton.csv"
+        schools_df.to_csv(output_file, index=False)
+        print(f"\nSchool data saved to: {output_file}")
+        print(f"Total schools: {len(schools_df)}")
+        
+        # Show breakdown
+        print("\nBreakdown by Local Authority:")
+        print(schools_df['local_authority'].value_counts())
+    else:
+        print("\nFailed to fetch school data")
+    
+    print("\n" + "=" * 60)
+    print("NEXT STEPS:")
+    print("=" * 60)
+    print("""
+To get complete performance data:
+
+1. Go to: https://www.compare-school-performance.service.gov.uk/download-data
+
+2. Download KS2 data for each year (2019-2024):
+   - Select: Key Stage 2
+   - Select: All data (or specific metrics)
+   - Select: Academic year
+   - Click: Download data
+
+3. Place downloaded CSV files in the 'data' folder
+
+4. Restart the application - it will automatically load the real data
+
+The app will merge school info with performance metrics.
+""")
+
+
+if __name__ == "__main__":
+    main()
+