Introducing Postgresql for persistance

2026-01-06 17:15:43 +00:00
parent bd3640d50f
commit 52fbade30c
6 changed files with 492 additions and 213 deletions
@@ -1,24 +1,19 @@
 """
-Data loading module with optimized pandas operations.
-Uses vectorized operations instead of .apply() for performance.
+Data loading module that queries from PostgreSQL database.
+Provides efficient queries with caching and lazy loading.
 """

 import pandas as pd
 import numpy as np
-from pathlib import Path
 from functools import lru_cache
-import re
+from typing import Optional, Dict, Tuple, List
 import requests
-from typing import Optional, Dict, Tuple
+from sqlalchemy import select, func, and_, or_
+from sqlalchemy.orm import joinedload, Session

 from .config import settings
-from .schemas import (
-    COLUMN_MAPPINGS,
-    NUMERIC_COLUMNS,
-    SCHOOL_TYPE_MAP,
-    NULL_VALUES,
-    LA_CODE_TO_NAME,
-)
+from .database import SessionLocal, get_db_session
+from .models import School, SchoolResult

 # Cache for postcode geocoding
 _postcode_cache: Dict[str, Tuple[float, float]] = {}
@@ -31,17 +26,25 @@ def geocode_postcodes_bulk(postcodes: list) -> Dict[str, Tuple[float, float]]:
    """
    results = {}
    
-    # Remove invalid postcodes and deduplicate
-    valid_postcodes = [p.strip().upper() for p in postcodes if p and isinstance(p, str) and len(p.strip()) >= 5]
-    valid_postcodes = list(set(valid_postcodes))
+    # Check cache first
+    uncached = []
+    for pc in postcodes:
+        if pc and isinstance(pc, str):
+            pc_upper = pc.strip().upper()
+            if pc_upper in _postcode_cache:
+                results[pc_upper] = _postcode_cache[pc_upper]
+            elif len(pc_upper) >= 5:
+                uncached.append(pc_upper)
    
-    if not valid_postcodes:
+    if not uncached:
        return results
    
+    uncached = list(set(uncached))
+    
    # postcodes.io allows max 100 postcodes per request
    batch_size = 100
-    for i in range(0, len(valid_postcodes), batch_size):
-        batch = valid_postcodes[i:i + batch_size]
+    for i in range(0, len(uncached), batch_size):
+        batch = uncached[i:i + batch_size]
        try:
            response = requests.post(
                'https://api.postcodes.io/postcodes',
@@ -57,6 +60,7 @@ def geocode_postcodes_bulk(postcodes: list) -> Dict[str, Tuple[float, float]]:
                        lon = item['result'].get('longitude')
                        if lat and lon:
                            results[pc] = (lat, lon)
+                            _postcode_cache[pc] = (lat, lon)
        except Exception as e:
            print(f"  Warning: Geocoding batch failed: {e}")
    
@@ -93,189 +97,6 @@ def geocode_single_postcode(postcode: str) -> Optional[Tuple[float, float]]:
    return None


-def extract_year_from_folder(folder_name: str) -> Optional[int]:
-    """Extract the end year from folder name like '2023-2024' -> 2024."""
-    match = re.search(r'(\d{4})-(\d{4})', folder_name)
-    if match:
-        return int(match.group(2))
-    return None
-
-
-def parse_numeric_vectorized(series: pd.Series) -> pd.Series:
-    """
-    Vectorized numeric parsing - much faster than .apply().
-    Handles SUPP, NE, NA, NP, %, etc.
-    """
-    # Convert to string first
-    str_series = series.astype(str)
-    
-    # Replace null values with NaN
-    for null_val in NULL_VALUES:
-        str_series = str_series.replace(null_val, np.nan)
-    
-    # Remove % signs
-    str_series = str_series.str.rstrip('%')
-    
-    # Convert to numeric
-    return pd.to_numeric(str_series, errors='coerce')
-
-
-def create_address_vectorized(df: pd.DataFrame) -> pd.Series:
-    """
-    Vectorized address creation - much faster than .apply().
-    """
-    parts = []
-    
-    if 'address1' in df.columns:
-        parts.append(df['address1'].fillna('').astype(str))
-    if 'town' in df.columns:
-        parts.append(df['town'].fillna('').astype(str))
-    if 'postcode' in df.columns:
-        parts.append(df['postcode'].fillna('').astype(str))
-    
-    if not parts:
-        return pd.Series([''] * len(df), index=df.index)
-    
-    # Combine parts with comma separator, filtering empty strings
-    result = pd.Series([''] * len(df), index=df.index)
-    for i, row_idx in enumerate(df.index):
-        row_parts = [p.iloc[i] if hasattr(p, 'iloc') else p[i] for p in parts]
-        row_parts = [p for p in row_parts if p and p.strip()]
-        result.iloc[i] = ', '.join(row_parts)
-    
-    return result
-
-
-def create_address_fast(df: pd.DataFrame) -> pd.Series:
-    """
-    Fast vectorized address creation using string concatenation.
-    """
-    addr1 = df.get('address1', pd.Series([''] * len(df))).fillna('').astype(str)
-    town = df.get('town', pd.Series([''] * len(df))).fillna('').astype(str)
-    postcode = df.get('postcode', pd.Series([''] * len(df))).fillna('').astype(str)
-    
-    # Build address with proper separators
-    result = addr1.str.strip()
-    
-    # Add town if not empty
-    town_mask = town.str.strip() != ''
-    result = result.where(~town_mask, result + ', ' + town.str.strip())
-    
-    # Add postcode if not empty  
-    postcode_mask = postcode.str.strip() != ''
-    result = result.where(~postcode_mask, result + ', ' + postcode.str.strip())
-    
-    # Clean up leading commas
-    result = result.str.lstrip(', ')
-    
-    return result
-
-
-def load_year_data(year_folder: Path, year: int) -> Optional[pd.DataFrame]:
-    """Load and process data for a single year."""
-    ks2_file = year_folder / "england_ks2final.csv"
-    if not ks2_file.exists():
-        return None
-    
-    try:
-        print(f"Loading data from {ks2_file}")
-        df = pd.read_csv(ks2_file, low_memory=False)
-        
-        # Handle column types
-        if 'LEA' in df.columns and df['LEA'].dtype == 'object':
-            df['LEA'] = pd.to_numeric(df['LEA'], errors='coerce')
-        if 'URN' in df.columns and df['URN'].dtype == 'object':
-            df['URN'] = pd.to_numeric(df['URN'], errors='coerce')
-        
-        # Filter to schools only (RECTYPE == 1 means school level data)
-        if 'RECTYPE' in df.columns:
-            df = df[df['RECTYPE'] == 1].copy()
-        
-        # Add year and local authority name
-        df['year'] = year
-        
-        # Try different column names for LA name
-        la_name_cols = ['LANAME', 'LA (name)', 'LA_NAME', 'LA NAME']
-        la_col_found = None
-        for col in la_name_cols:
-            if col in df.columns:
-                la_col_found = col
-                break
-        
-        if la_col_found:
-            df['local_authority'] = df[la_col_found]
-        elif 'LEA' in df.columns:
-            # Map LEA codes to names using our mapping
-            df['local_authority'] = df['LEA'].map(LA_CODE_TO_NAME).fillna(df['LEA'].astype(str))
-        
-        # Rename columns using mapping
-        rename_dict = {k: v for k, v in COLUMN_MAPPINGS.items() if k in df.columns}
-        df = df.rename(columns=rename_dict)
-        
-        # Create address field (vectorized)
-        df['address'] = create_address_fast(df)
-        
-        # Map school type codes to names (vectorized)
-        if 'school_type_code' in df.columns:
-            df['school_type'] = df['school_type_code'].map(SCHOOL_TYPE_MAP).fillna('Other')
-        
-        # Parse numeric columns (vectorized - much faster than .apply())
-        for col in NUMERIC_COLUMNS:
-            if col in df.columns:
-                df[col] = parse_numeric_vectorized(df[col])
-        
-        # Initialize lat/long columns
-        df['latitude'] = None
-        df['longitude'] = None
-        
-        print(f"  Loaded {len(df)} schools for year {year}")
-        return df
-        
-    except Exception as e:
-        print(f"Error loading {ks2_file}: {e}")
-        return None
-
-
-@lru_cache(maxsize=1)
-def load_school_data() -> pd.DataFrame:
-    """
-    Load and combine all school data from CSV files in year folders.
-    Uses lru_cache for singleton-like behavior.
-    """
-    all_data = []
-    
-    data_dir = settings.data_dir
-    if data_dir.exists():
-        for year_folder in data_dir.iterdir():
-            if year_folder.is_dir() and re.match(r'\d{4}-\d{4}', year_folder.name):
-                year = extract_year_from_folder(year_folder.name)
-                if year is None:
-                    continue
-                
-                df = load_year_data(year_folder, year)
-                if df is not None:
-                    all_data.append(df)
-    
-    if all_data:
-        result = pd.concat(all_data, ignore_index=True)
-        print(f"\nTotal records loaded: {len(result)}")
-        print(f"Unique schools: {result['urn'].nunique()}")
-        print(f"Years: {sorted(result['year'].unique())}")
-        
-        # Note: Geocoding is done lazily when location search is used
-        # This keeps startup fast
-        
-        return result
-    else:
-        print("No data files found. Creating empty DataFrame.")
-        return pd.DataFrame()
-
-
-def clear_cache():
-    """Clear the data cache to force reload."""
-    load_school_data.cache_clear()
-
-
 def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> float:
    """
    Calculate the great circle distance between two points on Earth (in miles).
@@ -296,3 +117,402 @@ def haversine_distance(lat1: float, lon1: float, lat2: float, lon2: float) -> fl
    
    return c * r

+
+# =============================================================================
+# DATABASE QUERY FUNCTIONS
+# =============================================================================
+
+def get_db():
+    """Get a database session."""
+    return SessionLocal()
+
+
+def get_available_years(db: Session = None) -> List[int]:
+    """Get list of available years in the database."""
+    close_db = db is None
+    if db is None:
+        db = get_db()
+    
+    try:
+        result = db.query(SchoolResult.year).distinct().order_by(SchoolResult.year).all()
+        return [r[0] for r in result]
+    finally:
+        if close_db:
+            db.close()
+
+
+def get_available_local_authorities(db: Session = None) -> List[str]:
+    """Get list of available local authorities."""
+    close_db = db is None
+    if db is None:
+        db = get_db()
+    
+    try:
+        result = db.query(School.local_authority)\
+            .filter(School.local_authority.isnot(None))\
+            .distinct()\
+            .order_by(School.local_authority)\
+            .all()
+        return [r[0] for r in result if r[0]]
+    finally:
+        if close_db:
+            db.close()
+
+
+def get_available_school_types(db: Session = None) -> List[str]:
+    """Get list of available school types."""
+    close_db = db is None
+    if db is None:
+        db = get_db()
+    
+    try:
+        result = db.query(School.school_type)\
+            .filter(School.school_type.isnot(None))\
+            .distinct()\
+            .order_by(School.school_type)\
+            .all()
+        return [r[0] for r in result if r[0]]
+    finally:
+        if close_db:
+            db.close()
+
+
+def get_schools_count(db: Session = None) -> int:
+    """Get total number of schools."""
+    close_db = db is None
+    if db is None:
+        db = get_db()
+    
+    try:
+        return db.query(School).count()
+    finally:
+        if close_db:
+            db.close()
+
+
+def get_schools(
+    db: Session,
+    search: Optional[str] = None,
+    local_authority: Optional[str] = None,
+    school_type: Optional[str] = None,
+    page: int = 1,
+    page_size: int = 50,
+) -> Tuple[List[School], int]:
+    """
+    Get paginated list of schools with optional filters.
+    Returns (schools, total_count).
+    """
+    query = db.query(School)
+    
+    # Apply filters
+    if search:
+        search_lower = f"%{search.lower()}%"
+        query = query.filter(
+            or_(
+                func.lower(School.school_name).like(search_lower),
+                func.lower(School.postcode).like(search_lower),
+                func.lower(School.town).like(search_lower),
+            )
+        )
+    
+    if local_authority:
+        query = query.filter(func.lower(School.local_authority) == local_authority.lower())
+    
+    if school_type:
+        query = query.filter(func.lower(School.school_type) == school_type.lower())
+    
+    # Get total count
+    total = query.count()
+    
+    # Apply pagination
+    offset = (page - 1) * page_size
+    schools = query.order_by(School.school_name).offset(offset).limit(page_size).all()
+    
+    return schools, total
+
+
+def get_schools_near_location(
+    db: Session,
+    latitude: float,
+    longitude: float,
+    radius_miles: float = 5.0,
+    search: Optional[str] = None,
+    local_authority: Optional[str] = None,
+    school_type: Optional[str] = None,
+    page: int = 1,
+    page_size: int = 50,
+) -> Tuple[List[Tuple[School, float]], int]:
+    """
+    Get schools near a location, sorted by distance.
+    Returns list of (school, distance) tuples and total count.
+    """
+    # Get all schools with coordinates
+    query = db.query(School).filter(
+        School.latitude.isnot(None),
+        School.longitude.isnot(None)
+    )
+    
+    # Apply text filters
+    if search:
+        search_lower = f"%{search.lower()}%"
+        query = query.filter(
+            or_(
+                func.lower(School.school_name).like(search_lower),
+                func.lower(School.postcode).like(search_lower),
+                func.lower(School.town).like(search_lower),
+            )
+        )
+    
+    if local_authority:
+        query = query.filter(func.lower(School.local_authority) == local_authority.lower())
+    
+    if school_type:
+        query = query.filter(func.lower(School.school_type) == school_type.lower())
+    
+    # Get all matching schools and calculate distances
+    all_schools = query.all()
+    
+    schools_with_distance = []
+    for school in all_schools:
+        if school.latitude and school.longitude:
+            dist = haversine_distance(latitude, longitude, school.latitude, school.longitude)
+            if dist <= radius_miles:
+                schools_with_distance.append((school, dist))
+    
+    # Sort by distance
+    schools_with_distance.sort(key=lambda x: x[1])
+    
+    total = len(schools_with_distance)
+    
+    # Paginate
+    offset = (page - 1) * page_size
+    paginated = schools_with_distance[offset:offset + page_size]
+    
+    return paginated, total
+
+
+def get_school_by_urn(db: Session, urn: int) -> Optional[School]:
+    """Get a single school by URN."""
+    return db.query(School).filter(School.urn == urn).first()
+
+
+def get_school_results(
+    db: Session,
+    urn: int,
+    years: Optional[List[int]] = None
+) -> List[SchoolResult]:
+    """Get all results for a school, optionally filtered by years."""
+    query = db.query(SchoolResult)\
+        .join(School)\
+        .filter(School.urn == urn)\
+        .order_by(SchoolResult.year)
+    
+    if years:
+        query = query.filter(SchoolResult.year.in_(years))
+    
+    return query.all()
+
+
+def get_rankings(
+    db: Session,
+    metric: str,
+    year: int,
+    local_authority: Optional[str] = None,
+    limit: int = 20,
+    ascending: bool = False,
+) -> List[Tuple[School, SchoolResult]]:
+    """
+    Get school rankings for a specific metric and year.
+    Returns list of (school, result) tuples.
+    """
+    # Build the query
+    query = db.query(School, SchoolResult)\
+        .join(SchoolResult)\
+        .filter(SchoolResult.year == year)
+    
+    # Filter by local authority
+    if local_authority:
+        query = query.filter(func.lower(School.local_authority) == local_authority.lower())
+    
+    # Get the metric column
+    metric_column = getattr(SchoolResult, metric, None)
+    if metric_column is None:
+        return []
+    
+    # Filter out nulls and order
+    query = query.filter(metric_column.isnot(None))
+    
+    if ascending:
+        query = query.order_by(metric_column.asc())
+    else:
+        query = query.order_by(metric_column.desc())
+    
+    return query.limit(limit).all()
+
+
+def get_data_info(db: Session = None) -> dict:
+    """Get information about the data in the database."""
+    close_db = db is None
+    if db is None:
+        db = get_db()
+    
+    try:
+        school_count = db.query(School).count()
+        result_count = db.query(SchoolResult).count()
+        years = get_available_years(db)
+        local_authorities = get_available_local_authorities(db)
+        
+        return {
+            "total_schools": school_count,
+            "total_results": result_count,
+            "years_available": years,
+            "local_authorities_count": len(local_authorities),
+            "data_source": "PostgreSQL",
+        }
+    finally:
+        if close_db:
+            db.close()
+
+
+def school_to_dict(school: School, include_results: bool = False) -> dict:
+    """Convert a School model to dictionary."""
+    data = {
+        "urn": school.urn,
+        "school_name": school.school_name,
+        "local_authority": school.local_authority,
+        "school_type": school.school_type,
+        "address": school.address,
+        "town": school.town,
+        "postcode": school.postcode,
+        "latitude": school.latitude,
+        "longitude": school.longitude,
+    }
+    
+    if include_results and school.results:
+        data["results"] = [result_to_dict(r) for r in school.results]
+    
+    return data
+
+
+def result_to_dict(result: SchoolResult) -> dict:
+    """Convert a SchoolResult model to dictionary."""
+    return {
+        "year": result.year,
+        "total_pupils": result.total_pupils,
+        "eligible_pupils": result.eligible_pupils,
+        # Expected Standard
+        "rwm_expected_pct": result.rwm_expected_pct,
+        "reading_expected_pct": result.reading_expected_pct,
+        "writing_expected_pct": result.writing_expected_pct,
+        "maths_expected_pct": result.maths_expected_pct,
+        "gps_expected_pct": result.gps_expected_pct,
+        "science_expected_pct": result.science_expected_pct,
+        # Higher Standard
+        "rwm_high_pct": result.rwm_high_pct,
+        "reading_high_pct": result.reading_high_pct,
+        "writing_high_pct": result.writing_high_pct,
+        "maths_high_pct": result.maths_high_pct,
+        "gps_high_pct": result.gps_high_pct,
+        # Progress
+        "reading_progress": result.reading_progress,
+        "writing_progress": result.writing_progress,
+        "maths_progress": result.maths_progress,
+        # Averages
+        "reading_avg_score": result.reading_avg_score,
+        "maths_avg_score": result.maths_avg_score,
+        "gps_avg_score": result.gps_avg_score,
+        # Context
+        "disadvantaged_pct": result.disadvantaged_pct,
+        "eal_pct": result.eal_pct,
+        "sen_support_pct": result.sen_support_pct,
+        "sen_ehcp_pct": result.sen_ehcp_pct,
+        "stability_pct": result.stability_pct,
+        # Gender
+        "rwm_expected_boys_pct": result.rwm_expected_boys_pct,
+        "rwm_expected_girls_pct": result.rwm_expected_girls_pct,
+        "rwm_high_boys_pct": result.rwm_high_boys_pct,
+        "rwm_high_girls_pct": result.rwm_high_girls_pct,
+        # Disadvantaged
+        "rwm_expected_disadvantaged_pct": result.rwm_expected_disadvantaged_pct,
+        "rwm_expected_non_disadvantaged_pct": result.rwm_expected_non_disadvantaged_pct,
+        "disadvantaged_gap": result.disadvantaged_gap,
+        # 3-Year
+        "rwm_expected_3yr_pct": result.rwm_expected_3yr_pct,
+        "reading_avg_3yr": result.reading_avg_3yr,
+        "maths_avg_3yr": result.maths_avg_3yr,
+    }
+
+
+# =============================================================================
+# LEGACY COMPATIBILITY - DataFrame-based functions
+# =============================================================================
+
+def load_school_data_as_dataframe(db: Session = None) -> pd.DataFrame:
+    """
+    Load all school data as a pandas DataFrame.
+    For compatibility with existing code that expects DataFrames.
+    """
+    close_db = db is None
+    if db is None:
+        db = get_db()
+    
+    try:
+        # Query all schools with their results
+        schools = db.query(School).options(joinedload(School.results)).all()
+        
+        rows = []
+        for school in schools:
+            for result in school.results:
+                row = {
+                    "urn": school.urn,
+                    "school_name": school.school_name,
+                    "local_authority": school.local_authority,
+                    "school_type": school.school_type,
+                    "address": school.address,
+                    "town": school.town,
+                    "postcode": school.postcode,
+                    "latitude": school.latitude,
+                    "longitude": school.longitude,
+                    **result_to_dict(result)
+                }
+                rows.append(row)
+        
+        if rows:
+            return pd.DataFrame(rows)
+        return pd.DataFrame()
+    finally:
+        if close_db:
+            db.close()
+
+
+# Cache for DataFrame (legacy compatibility)
+_df_cache: Optional[pd.DataFrame] = None
+
+
+def load_school_data() -> pd.DataFrame:
+    """
+    Legacy function to load school data as DataFrame.
+    Uses caching for performance.
+    """
+    global _df_cache
+    
+    if _df_cache is not None:
+        return _df_cache
+    
+    print("Loading school data from database...")
+    _df_cache = load_school_data_as_dataframe()
+    
+    if not _df_cache.empty:
+        print(f"Total records loaded: {len(_df_cache)}")
+        print(f"Unique schools: {_df_cache['urn'].nunique()}")
+        print(f"Years: {sorted(_df_cache['year'].unique())}")
+    else:
+        print("No data found in database")
+    
+    return _df_cache
+
+
+def clear_cache():
+    """Clear all caches."""
+    global _df_cache
+    _df_cache = None