repo restructure
All checks were successful
Build Docker Image / build (push) Successful in 1m3s

This commit is contained in:
Tudor Sitaru
2025-10-14 21:58:54 +01:00
parent e062b51b4b
commit d8637ac2ea
69 changed files with 781 additions and 4710 deletions

44
src/__init__.py Normal file
View File

@@ -0,0 +1,44 @@
"""
ParentZone Downloader - Source Package
This package contains the core application modules for the ParentZone Downloader.
Modules:
- asset_tracker: Track downloaded assets to avoid re-downloads
- auth_manager: Handle authentication with ParentZone API
- config_downloader: Configuration-based image downloader
- config_snapshot_downloader: Configuration-based snapshot downloader
- image_downloader: Download images from ParentZone API
- snapshot_downloader: Download snapshots from ParentZone API
- webserver: Web server to serve downloaded snapshots
"""
__version__ = "1.0.0"
__author__ = "ParentZone Downloader Team"
# Import main classes for easier access
try:
from .asset_tracker import AssetTracker
from .auth_manager import AuthManager
from .config_downloader import ConfigImageDownloader
from .config_snapshot_downloader import ConfigSnapshotDownloader
from .image_downloader import ImageDownloader
from .snapshot_downloader import SnapshotDownloader
from .webserver import SnapshotsWebServer
__all__ = [
"AssetTracker",
"AuthManager",
"ConfigImageDownloader",
"ConfigSnapshotDownloader",
"ImageDownloader",
"SnapshotDownloader",
"SnapshotsWebServer",
]
except ImportError as e:
# Handle case where dependencies might not be available
__all__ = []
import warnings
warnings.warn(f"Some modules could not be imported: {e}")

313
src/asset_tracker.py Normal file
View File

@@ -0,0 +1,313 @@
#!/usr/bin/env python3
"""
Asset Tracker for ParentZone Downloader
This module handles tracking of downloaded assets to avoid re-downloading
and to identify new assets that need to be downloaded.
"""
import json
import logging
import os
from datetime import datetime
from pathlib import Path
from typing import Dict, List, Set, Any, Optional
import hashlib
class AssetTracker:
"""
Tracks downloaded assets and identifies new ones.
"""
def __init__(self, storage_dir: str = "downloaded_images", metadata_file: str = "asset_metadata.json"):
"""
Initialize the asset tracker.
Args:
storage_dir: Directory where downloaded assets are stored
metadata_file: JSON file to store asset metadata
"""
self.storage_dir = Path(storage_dir)
self.storage_dir.mkdir(exist_ok=True)
self.metadata_file = self.storage_dir / metadata_file
self.logger = logging.getLogger(__name__)
# Load existing metadata
self.metadata = self._load_metadata()
def _load_metadata(self) -> Dict[str, Dict[str, Any]]:
"""
Load asset metadata from the JSON file.
Returns:
Dictionary of asset metadata keyed by asset ID
"""
if self.metadata_file.exists():
try:
with open(self.metadata_file, 'r', encoding='utf-8') as f:
data = json.load(f)
self.logger.info(f"Loaded metadata for {len(data)} assets")
return data
except Exception as e:
self.logger.error(f"Failed to load metadata file: {e}")
return {}
else:
self.logger.info("No existing metadata file found, starting fresh")
return {}
def _save_metadata(self):
"""Save asset metadata to the JSON file."""
try:
with open(self.metadata_file, 'w', encoding='utf-8') as f:
json.dump(self.metadata, f, indent=2, default=str)
self.logger.debug(f"Saved metadata for {len(self.metadata)} assets")
except Exception as e:
self.logger.error(f"Failed to save metadata file: {e}")
def _get_asset_key(self, asset: Dict[str, Any]) -> str:
"""
Generate a unique key for an asset.
Args:
asset: Asset dictionary from API
Returns:
Unique key for the asset
"""
# Try different ID fields
if 'id' in asset:
return str(asset['id'])
elif 'assetId' in asset:
return str(asset['assetId'])
elif 'uuid' in asset:
return str(asset['uuid'])
else:
# Generate hash from asset data
asset_str = json.dumps(asset, sort_keys=True, default=str)
return hashlib.md5(asset_str.encode()).hexdigest()
def _get_asset_hash(self, asset: Dict[str, Any]) -> str:
"""
Generate a hash for asset content to detect changes.
Args:
asset: Asset dictionary from API
Returns:
Hash of asset content
"""
# Fields that indicate content changes
content_fields = ['updated', 'modified', 'lastModified', 'size', 'checksum', 'etag']
content_data = {}
for field in content_fields:
if field in asset:
content_data[field] = asset[field]
# If no content fields, use entire asset
if not content_data:
content_data = asset
content_str = json.dumps(content_data, sort_keys=True, default=str)
return hashlib.md5(content_str.encode()).hexdigest()
def is_asset_downloaded(self, asset: Dict[str, Any]) -> bool:
"""
Check if an asset has already been downloaded.
Args:
asset: Asset dictionary from API
Returns:
True if asset is already downloaded, False otherwise
"""
asset_key = self._get_asset_key(asset)
return asset_key in self.metadata
def is_asset_modified(self, asset: Dict[str, Any]) -> bool:
"""
Check if an asset has been modified since last download.
Args:
asset: Asset dictionary from API
Returns:
True if asset has been modified, False otherwise
"""
asset_key = self._get_asset_key(asset)
if asset_key not in self.metadata:
return True # New asset
current_hash = self._get_asset_hash(asset)
stored_hash = self.metadata[asset_key].get('content_hash', '')
return current_hash != stored_hash
def get_new_assets(self, api_assets: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
"""
Identify new or modified assets that need to be downloaded.
Args:
api_assets: List of assets from API response
Returns:
List of assets that need to be downloaded
"""
new_assets = []
for asset in api_assets:
asset_key = self._get_asset_key(asset)
if not self.is_asset_downloaded(asset):
self.logger.info(f"New asset found: {asset_key}")
new_assets.append(asset)
elif self.is_asset_modified(asset):
self.logger.info(f"Modified asset found: {asset_key}")
new_assets.append(asset)
else:
self.logger.debug(f"Asset unchanged: {asset_key}")
self.logger.info(f"Found {len(new_assets)} new/modified assets out of {len(api_assets)} total")
return new_assets
def mark_asset_downloaded(self, asset: Dict[str, Any], filepath: Path, success: bool = True):
"""
Mark an asset as downloaded in the metadata.
Args:
asset: Asset dictionary from API
filepath: Path where asset was saved
success: Whether download was successful
"""
asset_key = self._get_asset_key(asset)
metadata_entry = {
'asset_id': asset_key,
'filename': filepath.name,
'filepath': str(filepath),
'download_date': datetime.now().isoformat(),
'success': success,
'content_hash': self._get_asset_hash(asset),
'api_data': asset
}
# Add file info if download was successful and file exists
if success and filepath.exists():
stat = filepath.stat()
metadata_entry.update({
'file_size': stat.st_size,
'file_modified': datetime.fromtimestamp(stat.st_mtime).isoformat()
})
self.metadata[asset_key] = metadata_entry
self._save_metadata()
self.logger.debug(f"Marked asset as downloaded: {asset_key}")
def get_downloaded_assets(self) -> Dict[str, Dict[str, Any]]:
"""
Get all downloaded asset metadata.
Returns:
Dictionary of downloaded asset metadata
"""
return self.metadata.copy()
def cleanup_missing_files(self):
"""
Remove metadata entries for files that no longer exist on disk.
"""
removed_count = 0
assets_to_remove = []
for asset_key, metadata_entry in self.metadata.items():
filepath = Path(metadata_entry.get('filepath', ''))
if not filepath.exists():
assets_to_remove.append(asset_key)
self.logger.warning(f"File missing, removing from metadata: {filepath}")
for asset_key in assets_to_remove:
del self.metadata[asset_key]
removed_count += 1
if removed_count > 0:
self._save_metadata()
self.logger.info(f"Cleaned up {removed_count} missing file entries from metadata")
def get_stats(self) -> Dict[str, Any]:
"""
Get statistics about tracked assets.
Returns:
Dictionary with statistics
"""
total_assets = len(self.metadata)
successful_downloads = sum(1 for entry in self.metadata.values() if entry.get('success', False))
failed_downloads = total_assets - successful_downloads
total_size = 0
existing_files = 0
for entry in self.metadata.values():
if 'file_size' in entry:
total_size += entry['file_size']
filepath = Path(entry.get('filepath', ''))
if filepath.exists():
existing_files += 1
return {
'total_tracked_assets': total_assets,
'successful_downloads': successful_downloads,
'failed_downloads': failed_downloads,
'existing_files': existing_files,
'missing_files': total_assets - existing_files,
'total_size_bytes': total_size,
'total_size_mb': round(total_size / (1024 * 1024), 2)
}
def print_stats(self):
"""Print statistics about tracked assets."""
stats = self.get_stats()
print("=" * 60)
print("ASSET TRACKER STATISTICS")
print("=" * 60)
print(f"Total tracked assets: {stats['total_tracked_assets']}")
print(f"Successful downloads: {stats['successful_downloads']}")
print(f"Failed downloads: {stats['failed_downloads']}")
print(f"Existing files: {stats['existing_files']}")
print(f"Missing files: {stats['missing_files']}")
print(f"Total size: {stats['total_size_mb']} MB ({stats['total_size_bytes']} bytes)")
print("=" * 60)
def main():
"""Test the asset tracker functionality."""
import sys
# Setup logging
logging.basicConfig(
level=logging.INFO,
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
)
# Create tracker
tracker = AssetTracker()
# Print current stats
tracker.print_stats()
# Cleanup missing files
tracker.cleanup_missing_files()
# Print updated stats
if len(sys.argv) > 1 and sys.argv[1] == '--cleanup':
print("\nAfter cleanup:")
tracker.print_stats()
if __name__ == "__main__":
main()

229
src/auth_manager.py Normal file
View File

@@ -0,0 +1,229 @@
#!/usr/bin/env python3
"""
Authentication Manager for ParentZone API
This module handles authentication against the ParentZone login API
and manages session tokens for API requests.
"""
import asyncio
import aiohttp
import json
import logging
from typing import Optional, Dict, Any
from urllib.parse import urljoin
class AuthManager:
def __init__(self, api_url: str = "https://api.parentzone.me"):
"""
Initialize the authentication manager.
Args:
api_url: Base URL of the API
"""
self.api_url = api_url.rstrip('/')
self.login_url = urljoin(self.api_url, "/v1/auth/login")
self.create_session_url = urljoin(self.api_url, "/v1/auth/create-session")
self.session_token: Optional[str] = None
self.api_key: Optional[str] = None
self.user_id: Optional[str] = None
self.user_name: Optional[str] = None
self.provider_name: Optional[str] = None
self.logger = logging.getLogger(__name__)
# Standard headers for login requests
self.headers = {
'accept': 'application/json, text/plain, */*',
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,ro;q=0.7',
'content-type': 'application/json;charset=UTF-8',
'origin': 'https://www.parentzone.me',
'priority': 'u=1, i',
'sec-ch-ua': '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
'sec-ch-ua-mobile': '?0',
'sec-ch-ua-platform': '"macOS"',
'sec-fetch-dest': 'empty',
'sec-fetch-mode': 'cors',
'sec-fetch-site': 'same-site',
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'
}
async def login(self, email: str, password: str) -> bool:
"""
Login to the ParentZone API using two-step authentication.
Step 1: Login with email/password to get user accounts
Step 2: Create session with first account ID and password to get API key
Args:
email: User email
password: User password
Returns:
True if login successful, False otherwise
"""
self.logger.info(f"Attempting login for {email}")
# Step 1: Login to get user accounts
login_data = {
"email": email,
"password": password
}
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
try:
async with session.post(
self.login_url,
headers=self.headers,
json=login_data
) as response:
self.logger.info(f"Login response status: {response.status}")
if response.status == 200:
data = await response.json()
self.logger.info("Login successful")
self.logger.debug(f"Response data type: {type(data)}")
self.logger.debug(f"Full response data: {data}")
# Handle list response with user accounts
if isinstance(data, list) and len(data) > 0:
# Use the first account
first_account = data[0]
self.user_id = first_account.get('id')
self.user_name = first_account.get('name')
self.provider_name = first_account.get('providerName')
self.logger.info(f"Selected account: {self.user_name} at {self.provider_name} (ID: {self.user_id})")
# Step 2: Create session with the account ID
return await self._create_session(password)
else:
self.logger.error(f"Unexpected login response format: {data}")
return False
else:
error_text = await response.text()
self.logger.error(f"Login failed with status {response.status}: {error_text}")
return False
except Exception as e:
self.logger.error(f"Login request failed: {e}")
return False
async def _create_session(self, password: str) -> bool:
"""
Create a session using the user ID from login.
Args:
password: User password
Returns:
True if session creation successful, False otherwise
"""
if not self.user_id:
self.logger.error("No user ID available for session creation")
return False
self.logger.info(f"Creating session for user ID: {self.user_id}")
session_data = {
"id": self.user_id,
"password": password
}
# Add x-api-product header for session creation
session_headers = self.headers.copy()
session_headers['x-api-product'] = 'iConnect'
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(timeout=timeout) as session:
try:
async with session.post(
self.create_session_url,
headers=session_headers,
json=session_data
) as response:
self.logger.info(f"Create session response status: {response.status}")
if response.status == 200:
data = await response.json()
self.logger.info("Session creation successful")
self.logger.debug(f"Session response data: {data}")
# Extract API key from response
if isinstance(data, dict) and 'key' in data:
self.api_key = data['key']
self.logger.info("API key obtained successfully")
return True
else:
self.logger.error(f"No 'key' field in session response: {data}")
return False
else:
error_text = await response.text()
self.logger.error(f"Session creation failed with status {response.status}: {error_text}")
return False
except Exception as e:
self.logger.error(f"Session creation request failed: {e}")
return False
def get_auth_headers(self) -> Dict[str, str]:
"""
Get headers with authentication token.
Returns:
Dictionary of headers including authentication
"""
headers = self.headers.copy()
if self.api_key:
# Use x-api-key header for authenticated requests
headers['x-api-key'] = self.api_key
headers['x-api-product'] = 'iConnect'
return headers
def is_authenticated(self) -> bool:
"""
Check if currently authenticated.
Returns:
True if authenticated, False otherwise
"""
return self.api_key is not None
def logout(self):
"""Clear the session data."""
self.api_key = None
self.session_token = None
self.user_id = None
self.user_name = None
self.provider_name = None
self.logger.info("Logged out - session data cleared")
async def test_login():
"""Test the login functionality."""
auth_manager = AuthManager()
# Test credentials (replace with actual credentials)
email = "tudor.sitaru@gmail.com"
password = "mTVq8uNUvY7R39EPGVAm@"
print("Testing ParentZone Login...")
success = await auth_manager.login(email, password)
if success:
print("✅ Login successful!")
print(f"User: {auth_manager.user_name} at {auth_manager.provider_name}")
print(f"User ID: {auth_manager.user_id}")
print(f"API Key: {auth_manager.api_key[:20]}..." if auth_manager.api_key else "No API key found")
# Test getting auth headers
headers = auth_manager.get_auth_headers()
print(f"Auth headers: {list(headers.keys())}")
else:
print("❌ Login failed!")
if __name__ == "__main__":
asyncio.run(test_login())

517
src/config_downloader.py Normal file
View File

@@ -0,0 +1,517 @@
#!/usr/bin/env python3
"""
Configuration-based Image Downloader
This script reads configuration from a JSON file and downloads images from a REST API.
It's a simplified version of the main downloader for easier use.
Usage:
python config_downloader.py --config config.json
"""
import argparse
import json
import asyncio
import aiohttp
import aiofiles
import os
import logging
from pathlib import Path
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Any, Optional
import time
from tqdm import tqdm
# Import the auth manager and asset tracker
try:
from src.auth_manager import AuthManager
except ImportError:
AuthManager = None
try:
from src.asset_tracker import AssetTracker
except ImportError:
AssetTracker = None
class ConfigImageDownloader:
def __init__(self, config_file: str):
"""
Initialize the downloader with configuration from a JSON file.
Args:
config_file: Path to the JSON configuration file
"""
self.config = self.load_config(config_file)
self.setup_logging()
# Create output directory
self.output_dir = Path(self.config["output_dir"])
self.output_dir.mkdir(parents=True, exist_ok=True)
# Track download statistics
self.stats = {"total": 0, "successful": 0, "failed": 0, "skipped": 0}
# Authentication manager
self.auth_manager = None
# Initialize asset tracker if enabled and available
track_assets = self.config.get("track_assets", True)
self.asset_tracker = None
if track_assets and AssetTracker:
self.asset_tracker = AssetTracker(storage_dir=str(self.output_dir))
self.logger.info("Asset tracking enabled")
elif track_assets:
self.logger.warning(
"Asset tracking requested but AssetTracker not available"
)
else:
self.logger.info("Asset tracking disabled")
def load_config(self, config_file: str) -> Dict[str, Any]:
"""Load configuration from JSON file."""
try:
with open(config_file, "r") as f:
config = json.load(f)
# Validate required fields
required_fields = [
"api_url",
"list_endpoint",
"download_endpoint",
"output_dir",
]
for field in required_fields:
if field not in config:
raise ValueError(f"Missing required field: {field}")
# Set defaults for optional fields
config.setdefault("max_concurrent", 5)
config.setdefault("timeout", 30)
config.setdefault("headers", {})
# Note: API key is now passed as URL parameter, not header
# The x-api-key header is only used for the list endpoint
# Add API key to headers for list endpoint authentication
if "api_key" in config and config["api_key"]:
config["headers"]["x-api-key"] = config["api_key"]
return config
except FileNotFoundError:
raise FileNotFoundError(f"Configuration file not found: {config_file}")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in configuration file: {e}")
def setup_logging(self):
"""Setup logging configuration."""
log_file = Path(self.config["output_dir"]) / "download.log"
# Create output directory if it doesn't exist
log_file.parent.mkdir(parents=True, exist_ok=True)
# Create log file if it doesn't exist
log_file.touch(exist_ok=True)
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[logging.FileHandler(log_file), logging.StreamHandler()],
)
self.logger = logging.getLogger(__name__)
async def authenticate(self):
"""Perform login authentication if credentials are provided in config."""
if "email" in self.config and "password" in self.config and AuthManager:
self.logger.info("Attempting login authentication...")
self.auth_manager = AuthManager(self.config["api_url"])
success = await self.auth_manager.login(
self.config["email"], self.config["password"]
)
if success:
self.logger.info("Login authentication successful")
else:
self.logger.error("Login authentication failed")
raise Exception("Login authentication failed")
elif "email" in self.config or "password" in self.config:
self.logger.warning(
"Both email and password must be provided in config for login authentication"
)
raise Exception(
"Both email and password must be provided in config for login authentication"
)
async def get_asset_list(
self, session: aiohttp.ClientSession
) -> List[Dict[str, Any]]:
"""Fetch the list of assets from the API."""
url = urljoin(self.config["api_url"], self.config["list_endpoint"])
self.logger.info(f"Fetching asset list from: {url}")
headers = self.config.get("headers", {})
# Use API key if provided
if "api_key" in self.config and self.config["api_key"]:
headers["x-api-key"] = self.config["api_key"]
# Use login authentication if available
elif self.auth_manager and self.auth_manager.is_authenticated():
headers.update(self.auth_manager.get_auth_headers())
try:
async with session.get(
url, headers=headers, timeout=self.config["timeout"]
) as response:
response.raise_for_status()
data = await response.json()
# Handle different response formats
if isinstance(data, list):
assets = data
elif isinstance(data, dict):
# Common patterns for API responses
for key in ["data", "results", "items", "assets", "images"]:
if key in data and isinstance(data[key], list):
assets = data[key]
break
else:
assets = [data] # Single asset
else:
raise ValueError(f"Unexpected response format: {type(data)}")
self.logger.info(f"Found {len(assets)} assets")
return assets
except Exception as e:
self.logger.error(f"Failed to fetch asset list: {e}")
raise
def get_download_url(self, asset: Dict[str, Any]) -> str:
"""Generate the download URL for an asset."""
# Try different common patterns for asset IDs
asset_id = None
# Common field names for asset identifiers
id_fields = ["id", "asset_id", "image_id", "file_id", "uuid", "key"]
for field in id_fields:
if field in asset:
asset_id = asset[field]
break
if asset_id is None:
# If no ID field found, try to use the asset itself as the ID
asset_id = str(asset)
# Build download URL with required parameters
from urllib.parse import urlencode
params = {"key": self.config.get("api_key", ""), "u": asset.get("updated", "")}
download_url = urljoin(
self.config["api_url"], f"/v1/media/{asset_id}/full?{urlencode(params)}"
)
return download_url
def get_filename(self, asset: Dict[str, Any], url: str) -> str:
"""Generate a filename for the downloaded asset."""
# Try to get filename from asset metadata
if "fileName" in asset:
filename = asset["fileName"]
elif "filename" in asset:
filename = asset["filename"]
elif "name" in asset:
filename = asset["name"]
elif "title" in asset:
filename = asset["title"]
else:
# Extract filename from URL
parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path)
# If no extension, try to get it from content-type or add default
if "." not in filename:
if "mimeType" in asset:
ext = self._get_extension_from_mime(asset["mimeType"])
elif "content_type" in asset:
ext = self._get_extension_from_mime(asset["content_type"])
else:
ext = ".jpg" # Default extension
filename += ext
# Sanitize filename
filename = self._sanitize_filename(filename)
# Ensure unique filename
counter = 1
original_filename = filename
while (self.output_dir / filename).exists():
name, ext = os.path.splitext(original_filename)
filename = f"{name}_{counter}{ext}"
counter += 1
return filename
def _get_extension_from_mime(self, mime_type: str) -> str:
"""Get file extension from MIME type."""
mime_to_ext = {
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/png": ".png",
"image/gif": ".gif",
"image/webp": ".webp",
"image/bmp": ".bmp",
"image/tiff": ".tiff",
"image/svg+xml": ".svg",
}
return mime_to_ext.get(mime_type.lower(), ".jpg")
def _sanitize_filename(self, filename: str) -> str:
"""Sanitize filename by removing invalid characters."""
# Remove or replace invalid characters
invalid_chars = '<>:"/\\|?*'
for char in invalid_chars:
filename = filename.replace(char, "_")
# Remove leading/trailing spaces and dots
filename = filename.strip(". ")
# Ensure filename is not empty
if not filename:
filename = "image"
return filename
async def download_asset(
self,
session: aiohttp.ClientSession,
asset: Dict[str, Any],
semaphore: asyncio.Semaphore,
) -> bool:
"""Download a single asset."""
async with semaphore:
try:
download_url = self.get_download_url(asset)
filename = self.get_filename(asset, download_url)
filepath = self.output_dir / filename
# Check if file already exists and we're not tracking assets
if filepath.exists() and not self.asset_tracker:
self.logger.info(f"Skipping {filename} (already exists)")
self.stats["skipped"] += 1
return True
self.logger.info(f"Downloading {filename} from {download_url}")
headers = self.config.get("headers", {})
async with session.get(
download_url, headers=headers, timeout=self.config["timeout"]
) as response:
response.raise_for_status()
# Get content type to verify it's an image
content_type = response.headers.get("content-type", "")
if not content_type.startswith("image/"):
self.logger.warning(
f"Content type is not an image: {content_type}"
)
# Download the file
async with aiofiles.open(filepath, "wb") as f:
async for chunk in response.content.iter_chunked(8192):
await f.write(chunk)
# Set file modification time to match the updated timestamp
if "updated" in asset:
try:
from datetime import datetime
import os
# Parse the ISO timestamp
updated_time = datetime.fromisoformat(
asset["updated"].replace("Z", "+00:00")
)
# Set file modification time
os.utime(
filepath,
(updated_time.timestamp(), updated_time.timestamp()),
)
self.logger.info(
f"Set file modification time to {asset['updated']}"
)
except Exception as e:
self.logger.warning(
f"Failed to set file modification time: {e}"
)
# Mark asset as downloaded in tracker
if self.asset_tracker:
self.asset_tracker.mark_asset_downloaded(asset, filepath, True)
self.logger.info(f"Successfully downloaded {filename}")
self.stats["successful"] += 1
return True
except Exception as e:
# Mark asset as failed in tracker
if self.asset_tracker:
download_url = self.get_download_url(asset)
filename = self.get_filename(asset, download_url)
filepath = self.output_dir / filename
self.asset_tracker.mark_asset_downloaded(asset, filepath, False)
self.logger.error(
f"Failed to download asset {asset.get('id', 'unknown')}: {e}"
)
self.stats["failed"] += 1
return False
async def download_all_assets(self, force_redownload: bool = False):
"""
Download all assets from the API.
Args:
force_redownload: If True, download all assets regardless of tracking
"""
start_time = time.time()
# Create aiohttp session with connection pooling
connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
timeout = aiohttp.ClientTimeout(total=self.config["timeout"])
async with aiohttp.ClientSession(
connector=connector, timeout=timeout
) as session:
try:
# Perform authentication if needed
await self.authenticate()
# Get asset list
all_assets = await self.get_asset_list(session)
self.logger.info(f"Retrieved {len(all_assets)} total assets from API")
if not all_assets:
self.logger.warning("No assets found to download")
return
# Filter for new/modified assets if tracking is enabled
if self.asset_tracker and not force_redownload:
assets = self.asset_tracker.get_new_assets(all_assets)
self.logger.info(
f"Found {len(assets)} new/modified assets to download"
)
if len(assets) == 0:
self.logger.info("All assets are up to date!")
return
else:
assets = all_assets
if force_redownload:
self.logger.info(
"Force redownload enabled - downloading all assets"
)
self.stats["total"] = len(assets)
# Create semaphore to limit concurrent downloads
semaphore = asyncio.Semaphore(self.config["max_concurrent"])
# Create tasks for all downloads
tasks = [
self.download_asset(session, asset, semaphore) for asset in assets
]
# Download all assets with progress bar
with tqdm(total=len(tasks), desc="Downloading assets") as pbar:
for coro in asyncio.as_completed(tasks):
result = await coro
pbar.update(1)
pbar.set_postfix(
{
"Success": self.stats["successful"],
"Failed": self.stats["failed"],
"Skipped": self.stats["skipped"],
}
)
except Exception as e:
self.logger.error(f"Error during download process: {e}")
raise
# Print final statistics
elapsed_time = time.time() - start_time
self.logger.info(f"Download completed in {elapsed_time:.2f} seconds")
self.logger.info(f"Statistics: {self.stats}")
def main():
parser = argparse.ArgumentParser(
description="Download images using configuration file",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
python config_downloader.py --config config.json
# Create a config file first:
cp config/config_example.json config/my_config.json
# Edit config/my_config.json with your API details
python config_downloader.py --config my_config.json
""",
)
parser.add_argument(
"--config", required=True, help="Path to the JSON configuration file"
)
parser.add_argument(
"--force-redownload",
action="store_true",
help="Force re-download of all assets, even if already tracked",
)
parser.add_argument(
"--show-stats",
action="store_true",
help="Show asset tracking statistics and exit",
)
parser.add_argument(
"--cleanup",
action="store_true",
help="Clean up metadata for missing files and exit",
)
args = parser.parse_args()
# Handle special commands first
if args.show_stats or args.cleanup:
try:
downloader = ConfigImageDownloader(args.config)
if downloader.asset_tracker:
if args.cleanup:
downloader.asset_tracker.cleanup_missing_files()
if args.show_stats:
downloader.asset_tracker.print_stats()
else:
print("Asset tracking is not available")
except Exception as e:
print(f"Error: {e}")
return 1
return 0
try:
downloader = ConfigImageDownloader(args.config)
asyncio.run(
downloader.download_all_assets(force_redownload=args.force_redownload)
)
except KeyboardInterrupt:
print("\nDownload interrupted by user")
except Exception as e:
print(f"Error: {e}")
return 1
return 0
if __name__ == "__main__":
exit(main())

View File

@@ -0,0 +1,297 @@
#!/usr/bin/env python3
"""
Configuration-based Snapshot Downloader for ParentZone
This script reads configuration from a JSON file and downloads snapshots (daily events)
from the ParentZone API with pagination support, generating a comprehensive HTML report.
"""
import argparse
import asyncio
import json
import logging
import os
from datetime import datetime, timedelta
from pathlib import Path
# Import the snapshot downloader
try:
from src.snapshot_downloader import SnapshotDownloader
except ImportError:
print(
"Error: snapshot_downloader.py not found. Please ensure it's in the same directory."
)
exit(1)
class ConfigSnapshotDownloader:
def __init__(self, config_file: str):
"""
Initialize the downloader with configuration from a JSON file.
Args:
config_file: Path to the JSON configuration file
"""
self.config = self.load_config(config_file)
self.setup_logging()
# Create the underlying snapshot downloader
self.downloader = SnapshotDownloader(
api_url=self.config.get("api_url", "https://api.parentzone.me"),
output_dir=self.config.get("output_dir", "snapshots"),
api_key=self.config.get("api_key"),
email=self.config.get("email"),
password=self.config.get("password"),
)
def load_config(self, config_file: str) -> dict:
"""Load configuration from JSON file."""
try:
with open(config_file, "r") as f:
config = json.load(f)
# Validate required authentication
has_api_key = "api_key" in config and config["api_key"]
has_credentials = (
"email" in config
and "password" in config
and config["email"]
and config["password"]
)
if not has_api_key and not has_credentials:
raise ValueError(
"Either 'api_key' or both 'email' and 'password' must be provided in config"
)
# Set defaults for optional fields
config.setdefault("api_url", "https://api.parentzone.me")
config.setdefault("output_dir", "snapshots")
config.setdefault("type_ids", [15])
config.setdefault("max_pages", None)
# Set default date range (last year) if not specified
if "date_from" not in config or not config["date_from"]:
config["date_from"] = (datetime.now() - timedelta(days=365)).strftime(
"%Y-%m-%d"
)
if "date_to" not in config or not config["date_to"]:
config["date_to"] = datetime.now().strftime("%Y-%m-%d")
return config
except FileNotFoundError:
raise FileNotFoundError(f"Configuration file not found: {config_file}")
except json.JSONDecodeError as e:
raise ValueError(f"Invalid JSON in configuration file: {e}")
def setup_logging(self):
"""Setup logging configuration."""
output_dir = Path(self.config["output_dir"])
output_dir.mkdir(exist_ok=True)
log_file = output_dir / "snapshots.log"
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[logging.FileHandler(log_file), logging.StreamHandler()],
)
self.logger = logging.getLogger(__name__)
async def download_snapshots(self) -> Path:
"""
Download snapshots using the configuration settings.
Returns:
Path to the generated HTML file
"""
self.logger.info("Starting snapshot download with configuration")
self.logger.info(
f"Date range: {self.config['date_from']} to {self.config['date_to']}"
)
self.logger.info(f"Type IDs: {self.config['type_ids']}")
self.logger.info(f"Output directory: {self.config['output_dir']}")
if self.config.get("max_pages"):
self.logger.info(f"Max pages limit: {self.config['max_pages']}")
try:
html_file = await self.downloader.download_snapshots(
type_ids=self.config["type_ids"],
date_from=self.config["date_from"],
date_to=self.config["date_to"],
max_pages=self.config.get("max_pages"),
)
return html_file
except Exception as e:
self.logger.error(f"Error during snapshot download: {e}")
raise
def print_config_summary(self):
"""Print a summary of the current configuration."""
print("=" * 60)
print("SNAPSHOT DOWNLOADER CONFIGURATION")
print("=" * 60)
print(f"API URL: {self.config['api_url']}")
print(f"Output Directory: {self.config['output_dir']}")
print(f"Date From: {self.config['date_from']}")
print(f"Date To: {self.config['date_to']}")
print(f"Type IDs: {self.config['type_ids']}")
auth_method = "API Key" if self.config.get("api_key") else "Email/Password"
print(f"Authentication: {auth_method}")
if self.config.get("max_pages"):
print(f"Max Pages: {self.config['max_pages']}")
print("=" * 60)
def create_example_config():
"""Create an example configuration file."""
example_config = {
"api_url": "https://api.parentzone.me",
"output_dir": "./snapshots",
"type_ids": [15],
"date_from": "2024-01-01",
"date_to": "2024-12-31",
"max_pages": null,
"api_key": "your-api-key-here",
"email": "your-email@example.com",
"password": "your-password-here",
}
config_file = Path("snapshot_config_example.json")
with open(config_file, "w") as f:
json.dump(example_config, f, indent=2)
print(f"✅ Example configuration created: {config_file}")
print("📝 Edit the file with your credentials and settings")
return config_file
def main():
parser = argparse.ArgumentParser(
description="Download ParentZone snapshots using configuration file",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Use existing config file
python3 config_snapshot_downloader.py --config snapshot_config.json
# Create example config file
python3 config_snapshot_downloader.py --create-example
# Show config summary before downloading
python3 config_snapshot_downloader.py --config snapshot_config.json --show-config
Configuration file format:
{
"api_url": "https://api.parentzone.me",
"output_dir": "./snapshots",
"type_ids": [15],
"date_from": "2024-01-01",
"date_to": "2024-12-31",
"max_pages": null,
"api_key": "your-api-key-here",
"email": "your-email@example.com",
"password": "your-password-here"
}
Notes:
- Either 'api_key' OR both 'email' and 'password' are required
- 'date_from' and 'date_to' default to last year if not specified
- 'type_ids' defaults to [15] (snapshot type)
- 'max_pages' limits pages fetched (useful for testing)
""",
)
parser.add_argument("--config", help="Path to the JSON configuration file")
parser.add_argument(
"--create-example",
action="store_true",
help="Create an example configuration file and exit",
)
parser.add_argument(
"--show-config",
action="store_true",
help="Show configuration summary before downloading",
)
parser.add_argument(
"--debug",
action="store_true",
help="Enable debug mode with detailed server response logging",
)
args = parser.parse_args()
# Handle create example
if args.create_example:
create_example_config()
return 0
# Validate config argument
if not args.config:
print("Error: --config argument is required (or use --create-example)")
print("Run with --help for more information")
return 1
try:
# Create downloader
downloader = ConfigSnapshotDownloader(args.config)
# Show configuration if requested
if args.show_config:
downloader.print_config_summary()
print()
# Enable debug mode if requested
if args.debug:
print("🔍 DEBUG MODE ENABLED - Detailed server responses will be printed")
# Set debug flag on the underlying downloader
downloader.downloader.debug_mode = True
# Download snapshots
html_file = asyncio.run(downloader.download_snapshots())
if html_file:
print("\n" + "=" * 60)
print("✅ SUCCESS!")
print("=" * 60)
print(f"📄 HTML Report: {html_file}")
print(f"📁 Open the file in your browser to view the snapshots")
print("🎯 The report includes:")
print(" • All snapshots with descriptions and metadata")
print(" • Images and attachments (if any)")
print(" • Search and filtering capabilities")
print(" • Interactive collapsible sections")
print("=" * 60)
else:
print("⚠️ No snapshots were found for the specified period")
print("💡 Try adjusting the date range in your configuration")
except KeyboardInterrupt:
print("\n⚠️ Download interrupted by user")
return 1
except FileNotFoundError as e:
print(f"❌ Configuration file error: {e}")
print("💡 Use --create-example to generate a template")
return 1
except ValueError as e:
print(f"❌ Configuration error: {e}")
return 1
except Exception as e:
print(f"❌ Download failed: {e}")
return 1
return 0
if __name__ == "__main__":
exit(main())

596
src/image_downloader.py Normal file
View File

@@ -0,0 +1,596 @@
#!/usr/bin/env python3
"""
Image Downloader Script
This script downloads images from a REST API that provides:
1. An endpoint to list all assets
2. An endpoint to download individual assets in full resolution
Usage:
python image_downloader.py --api-url <base_url> --list-endpoint <endpoint> --download-endpoint <endpoint> --output-dir <directory>
"""
import argparse
import asyncio
import aiohttp
import aiofiles
import os
import json
import logging
from pathlib import Path
from urllib.parse import urljoin, urlparse
from typing import List, Dict, Any, Optional
import time
from tqdm import tqdm
import hashlib
# Import the auth manager and asset tracker
try:
from src.auth_manager import AuthManager
except ImportError:
AuthManager = None
try:
from src.asset_tracker import AssetTracker
except ImportError:
AssetTracker = None
class ImageDownloader:
def __init__(
self,
api_url: str,
list_endpoint: str,
download_endpoint: str,
output_dir: str,
max_concurrent: int = 5,
timeout: int = 30,
api_key: str = None,
email: str = None,
password: str = None,
track_assets: bool = True,
):
"""
Initialize the image downloader.
Args:
api_url: Base URL of the API
list_endpoint: Endpoint to get the list of assets
download_endpoint: Endpoint to download individual assets
output_dir: Directory to save downloaded images
max_concurrent: Maximum number of concurrent downloads
timeout: Request timeout in seconds
api_key: API key for authentication
email: Email for login authentication
password: Password for login authentication
track_assets: Whether to enable asset tracking to avoid re-downloads
"""
self.api_url = api_url.rstrip("/")
self.list_endpoint = list_endpoint.lstrip("/")
self.download_endpoint = download_endpoint.lstrip("/")
self.output_dir = Path(output_dir)
self.max_concurrent = max_concurrent
self.timeout = timeout
self.api_key = api_key
self.email = email
self.password = password
self.auth_manager = None
# Create output directory if it doesn't exist
self.output_dir.mkdir(parents=True, exist_ok=True)
# Setup logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(levelname)s - %(message)s",
handlers=[
logging.FileHandler(self.output_dir / "download.log"),
logging.StreamHandler(),
],
)
self.logger = logging.getLogger(__name__)
# Initialize asset tracker if enabled and available
self.asset_tracker = None
if track_assets and AssetTracker:
self.asset_tracker = AssetTracker(storage_dir=str(self.output_dir))
self.logger.info("Asset tracking enabled")
elif track_assets:
self.logger.warning(
"Asset tracking requested but AssetTracker not available"
)
else:
self.logger.info("Asset tracking disabled")
# Track download statistics
self.stats = {"total": 0, "successful": 0, "failed": 0, "skipped": 0}
async def authenticate(self):
"""Perform login authentication if credentials are provided."""
if self.email and self.password and AuthManager:
self.logger.info("Attempting login authentication...")
self.auth_manager = AuthManager(self.api_url)
success = await self.auth_manager.login(self.email, self.password)
if success:
self.logger.info("Login authentication successful")
else:
self.logger.error("Login authentication failed")
raise Exception("Login authentication failed")
elif self.email or self.password:
self.logger.warning(
"Both email and password must be provided for login authentication"
)
raise Exception(
"Both email and password must be provided for login authentication"
)
async def get_asset_list(
self, session: aiohttp.ClientSession
) -> List[Dict[str, Any]]:
"""
Fetch the list of assets from the API.
Args:
session: aiohttp session for making requests
Returns:
List of asset dictionaries
"""
url = urljoin(self.api_url, self.list_endpoint)
self.logger.info(f"Fetching asset list from: {url}")
try:
headers = {}
# Use API key if provided
if self.api_key:
headers["x-api-key"] = self.api_key
# Use login authentication if provided
elif self.auth_manager and self.auth_manager.is_authenticated():
headers.update(self.auth_manager.get_auth_headers())
async with session.get(
url, headers=headers, timeout=self.timeout
) as response:
response.raise_for_status()
data = await response.json()
# Handle different response formats
if isinstance(data, list):
assets = data
elif isinstance(data, dict):
# Common patterns for API responses
if "data" in data:
assets = data["data"]
elif "results" in data:
assets = data["results"]
elif "items" in data:
assets = data["items"]
else:
assets = [data] # Single asset
else:
raise ValueError(f"Unexpected response format: {type(data)}")
self.logger.info(f"Found {len(assets)} assets")
return assets
except Exception as e:
self.logger.error(f"Failed to fetch asset list: {e}")
raise
def get_download_url(self, asset: Dict[str, Any]) -> str:
"""
Generate the download URL for an asset.
Args:
asset: Asset dictionary from the API
Returns:
Download URL for the asset
"""
# Try different common patterns for asset IDs
asset_id = None
# Common field names for asset identifiers
id_fields = ["id", "asset_id", "image_id", "file_id", "uuid", "key"]
for field in id_fields:
if field in asset:
asset_id = asset[field]
break
if asset_id is None:
# If no ID field found, try to use the asset itself as the ID
asset_id = str(asset)
# Build download URL with required parameters
from urllib.parse import urlencode
params = {"key": self.api_key, "u": asset.get("updated", "")}
download_url = urljoin(
self.api_url, f"/v1/media/{asset_id}/full?{urlencode(params)}"
)
return download_url
def get_filename(self, asset: Dict[str, Any], url: str) -> str:
"""
Generate a filename for the downloaded asset.
Args:
asset: Asset dictionary from the API
url: Download URL
Returns:
Filename for the asset
"""
# Try to get filename from asset metadata
if "fileName" in asset:
filename = asset["fileName"]
elif "filename" in asset:
filename = asset["filename"]
elif "name" in asset:
filename = asset["name"]
elif "title" in asset:
filename = asset["title"]
else:
# Extract filename from URL
parsed_url = urlparse(url)
filename = os.path.basename(parsed_url.path)
# If no extension, try to get it from content-type or add default
if "." not in filename:
if "mimeType" in asset:
ext = self._get_extension_from_mime(asset["mimeType"])
elif "content_type" in asset:
ext = self._get_extension_from_mime(asset["content_type"])
else:
ext = ".jpg" # Default extension
filename += ext
# Sanitize filename
filename = self._sanitize_filename(filename)
# Ensure unique filename
counter = 1
original_filename = filename
while (self.output_dir / filename).exists():
name, ext = os.path.splitext(original_filename)
filename = f"{name}_{counter}{ext}"
counter += 1
return filename
def _get_extension_from_mime(self, mime_type: str) -> str:
"""Get file extension from MIME type."""
mime_to_ext = {
"image/jpeg": ".jpg",
"image/jpg": ".jpg",
"image/png": ".png",
"image/gif": ".gif",
"image/webp": ".webp",
"image/bmp": ".bmp",
"image/tiff": ".tiff",
"image/svg+xml": ".svg",
}
return mime_to_ext.get(mime_type.lower(), ".jpg")
def _sanitize_filename(self, filename: str) -> str:
"""Sanitize filename by removing invalid characters."""
# Remove or replace invalid characters
invalid_chars = '<>:"/\\|?*'
for char in invalid_chars:
filename = filename.replace(char, "_")
# Remove leading/trailing spaces and dots
filename = filename.strip(". ")
# Ensure filename is not empty
if not filename:
filename = "image"
return filename
async def download_asset(
self,
session: aiohttp.ClientSession,
asset: Dict[str, Any],
semaphore: asyncio.Semaphore,
) -> bool:
"""
Download a single asset.
Args:
session: aiohttp session for making requests
asset: Asset dictionary from the API
semaphore: Semaphore to limit concurrent downloads
Returns:
True if download was successful, False otherwise
"""
async with semaphore:
try:
download_url = self.get_download_url(asset)
filename = self.get_filename(asset, download_url)
filepath = self.output_dir / filename
# Check if file already exists and we're not tracking assets
if filepath.exists() and not self.asset_tracker:
self.logger.info(f"Skipping {filename} (already exists)")
self.stats["skipped"] += 1
return True
self.logger.info(f"Downloading {filename} from {download_url}")
async with session.get(download_url, timeout=self.timeout) as response:
response.raise_for_status()
# Get content type to verify it's an image
content_type = response.headers.get("content-type", "")
if not content_type.startswith("image/"):
self.logger.warning(
f"Content type is not an image: {content_type}"
)
# Download the file
async with aiofiles.open(filepath, "wb") as f:
async for chunk in response.content.iter_chunked(8192):
await f.write(chunk)
# Set file modification time to match the updated timestamp
if "updated" in asset:
try:
from datetime import datetime
import os
# Parse the ISO timestamp
updated_time = datetime.fromisoformat(
asset["updated"].replace("Z", "+00:00")
)
# Set file modification time
os.utime(
filepath,
(updated_time.timestamp(), updated_time.timestamp()),
)
self.logger.info(
f"Set file modification time to {asset['updated']}"
)
except Exception as e:
self.logger.warning(
f"Failed to set file modification time: {e}"
)
# Mark asset as downloaded in tracker
if self.asset_tracker:
self.asset_tracker.mark_asset_downloaded(asset, filepath, True)
self.logger.info(f"Successfully downloaded {filename}")
self.stats["successful"] += 1
return True
except Exception as e:
# Mark asset as failed in tracker
if self.asset_tracker:
download_url = self.get_download_url(asset)
filename = self.get_filename(asset, download_url)
filepath = self.output_dir / filename
self.asset_tracker.mark_asset_downloaded(asset, filepath, False)
self.logger.error(
f"Failed to download asset {asset.get('id', 'unknown')}: {e}"
)
self.stats["failed"] += 1
return False
async def download_all_assets(self, force_redownload: bool = False):
"""
Download all assets from the API.
Args:
force_redownload: If True, download all assets regardless of tracking
"""
start_time = time.time()
# Create aiohttp session with connection pooling
connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
timeout = aiohttp.ClientTimeout(total=self.timeout)
async with aiohttp.ClientSession(
connector=connector, timeout=timeout
) as session:
try:
# Perform authentication if needed
await self.authenticate()
# Get asset list
all_assets = await self.get_asset_list(session)
self.logger.info(f"Retrieved {len(all_assets)} total assets from API")
if not all_assets:
self.logger.warning("No assets found to download")
return
# Filter for new/modified assets if tracking is enabled
if self.asset_tracker and not force_redownload:
assets = self.asset_tracker.get_new_assets(all_assets)
self.logger.info(
f"Found {len(assets)} new/modified assets to download"
)
if len(assets) == 0:
self.logger.info("All assets are up to date!")
return
else:
assets = all_assets
if force_redownload:
self.logger.info(
"Force redownload enabled - downloading all assets"
)
self.stats["total"] = len(assets)
# Create semaphore to limit concurrent downloads
semaphore = asyncio.Semaphore(self.max_concurrent)
# Create tasks for all downloads
tasks = [
self.download_asset(session, asset, semaphore) for asset in assets
]
# Download all assets with progress bar
with tqdm(total=len(tasks), desc="Downloading assets") as pbar:
for coro in asyncio.as_completed(tasks):
result = await coro
pbar.update(1)
pbar.set_postfix(
{
"Success": self.stats["successful"],
"Failed": self.stats["failed"],
"Skipped": self.stats["skipped"],
}
)
except Exception as e:
self.logger.error(f"Error during download process: {e}")
raise
# Print final statistics
elapsed_time = time.time() - start_time
self.logger.info(f"Download completed in {elapsed_time:.2f} seconds")
self.logger.info(f"Statistics: {self.stats}")
def main():
"""Main function to run the image downloader."""
parser = argparse.ArgumentParser(
description="Download images from a REST API",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Basic usage
python image_downloader.py --api-url "https://api.example.com" \\
--list-endpoint "/assets" \\
--download-endpoint "/download" \\
--output-dir "./images"
# With custom concurrent downloads and timeout
python image_downloader.py --api-url "https://api.example.com" \\
--list-endpoint "/assets" \\
--download-endpoint "/download" \\
--output-dir "./images" \\
--max-concurrent 10 \\
--timeout 60
""",
)
parser.add_argument(
"--api-url",
required=True,
help="Base URL of the API (e.g., https://api.example.com)",
)
parser.add_argument(
"--list-endpoint",
required=True,
help="Endpoint to get the list of assets (e.g., /assets or /images)",
)
parser.add_argument(
"--download-endpoint",
required=True,
help="Endpoint to download individual assets (e.g., /download or /assets)",
)
parser.add_argument(
"--output-dir", required=True, help="Directory to save downloaded images"
)
parser.add_argument(
"--max-concurrent",
type=int,
default=5,
help="Maximum number of concurrent downloads (default: 5)",
)
parser.add_argument(
"--timeout",
type=int,
default=30,
help="Request timeout in seconds (default: 30)",
)
parser.add_argument(
"--api-key", help="API key for authentication (x-api-key header)"
)
parser.add_argument("--email", help="Email for login authentication")
parser.add_argument("--password", help="Password for login authentication")
parser.add_argument(
"--no-tracking",
action="store_true",
help="Disable asset tracking (will re-download all assets)",
)
parser.add_argument(
"--force-redownload",
action="store_true",
help="Force re-download of all assets, even if already tracked",
)
parser.add_argument(
"--show-stats",
action="store_true",
help="Show asset tracking statistics and exit",
)
parser.add_argument(
"--cleanup",
action="store_true",
help="Clean up metadata for missing files and exit",
)
args = parser.parse_args()
# Handle special commands first
if args.show_stats or args.cleanup:
if AssetTracker:
tracker = AssetTracker(storage_dir=args.output_dir)
if args.cleanup:
tracker.cleanup_missing_files()
if args.show_stats:
tracker.print_stats()
else:
print("Asset tracking is not available")
return
# Create the image downloader
downloader = ImageDownloader(
api_url=args.api_url,
list_endpoint=args.list_endpoint,
download_endpoint=args.download_endpoint,
output_dir=args.output_dir,
max_concurrent=args.max_concurrent,
timeout=args.timeout,
api_key=args.api_key,
email=args.email,
password=args.password,
track_assets=not args.no_tracking,
)
try:
asyncio.run(
downloader.download_all_assets(force_redownload=args.force_redownload)
)
except KeyboardInterrupt:
print("\nDownload interrupted by user")
except Exception as e:
print(f"Error: {e}")
return 1
return 0
if __name__ == "__main__":
exit(main())

1248
src/snapshot_downloader.py Normal file

File diff suppressed because it is too large Load Diff

523
src/webserver.py Normal file
View File

@@ -0,0 +1,523 @@
#!/usr/bin/env python3
"""
ParentZone Snapshots Web Server
A simple web server that serves HTML snapshot files and their assets.
Provides a directory listing and serves static files from the snapshots folder.
"""
import os
import asyncio
import argparse
import logging
from pathlib import Path
from urllib.parse import unquote
from datetime import datetime
import aiohttp
from aiohttp import web, hdrs
from aiohttp.web_response import Response
class SnapshotsWebServer:
def __init__(
self,
snapshots_dir: str = "./snapshots",
port: int = 8080,
host: str = "0.0.0.0",
):
self.snapshots_dir = Path(snapshots_dir).resolve()
self.port = port
self.host = host
# Setup logging
logging.basicConfig(
level=logging.INFO,
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
)
self.logger = logging.getLogger(__name__)
# Ensure snapshots directory exists
self.snapshots_dir.mkdir(parents=True, exist_ok=True)
self.logger.info(f"Serving snapshots from: {self.snapshots_dir}")
async def index_handler(self, request):
"""Serve the main directory listing page."""
try:
html_files = []
# Find all HTML files in the snapshots directory
for file_path in self.snapshots_dir.glob("*.html"):
stat = file_path.stat()
html_files.append(
{
"name": file_path.name,
"size": stat.st_size,
"modified": datetime.fromtimestamp(stat.st_mtime),
"path": file_path.name,
}
)
# Sort by modification time (newest first)
html_files.sort(key=lambda x: x["modified"], reverse=True)
# Generate HTML page
html_content = self._generate_index_html(html_files)
return web.Response(text=html_content, content_type="text/html")
except Exception as e:
self.logger.error(f"Error generating index: {e}")
return web.Response(
text=f"<h1>Error</h1><p>Could not generate directory listing: {e}</p>",
status=500,
content_type="text/html",
)
def _generate_index_html(self, html_files):
"""Generate the HTML directory listing page."""
files_list = ""
if not html_files:
files_list = "<p class='no-files'>No snapshot files found.</p>"
else:
for file_info in html_files:
size_mb = file_info["size"] / (1024 * 1024)
files_list += f"""
<div class="file-item">
<div class="file-info">
<h3><a href="/{file_info["path"]}" class="file-link">{file_info["name"]}</a></h3>
<div class="file-meta">
<span class="file-size">{size_mb:.2f} MB</span>
<span class="file-date">{file_info["modified"].strftime("%Y-%m-%d %H:%M:%S")}</span>
</div>
</div>
</div>
"""
return f"""
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="UTF-8">
<meta name="viewport" content="width=device-width, initial-scale=1.0">
<title>ParentZone Snapshots</title>
<style>
* {{
margin: 0;
padding: 0;
box-sizing: border-box;
}}
body {{
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
line-height: 1.6;
color: #333;
background-color: #f5f5f5;
padding: 20px;
}}
.container {{
max-width: 1000px;
margin: 0 auto;
}}
.header {{
background: white;
padding: 30px;
border-radius: 10px;
margin-bottom: 30px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
text-align: center;
}}
.header h1 {{
color: #2c3e50;
margin-bottom: 10px;
font-size: 2.5em;
}}
.header p {{
color: #7f8c8d;
font-size: 1.1em;
}}
.files-container {{
background: white;
border-radius: 10px;
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
overflow: hidden;
}}
.files-header {{
background: #3498db;
color: white;
padding: 20px;
font-size: 1.2em;
font-weight: bold;
}}
.file-item {{
border-bottom: 1px solid #ecf0f1;
padding: 20px;
transition: background-color 0.2s;
}}
.file-item:last-child {{
border-bottom: none;
}}
.file-item:hover {{
background-color: #f8f9fa;
}}
.file-link {{
color: #2c3e50;
text-decoration: none;
font-size: 1.1em;
font-weight: 500;
}}
.file-link:hover {{
color: #3498db;
text-decoration: underline;
}}
.file-meta {{
margin-top: 8px;
display: flex;
gap: 20px;
color: #7f8c8d;
font-size: 0.9em;
}}
.no-files {{
padding: 40px;
text-align: center;
color: #7f8c8d;
font-size: 1.1em;
}}
.footer {{
margin-top: 30px;
text-align: center;
color: #7f8c8d;
font-size: 0.9em;
}}
@media (max-width: 600px) {{
.file-meta {{
flex-direction: column;
gap: 5px;
}}
.header h1 {{
font-size: 2em;
}}
}}
</style>
</head>
<body>
<div class="container">
<div class="header">
<h1>📸 ParentZone Snapshots</h1>
<p>Browse and view your downloaded snapshot files</p>
</div>
<div class="files-container">
<div class="files-header">
📁 Available Snapshot Files ({len(html_files)} files)
</div>
{files_list}
</div>
<div class="footer">
<p>Served from: {self.snapshots_dir}</p>
<p>Server running on {self.host}:{self.port}</p>
</div>
</div>
</body>
</html>
"""
async def file_handler(self, request):
"""Serve individual HTML files and their assets."""
try:
# Get the requested file path
file_path = unquote(request.match_info["filename"])
requested_file = self.snapshots_dir / file_path
# Security check: ensure the file is within the snapshots directory
try:
requested_file.resolve().relative_to(self.snapshots_dir.resolve())
except ValueError:
self.logger.warning(f"Attempted path traversal: {file_path}")
return web.Response(
text="<h1>403 Forbidden</h1><p>Access denied.</p>",
status=403,
content_type="text/html",
)
# Check if file exists
if not requested_file.exists():
return web.Response(
text="<h1>404 Not Found</h1><p>The requested file was not found.</p>",
status=404,
content_type="text/html",
)
# Determine content type
content_type = self._get_content_type(requested_file)
# Read and serve the file
with open(requested_file, "rb") as f:
content = f.read()
return web.Response(
body=content,
content_type=content_type,
headers={
"Cache-Control": "public, max-age=3600",
"Last-Modified": datetime.fromtimestamp(
requested_file.stat().st_mtime
).strftime("%a, %d %b %Y %H:%M:%S GMT"),
},
)
except Exception as e:
self.logger.error(
f"Error serving file {request.match_info.get('filename', 'unknown')}: {e}"
)
return web.Response(
text=f"<h1>500 Internal Server Error</h1><p>Could not serve file: {e}</p>",
status=500,
content_type="text/html",
)
async def assets_handler(self, request):
"""Serve asset files (images, CSS, JS, etc.) from assets subdirectories."""
try:
# Get the requested asset path
asset_path = unquote(request.match_info["path"])
requested_file = self.snapshots_dir / "assets" / asset_path
# Security check
try:
requested_file.resolve().relative_to(self.snapshots_dir.resolve())
except ValueError:
self.logger.warning(f"Attempted path traversal in assets: {asset_path}")
return web.Response(text="403 Forbidden", status=403)
# Check if file exists
if not requested_file.exists():
return web.Response(text="404 Not Found", status=404)
# Determine content type
content_type = self._get_content_type(requested_file)
# Read and serve the file
with open(requested_file, "rb") as f:
content = f.read()
return web.Response(
body=content,
content_type=content_type,
headers={
"Cache-Control": "public, max-age=86400", # Cache assets for 24 hours
"Last-Modified": datetime.fromtimestamp(
requested_file.stat().st_mtime
).strftime("%a, %d %b %Y %H:%M:%S GMT"),
},
)
except Exception as e:
self.logger.error(
f"Error serving asset {request.match_info.get('path', 'unknown')}: {e}"
)
return web.Response(text="500 Internal Server Error", status=500)
def _get_content_type(self, file_path: Path) -> str:
"""Determine the content type based on file extension."""
suffix = file_path.suffix.lower()
content_types = {
".html": "text/html; charset=utf-8",
".css": "text/css; charset=utf-8",
".js": "application/javascript; charset=utf-8",
".json": "application/json; charset=utf-8",
".jpg": "image/jpeg",
".jpeg": "image/jpeg",
".png": "image/png",
".gif": "image/gif",
".webp": "image/webp",
".svg": "image/svg+xml",
".ico": "image/x-icon",
".pdf": "application/pdf",
".txt": "text/plain; charset=utf-8",
".log": "text/plain; charset=utf-8",
}
return content_types.get(suffix, "application/octet-stream")
def setup_routes(self, app):
"""Configure the web application routes."""
# Main index page
app.router.add_get("/", self.index_handler)
# Serve HTML files directly
app.router.add_get("/{filename:.+\.html}", self.file_handler)
# Serve assets (images, CSS, JS, etc.)
app.router.add_get("/assets/{path:.+}", self.assets_handler)
# Serve other static files (logs, etc.)
app.router.add_get(
"/{filename:.+\.(css|js|json|txt|log|ico)}", self.file_handler
)
async def create_app(self):
"""Create and configure the web application."""
app = web.Application()
# Setup routes
self.setup_routes(app)
# Add middleware for logging
async def logging_middleware(request, handler):
start_time = datetime.now()
# Get client IP address
def get_client_ip():
# Check for forwarded header first
forwarded = request.headers.get("X-Forwarded-For")
if forwarded:
return forwarded.split(",")[0].strip()
# Try to get from transport
try:
if request.transport:
peername = request.transport.get_extra_info("peername")
if peername:
return peername[0]
except:
pass
return "unknown"
try:
response = await handler(request)
# Log the request
duration = (datetime.now() - start_time).total_seconds()
remote_addr = get_client_ip()
self.logger.info(
f"{remote_addr} - {request.method} {request.path} - "
f"{response.status} - {duration:.3f}s"
)
return response
except Exception as e:
duration = (datetime.now() - start_time).total_seconds()
remote_addr = get_client_ip()
self.logger.error(
f"{remote_addr} - {request.method} {request.path} - "
f"ERROR: {e} - {duration:.3f}s"
)
raise
app.middlewares.append(logging_middleware)
return app
async def start_server(self):
"""Start the web server."""
app = await self.create_app()
runner = web.AppRunner(app)
await runner.setup()
site = web.TCPSite(runner, self.host, self.port)
await site.start()
self.logger.info(f"🚀 ParentZone Snapshots Web Server started!")
self.logger.info(f"📂 Serving files from: {self.snapshots_dir}")
self.logger.info(f"🌐 Server running at: http://{self.host}:{self.port}")
self.logger.info(f"🔗 Open in browser: http://localhost:{self.port}")
self.logger.info("Press Ctrl+C to stop the server")
return runner
def main():
parser = argparse.ArgumentParser(
description="ParentZone Snapshots Web Server",
formatter_class=argparse.RawDescriptionHelpFormatter,
epilog="""
Examples:
# Start server with default settings
python webserver.py
# Start server on custom port
python webserver.py --port 3000
# Serve from custom directory
python webserver.py --snapshots-dir /path/to/snapshots
# Start server on all interfaces
python webserver.py --host 0.0.0.0 --port 8080
""",
)
parser.add_argument(
"--snapshots-dir",
default="./snapshots",
help="Directory containing snapshot files (default: ./snapshots)",
)
parser.add_argument(
"--port",
type=int,
default=8080,
help="Port to run the server on (default: 8080)",
)
parser.add_argument(
"--host",
default="0.0.0.0",
help="Host to bind the server to (default: 0.0.0.0)",
)
args = parser.parse_args()
# Create and start the server
server = SnapshotsWebServer(
snapshots_dir=args.snapshots_dir, port=args.port, host=args.host
)
async def run_server():
runner = None
try:
runner = await server.start_server()
# Keep the server running
while True:
await asyncio.sleep(1)
except KeyboardInterrupt:
print("\n👋 Shutting down server...")
except Exception as e:
print(f"❌ Server error: {e}")
finally:
if runner:
await runner.cleanup()
try:
asyncio.run(run_server())
except KeyboardInterrupt:
print("\n✅ Server stopped")
if __name__ == "__main__":
main()