This commit is contained in:
44
src/__init__.py
Normal file
44
src/__init__.py
Normal file
@@ -0,0 +1,44 @@
|
||||
"""
|
||||
ParentZone Downloader - Source Package
|
||||
|
||||
This package contains the core application modules for the ParentZone Downloader.
|
||||
|
||||
Modules:
|
||||
- asset_tracker: Track downloaded assets to avoid re-downloads
|
||||
- auth_manager: Handle authentication with ParentZone API
|
||||
- config_downloader: Configuration-based image downloader
|
||||
- config_snapshot_downloader: Configuration-based snapshot downloader
|
||||
- image_downloader: Download images from ParentZone API
|
||||
- snapshot_downloader: Download snapshots from ParentZone API
|
||||
- webserver: Web server to serve downloaded snapshots
|
||||
"""
|
||||
|
||||
__version__ = "1.0.0"
|
||||
__author__ = "ParentZone Downloader Team"
|
||||
|
||||
# Import main classes for easier access
|
||||
try:
|
||||
from .asset_tracker import AssetTracker
|
||||
from .auth_manager import AuthManager
|
||||
from .config_downloader import ConfigImageDownloader
|
||||
from .config_snapshot_downloader import ConfigSnapshotDownloader
|
||||
from .image_downloader import ImageDownloader
|
||||
from .snapshot_downloader import SnapshotDownloader
|
||||
from .webserver import SnapshotsWebServer
|
||||
|
||||
__all__ = [
|
||||
"AssetTracker",
|
||||
"AuthManager",
|
||||
"ConfigImageDownloader",
|
||||
"ConfigSnapshotDownloader",
|
||||
"ImageDownloader",
|
||||
"SnapshotDownloader",
|
||||
"SnapshotsWebServer",
|
||||
]
|
||||
|
||||
except ImportError as e:
|
||||
# Handle case where dependencies might not be available
|
||||
__all__ = []
|
||||
import warnings
|
||||
|
||||
warnings.warn(f"Some modules could not be imported: {e}")
|
||||
313
src/asset_tracker.py
Normal file
313
src/asset_tracker.py
Normal file
@@ -0,0 +1,313 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Asset Tracker for ParentZone Downloader
|
||||
|
||||
This module handles tracking of downloaded assets to avoid re-downloading
|
||||
and to identify new assets that need to be downloaded.
|
||||
"""
|
||||
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime
|
||||
from pathlib import Path
|
||||
from typing import Dict, List, Set, Any, Optional
|
||||
import hashlib
|
||||
|
||||
|
||||
class AssetTracker:
|
||||
"""
|
||||
Tracks downloaded assets and identifies new ones.
|
||||
"""
|
||||
|
||||
def __init__(self, storage_dir: str = "downloaded_images", metadata_file: str = "asset_metadata.json"):
|
||||
"""
|
||||
Initialize the asset tracker.
|
||||
|
||||
Args:
|
||||
storage_dir: Directory where downloaded assets are stored
|
||||
metadata_file: JSON file to store asset metadata
|
||||
"""
|
||||
self.storage_dir = Path(storage_dir)
|
||||
self.storage_dir.mkdir(exist_ok=True)
|
||||
|
||||
self.metadata_file = self.storage_dir / metadata_file
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Load existing metadata
|
||||
self.metadata = self._load_metadata()
|
||||
|
||||
def _load_metadata(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Load asset metadata from the JSON file.
|
||||
|
||||
Returns:
|
||||
Dictionary of asset metadata keyed by asset ID
|
||||
"""
|
||||
if self.metadata_file.exists():
|
||||
try:
|
||||
with open(self.metadata_file, 'r', encoding='utf-8') as f:
|
||||
data = json.load(f)
|
||||
self.logger.info(f"Loaded metadata for {len(data)} assets")
|
||||
return data
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to load metadata file: {e}")
|
||||
return {}
|
||||
else:
|
||||
self.logger.info("No existing metadata file found, starting fresh")
|
||||
return {}
|
||||
|
||||
def _save_metadata(self):
|
||||
"""Save asset metadata to the JSON file."""
|
||||
try:
|
||||
with open(self.metadata_file, 'w', encoding='utf-8') as f:
|
||||
json.dump(self.metadata, f, indent=2, default=str)
|
||||
self.logger.debug(f"Saved metadata for {len(self.metadata)} assets")
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to save metadata file: {e}")
|
||||
|
||||
def _get_asset_key(self, asset: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Generate a unique key for an asset.
|
||||
|
||||
Args:
|
||||
asset: Asset dictionary from API
|
||||
|
||||
Returns:
|
||||
Unique key for the asset
|
||||
"""
|
||||
# Try different ID fields
|
||||
if 'id' in asset:
|
||||
return str(asset['id'])
|
||||
elif 'assetId' in asset:
|
||||
return str(asset['assetId'])
|
||||
elif 'uuid' in asset:
|
||||
return str(asset['uuid'])
|
||||
else:
|
||||
# Generate hash from asset data
|
||||
asset_str = json.dumps(asset, sort_keys=True, default=str)
|
||||
return hashlib.md5(asset_str.encode()).hexdigest()
|
||||
|
||||
def _get_asset_hash(self, asset: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Generate a hash for asset content to detect changes.
|
||||
|
||||
Args:
|
||||
asset: Asset dictionary from API
|
||||
|
||||
Returns:
|
||||
Hash of asset content
|
||||
"""
|
||||
# Fields that indicate content changes
|
||||
content_fields = ['updated', 'modified', 'lastModified', 'size', 'checksum', 'etag']
|
||||
|
||||
content_data = {}
|
||||
for field in content_fields:
|
||||
if field in asset:
|
||||
content_data[field] = asset[field]
|
||||
|
||||
# If no content fields, use entire asset
|
||||
if not content_data:
|
||||
content_data = asset
|
||||
|
||||
content_str = json.dumps(content_data, sort_keys=True, default=str)
|
||||
return hashlib.md5(content_str.encode()).hexdigest()
|
||||
|
||||
def is_asset_downloaded(self, asset: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Check if an asset has already been downloaded.
|
||||
|
||||
Args:
|
||||
asset: Asset dictionary from API
|
||||
|
||||
Returns:
|
||||
True if asset is already downloaded, False otherwise
|
||||
"""
|
||||
asset_key = self._get_asset_key(asset)
|
||||
return asset_key in self.metadata
|
||||
|
||||
def is_asset_modified(self, asset: Dict[str, Any]) -> bool:
|
||||
"""
|
||||
Check if an asset has been modified since last download.
|
||||
|
||||
Args:
|
||||
asset: Asset dictionary from API
|
||||
|
||||
Returns:
|
||||
True if asset has been modified, False otherwise
|
||||
"""
|
||||
asset_key = self._get_asset_key(asset)
|
||||
|
||||
if asset_key not in self.metadata:
|
||||
return True # New asset
|
||||
|
||||
current_hash = self._get_asset_hash(asset)
|
||||
stored_hash = self.metadata[asset_key].get('content_hash', '')
|
||||
|
||||
return current_hash != stored_hash
|
||||
|
||||
def get_new_assets(self, api_assets: List[Dict[str, Any]]) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Identify new or modified assets that need to be downloaded.
|
||||
|
||||
Args:
|
||||
api_assets: List of assets from API response
|
||||
|
||||
Returns:
|
||||
List of assets that need to be downloaded
|
||||
"""
|
||||
new_assets = []
|
||||
|
||||
for asset in api_assets:
|
||||
asset_key = self._get_asset_key(asset)
|
||||
|
||||
if not self.is_asset_downloaded(asset):
|
||||
self.logger.info(f"New asset found: {asset_key}")
|
||||
new_assets.append(asset)
|
||||
elif self.is_asset_modified(asset):
|
||||
self.logger.info(f"Modified asset found: {asset_key}")
|
||||
new_assets.append(asset)
|
||||
else:
|
||||
self.logger.debug(f"Asset unchanged: {asset_key}")
|
||||
|
||||
self.logger.info(f"Found {len(new_assets)} new/modified assets out of {len(api_assets)} total")
|
||||
return new_assets
|
||||
|
||||
def mark_asset_downloaded(self, asset: Dict[str, Any], filepath: Path, success: bool = True):
|
||||
"""
|
||||
Mark an asset as downloaded in the metadata.
|
||||
|
||||
Args:
|
||||
asset: Asset dictionary from API
|
||||
filepath: Path where asset was saved
|
||||
success: Whether download was successful
|
||||
"""
|
||||
asset_key = self._get_asset_key(asset)
|
||||
|
||||
metadata_entry = {
|
||||
'asset_id': asset_key,
|
||||
'filename': filepath.name,
|
||||
'filepath': str(filepath),
|
||||
'download_date': datetime.now().isoformat(),
|
||||
'success': success,
|
||||
'content_hash': self._get_asset_hash(asset),
|
||||
'api_data': asset
|
||||
}
|
||||
|
||||
# Add file info if download was successful and file exists
|
||||
if success and filepath.exists():
|
||||
stat = filepath.stat()
|
||||
metadata_entry.update({
|
||||
'file_size': stat.st_size,
|
||||
'file_modified': datetime.fromtimestamp(stat.st_mtime).isoformat()
|
||||
})
|
||||
|
||||
self.metadata[asset_key] = metadata_entry
|
||||
self._save_metadata()
|
||||
|
||||
self.logger.debug(f"Marked asset as downloaded: {asset_key}")
|
||||
|
||||
def get_downloaded_assets(self) -> Dict[str, Dict[str, Any]]:
|
||||
"""
|
||||
Get all downloaded asset metadata.
|
||||
|
||||
Returns:
|
||||
Dictionary of downloaded asset metadata
|
||||
"""
|
||||
return self.metadata.copy()
|
||||
|
||||
def cleanup_missing_files(self):
|
||||
"""
|
||||
Remove metadata entries for files that no longer exist on disk.
|
||||
"""
|
||||
removed_count = 0
|
||||
assets_to_remove = []
|
||||
|
||||
for asset_key, metadata_entry in self.metadata.items():
|
||||
filepath = Path(metadata_entry.get('filepath', ''))
|
||||
if not filepath.exists():
|
||||
assets_to_remove.append(asset_key)
|
||||
self.logger.warning(f"File missing, removing from metadata: {filepath}")
|
||||
|
||||
for asset_key in assets_to_remove:
|
||||
del self.metadata[asset_key]
|
||||
removed_count += 1
|
||||
|
||||
if removed_count > 0:
|
||||
self._save_metadata()
|
||||
self.logger.info(f"Cleaned up {removed_count} missing file entries from metadata")
|
||||
|
||||
def get_stats(self) -> Dict[str, Any]:
|
||||
"""
|
||||
Get statistics about tracked assets.
|
||||
|
||||
Returns:
|
||||
Dictionary with statistics
|
||||
"""
|
||||
total_assets = len(self.metadata)
|
||||
successful_downloads = sum(1 for entry in self.metadata.values() if entry.get('success', False))
|
||||
failed_downloads = total_assets - successful_downloads
|
||||
|
||||
total_size = 0
|
||||
existing_files = 0
|
||||
|
||||
for entry in self.metadata.values():
|
||||
if 'file_size' in entry:
|
||||
total_size += entry['file_size']
|
||||
filepath = Path(entry.get('filepath', ''))
|
||||
if filepath.exists():
|
||||
existing_files += 1
|
||||
|
||||
return {
|
||||
'total_tracked_assets': total_assets,
|
||||
'successful_downloads': successful_downloads,
|
||||
'failed_downloads': failed_downloads,
|
||||
'existing_files': existing_files,
|
||||
'missing_files': total_assets - existing_files,
|
||||
'total_size_bytes': total_size,
|
||||
'total_size_mb': round(total_size / (1024 * 1024), 2)
|
||||
}
|
||||
|
||||
def print_stats(self):
|
||||
"""Print statistics about tracked assets."""
|
||||
stats = self.get_stats()
|
||||
|
||||
print("=" * 60)
|
||||
print("ASSET TRACKER STATISTICS")
|
||||
print("=" * 60)
|
||||
print(f"Total tracked assets: {stats['total_tracked_assets']}")
|
||||
print(f"Successful downloads: {stats['successful_downloads']}")
|
||||
print(f"Failed downloads: {stats['failed_downloads']}")
|
||||
print(f"Existing files: {stats['existing_files']}")
|
||||
print(f"Missing files: {stats['missing_files']}")
|
||||
print(f"Total size: {stats['total_size_mb']} MB ({stats['total_size_bytes']} bytes)")
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def main():
|
||||
"""Test the asset tracker functionality."""
|
||||
import sys
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format='%(asctime)s - %(name)s - %(levelname)s - %(message)s'
|
||||
)
|
||||
|
||||
# Create tracker
|
||||
tracker = AssetTracker()
|
||||
|
||||
# Print current stats
|
||||
tracker.print_stats()
|
||||
|
||||
# Cleanup missing files
|
||||
tracker.cleanup_missing_files()
|
||||
|
||||
# Print updated stats
|
||||
if len(sys.argv) > 1 and sys.argv[1] == '--cleanup':
|
||||
print("\nAfter cleanup:")
|
||||
tracker.print_stats()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
229
src/auth_manager.py
Normal file
229
src/auth_manager.py
Normal file
@@ -0,0 +1,229 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Authentication Manager for ParentZone API
|
||||
|
||||
This module handles authentication against the ParentZone login API
|
||||
and manages session tokens for API requests.
|
||||
"""
|
||||
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import json
|
||||
import logging
|
||||
from typing import Optional, Dict, Any
|
||||
from urllib.parse import urljoin
|
||||
|
||||
|
||||
class AuthManager:
|
||||
def __init__(self, api_url: str = "https://api.parentzone.me"):
|
||||
"""
|
||||
Initialize the authentication manager.
|
||||
|
||||
Args:
|
||||
api_url: Base URL of the API
|
||||
"""
|
||||
self.api_url = api_url.rstrip('/')
|
||||
self.login_url = urljoin(self.api_url, "/v1/auth/login")
|
||||
self.create_session_url = urljoin(self.api_url, "/v1/auth/create-session")
|
||||
self.session_token: Optional[str] = None
|
||||
self.api_key: Optional[str] = None
|
||||
self.user_id: Optional[str] = None
|
||||
self.user_name: Optional[str] = None
|
||||
self.provider_name: Optional[str] = None
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Standard headers for login requests
|
||||
self.headers = {
|
||||
'accept': 'application/json, text/plain, */*',
|
||||
'accept-language': 'en-GB,en-US;q=0.9,en;q=0.8,ro;q=0.7',
|
||||
'content-type': 'application/json;charset=UTF-8',
|
||||
'origin': 'https://www.parentzone.me',
|
||||
'priority': 'u=1, i',
|
||||
'sec-ch-ua': '"Not;A=Brand";v="99", "Google Chrome";v="139", "Chromium";v="139"',
|
||||
'sec-ch-ua-mobile': '?0',
|
||||
'sec-ch-ua-platform': '"macOS"',
|
||||
'sec-fetch-dest': 'empty',
|
||||
'sec-fetch-mode': 'cors',
|
||||
'sec-fetch-site': 'same-site',
|
||||
'user-agent': 'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_15_7) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/139.0.0.0 Safari/537.36'
|
||||
}
|
||||
|
||||
async def login(self, email: str, password: str) -> bool:
|
||||
"""
|
||||
Login to the ParentZone API using two-step authentication.
|
||||
Step 1: Login with email/password to get user accounts
|
||||
Step 2: Create session with first account ID and password to get API key
|
||||
|
||||
Args:
|
||||
email: User email
|
||||
password: User password
|
||||
|
||||
Returns:
|
||||
True if login successful, False otherwise
|
||||
"""
|
||||
self.logger.info(f"Attempting login for {email}")
|
||||
|
||||
# Step 1: Login to get user accounts
|
||||
login_data = {
|
||||
"email": email,
|
||||
"password": password
|
||||
}
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
try:
|
||||
async with session.post(
|
||||
self.login_url,
|
||||
headers=self.headers,
|
||||
json=login_data
|
||||
) as response:
|
||||
self.logger.info(f"Login response status: {response.status}")
|
||||
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
self.logger.info("Login successful")
|
||||
self.logger.debug(f"Response data type: {type(data)}")
|
||||
self.logger.debug(f"Full response data: {data}")
|
||||
|
||||
# Handle list response with user accounts
|
||||
if isinstance(data, list) and len(data) > 0:
|
||||
# Use the first account
|
||||
first_account = data[0]
|
||||
self.user_id = first_account.get('id')
|
||||
self.user_name = first_account.get('name')
|
||||
self.provider_name = first_account.get('providerName')
|
||||
|
||||
self.logger.info(f"Selected account: {self.user_name} at {self.provider_name} (ID: {self.user_id})")
|
||||
|
||||
# Step 2: Create session with the account ID
|
||||
return await self._create_session(password)
|
||||
else:
|
||||
self.logger.error(f"Unexpected login response format: {data}")
|
||||
return False
|
||||
else:
|
||||
error_text = await response.text()
|
||||
self.logger.error(f"Login failed with status {response.status}: {error_text}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Login request failed: {e}")
|
||||
return False
|
||||
|
||||
async def _create_session(self, password: str) -> bool:
|
||||
"""
|
||||
Create a session using the user ID from login.
|
||||
|
||||
Args:
|
||||
password: User password
|
||||
|
||||
Returns:
|
||||
True if session creation successful, False otherwise
|
||||
"""
|
||||
if not self.user_id:
|
||||
self.logger.error("No user ID available for session creation")
|
||||
return False
|
||||
|
||||
self.logger.info(f"Creating session for user ID: {self.user_id}")
|
||||
|
||||
session_data = {
|
||||
"id": self.user_id,
|
||||
"password": password
|
||||
}
|
||||
|
||||
# Add x-api-product header for session creation
|
||||
session_headers = self.headers.copy()
|
||||
session_headers['x-api-product'] = 'iConnect'
|
||||
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
async with aiohttp.ClientSession(timeout=timeout) as session:
|
||||
try:
|
||||
async with session.post(
|
||||
self.create_session_url,
|
||||
headers=session_headers,
|
||||
json=session_data
|
||||
) as response:
|
||||
self.logger.info(f"Create session response status: {response.status}")
|
||||
|
||||
if response.status == 200:
|
||||
data = await response.json()
|
||||
self.logger.info("Session creation successful")
|
||||
self.logger.debug(f"Session response data: {data}")
|
||||
|
||||
# Extract API key from response
|
||||
if isinstance(data, dict) and 'key' in data:
|
||||
self.api_key = data['key']
|
||||
self.logger.info("API key obtained successfully")
|
||||
return True
|
||||
else:
|
||||
self.logger.error(f"No 'key' field in session response: {data}")
|
||||
return False
|
||||
else:
|
||||
error_text = await response.text()
|
||||
self.logger.error(f"Session creation failed with status {response.status}: {error_text}")
|
||||
return False
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Session creation request failed: {e}")
|
||||
return False
|
||||
|
||||
def get_auth_headers(self) -> Dict[str, str]:
|
||||
"""
|
||||
Get headers with authentication token.
|
||||
|
||||
Returns:
|
||||
Dictionary of headers including authentication
|
||||
"""
|
||||
headers = self.headers.copy()
|
||||
|
||||
if self.api_key:
|
||||
# Use x-api-key header for authenticated requests
|
||||
headers['x-api-key'] = self.api_key
|
||||
headers['x-api-product'] = 'iConnect'
|
||||
|
||||
return headers
|
||||
|
||||
def is_authenticated(self) -> bool:
|
||||
"""
|
||||
Check if currently authenticated.
|
||||
|
||||
Returns:
|
||||
True if authenticated, False otherwise
|
||||
"""
|
||||
return self.api_key is not None
|
||||
|
||||
def logout(self):
|
||||
"""Clear the session data."""
|
||||
self.api_key = None
|
||||
self.session_token = None
|
||||
self.user_id = None
|
||||
self.user_name = None
|
||||
self.provider_name = None
|
||||
self.logger.info("Logged out - session data cleared")
|
||||
|
||||
|
||||
async def test_login():
|
||||
"""Test the login functionality."""
|
||||
auth_manager = AuthManager()
|
||||
|
||||
# Test credentials (replace with actual credentials)
|
||||
email = "tudor.sitaru@gmail.com"
|
||||
password = "mTVq8uNUvY7R39EPGVAm@"
|
||||
|
||||
print("Testing ParentZone Login...")
|
||||
success = await auth_manager.login(email, password)
|
||||
|
||||
if success:
|
||||
print("✅ Login successful!")
|
||||
print(f"User: {auth_manager.user_name} at {auth_manager.provider_name}")
|
||||
print(f"User ID: {auth_manager.user_id}")
|
||||
print(f"API Key: {auth_manager.api_key[:20]}..." if auth_manager.api_key else "No API key found")
|
||||
|
||||
# Test getting auth headers
|
||||
headers = auth_manager.get_auth_headers()
|
||||
print(f"Auth headers: {list(headers.keys())}")
|
||||
else:
|
||||
print("❌ Login failed!")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
asyncio.run(test_login())
|
||||
517
src/config_downloader.py
Normal file
517
src/config_downloader.py
Normal file
@@ -0,0 +1,517 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Configuration-based Image Downloader
|
||||
|
||||
This script reads configuration from a JSON file and downloads images from a REST API.
|
||||
It's a simplified version of the main downloader for easier use.
|
||||
|
||||
Usage:
|
||||
python config_downloader.py --config config.json
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import json
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import aiofiles
|
||||
import os
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from typing import List, Dict, Any, Optional
|
||||
import time
|
||||
from tqdm import tqdm
|
||||
|
||||
# Import the auth manager and asset tracker
|
||||
try:
|
||||
from src.auth_manager import AuthManager
|
||||
except ImportError:
|
||||
AuthManager = None
|
||||
|
||||
try:
|
||||
from src.asset_tracker import AssetTracker
|
||||
except ImportError:
|
||||
AssetTracker = None
|
||||
|
||||
|
||||
class ConfigImageDownloader:
|
||||
def __init__(self, config_file: str):
|
||||
"""
|
||||
Initialize the downloader with configuration from a JSON file.
|
||||
|
||||
Args:
|
||||
config_file: Path to the JSON configuration file
|
||||
"""
|
||||
self.config = self.load_config(config_file)
|
||||
self.setup_logging()
|
||||
|
||||
# Create output directory
|
||||
self.output_dir = Path(self.config["output_dir"])
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Track download statistics
|
||||
self.stats = {"total": 0, "successful": 0, "failed": 0, "skipped": 0}
|
||||
|
||||
# Authentication manager
|
||||
self.auth_manager = None
|
||||
|
||||
# Initialize asset tracker if enabled and available
|
||||
track_assets = self.config.get("track_assets", True)
|
||||
self.asset_tracker = None
|
||||
if track_assets and AssetTracker:
|
||||
self.asset_tracker = AssetTracker(storage_dir=str(self.output_dir))
|
||||
self.logger.info("Asset tracking enabled")
|
||||
elif track_assets:
|
||||
self.logger.warning(
|
||||
"Asset tracking requested but AssetTracker not available"
|
||||
)
|
||||
else:
|
||||
self.logger.info("Asset tracking disabled")
|
||||
|
||||
def load_config(self, config_file: str) -> Dict[str, Any]:
|
||||
"""Load configuration from JSON file."""
|
||||
try:
|
||||
with open(config_file, "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Validate required fields
|
||||
required_fields = [
|
||||
"api_url",
|
||||
"list_endpoint",
|
||||
"download_endpoint",
|
||||
"output_dir",
|
||||
]
|
||||
for field in required_fields:
|
||||
if field not in config:
|
||||
raise ValueError(f"Missing required field: {field}")
|
||||
|
||||
# Set defaults for optional fields
|
||||
config.setdefault("max_concurrent", 5)
|
||||
config.setdefault("timeout", 30)
|
||||
config.setdefault("headers", {})
|
||||
|
||||
# Note: API key is now passed as URL parameter, not header
|
||||
# The x-api-key header is only used for the list endpoint
|
||||
|
||||
# Add API key to headers for list endpoint authentication
|
||||
if "api_key" in config and config["api_key"]:
|
||||
config["headers"]["x-api-key"] = config["api_key"]
|
||||
|
||||
return config
|
||||
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(f"Configuration file not found: {config_file}")
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON in configuration file: {e}")
|
||||
|
||||
def setup_logging(self):
|
||||
"""Setup logging configuration."""
|
||||
log_file = Path(self.config["output_dir"]) / "download.log"
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
log_file.parent.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Create log file if it doesn't exist
|
||||
log_file.touch(exist_ok=True)
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
handlers=[logging.FileHandler(log_file), logging.StreamHandler()],
|
||||
)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
async def authenticate(self):
|
||||
"""Perform login authentication if credentials are provided in config."""
|
||||
if "email" in self.config and "password" in self.config and AuthManager:
|
||||
self.logger.info("Attempting login authentication...")
|
||||
self.auth_manager = AuthManager(self.config["api_url"])
|
||||
success = await self.auth_manager.login(
|
||||
self.config["email"], self.config["password"]
|
||||
)
|
||||
|
||||
if success:
|
||||
self.logger.info("Login authentication successful")
|
||||
else:
|
||||
self.logger.error("Login authentication failed")
|
||||
raise Exception("Login authentication failed")
|
||||
elif "email" in self.config or "password" in self.config:
|
||||
self.logger.warning(
|
||||
"Both email and password must be provided in config for login authentication"
|
||||
)
|
||||
raise Exception(
|
||||
"Both email and password must be provided in config for login authentication"
|
||||
)
|
||||
|
||||
async def get_asset_list(
|
||||
self, session: aiohttp.ClientSession
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""Fetch the list of assets from the API."""
|
||||
url = urljoin(self.config["api_url"], self.config["list_endpoint"])
|
||||
self.logger.info(f"Fetching asset list from: {url}")
|
||||
|
||||
headers = self.config.get("headers", {})
|
||||
|
||||
# Use API key if provided
|
||||
if "api_key" in self.config and self.config["api_key"]:
|
||||
headers["x-api-key"] = self.config["api_key"]
|
||||
|
||||
# Use login authentication if available
|
||||
elif self.auth_manager and self.auth_manager.is_authenticated():
|
||||
headers.update(self.auth_manager.get_auth_headers())
|
||||
|
||||
try:
|
||||
async with session.get(
|
||||
url, headers=headers, timeout=self.config["timeout"]
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
data = await response.json()
|
||||
|
||||
# Handle different response formats
|
||||
if isinstance(data, list):
|
||||
assets = data
|
||||
elif isinstance(data, dict):
|
||||
# Common patterns for API responses
|
||||
for key in ["data", "results", "items", "assets", "images"]:
|
||||
if key in data and isinstance(data[key], list):
|
||||
assets = data[key]
|
||||
break
|
||||
else:
|
||||
assets = [data] # Single asset
|
||||
else:
|
||||
raise ValueError(f"Unexpected response format: {type(data)}")
|
||||
|
||||
self.logger.info(f"Found {len(assets)} assets")
|
||||
return assets
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to fetch asset list: {e}")
|
||||
raise
|
||||
|
||||
def get_download_url(self, asset: Dict[str, Any]) -> str:
|
||||
"""Generate the download URL for an asset."""
|
||||
# Try different common patterns for asset IDs
|
||||
asset_id = None
|
||||
|
||||
# Common field names for asset identifiers
|
||||
id_fields = ["id", "asset_id", "image_id", "file_id", "uuid", "key"]
|
||||
for field in id_fields:
|
||||
if field in asset:
|
||||
asset_id = asset[field]
|
||||
break
|
||||
|
||||
if asset_id is None:
|
||||
# If no ID field found, try to use the asset itself as the ID
|
||||
asset_id = str(asset)
|
||||
|
||||
# Build download URL with required parameters
|
||||
from urllib.parse import urlencode
|
||||
|
||||
params = {"key": self.config.get("api_key", ""), "u": asset.get("updated", "")}
|
||||
|
||||
download_url = urljoin(
|
||||
self.config["api_url"], f"/v1/media/{asset_id}/full?{urlencode(params)}"
|
||||
)
|
||||
return download_url
|
||||
|
||||
def get_filename(self, asset: Dict[str, Any], url: str) -> str:
|
||||
"""Generate a filename for the downloaded asset."""
|
||||
# Try to get filename from asset metadata
|
||||
if "fileName" in asset:
|
||||
filename = asset["fileName"]
|
||||
elif "filename" in asset:
|
||||
filename = asset["filename"]
|
||||
elif "name" in asset:
|
||||
filename = asset["name"]
|
||||
elif "title" in asset:
|
||||
filename = asset["title"]
|
||||
else:
|
||||
# Extract filename from URL
|
||||
parsed_url = urlparse(url)
|
||||
filename = os.path.basename(parsed_url.path)
|
||||
|
||||
# If no extension, try to get it from content-type or add default
|
||||
if "." not in filename:
|
||||
if "mimeType" in asset:
|
||||
ext = self._get_extension_from_mime(asset["mimeType"])
|
||||
elif "content_type" in asset:
|
||||
ext = self._get_extension_from_mime(asset["content_type"])
|
||||
else:
|
||||
ext = ".jpg" # Default extension
|
||||
filename += ext
|
||||
|
||||
# Sanitize filename
|
||||
filename = self._sanitize_filename(filename)
|
||||
|
||||
# Ensure unique filename
|
||||
counter = 1
|
||||
original_filename = filename
|
||||
while (self.output_dir / filename).exists():
|
||||
name, ext = os.path.splitext(original_filename)
|
||||
filename = f"{name}_{counter}{ext}"
|
||||
counter += 1
|
||||
|
||||
return filename
|
||||
|
||||
def _get_extension_from_mime(self, mime_type: str) -> str:
|
||||
"""Get file extension from MIME type."""
|
||||
mime_to_ext = {
|
||||
"image/jpeg": ".jpg",
|
||||
"image/jpg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
"image/bmp": ".bmp",
|
||||
"image/tiff": ".tiff",
|
||||
"image/svg+xml": ".svg",
|
||||
}
|
||||
return mime_to_ext.get(mime_type.lower(), ".jpg")
|
||||
|
||||
def _sanitize_filename(self, filename: str) -> str:
|
||||
"""Sanitize filename by removing invalid characters."""
|
||||
# Remove or replace invalid characters
|
||||
invalid_chars = '<>:"/\\|?*'
|
||||
for char in invalid_chars:
|
||||
filename = filename.replace(char, "_")
|
||||
|
||||
# Remove leading/trailing spaces and dots
|
||||
filename = filename.strip(". ")
|
||||
|
||||
# Ensure filename is not empty
|
||||
if not filename:
|
||||
filename = "image"
|
||||
|
||||
return filename
|
||||
|
||||
async def download_asset(
|
||||
self,
|
||||
session: aiohttp.ClientSession,
|
||||
asset: Dict[str, Any],
|
||||
semaphore: asyncio.Semaphore,
|
||||
) -> bool:
|
||||
"""Download a single asset."""
|
||||
async with semaphore:
|
||||
try:
|
||||
download_url = self.get_download_url(asset)
|
||||
filename = self.get_filename(asset, download_url)
|
||||
filepath = self.output_dir / filename
|
||||
|
||||
# Check if file already exists and we're not tracking assets
|
||||
if filepath.exists() and not self.asset_tracker:
|
||||
self.logger.info(f"Skipping {filename} (already exists)")
|
||||
self.stats["skipped"] += 1
|
||||
return True
|
||||
|
||||
self.logger.info(f"Downloading {filename} from {download_url}")
|
||||
|
||||
headers = self.config.get("headers", {})
|
||||
async with session.get(
|
||||
download_url, headers=headers, timeout=self.config["timeout"]
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Get content type to verify it's an image
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if not content_type.startswith("image/"):
|
||||
self.logger.warning(
|
||||
f"Content type is not an image: {content_type}"
|
||||
)
|
||||
|
||||
# Download the file
|
||||
async with aiofiles.open(filepath, "wb") as f:
|
||||
async for chunk in response.content.iter_chunked(8192):
|
||||
await f.write(chunk)
|
||||
|
||||
# Set file modification time to match the updated timestamp
|
||||
if "updated" in asset:
|
||||
try:
|
||||
from datetime import datetime
|
||||
import os
|
||||
|
||||
# Parse the ISO timestamp
|
||||
updated_time = datetime.fromisoformat(
|
||||
asset["updated"].replace("Z", "+00:00")
|
||||
)
|
||||
# Set file modification time
|
||||
os.utime(
|
||||
filepath,
|
||||
(updated_time.timestamp(), updated_time.timestamp()),
|
||||
)
|
||||
self.logger.info(
|
||||
f"Set file modification time to {asset['updated']}"
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
f"Failed to set file modification time: {e}"
|
||||
)
|
||||
|
||||
# Mark asset as downloaded in tracker
|
||||
if self.asset_tracker:
|
||||
self.asset_tracker.mark_asset_downloaded(asset, filepath, True)
|
||||
|
||||
self.logger.info(f"Successfully downloaded {filename}")
|
||||
self.stats["successful"] += 1
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
# Mark asset as failed in tracker
|
||||
if self.asset_tracker:
|
||||
download_url = self.get_download_url(asset)
|
||||
filename = self.get_filename(asset, download_url)
|
||||
filepath = self.output_dir / filename
|
||||
self.asset_tracker.mark_asset_downloaded(asset, filepath, False)
|
||||
|
||||
self.logger.error(
|
||||
f"Failed to download asset {asset.get('id', 'unknown')}: {e}"
|
||||
)
|
||||
self.stats["failed"] += 1
|
||||
return False
|
||||
|
||||
async def download_all_assets(self, force_redownload: bool = False):
|
||||
"""
|
||||
Download all assets from the API.
|
||||
|
||||
Args:
|
||||
force_redownload: If True, download all assets regardless of tracking
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Create aiohttp session with connection pooling
|
||||
connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
|
||||
timeout = aiohttp.ClientTimeout(total=self.config["timeout"])
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
connector=connector, timeout=timeout
|
||||
) as session:
|
||||
try:
|
||||
# Perform authentication if needed
|
||||
await self.authenticate()
|
||||
|
||||
# Get asset list
|
||||
all_assets = await self.get_asset_list(session)
|
||||
self.logger.info(f"Retrieved {len(all_assets)} total assets from API")
|
||||
|
||||
if not all_assets:
|
||||
self.logger.warning("No assets found to download")
|
||||
return
|
||||
|
||||
# Filter for new/modified assets if tracking is enabled
|
||||
if self.asset_tracker and not force_redownload:
|
||||
assets = self.asset_tracker.get_new_assets(all_assets)
|
||||
self.logger.info(
|
||||
f"Found {len(assets)} new/modified assets to download"
|
||||
)
|
||||
if len(assets) == 0:
|
||||
self.logger.info("All assets are up to date!")
|
||||
return
|
||||
else:
|
||||
assets = all_assets
|
||||
if force_redownload:
|
||||
self.logger.info(
|
||||
"Force redownload enabled - downloading all assets"
|
||||
)
|
||||
|
||||
self.stats["total"] = len(assets)
|
||||
|
||||
# Create semaphore to limit concurrent downloads
|
||||
semaphore = asyncio.Semaphore(self.config["max_concurrent"])
|
||||
|
||||
# Create tasks for all downloads
|
||||
tasks = [
|
||||
self.download_asset(session, asset, semaphore) for asset in assets
|
||||
]
|
||||
|
||||
# Download all assets with progress bar
|
||||
with tqdm(total=len(tasks), desc="Downloading assets") as pbar:
|
||||
for coro in asyncio.as_completed(tasks):
|
||||
result = await coro
|
||||
pbar.update(1)
|
||||
pbar.set_postfix(
|
||||
{
|
||||
"Success": self.stats["successful"],
|
||||
"Failed": self.stats["failed"],
|
||||
"Skipped": self.stats["skipped"],
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error during download process: {e}")
|
||||
raise
|
||||
|
||||
# Print final statistics
|
||||
elapsed_time = time.time() - start_time
|
||||
self.logger.info(f"Download completed in {elapsed_time:.2f} seconds")
|
||||
self.logger.info(f"Statistics: {self.stats}")
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download images using configuration file",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
python config_downloader.py --config config.json
|
||||
|
||||
# Create a config file first:
|
||||
cp config/config_example.json config/my_config.json
|
||||
# Edit config/my_config.json with your API details
|
||||
python config_downloader.py --config my_config.json
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--config", required=True, help="Path to the JSON configuration file"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--force-redownload",
|
||||
action="store_true",
|
||||
help="Force re-download of all assets, even if already tracked",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--show-stats",
|
||||
action="store_true",
|
||||
help="Show asset tracking statistics and exit",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--cleanup",
|
||||
action="store_true",
|
||||
help="Clean up metadata for missing files and exit",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Handle special commands first
|
||||
if args.show_stats or args.cleanup:
|
||||
try:
|
||||
downloader = ConfigImageDownloader(args.config)
|
||||
if downloader.asset_tracker:
|
||||
if args.cleanup:
|
||||
downloader.asset_tracker.cleanup_missing_files()
|
||||
if args.show_stats:
|
||||
downloader.asset_tracker.print_stats()
|
||||
else:
|
||||
print("Asset tracking is not available")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return 1
|
||||
return 0
|
||||
|
||||
try:
|
||||
downloader = ConfigImageDownloader(args.config)
|
||||
asyncio.run(
|
||||
downloader.download_all_assets(force_redownload=args.force_redownload)
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
print("\nDownload interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
297
src/config_snapshot_downloader.py
Normal file
297
src/config_snapshot_downloader.py
Normal file
@@ -0,0 +1,297 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Configuration-based Snapshot Downloader for ParentZone
|
||||
|
||||
This script reads configuration from a JSON file and downloads snapshots (daily events)
|
||||
from the ParentZone API with pagination support, generating a comprehensive HTML report.
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import json
|
||||
import logging
|
||||
import os
|
||||
from datetime import datetime, timedelta
|
||||
from pathlib import Path
|
||||
|
||||
# Import the snapshot downloader
|
||||
try:
|
||||
from src.snapshot_downloader import SnapshotDownloader
|
||||
except ImportError:
|
||||
print(
|
||||
"Error: snapshot_downloader.py not found. Please ensure it's in the same directory."
|
||||
)
|
||||
exit(1)
|
||||
|
||||
|
||||
class ConfigSnapshotDownloader:
|
||||
def __init__(self, config_file: str):
|
||||
"""
|
||||
Initialize the downloader with configuration from a JSON file.
|
||||
|
||||
Args:
|
||||
config_file: Path to the JSON configuration file
|
||||
"""
|
||||
self.config = self.load_config(config_file)
|
||||
self.setup_logging()
|
||||
|
||||
# Create the underlying snapshot downloader
|
||||
self.downloader = SnapshotDownloader(
|
||||
api_url=self.config.get("api_url", "https://api.parentzone.me"),
|
||||
output_dir=self.config.get("output_dir", "snapshots"),
|
||||
api_key=self.config.get("api_key"),
|
||||
email=self.config.get("email"),
|
||||
password=self.config.get("password"),
|
||||
)
|
||||
|
||||
def load_config(self, config_file: str) -> dict:
|
||||
"""Load configuration from JSON file."""
|
||||
try:
|
||||
with open(config_file, "r") as f:
|
||||
config = json.load(f)
|
||||
|
||||
# Validate required authentication
|
||||
has_api_key = "api_key" in config and config["api_key"]
|
||||
has_credentials = (
|
||||
"email" in config
|
||||
and "password" in config
|
||||
and config["email"]
|
||||
and config["password"]
|
||||
)
|
||||
|
||||
if not has_api_key and not has_credentials:
|
||||
raise ValueError(
|
||||
"Either 'api_key' or both 'email' and 'password' must be provided in config"
|
||||
)
|
||||
|
||||
# Set defaults for optional fields
|
||||
config.setdefault("api_url", "https://api.parentzone.me")
|
||||
config.setdefault("output_dir", "snapshots")
|
||||
config.setdefault("type_ids", [15])
|
||||
config.setdefault("max_pages", None)
|
||||
|
||||
# Set default date range (last year) if not specified
|
||||
if "date_from" not in config or not config["date_from"]:
|
||||
config["date_from"] = (datetime.now() - timedelta(days=365)).strftime(
|
||||
"%Y-%m-%d"
|
||||
)
|
||||
|
||||
if "date_to" not in config or not config["date_to"]:
|
||||
config["date_to"] = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
return config
|
||||
|
||||
except FileNotFoundError:
|
||||
raise FileNotFoundError(f"Configuration file not found: {config_file}")
|
||||
except json.JSONDecodeError as e:
|
||||
raise ValueError(f"Invalid JSON in configuration file: {e}")
|
||||
|
||||
def setup_logging(self):
|
||||
"""Setup logging configuration."""
|
||||
output_dir = Path(self.config["output_dir"])
|
||||
output_dir.mkdir(exist_ok=True)
|
||||
log_file = output_dir / "snapshots.log"
|
||||
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
handlers=[logging.FileHandler(log_file), logging.StreamHandler()],
|
||||
)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
async def download_snapshots(self) -> Path:
|
||||
"""
|
||||
Download snapshots using the configuration settings.
|
||||
|
||||
Returns:
|
||||
Path to the generated HTML file
|
||||
"""
|
||||
self.logger.info("Starting snapshot download with configuration")
|
||||
self.logger.info(
|
||||
f"Date range: {self.config['date_from']} to {self.config['date_to']}"
|
||||
)
|
||||
self.logger.info(f"Type IDs: {self.config['type_ids']}")
|
||||
self.logger.info(f"Output directory: {self.config['output_dir']}")
|
||||
|
||||
if self.config.get("max_pages"):
|
||||
self.logger.info(f"Max pages limit: {self.config['max_pages']}")
|
||||
|
||||
try:
|
||||
html_file = await self.downloader.download_snapshots(
|
||||
type_ids=self.config["type_ids"],
|
||||
date_from=self.config["date_from"],
|
||||
date_to=self.config["date_to"],
|
||||
max_pages=self.config.get("max_pages"),
|
||||
)
|
||||
|
||||
return html_file
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error during snapshot download: {e}")
|
||||
raise
|
||||
|
||||
def print_config_summary(self):
|
||||
"""Print a summary of the current configuration."""
|
||||
print("=" * 60)
|
||||
print("SNAPSHOT DOWNLOADER CONFIGURATION")
|
||||
print("=" * 60)
|
||||
print(f"API URL: {self.config['api_url']}")
|
||||
print(f"Output Directory: {self.config['output_dir']}")
|
||||
print(f"Date From: {self.config['date_from']}")
|
||||
print(f"Date To: {self.config['date_to']}")
|
||||
print(f"Type IDs: {self.config['type_ids']}")
|
||||
|
||||
auth_method = "API Key" if self.config.get("api_key") else "Email/Password"
|
||||
print(f"Authentication: {auth_method}")
|
||||
|
||||
if self.config.get("max_pages"):
|
||||
print(f"Max Pages: {self.config['max_pages']}")
|
||||
|
||||
print("=" * 60)
|
||||
|
||||
|
||||
def create_example_config():
|
||||
"""Create an example configuration file."""
|
||||
example_config = {
|
||||
"api_url": "https://api.parentzone.me",
|
||||
"output_dir": "./snapshots",
|
||||
"type_ids": [15],
|
||||
"date_from": "2024-01-01",
|
||||
"date_to": "2024-12-31",
|
||||
"max_pages": null,
|
||||
"api_key": "your-api-key-here",
|
||||
"email": "your-email@example.com",
|
||||
"password": "your-password-here",
|
||||
}
|
||||
|
||||
config_file = Path("snapshot_config_example.json")
|
||||
with open(config_file, "w") as f:
|
||||
json.dump(example_config, f, indent=2)
|
||||
|
||||
print(f"✅ Example configuration created: {config_file}")
|
||||
print("📝 Edit the file with your credentials and settings")
|
||||
return config_file
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download ParentZone snapshots using configuration file",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Use existing config file
|
||||
python3 config_snapshot_downloader.py --config snapshot_config.json
|
||||
|
||||
# Create example config file
|
||||
python3 config_snapshot_downloader.py --create-example
|
||||
|
||||
# Show config summary before downloading
|
||||
python3 config_snapshot_downloader.py --config snapshot_config.json --show-config
|
||||
|
||||
Configuration file format:
|
||||
{
|
||||
"api_url": "https://api.parentzone.me",
|
||||
"output_dir": "./snapshots",
|
||||
"type_ids": [15],
|
||||
"date_from": "2024-01-01",
|
||||
"date_to": "2024-12-31",
|
||||
"max_pages": null,
|
||||
"api_key": "your-api-key-here",
|
||||
"email": "your-email@example.com",
|
||||
"password": "your-password-here"
|
||||
}
|
||||
|
||||
Notes:
|
||||
- Either 'api_key' OR both 'email' and 'password' are required
|
||||
- 'date_from' and 'date_to' default to last year if not specified
|
||||
- 'type_ids' defaults to [15] (snapshot type)
|
||||
- 'max_pages' limits pages fetched (useful for testing)
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument("--config", help="Path to the JSON configuration file")
|
||||
|
||||
parser.add_argument(
|
||||
"--create-example",
|
||||
action="store_true",
|
||||
help="Create an example configuration file and exit",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--show-config",
|
||||
action="store_true",
|
||||
help="Show configuration summary before downloading",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--debug",
|
||||
action="store_true",
|
||||
help="Enable debug mode with detailed server response logging",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Handle create example
|
||||
if args.create_example:
|
||||
create_example_config()
|
||||
return 0
|
||||
|
||||
# Validate config argument
|
||||
if not args.config:
|
||||
print("Error: --config argument is required (or use --create-example)")
|
||||
print("Run with --help for more information")
|
||||
return 1
|
||||
|
||||
try:
|
||||
# Create downloader
|
||||
downloader = ConfigSnapshotDownloader(args.config)
|
||||
|
||||
# Show configuration if requested
|
||||
if args.show_config:
|
||||
downloader.print_config_summary()
|
||||
print()
|
||||
|
||||
# Enable debug mode if requested
|
||||
if args.debug:
|
||||
print("🔍 DEBUG MODE ENABLED - Detailed server responses will be printed")
|
||||
# Set debug flag on the underlying downloader
|
||||
downloader.downloader.debug_mode = True
|
||||
|
||||
# Download snapshots
|
||||
html_file = asyncio.run(downloader.download_snapshots())
|
||||
|
||||
if html_file:
|
||||
print("\n" + "=" * 60)
|
||||
print("✅ SUCCESS!")
|
||||
print("=" * 60)
|
||||
print(f"📄 HTML Report: {html_file}")
|
||||
print(f"📁 Open the file in your browser to view the snapshots")
|
||||
print("🎯 The report includes:")
|
||||
print(" • All snapshots with descriptions and metadata")
|
||||
print(" • Images and attachments (if any)")
|
||||
print(" • Search and filtering capabilities")
|
||||
print(" • Interactive collapsible sections")
|
||||
print("=" * 60)
|
||||
else:
|
||||
print("⚠️ No snapshots were found for the specified period")
|
||||
print("💡 Try adjusting the date range in your configuration")
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n⚠️ Download interrupted by user")
|
||||
return 1
|
||||
except FileNotFoundError as e:
|
||||
print(f"❌ Configuration file error: {e}")
|
||||
print("💡 Use --create-example to generate a template")
|
||||
return 1
|
||||
except ValueError as e:
|
||||
print(f"❌ Configuration error: {e}")
|
||||
return 1
|
||||
except Exception as e:
|
||||
print(f"❌ Download failed: {e}")
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
596
src/image_downloader.py
Normal file
596
src/image_downloader.py
Normal file
@@ -0,0 +1,596 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
Image Downloader Script
|
||||
|
||||
This script downloads images from a REST API that provides:
|
||||
1. An endpoint to list all assets
|
||||
2. An endpoint to download individual assets in full resolution
|
||||
|
||||
Usage:
|
||||
python image_downloader.py --api-url <base_url> --list-endpoint <endpoint> --download-endpoint <endpoint> --output-dir <directory>
|
||||
"""
|
||||
|
||||
import argparse
|
||||
import asyncio
|
||||
import aiohttp
|
||||
import aiofiles
|
||||
import os
|
||||
import json
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from urllib.parse import urljoin, urlparse
|
||||
from typing import List, Dict, Any, Optional
|
||||
import time
|
||||
from tqdm import tqdm
|
||||
import hashlib
|
||||
|
||||
# Import the auth manager and asset tracker
|
||||
try:
|
||||
from src.auth_manager import AuthManager
|
||||
except ImportError:
|
||||
AuthManager = None
|
||||
|
||||
try:
|
||||
from src.asset_tracker import AssetTracker
|
||||
except ImportError:
|
||||
AssetTracker = None
|
||||
|
||||
|
||||
class ImageDownloader:
|
||||
def __init__(
|
||||
self,
|
||||
api_url: str,
|
||||
list_endpoint: str,
|
||||
download_endpoint: str,
|
||||
output_dir: str,
|
||||
max_concurrent: int = 5,
|
||||
timeout: int = 30,
|
||||
api_key: str = None,
|
||||
email: str = None,
|
||||
password: str = None,
|
||||
track_assets: bool = True,
|
||||
):
|
||||
"""
|
||||
Initialize the image downloader.
|
||||
|
||||
Args:
|
||||
api_url: Base URL of the API
|
||||
list_endpoint: Endpoint to get the list of assets
|
||||
download_endpoint: Endpoint to download individual assets
|
||||
output_dir: Directory to save downloaded images
|
||||
max_concurrent: Maximum number of concurrent downloads
|
||||
timeout: Request timeout in seconds
|
||||
api_key: API key for authentication
|
||||
email: Email for login authentication
|
||||
password: Password for login authentication
|
||||
track_assets: Whether to enable asset tracking to avoid re-downloads
|
||||
"""
|
||||
self.api_url = api_url.rstrip("/")
|
||||
self.list_endpoint = list_endpoint.lstrip("/")
|
||||
self.download_endpoint = download_endpoint.lstrip("/")
|
||||
self.output_dir = Path(output_dir)
|
||||
self.max_concurrent = max_concurrent
|
||||
self.timeout = timeout
|
||||
self.api_key = api_key
|
||||
self.email = email
|
||||
self.password = password
|
||||
self.auth_manager = None
|
||||
|
||||
# Create output directory if it doesn't exist
|
||||
self.output_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(levelname)s - %(message)s",
|
||||
handlers=[
|
||||
logging.FileHandler(self.output_dir / "download.log"),
|
||||
logging.StreamHandler(),
|
||||
],
|
||||
)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Initialize asset tracker if enabled and available
|
||||
self.asset_tracker = None
|
||||
if track_assets and AssetTracker:
|
||||
self.asset_tracker = AssetTracker(storage_dir=str(self.output_dir))
|
||||
self.logger.info("Asset tracking enabled")
|
||||
elif track_assets:
|
||||
self.logger.warning(
|
||||
"Asset tracking requested but AssetTracker not available"
|
||||
)
|
||||
else:
|
||||
self.logger.info("Asset tracking disabled")
|
||||
|
||||
# Track download statistics
|
||||
self.stats = {"total": 0, "successful": 0, "failed": 0, "skipped": 0}
|
||||
|
||||
async def authenticate(self):
|
||||
"""Perform login authentication if credentials are provided."""
|
||||
if self.email and self.password and AuthManager:
|
||||
self.logger.info("Attempting login authentication...")
|
||||
self.auth_manager = AuthManager(self.api_url)
|
||||
success = await self.auth_manager.login(self.email, self.password)
|
||||
|
||||
if success:
|
||||
self.logger.info("Login authentication successful")
|
||||
else:
|
||||
self.logger.error("Login authentication failed")
|
||||
raise Exception("Login authentication failed")
|
||||
elif self.email or self.password:
|
||||
self.logger.warning(
|
||||
"Both email and password must be provided for login authentication"
|
||||
)
|
||||
raise Exception(
|
||||
"Both email and password must be provided for login authentication"
|
||||
)
|
||||
|
||||
async def get_asset_list(
|
||||
self, session: aiohttp.ClientSession
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Fetch the list of assets from the API.
|
||||
|
||||
Args:
|
||||
session: aiohttp session for making requests
|
||||
|
||||
Returns:
|
||||
List of asset dictionaries
|
||||
"""
|
||||
url = urljoin(self.api_url, self.list_endpoint)
|
||||
self.logger.info(f"Fetching asset list from: {url}")
|
||||
|
||||
try:
|
||||
headers = {}
|
||||
|
||||
# Use API key if provided
|
||||
if self.api_key:
|
||||
headers["x-api-key"] = self.api_key
|
||||
|
||||
# Use login authentication if provided
|
||||
elif self.auth_manager and self.auth_manager.is_authenticated():
|
||||
headers.update(self.auth_manager.get_auth_headers())
|
||||
|
||||
async with session.get(
|
||||
url, headers=headers, timeout=self.timeout
|
||||
) as response:
|
||||
response.raise_for_status()
|
||||
data = await response.json()
|
||||
|
||||
# Handle different response formats
|
||||
if isinstance(data, list):
|
||||
assets = data
|
||||
elif isinstance(data, dict):
|
||||
# Common patterns for API responses
|
||||
if "data" in data:
|
||||
assets = data["data"]
|
||||
elif "results" in data:
|
||||
assets = data["results"]
|
||||
elif "items" in data:
|
||||
assets = data["items"]
|
||||
else:
|
||||
assets = [data] # Single asset
|
||||
else:
|
||||
raise ValueError(f"Unexpected response format: {type(data)}")
|
||||
|
||||
self.logger.info(f"Found {len(assets)} assets")
|
||||
return assets
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Failed to fetch asset list: {e}")
|
||||
raise
|
||||
|
||||
def get_download_url(self, asset: Dict[str, Any]) -> str:
|
||||
"""
|
||||
Generate the download URL for an asset.
|
||||
|
||||
Args:
|
||||
asset: Asset dictionary from the API
|
||||
|
||||
Returns:
|
||||
Download URL for the asset
|
||||
"""
|
||||
# Try different common patterns for asset IDs
|
||||
asset_id = None
|
||||
|
||||
# Common field names for asset identifiers
|
||||
id_fields = ["id", "asset_id", "image_id", "file_id", "uuid", "key"]
|
||||
for field in id_fields:
|
||||
if field in asset:
|
||||
asset_id = asset[field]
|
||||
break
|
||||
|
||||
if asset_id is None:
|
||||
# If no ID field found, try to use the asset itself as the ID
|
||||
asset_id = str(asset)
|
||||
|
||||
# Build download URL with required parameters
|
||||
from urllib.parse import urlencode
|
||||
|
||||
params = {"key": self.api_key, "u": asset.get("updated", "")}
|
||||
|
||||
download_url = urljoin(
|
||||
self.api_url, f"/v1/media/{asset_id}/full?{urlencode(params)}"
|
||||
)
|
||||
return download_url
|
||||
|
||||
def get_filename(self, asset: Dict[str, Any], url: str) -> str:
|
||||
"""
|
||||
Generate a filename for the downloaded asset.
|
||||
|
||||
Args:
|
||||
asset: Asset dictionary from the API
|
||||
url: Download URL
|
||||
|
||||
Returns:
|
||||
Filename for the asset
|
||||
"""
|
||||
# Try to get filename from asset metadata
|
||||
if "fileName" in asset:
|
||||
filename = asset["fileName"]
|
||||
elif "filename" in asset:
|
||||
filename = asset["filename"]
|
||||
elif "name" in asset:
|
||||
filename = asset["name"]
|
||||
elif "title" in asset:
|
||||
filename = asset["title"]
|
||||
else:
|
||||
# Extract filename from URL
|
||||
parsed_url = urlparse(url)
|
||||
filename = os.path.basename(parsed_url.path)
|
||||
|
||||
# If no extension, try to get it from content-type or add default
|
||||
if "." not in filename:
|
||||
if "mimeType" in asset:
|
||||
ext = self._get_extension_from_mime(asset["mimeType"])
|
||||
elif "content_type" in asset:
|
||||
ext = self._get_extension_from_mime(asset["content_type"])
|
||||
else:
|
||||
ext = ".jpg" # Default extension
|
||||
filename += ext
|
||||
|
||||
# Sanitize filename
|
||||
filename = self._sanitize_filename(filename)
|
||||
|
||||
# Ensure unique filename
|
||||
counter = 1
|
||||
original_filename = filename
|
||||
while (self.output_dir / filename).exists():
|
||||
name, ext = os.path.splitext(original_filename)
|
||||
filename = f"{name}_{counter}{ext}"
|
||||
counter += 1
|
||||
|
||||
return filename
|
||||
|
||||
def _get_extension_from_mime(self, mime_type: str) -> str:
|
||||
"""Get file extension from MIME type."""
|
||||
mime_to_ext = {
|
||||
"image/jpeg": ".jpg",
|
||||
"image/jpg": ".jpg",
|
||||
"image/png": ".png",
|
||||
"image/gif": ".gif",
|
||||
"image/webp": ".webp",
|
||||
"image/bmp": ".bmp",
|
||||
"image/tiff": ".tiff",
|
||||
"image/svg+xml": ".svg",
|
||||
}
|
||||
return mime_to_ext.get(mime_type.lower(), ".jpg")
|
||||
|
||||
def _sanitize_filename(self, filename: str) -> str:
|
||||
"""Sanitize filename by removing invalid characters."""
|
||||
# Remove or replace invalid characters
|
||||
invalid_chars = '<>:"/\\|?*'
|
||||
for char in invalid_chars:
|
||||
filename = filename.replace(char, "_")
|
||||
|
||||
# Remove leading/trailing spaces and dots
|
||||
filename = filename.strip(". ")
|
||||
|
||||
# Ensure filename is not empty
|
||||
if not filename:
|
||||
filename = "image"
|
||||
|
||||
return filename
|
||||
|
||||
async def download_asset(
|
||||
self,
|
||||
session: aiohttp.ClientSession,
|
||||
asset: Dict[str, Any],
|
||||
semaphore: asyncio.Semaphore,
|
||||
) -> bool:
|
||||
"""
|
||||
Download a single asset.
|
||||
|
||||
Args:
|
||||
session: aiohttp session for making requests
|
||||
asset: Asset dictionary from the API
|
||||
semaphore: Semaphore to limit concurrent downloads
|
||||
|
||||
Returns:
|
||||
True if download was successful, False otherwise
|
||||
"""
|
||||
async with semaphore:
|
||||
try:
|
||||
download_url = self.get_download_url(asset)
|
||||
filename = self.get_filename(asset, download_url)
|
||||
filepath = self.output_dir / filename
|
||||
|
||||
# Check if file already exists and we're not tracking assets
|
||||
if filepath.exists() and not self.asset_tracker:
|
||||
self.logger.info(f"Skipping {filename} (already exists)")
|
||||
self.stats["skipped"] += 1
|
||||
return True
|
||||
|
||||
self.logger.info(f"Downloading {filename} from {download_url}")
|
||||
|
||||
async with session.get(download_url, timeout=self.timeout) as response:
|
||||
response.raise_for_status()
|
||||
|
||||
# Get content type to verify it's an image
|
||||
content_type = response.headers.get("content-type", "")
|
||||
if not content_type.startswith("image/"):
|
||||
self.logger.warning(
|
||||
f"Content type is not an image: {content_type}"
|
||||
)
|
||||
|
||||
# Download the file
|
||||
async with aiofiles.open(filepath, "wb") as f:
|
||||
async for chunk in response.content.iter_chunked(8192):
|
||||
await f.write(chunk)
|
||||
|
||||
# Set file modification time to match the updated timestamp
|
||||
if "updated" in asset:
|
||||
try:
|
||||
from datetime import datetime
|
||||
import os
|
||||
|
||||
# Parse the ISO timestamp
|
||||
updated_time = datetime.fromisoformat(
|
||||
asset["updated"].replace("Z", "+00:00")
|
||||
)
|
||||
# Set file modification time
|
||||
os.utime(
|
||||
filepath,
|
||||
(updated_time.timestamp(), updated_time.timestamp()),
|
||||
)
|
||||
self.logger.info(
|
||||
f"Set file modification time to {asset['updated']}"
|
||||
)
|
||||
except Exception as e:
|
||||
self.logger.warning(
|
||||
f"Failed to set file modification time: {e}"
|
||||
)
|
||||
|
||||
# Mark asset as downloaded in tracker
|
||||
if self.asset_tracker:
|
||||
self.asset_tracker.mark_asset_downloaded(asset, filepath, True)
|
||||
|
||||
self.logger.info(f"Successfully downloaded {filename}")
|
||||
self.stats["successful"] += 1
|
||||
return True
|
||||
|
||||
except Exception as e:
|
||||
# Mark asset as failed in tracker
|
||||
if self.asset_tracker:
|
||||
download_url = self.get_download_url(asset)
|
||||
filename = self.get_filename(asset, download_url)
|
||||
filepath = self.output_dir / filename
|
||||
self.asset_tracker.mark_asset_downloaded(asset, filepath, False)
|
||||
|
||||
self.logger.error(
|
||||
f"Failed to download asset {asset.get('id', 'unknown')}: {e}"
|
||||
)
|
||||
self.stats["failed"] += 1
|
||||
return False
|
||||
|
||||
async def download_all_assets(self, force_redownload: bool = False):
|
||||
"""
|
||||
Download all assets from the API.
|
||||
|
||||
Args:
|
||||
force_redownload: If True, download all assets regardless of tracking
|
||||
"""
|
||||
start_time = time.time()
|
||||
|
||||
# Create aiohttp session with connection pooling
|
||||
connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
|
||||
timeout = aiohttp.ClientTimeout(total=self.timeout)
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
connector=connector, timeout=timeout
|
||||
) as session:
|
||||
try:
|
||||
# Perform authentication if needed
|
||||
await self.authenticate()
|
||||
|
||||
# Get asset list
|
||||
all_assets = await self.get_asset_list(session)
|
||||
self.logger.info(f"Retrieved {len(all_assets)} total assets from API")
|
||||
|
||||
if not all_assets:
|
||||
self.logger.warning("No assets found to download")
|
||||
return
|
||||
|
||||
# Filter for new/modified assets if tracking is enabled
|
||||
if self.asset_tracker and not force_redownload:
|
||||
assets = self.asset_tracker.get_new_assets(all_assets)
|
||||
self.logger.info(
|
||||
f"Found {len(assets)} new/modified assets to download"
|
||||
)
|
||||
if len(assets) == 0:
|
||||
self.logger.info("All assets are up to date!")
|
||||
return
|
||||
else:
|
||||
assets = all_assets
|
||||
if force_redownload:
|
||||
self.logger.info(
|
||||
"Force redownload enabled - downloading all assets"
|
||||
)
|
||||
|
||||
self.stats["total"] = len(assets)
|
||||
|
||||
# Create semaphore to limit concurrent downloads
|
||||
semaphore = asyncio.Semaphore(self.max_concurrent)
|
||||
|
||||
# Create tasks for all downloads
|
||||
tasks = [
|
||||
self.download_asset(session, asset, semaphore) for asset in assets
|
||||
]
|
||||
|
||||
# Download all assets with progress bar
|
||||
with tqdm(total=len(tasks), desc="Downloading assets") as pbar:
|
||||
for coro in asyncio.as_completed(tasks):
|
||||
result = await coro
|
||||
pbar.update(1)
|
||||
pbar.set_postfix(
|
||||
{
|
||||
"Success": self.stats["successful"],
|
||||
"Failed": self.stats["failed"],
|
||||
"Skipped": self.stats["skipped"],
|
||||
}
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error during download process: {e}")
|
||||
raise
|
||||
|
||||
# Print final statistics
|
||||
elapsed_time = time.time() - start_time
|
||||
self.logger.info(f"Download completed in {elapsed_time:.2f} seconds")
|
||||
self.logger.info(f"Statistics: {self.stats}")
|
||||
|
||||
|
||||
def main():
|
||||
"""Main function to run the image downloader."""
|
||||
parser = argparse.ArgumentParser(
|
||||
description="Download images from a REST API",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Basic usage
|
||||
python image_downloader.py --api-url "https://api.example.com" \\
|
||||
--list-endpoint "/assets" \\
|
||||
--download-endpoint "/download" \\
|
||||
--output-dir "./images"
|
||||
|
||||
# With custom concurrent downloads and timeout
|
||||
python image_downloader.py --api-url "https://api.example.com" \\
|
||||
--list-endpoint "/assets" \\
|
||||
--download-endpoint "/download" \\
|
||||
--output-dir "./images" \\
|
||||
--max-concurrent 10 \\
|
||||
--timeout 60
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--api-url",
|
||||
required=True,
|
||||
help="Base URL of the API (e.g., https://api.example.com)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--list-endpoint",
|
||||
required=True,
|
||||
help="Endpoint to get the list of assets (e.g., /assets or /images)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--download-endpoint",
|
||||
required=True,
|
||||
help="Endpoint to download individual assets (e.g., /download or /assets)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--output-dir", required=True, help="Directory to save downloaded images"
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--max-concurrent",
|
||||
type=int,
|
||||
default=5,
|
||||
help="Maximum number of concurrent downloads (default: 5)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--timeout",
|
||||
type=int,
|
||||
default=30,
|
||||
help="Request timeout in seconds (default: 30)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--api-key", help="API key for authentication (x-api-key header)"
|
||||
)
|
||||
|
||||
parser.add_argument("--email", help="Email for login authentication")
|
||||
|
||||
parser.add_argument("--password", help="Password for login authentication")
|
||||
|
||||
parser.add_argument(
|
||||
"--no-tracking",
|
||||
action="store_true",
|
||||
help="Disable asset tracking (will re-download all assets)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--force-redownload",
|
||||
action="store_true",
|
||||
help="Force re-download of all assets, even if already tracked",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--show-stats",
|
||||
action="store_true",
|
||||
help="Show asset tracking statistics and exit",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--cleanup",
|
||||
action="store_true",
|
||||
help="Clean up metadata for missing files and exit",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Handle special commands first
|
||||
if args.show_stats or args.cleanup:
|
||||
if AssetTracker:
|
||||
tracker = AssetTracker(storage_dir=args.output_dir)
|
||||
if args.cleanup:
|
||||
tracker.cleanup_missing_files()
|
||||
if args.show_stats:
|
||||
tracker.print_stats()
|
||||
else:
|
||||
print("Asset tracking is not available")
|
||||
return
|
||||
|
||||
# Create the image downloader
|
||||
downloader = ImageDownloader(
|
||||
api_url=args.api_url,
|
||||
list_endpoint=args.list_endpoint,
|
||||
download_endpoint=args.download_endpoint,
|
||||
output_dir=args.output_dir,
|
||||
max_concurrent=args.max_concurrent,
|
||||
timeout=args.timeout,
|
||||
api_key=args.api_key,
|
||||
email=args.email,
|
||||
password=args.password,
|
||||
track_assets=not args.no_tracking,
|
||||
)
|
||||
|
||||
try:
|
||||
asyncio.run(
|
||||
downloader.download_all_assets(force_redownload=args.force_redownload)
|
||||
)
|
||||
except KeyboardInterrupt:
|
||||
print("\nDownload interrupted by user")
|
||||
except Exception as e:
|
||||
print(f"Error: {e}")
|
||||
return 1
|
||||
|
||||
return 0
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
exit(main())
|
||||
1248
src/snapshot_downloader.py
Normal file
1248
src/snapshot_downloader.py
Normal file
File diff suppressed because it is too large
Load Diff
523
src/webserver.py
Normal file
523
src/webserver.py
Normal file
@@ -0,0 +1,523 @@
|
||||
#!/usr/bin/env python3
|
||||
"""
|
||||
ParentZone Snapshots Web Server
|
||||
|
||||
A simple web server that serves HTML snapshot files and their assets.
|
||||
Provides a directory listing and serves static files from the snapshots folder.
|
||||
"""
|
||||
|
||||
import os
|
||||
import asyncio
|
||||
import argparse
|
||||
import logging
|
||||
from pathlib import Path
|
||||
from urllib.parse import unquote
|
||||
from datetime import datetime
|
||||
|
||||
import aiohttp
|
||||
from aiohttp import web, hdrs
|
||||
from aiohttp.web_response import Response
|
||||
|
||||
|
||||
class SnapshotsWebServer:
|
||||
def __init__(
|
||||
self,
|
||||
snapshots_dir: str = "./snapshots",
|
||||
port: int = 8080,
|
||||
host: str = "0.0.0.0",
|
||||
):
|
||||
self.snapshots_dir = Path(snapshots_dir).resolve()
|
||||
self.port = port
|
||||
self.host = host
|
||||
|
||||
# Setup logging
|
||||
logging.basicConfig(
|
||||
level=logging.INFO,
|
||||
format="%(asctime)s - %(name)s - %(levelname)s - %(message)s",
|
||||
)
|
||||
self.logger = logging.getLogger(__name__)
|
||||
|
||||
# Ensure snapshots directory exists
|
||||
self.snapshots_dir.mkdir(parents=True, exist_ok=True)
|
||||
|
||||
self.logger.info(f"Serving snapshots from: {self.snapshots_dir}")
|
||||
|
||||
async def index_handler(self, request):
|
||||
"""Serve the main directory listing page."""
|
||||
try:
|
||||
html_files = []
|
||||
|
||||
# Find all HTML files in the snapshots directory
|
||||
for file_path in self.snapshots_dir.glob("*.html"):
|
||||
stat = file_path.stat()
|
||||
html_files.append(
|
||||
{
|
||||
"name": file_path.name,
|
||||
"size": stat.st_size,
|
||||
"modified": datetime.fromtimestamp(stat.st_mtime),
|
||||
"path": file_path.name,
|
||||
}
|
||||
)
|
||||
|
||||
# Sort by modification time (newest first)
|
||||
html_files.sort(key=lambda x: x["modified"], reverse=True)
|
||||
|
||||
# Generate HTML page
|
||||
html_content = self._generate_index_html(html_files)
|
||||
|
||||
return web.Response(text=html_content, content_type="text/html")
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error generating index: {e}")
|
||||
return web.Response(
|
||||
text=f"<h1>Error</h1><p>Could not generate directory listing: {e}</p>",
|
||||
status=500,
|
||||
content_type="text/html",
|
||||
)
|
||||
|
||||
def _generate_index_html(self, html_files):
|
||||
"""Generate the HTML directory listing page."""
|
||||
files_list = ""
|
||||
|
||||
if not html_files:
|
||||
files_list = "<p class='no-files'>No snapshot files found.</p>"
|
||||
else:
|
||||
for file_info in html_files:
|
||||
size_mb = file_info["size"] / (1024 * 1024)
|
||||
files_list += f"""
|
||||
<div class="file-item">
|
||||
<div class="file-info">
|
||||
<h3><a href="/{file_info["path"]}" class="file-link">{file_info["name"]}</a></h3>
|
||||
<div class="file-meta">
|
||||
<span class="file-size">{size_mb:.2f} MB</span>
|
||||
<span class="file-date">{file_info["modified"].strftime("%Y-%m-%d %H:%M:%S")}</span>
|
||||
</div>
|
||||
</div>
|
||||
</div>
|
||||
"""
|
||||
|
||||
return f"""
|
||||
<!DOCTYPE html>
|
||||
<html lang="en">
|
||||
<head>
|
||||
<meta charset="UTF-8">
|
||||
<meta name="viewport" content="width=device-width, initial-scale=1.0">
|
||||
<title>ParentZone Snapshots</title>
|
||||
<style>
|
||||
* {{
|
||||
margin: 0;
|
||||
padding: 0;
|
||||
box-sizing: border-box;
|
||||
}}
|
||||
|
||||
body {{
|
||||
font-family: -apple-system, BlinkMacSystemFont, 'Segoe UI', Roboto, sans-serif;
|
||||
line-height: 1.6;
|
||||
color: #333;
|
||||
background-color: #f5f5f5;
|
||||
padding: 20px;
|
||||
}}
|
||||
|
||||
.container {{
|
||||
max-width: 1000px;
|
||||
margin: 0 auto;
|
||||
}}
|
||||
|
||||
.header {{
|
||||
background: white;
|
||||
padding: 30px;
|
||||
border-radius: 10px;
|
||||
margin-bottom: 30px;
|
||||
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
||||
text-align: center;
|
||||
}}
|
||||
|
||||
.header h1 {{
|
||||
color: #2c3e50;
|
||||
margin-bottom: 10px;
|
||||
font-size: 2.5em;
|
||||
}}
|
||||
|
||||
.header p {{
|
||||
color: #7f8c8d;
|
||||
font-size: 1.1em;
|
||||
}}
|
||||
|
||||
.files-container {{
|
||||
background: white;
|
||||
border-radius: 10px;
|
||||
box-shadow: 0 2px 10px rgba(0,0,0,0.1);
|
||||
overflow: hidden;
|
||||
}}
|
||||
|
||||
.files-header {{
|
||||
background: #3498db;
|
||||
color: white;
|
||||
padding: 20px;
|
||||
font-size: 1.2em;
|
||||
font-weight: bold;
|
||||
}}
|
||||
|
||||
.file-item {{
|
||||
border-bottom: 1px solid #ecf0f1;
|
||||
padding: 20px;
|
||||
transition: background-color 0.2s;
|
||||
}}
|
||||
|
||||
.file-item:last-child {{
|
||||
border-bottom: none;
|
||||
}}
|
||||
|
||||
.file-item:hover {{
|
||||
background-color: #f8f9fa;
|
||||
}}
|
||||
|
||||
.file-link {{
|
||||
color: #2c3e50;
|
||||
text-decoration: none;
|
||||
font-size: 1.1em;
|
||||
font-weight: 500;
|
||||
}}
|
||||
|
||||
.file-link:hover {{
|
||||
color: #3498db;
|
||||
text-decoration: underline;
|
||||
}}
|
||||
|
||||
.file-meta {{
|
||||
margin-top: 8px;
|
||||
display: flex;
|
||||
gap: 20px;
|
||||
color: #7f8c8d;
|
||||
font-size: 0.9em;
|
||||
}}
|
||||
|
||||
.no-files {{
|
||||
padding: 40px;
|
||||
text-align: center;
|
||||
color: #7f8c8d;
|
||||
font-size: 1.1em;
|
||||
}}
|
||||
|
||||
.footer {{
|
||||
margin-top: 30px;
|
||||
text-align: center;
|
||||
color: #7f8c8d;
|
||||
font-size: 0.9em;
|
||||
}}
|
||||
|
||||
@media (max-width: 600px) {{
|
||||
.file-meta {{
|
||||
flex-direction: column;
|
||||
gap: 5px;
|
||||
}}
|
||||
|
||||
.header h1 {{
|
||||
font-size: 2em;
|
||||
}}
|
||||
}}
|
||||
</style>
|
||||
</head>
|
||||
<body>
|
||||
<div class="container">
|
||||
<div class="header">
|
||||
<h1>📸 ParentZone Snapshots</h1>
|
||||
<p>Browse and view your downloaded snapshot files</p>
|
||||
</div>
|
||||
|
||||
<div class="files-container">
|
||||
<div class="files-header">
|
||||
📁 Available Snapshot Files ({len(html_files)} files)
|
||||
</div>
|
||||
{files_list}
|
||||
</div>
|
||||
|
||||
<div class="footer">
|
||||
<p>Served from: {self.snapshots_dir}</p>
|
||||
<p>Server running on {self.host}:{self.port}</p>
|
||||
</div>
|
||||
</div>
|
||||
</body>
|
||||
</html>
|
||||
"""
|
||||
|
||||
async def file_handler(self, request):
|
||||
"""Serve individual HTML files and their assets."""
|
||||
try:
|
||||
# Get the requested file path
|
||||
file_path = unquote(request.match_info["filename"])
|
||||
requested_file = self.snapshots_dir / file_path
|
||||
|
||||
# Security check: ensure the file is within the snapshots directory
|
||||
try:
|
||||
requested_file.resolve().relative_to(self.snapshots_dir.resolve())
|
||||
except ValueError:
|
||||
self.logger.warning(f"Attempted path traversal: {file_path}")
|
||||
return web.Response(
|
||||
text="<h1>403 Forbidden</h1><p>Access denied.</p>",
|
||||
status=403,
|
||||
content_type="text/html",
|
||||
)
|
||||
|
||||
# Check if file exists
|
||||
if not requested_file.exists():
|
||||
return web.Response(
|
||||
text="<h1>404 Not Found</h1><p>The requested file was not found.</p>",
|
||||
status=404,
|
||||
content_type="text/html",
|
||||
)
|
||||
|
||||
# Determine content type
|
||||
content_type = self._get_content_type(requested_file)
|
||||
|
||||
# Read and serve the file
|
||||
with open(requested_file, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
return web.Response(
|
||||
body=content,
|
||||
content_type=content_type,
|
||||
headers={
|
||||
"Cache-Control": "public, max-age=3600",
|
||||
"Last-Modified": datetime.fromtimestamp(
|
||||
requested_file.stat().st_mtime
|
||||
).strftime("%a, %d %b %Y %H:%M:%S GMT"),
|
||||
},
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Error serving file {request.match_info.get('filename', 'unknown')}: {e}"
|
||||
)
|
||||
return web.Response(
|
||||
text=f"<h1>500 Internal Server Error</h1><p>Could not serve file: {e}</p>",
|
||||
status=500,
|
||||
content_type="text/html",
|
||||
)
|
||||
|
||||
async def assets_handler(self, request):
|
||||
"""Serve asset files (images, CSS, JS, etc.) from assets subdirectories."""
|
||||
try:
|
||||
# Get the requested asset path
|
||||
asset_path = unquote(request.match_info["path"])
|
||||
requested_file = self.snapshots_dir / "assets" / asset_path
|
||||
|
||||
# Security check
|
||||
try:
|
||||
requested_file.resolve().relative_to(self.snapshots_dir.resolve())
|
||||
except ValueError:
|
||||
self.logger.warning(f"Attempted path traversal in assets: {asset_path}")
|
||||
return web.Response(text="403 Forbidden", status=403)
|
||||
|
||||
# Check if file exists
|
||||
if not requested_file.exists():
|
||||
return web.Response(text="404 Not Found", status=404)
|
||||
|
||||
# Determine content type
|
||||
content_type = self._get_content_type(requested_file)
|
||||
|
||||
# Read and serve the file
|
||||
with open(requested_file, "rb") as f:
|
||||
content = f.read()
|
||||
|
||||
return web.Response(
|
||||
body=content,
|
||||
content_type=content_type,
|
||||
headers={
|
||||
"Cache-Control": "public, max-age=86400", # Cache assets for 24 hours
|
||||
"Last-Modified": datetime.fromtimestamp(
|
||||
requested_file.stat().st_mtime
|
||||
).strftime("%a, %d %b %Y %H:%M:%S GMT"),
|
||||
},
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(
|
||||
f"Error serving asset {request.match_info.get('path', 'unknown')}: {e}"
|
||||
)
|
||||
return web.Response(text="500 Internal Server Error", status=500)
|
||||
|
||||
def _get_content_type(self, file_path: Path) -> str:
|
||||
"""Determine the content type based on file extension."""
|
||||
suffix = file_path.suffix.lower()
|
||||
|
||||
content_types = {
|
||||
".html": "text/html; charset=utf-8",
|
||||
".css": "text/css; charset=utf-8",
|
||||
".js": "application/javascript; charset=utf-8",
|
||||
".json": "application/json; charset=utf-8",
|
||||
".jpg": "image/jpeg",
|
||||
".jpeg": "image/jpeg",
|
||||
".png": "image/png",
|
||||
".gif": "image/gif",
|
||||
".webp": "image/webp",
|
||||
".svg": "image/svg+xml",
|
||||
".ico": "image/x-icon",
|
||||
".pdf": "application/pdf",
|
||||
".txt": "text/plain; charset=utf-8",
|
||||
".log": "text/plain; charset=utf-8",
|
||||
}
|
||||
|
||||
return content_types.get(suffix, "application/octet-stream")
|
||||
|
||||
def setup_routes(self, app):
|
||||
"""Configure the web application routes."""
|
||||
# Main index page
|
||||
app.router.add_get("/", self.index_handler)
|
||||
|
||||
# Serve HTML files directly
|
||||
app.router.add_get("/{filename:.+\.html}", self.file_handler)
|
||||
|
||||
# Serve assets (images, CSS, JS, etc.)
|
||||
app.router.add_get("/assets/{path:.+}", self.assets_handler)
|
||||
|
||||
# Serve other static files (logs, etc.)
|
||||
app.router.add_get(
|
||||
"/{filename:.+\.(css|js|json|txt|log|ico)}", self.file_handler
|
||||
)
|
||||
|
||||
async def create_app(self):
|
||||
"""Create and configure the web application."""
|
||||
app = web.Application()
|
||||
|
||||
# Setup routes
|
||||
self.setup_routes(app)
|
||||
|
||||
# Add middleware for logging
|
||||
async def logging_middleware(request, handler):
|
||||
start_time = datetime.now()
|
||||
|
||||
# Get client IP address
|
||||
def get_client_ip():
|
||||
# Check for forwarded header first
|
||||
forwarded = request.headers.get("X-Forwarded-For")
|
||||
if forwarded:
|
||||
return forwarded.split(",")[0].strip()
|
||||
|
||||
# Try to get from transport
|
||||
try:
|
||||
if request.transport:
|
||||
peername = request.transport.get_extra_info("peername")
|
||||
if peername:
|
||||
return peername[0]
|
||||
except:
|
||||
pass
|
||||
|
||||
return "unknown"
|
||||
|
||||
try:
|
||||
response = await handler(request)
|
||||
|
||||
# Log the request
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
remote_addr = get_client_ip()
|
||||
self.logger.info(
|
||||
f"{remote_addr} - {request.method} {request.path} - "
|
||||
f"{response.status} - {duration:.3f}s"
|
||||
)
|
||||
|
||||
return response
|
||||
|
||||
except Exception as e:
|
||||
duration = (datetime.now() - start_time).total_seconds()
|
||||
remote_addr = get_client_ip()
|
||||
self.logger.error(
|
||||
f"{remote_addr} - {request.method} {request.path} - "
|
||||
f"ERROR: {e} - {duration:.3f}s"
|
||||
)
|
||||
raise
|
||||
|
||||
app.middlewares.append(logging_middleware)
|
||||
|
||||
return app
|
||||
|
||||
async def start_server(self):
|
||||
"""Start the web server."""
|
||||
app = await self.create_app()
|
||||
|
||||
runner = web.AppRunner(app)
|
||||
await runner.setup()
|
||||
|
||||
site = web.TCPSite(runner, self.host, self.port)
|
||||
await site.start()
|
||||
|
||||
self.logger.info(f"🚀 ParentZone Snapshots Web Server started!")
|
||||
self.logger.info(f"📂 Serving files from: {self.snapshots_dir}")
|
||||
self.logger.info(f"🌐 Server running at: http://{self.host}:{self.port}")
|
||||
self.logger.info(f"🔗 Open in browser: http://localhost:{self.port}")
|
||||
self.logger.info("Press Ctrl+C to stop the server")
|
||||
|
||||
return runner
|
||||
|
||||
|
||||
def main():
|
||||
parser = argparse.ArgumentParser(
|
||||
description="ParentZone Snapshots Web Server",
|
||||
formatter_class=argparse.RawDescriptionHelpFormatter,
|
||||
epilog="""
|
||||
Examples:
|
||||
# Start server with default settings
|
||||
python webserver.py
|
||||
|
||||
# Start server on custom port
|
||||
python webserver.py --port 3000
|
||||
|
||||
# Serve from custom directory
|
||||
python webserver.py --snapshots-dir /path/to/snapshots
|
||||
|
||||
# Start server on all interfaces
|
||||
python webserver.py --host 0.0.0.0 --port 8080
|
||||
""",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--snapshots-dir",
|
||||
default="./snapshots",
|
||||
help="Directory containing snapshot files (default: ./snapshots)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--port",
|
||||
type=int,
|
||||
default=8080,
|
||||
help="Port to run the server on (default: 8080)",
|
||||
)
|
||||
|
||||
parser.add_argument(
|
||||
"--host",
|
||||
default="0.0.0.0",
|
||||
help="Host to bind the server to (default: 0.0.0.0)",
|
||||
)
|
||||
|
||||
args = parser.parse_args()
|
||||
|
||||
# Create and start the server
|
||||
server = SnapshotsWebServer(
|
||||
snapshots_dir=args.snapshots_dir, port=args.port, host=args.host
|
||||
)
|
||||
|
||||
async def run_server():
|
||||
runner = None
|
||||
try:
|
||||
runner = await server.start_server()
|
||||
|
||||
# Keep the server running
|
||||
while True:
|
||||
await asyncio.sleep(1)
|
||||
|
||||
except KeyboardInterrupt:
|
||||
print("\n👋 Shutting down server...")
|
||||
except Exception as e:
|
||||
print(f"❌ Server error: {e}")
|
||||
finally:
|
||||
if runner:
|
||||
await runner.cleanup()
|
||||
|
||||
try:
|
||||
asyncio.run(run_server())
|
||||
except KeyboardInterrupt:
|
||||
print("\n✅ Server stopped")
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
||||
Reference in New Issue
Block a user