feat: incremental snapshot fetch with JSON cache and state file

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
Tudor Sitaru
2026-05-16 20:22:54 +01:00
parent 68d5996165
commit 5524373e78
2 changed files with 115 additions and 44 deletions
+44 -44
View File
@@ -587,8 +587,8 @@ class SnapshotDownloader:
Args:
snapshots: List of snapshot dictionaries
date_from: Start date
date_to: End date
date_from: Start date shown in the HTML page title (does not affect the filename)
date_to: End date shown in the HTML page title (does not affect the filename)
Returns:
Path to the generated HTML file
@@ -598,7 +598,7 @@ class SnapshotDownloader:
snapshots, key=lambda x: x.get("startTime", ""), reverse=True
)
# Generate filename
# Fixed filename — overwritten on each run
filename = "snapshots.html"
filepath = self.output_dir / filename
@@ -1021,59 +1021,59 @@ class SnapshotDownloader:
max_pages: int = None,
) -> Path:
"""
Download all snapshots and generate HTML file.
Download new snapshots incrementally and regenerate snapshots.html.
Args:
type_ids: List of type IDs to filter by (default: [15])
date_from: Start date in YYYY-MM-DD format
date_to: End date in YYYY-MM-DD format
max_pages: Maximum number of pages to fetch
Returns:
Path to generated HTML file
date_from is used only on the first run (no last_run.json).
date_to is always today regardless of what is passed.
"""
# Set default dates if not provided
if date_from is None:
# Default to 1 year ago
date_from = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
if date_to is None:
date_to = datetime.now().strftime("%Y-%m-%d")
date_to = datetime.now().strftime("%Y-%m-%d")
self.logger.info(
f"Starting snapshot download for period {date_from} to {date_to}"
)
# Determine fetch window start
last_run_date = self.load_last_run_date()
if last_run_date:
fetch_from = last_run_date
self.logger.info(f"Incremental run: fetching from {fetch_from}")
else:
if date_from is None:
date_from = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
fetch_from = date_from
self.logger.info(f"First run: fetching all snapshots from {fetch_from}")
self.logger.info(f"Fetch window: {fetch_from} to {date_to}")
# Load accumulated snapshot data
existing_snapshots = self.load_snapshot_cache()
self.logger.info(f"Loaded {len(existing_snapshots)} snapshots from cache")
# Create aiohttp session
connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
timeout = aiohttp.ClientTimeout(total=30)
async with aiohttp.ClientSession(
connector=connector, timeout=timeout
) as session:
try:
# Authenticate if needed
await self.authenticate()
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
# Authenticate if needed
await self.authenticate()
# Fetch all snapshots
snapshots = await self.fetch_all_snapshots(
session, type_ids, date_from, date_to, max_pages
)
# Fetch only new snapshots
new_snapshots = await self.fetch_all_snapshots(
session, type_ids, fetch_from, date_to, max_pages
)
if not snapshots:
self.logger.warning("No snapshots found for the specified period")
return None
# Merge: deduplicate by id
existing_ids = {s.get("id") for s in existing_snapshots}
added = [s for s in new_snapshots if s.get("id") not in existing_ids]
merged = existing_snapshots + added
self.logger.info(f"Added {len(added)} new snapshots (total: {len(merged)})")
# Generate HTML file
html_file = await self.generate_html_file(snapshots, date_from, date_to)
if not merged:
self.logger.warning("No snapshots found")
return None
# Print statistics
self.print_statistics()
# Persist updated cache and state
self.save_snapshot_cache(merged)
html_file = await self.generate_html_file(merged, date_from or fetch_from, date_to)
self.save_last_run_date(date_to)
return html_file
except Exception as e:
self.logger.error(f"Error during snapshot download: {e}")
raise
self.print_statistics()
return html_file
def print_statistics(self):
"""Print download statistics."""