feat: incremental snapshot fetch with JSON cache and state file
Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
This commit is contained in:
+44
-44
@@ -587,8 +587,8 @@ class SnapshotDownloader:
|
||||
|
||||
Args:
|
||||
snapshots: List of snapshot dictionaries
|
||||
date_from: Start date
|
||||
date_to: End date
|
||||
date_from: Start date shown in the HTML page title (does not affect the filename)
|
||||
date_to: End date shown in the HTML page title (does not affect the filename)
|
||||
|
||||
Returns:
|
||||
Path to the generated HTML file
|
||||
@@ -598,7 +598,7 @@ class SnapshotDownloader:
|
||||
snapshots, key=lambda x: x.get("startTime", ""), reverse=True
|
||||
)
|
||||
|
||||
# Generate filename
|
||||
# Fixed filename — overwritten on each run
|
||||
filename = "snapshots.html"
|
||||
filepath = self.output_dir / filename
|
||||
|
||||
@@ -1021,59 +1021,59 @@ class SnapshotDownloader:
|
||||
max_pages: int = None,
|
||||
) -> Path:
|
||||
"""
|
||||
Download all snapshots and generate HTML file.
|
||||
Download new snapshots incrementally and regenerate snapshots.html.
|
||||
|
||||
Args:
|
||||
type_ids: List of type IDs to filter by (default: [15])
|
||||
date_from: Start date in YYYY-MM-DD format
|
||||
date_to: End date in YYYY-MM-DD format
|
||||
max_pages: Maximum number of pages to fetch
|
||||
|
||||
Returns:
|
||||
Path to generated HTML file
|
||||
date_from is used only on the first run (no last_run.json).
|
||||
date_to is always today regardless of what is passed.
|
||||
"""
|
||||
# Set default dates if not provided
|
||||
if date_from is None:
|
||||
# Default to 1 year ago
|
||||
date_from = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
|
||||
if date_to is None:
|
||||
date_to = datetime.now().strftime("%Y-%m-%d")
|
||||
date_to = datetime.now().strftime("%Y-%m-%d")
|
||||
|
||||
self.logger.info(
|
||||
f"Starting snapshot download for period {date_from} to {date_to}"
|
||||
)
|
||||
# Determine fetch window start
|
||||
last_run_date = self.load_last_run_date()
|
||||
if last_run_date:
|
||||
fetch_from = last_run_date
|
||||
self.logger.info(f"Incremental run: fetching from {fetch_from}")
|
||||
else:
|
||||
if date_from is None:
|
||||
date_from = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
|
||||
fetch_from = date_from
|
||||
self.logger.info(f"First run: fetching all snapshots from {fetch_from}")
|
||||
|
||||
self.logger.info(f"Fetch window: {fetch_from} to {date_to}")
|
||||
|
||||
# Load accumulated snapshot data
|
||||
existing_snapshots = self.load_snapshot_cache()
|
||||
self.logger.info(f"Loaded {len(existing_snapshots)} snapshots from cache")
|
||||
|
||||
# Create aiohttp session
|
||||
connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
|
||||
timeout = aiohttp.ClientTimeout(total=30)
|
||||
|
||||
async with aiohttp.ClientSession(
|
||||
connector=connector, timeout=timeout
|
||||
) as session:
|
||||
try:
|
||||
# Authenticate if needed
|
||||
await self.authenticate()
|
||||
async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
|
||||
# Authenticate if needed
|
||||
await self.authenticate()
|
||||
|
||||
# Fetch all snapshots
|
||||
snapshots = await self.fetch_all_snapshots(
|
||||
session, type_ids, date_from, date_to, max_pages
|
||||
)
|
||||
# Fetch only new snapshots
|
||||
new_snapshots = await self.fetch_all_snapshots(
|
||||
session, type_ids, fetch_from, date_to, max_pages
|
||||
)
|
||||
|
||||
if not snapshots:
|
||||
self.logger.warning("No snapshots found for the specified period")
|
||||
return None
|
||||
# Merge: deduplicate by id
|
||||
existing_ids = {s.get("id") for s in existing_snapshots}
|
||||
added = [s for s in new_snapshots if s.get("id") not in existing_ids]
|
||||
merged = existing_snapshots + added
|
||||
self.logger.info(f"Added {len(added)} new snapshots (total: {len(merged)})")
|
||||
|
||||
# Generate HTML file
|
||||
html_file = await self.generate_html_file(snapshots, date_from, date_to)
|
||||
if not merged:
|
||||
self.logger.warning("No snapshots found")
|
||||
return None
|
||||
|
||||
# Print statistics
|
||||
self.print_statistics()
|
||||
# Persist updated cache and state
|
||||
self.save_snapshot_cache(merged)
|
||||
html_file = await self.generate_html_file(merged, date_from or fetch_from, date_to)
|
||||
self.save_last_run_date(date_to)
|
||||
|
||||
return html_file
|
||||
|
||||
except Exception as e:
|
||||
self.logger.error(f"Error during snapshot download: {e}")
|
||||
raise
|
||||
self.print_statistics()
|
||||
return html_file
|
||||
|
||||
def print_statistics(self):
|
||||
"""Print download statistics."""
|
||||
|
||||
@@ -86,3 +86,74 @@ def test_generate_html_file_uses_fixed_filename(tmp_path):
|
||||
result = asyncio.run(d.generate_html_file([], "2024-01-01", "2025-01-01"))
|
||||
assert result.name == "snapshots.html"
|
||||
assert (tmp_path / "snapshots.html").exists()
|
||||
|
||||
|
||||
# --- incremental download_snapshots ---
|
||||
|
||||
def _run_download(d, **kwargs):
|
||||
"""Run download_snapshots with mocked API calls."""
|
||||
new_snapshots = kwargs.pop("new_snapshots", [])
|
||||
mock_fetch = AsyncMock(return_value=new_snapshots)
|
||||
with patch.object(d, "authenticate", new_callable=AsyncMock):
|
||||
with patch.object(d, "fetch_all_snapshots", mock_fetch):
|
||||
with patch.object(d, "generate_html_file", new_callable=AsyncMock,
|
||||
return_value=d.output_dir / "snapshots.html"):
|
||||
asyncio.run(d.download_snapshots(**kwargs))
|
||||
return mock_fetch
|
||||
|
||||
|
||||
def test_first_run_saves_cache_and_state(tmp_path):
|
||||
d = _downloader(tmp_path)
|
||||
new_snapshots = [{"id": "abc", "startTime": "2025-01-15T10:00:00Z"}]
|
||||
_run_download(d, date_from="2024-01-01", new_snapshots=new_snapshots)
|
||||
|
||||
assert d.load_snapshot_cache() == new_snapshots
|
||||
assert d.load_last_run_date() is not None
|
||||
|
||||
|
||||
def test_subsequent_run_uses_last_run_date_as_fetch_from(tmp_path):
|
||||
d = _downloader(tmp_path)
|
||||
d.save_last_run_date("2025-03-01")
|
||||
d.save_snapshot_cache([{"id": "old", "startTime": "2025-02-01T00:00:00Z"}])
|
||||
|
||||
new_snapshots = [{"id": "new", "startTime": "2025-03-15T00:00:00Z"}]
|
||||
mock_fetch = _run_download(d, date_from="2024-01-01", new_snapshots=new_snapshots)
|
||||
|
||||
# Third positional arg to fetch_all_snapshots is date_from (after session, type_ids)
|
||||
assert mock_fetch.call_args.args[2] == "2025-03-01"
|
||||
|
||||
ids = {s["id"] for s in d.load_snapshot_cache()}
|
||||
assert ids == {"old", "new"}
|
||||
|
||||
|
||||
def test_deduplication_by_id(tmp_path):
|
||||
d = _downloader(tmp_path)
|
||||
d.save_last_run_date("2025-01-01")
|
||||
d.save_snapshot_cache([{"id": "dup", "startTime": "2025-01-01T00:00:00Z"}])
|
||||
|
||||
# API returns the boundary snapshot again plus one new one
|
||||
new_snapshots = [
|
||||
{"id": "dup", "startTime": "2025-01-01T00:00:00Z"},
|
||||
{"id": "fresh", "startTime": "2025-01-02T00:00:00Z"},
|
||||
]
|
||||
_run_download(d, date_from="2024-01-01", new_snapshots=new_snapshots)
|
||||
|
||||
cache = d.load_snapshot_cache()
|
||||
ids = [s["id"] for s in cache]
|
||||
assert ids.count("dup") == 1
|
||||
assert "fresh" in ids
|
||||
|
||||
|
||||
def test_fetch_failure_does_not_update_state(tmp_path):
|
||||
d = _downloader(tmp_path)
|
||||
d.save_last_run_date("2025-01-01")
|
||||
d.save_snapshot_cache([{"id": "existing"}])
|
||||
|
||||
with patch.object(d, "authenticate", new_callable=AsyncMock):
|
||||
with patch.object(d, "fetch_all_snapshots", new_callable=AsyncMock,
|
||||
side_effect=Exception("network error")):
|
||||
with pytest.raises(Exception, match="network error"):
|
||||
asyncio.run(d.download_snapshots(date_from="2024-01-01"))
|
||||
|
||||
assert d.load_last_run_date() == "2025-01-01"
|
||||
assert d.load_snapshot_cache() == [{"id": "existing"}]
|
||||
|
||||
Reference in New Issue
Block a user