From 5524373e78af5405fefcd20e4b74fa66ca0684fc Mon Sep 17 00:00:00 2001 From: Tudor Sitaru Date: Sat, 16 May 2026 20:22:54 +0100 Subject: [PATCH] feat: incremental snapshot fetch with JSON cache and state file Co-Authored-By: Claude Sonnet 4.6 --- src/snapshot_downloader.py | 88 +++++++++++++++--------------- tests/test_incremental_snapshot.py | 71 ++++++++++++++++++++++++ 2 files changed, 115 insertions(+), 44 deletions(-) diff --git a/src/snapshot_downloader.py b/src/snapshot_downloader.py index dce422c..5ba2d82 100644 --- a/src/snapshot_downloader.py +++ b/src/snapshot_downloader.py @@ -587,8 +587,8 @@ class SnapshotDownloader: Args: snapshots: List of snapshot dictionaries - date_from: Start date - date_to: End date + date_from: Start date shown in the HTML page title (does not affect the filename) + date_to: End date shown in the HTML page title (does not affect the filename) Returns: Path to the generated HTML file @@ -598,7 +598,7 @@ class SnapshotDownloader: snapshots, key=lambda x: x.get("startTime", ""), reverse=True ) - # Generate filename + # Fixed filename — overwritten on each run filename = "snapshots.html" filepath = self.output_dir / filename @@ -1021,59 +1021,59 @@ class SnapshotDownloader: max_pages: int = None, ) -> Path: """ - Download all snapshots and generate HTML file. + Download new snapshots incrementally and regenerate snapshots.html. - Args: - type_ids: List of type IDs to filter by (default: [15]) - date_from: Start date in YYYY-MM-DD format - date_to: End date in YYYY-MM-DD format - max_pages: Maximum number of pages to fetch - - Returns: - Path to generated HTML file + date_from is used only on the first run (no last_run.json). + date_to is always today regardless of what is passed. """ - # Set default dates if not provided - if date_from is None: - # Default to 1 year ago - date_from = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d") - if date_to is None: - date_to = datetime.now().strftime("%Y-%m-%d") + date_to = datetime.now().strftime("%Y-%m-%d") - self.logger.info( - f"Starting snapshot download for period {date_from} to {date_to}" - ) + # Determine fetch window start + last_run_date = self.load_last_run_date() + if last_run_date: + fetch_from = last_run_date + self.logger.info(f"Incremental run: fetching from {fetch_from}") + else: + if date_from is None: + date_from = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d") + fetch_from = date_from + self.logger.info(f"First run: fetching all snapshots from {fetch_from}") + + self.logger.info(f"Fetch window: {fetch_from} to {date_to}") + + # Load accumulated snapshot data + existing_snapshots = self.load_snapshot_cache() + self.logger.info(f"Loaded {len(existing_snapshots)} snapshots from cache") - # Create aiohttp session connector = aiohttp.TCPConnector(limit=100, limit_per_host=30) timeout = aiohttp.ClientTimeout(total=30) - async with aiohttp.ClientSession( - connector=connector, timeout=timeout - ) as session: - try: - # Authenticate if needed - await self.authenticate() + async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session: + # Authenticate if needed + await self.authenticate() - # Fetch all snapshots - snapshots = await self.fetch_all_snapshots( - session, type_ids, date_from, date_to, max_pages - ) + # Fetch only new snapshots + new_snapshots = await self.fetch_all_snapshots( + session, type_ids, fetch_from, date_to, max_pages + ) - if not snapshots: - self.logger.warning("No snapshots found for the specified period") - return None + # Merge: deduplicate by id + existing_ids = {s.get("id") for s in existing_snapshots} + added = [s for s in new_snapshots if s.get("id") not in existing_ids] + merged = existing_snapshots + added + self.logger.info(f"Added {len(added)} new snapshots (total: {len(merged)})") - # Generate HTML file - html_file = await self.generate_html_file(snapshots, date_from, date_to) + if not merged: + self.logger.warning("No snapshots found") + return None - # Print statistics - self.print_statistics() + # Persist updated cache and state + self.save_snapshot_cache(merged) + html_file = await self.generate_html_file(merged, date_from or fetch_from, date_to) + self.save_last_run_date(date_to) - return html_file - - except Exception as e: - self.logger.error(f"Error during snapshot download: {e}") - raise + self.print_statistics() + return html_file def print_statistics(self): """Print download statistics.""" diff --git a/tests/test_incremental_snapshot.py b/tests/test_incremental_snapshot.py index 857e654..c7076e9 100644 --- a/tests/test_incremental_snapshot.py +++ b/tests/test_incremental_snapshot.py @@ -86,3 +86,74 @@ def test_generate_html_file_uses_fixed_filename(tmp_path): result = asyncio.run(d.generate_html_file([], "2024-01-01", "2025-01-01")) assert result.name == "snapshots.html" assert (tmp_path / "snapshots.html").exists() + + +# --- incremental download_snapshots --- + +def _run_download(d, **kwargs): + """Run download_snapshots with mocked API calls.""" + new_snapshots = kwargs.pop("new_snapshots", []) + mock_fetch = AsyncMock(return_value=new_snapshots) + with patch.object(d, "authenticate", new_callable=AsyncMock): + with patch.object(d, "fetch_all_snapshots", mock_fetch): + with patch.object(d, "generate_html_file", new_callable=AsyncMock, + return_value=d.output_dir / "snapshots.html"): + asyncio.run(d.download_snapshots(**kwargs)) + return mock_fetch + + +def test_first_run_saves_cache_and_state(tmp_path): + d = _downloader(tmp_path) + new_snapshots = [{"id": "abc", "startTime": "2025-01-15T10:00:00Z"}] + _run_download(d, date_from="2024-01-01", new_snapshots=new_snapshots) + + assert d.load_snapshot_cache() == new_snapshots + assert d.load_last_run_date() is not None + + +def test_subsequent_run_uses_last_run_date_as_fetch_from(tmp_path): + d = _downloader(tmp_path) + d.save_last_run_date("2025-03-01") + d.save_snapshot_cache([{"id": "old", "startTime": "2025-02-01T00:00:00Z"}]) + + new_snapshots = [{"id": "new", "startTime": "2025-03-15T00:00:00Z"}] + mock_fetch = _run_download(d, date_from="2024-01-01", new_snapshots=new_snapshots) + + # Third positional arg to fetch_all_snapshots is date_from (after session, type_ids) + assert mock_fetch.call_args.args[2] == "2025-03-01" + + ids = {s["id"] for s in d.load_snapshot_cache()} + assert ids == {"old", "new"} + + +def test_deduplication_by_id(tmp_path): + d = _downloader(tmp_path) + d.save_last_run_date("2025-01-01") + d.save_snapshot_cache([{"id": "dup", "startTime": "2025-01-01T00:00:00Z"}]) + + # API returns the boundary snapshot again plus one new one + new_snapshots = [ + {"id": "dup", "startTime": "2025-01-01T00:00:00Z"}, + {"id": "fresh", "startTime": "2025-01-02T00:00:00Z"}, + ] + _run_download(d, date_from="2024-01-01", new_snapshots=new_snapshots) + + cache = d.load_snapshot_cache() + ids = [s["id"] for s in cache] + assert ids.count("dup") == 1 + assert "fresh" in ids + + +def test_fetch_failure_does_not_update_state(tmp_path): + d = _downloader(tmp_path) + d.save_last_run_date("2025-01-01") + d.save_snapshot_cache([{"id": "existing"}]) + + with patch.object(d, "authenticate", new_callable=AsyncMock): + with patch.object(d, "fetch_all_snapshots", new_callable=AsyncMock, + side_effect=Exception("network error")): + with pytest.raises(Exception, match="network error"): + asyncio.run(d.download_snapshots(date_from="2024-01-01")) + + assert d.load_last_run_date() == "2025-01-01" + assert d.load_snapshot_cache() == [{"id": "existing"}]