From 5524373e78af5405fefcd20e4b74fa66ca0684fc Mon Sep 17 00:00:00 2001
From: Tudor Sitaru <tudor.sitaru@gmail.com>
Date: Sat, 16 May 2026 20:22:54 +0100
Subject: [PATCH] feat: incremental snapshot fetch with JSON cache and state
 file

Co-Authored-By: Claude Sonnet 4.6 <noreply@anthropic.com>
---
 src/snapshot_downloader.py         | 88 +++++++++++++++---------------
 tests/test_incremental_snapshot.py | 71 ++++++++++++++++++++++++
 2 files changed, 115 insertions(+), 44 deletions(-)

diff --git a/src/snapshot_downloader.py b/src/snapshot_downloader.py
index dce422c..5ba2d82 100644
--- a/src/snapshot_downloader.py
+++ b/src/snapshot_downloader.py
@@ -587,8 +587,8 @@ class SnapshotDownloader:
 
         Args:
             snapshots: List of snapshot dictionaries
-            date_from: Start date
-            date_to: End date
+            date_from: Start date shown in the HTML page title (does not affect the filename)
+            date_to: End date shown in the HTML page title (does not affect the filename)
 
         Returns:
             Path to the generated HTML file
@@ -598,7 +598,7 @@ class SnapshotDownloader:
             snapshots, key=lambda x: x.get("startTime", ""), reverse=True
         )
 
-        # Generate filename
+        # Fixed filename — overwritten on each run
         filename = "snapshots.html"
         filepath = self.output_dir / filename
 
@@ -1021,59 +1021,59 @@ class SnapshotDownloader:
         max_pages: int = None,
     ) -> Path:
         """
-        Download all snapshots and generate HTML file.
+        Download new snapshots incrementally and regenerate snapshots.html.
 
-        Args:
-            type_ids: List of type IDs to filter by (default: [15])
-            date_from: Start date in YYYY-MM-DD format
-            date_to: End date in YYYY-MM-DD format
-            max_pages: Maximum number of pages to fetch
-
-        Returns:
-            Path to generated HTML file
+        date_from is used only on the first run (no last_run.json).
+        date_to is always today regardless of what is passed.
         """
-        # Set default dates if not provided
-        if date_from is None:
-            # Default to 1 year ago
-            date_from = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
-        if date_to is None:
-            date_to = datetime.now().strftime("%Y-%m-%d")
+        date_to = datetime.now().strftime("%Y-%m-%d")
 
-        self.logger.info(
-            f"Starting snapshot download for period {date_from} to {date_to}"
-        )
+        # Determine fetch window start
+        last_run_date = self.load_last_run_date()
+        if last_run_date:
+            fetch_from = last_run_date
+            self.logger.info(f"Incremental run: fetching from {fetch_from}")
+        else:
+            if date_from is None:
+                date_from = (datetime.now() - timedelta(days=365)).strftime("%Y-%m-%d")
+            fetch_from = date_from
+            self.logger.info(f"First run: fetching all snapshots from {fetch_from}")
+
+        self.logger.info(f"Fetch window: {fetch_from} to {date_to}")
+
+        # Load accumulated snapshot data
+        existing_snapshots = self.load_snapshot_cache()
+        self.logger.info(f"Loaded {len(existing_snapshots)} snapshots from cache")
 
-        # Create aiohttp session
         connector = aiohttp.TCPConnector(limit=100, limit_per_host=30)
         timeout = aiohttp.ClientTimeout(total=30)
 
-        async with aiohttp.ClientSession(
-            connector=connector, timeout=timeout
-        ) as session:
-            try:
-                # Authenticate if needed
-                await self.authenticate()
+        async with aiohttp.ClientSession(connector=connector, timeout=timeout) as session:
+            # Authenticate if needed
+            await self.authenticate()
 
-                # Fetch all snapshots
-                snapshots = await self.fetch_all_snapshots(
-                    session, type_ids, date_from, date_to, max_pages
-                )
+            # Fetch only new snapshots
+            new_snapshots = await self.fetch_all_snapshots(
+                session, type_ids, fetch_from, date_to, max_pages
+            )
 
-                if not snapshots:
-                    self.logger.warning("No snapshots found for the specified period")
-                    return None
+        # Merge: deduplicate by id
+        existing_ids = {s.get("id") for s in existing_snapshots}
+        added = [s for s in new_snapshots if s.get("id") not in existing_ids]
+        merged = existing_snapshots + added
+        self.logger.info(f"Added {len(added)} new snapshots (total: {len(merged)})")
 
-                # Generate HTML file
-                html_file = await self.generate_html_file(snapshots, date_from, date_to)
+        if not merged:
+            self.logger.warning("No snapshots found")
+            return None
 
-                # Print statistics
-                self.print_statistics()
+        # Persist updated cache and state
+        self.save_snapshot_cache(merged)
+        html_file = await self.generate_html_file(merged, date_from or fetch_from, date_to)
+        self.save_last_run_date(date_to)
 
-                return html_file
-
-            except Exception as e:
-                self.logger.error(f"Error during snapshot download: {e}")
-                raise
+        self.print_statistics()
+        return html_file
 
     def print_statistics(self):
         """Print download statistics."""
diff --git a/tests/test_incremental_snapshot.py b/tests/test_incremental_snapshot.py
index 857e654..c7076e9 100644
--- a/tests/test_incremental_snapshot.py
+++ b/tests/test_incremental_snapshot.py
@@ -86,3 +86,74 @@ def test_generate_html_file_uses_fixed_filename(tmp_path):
         result = asyncio.run(d.generate_html_file([], "2024-01-01", "2025-01-01"))
     assert result.name == "snapshots.html"
     assert (tmp_path / "snapshots.html").exists()
+
+
+# --- incremental download_snapshots ---
+
+def _run_download(d, **kwargs):
+    """Run download_snapshots with mocked API calls."""
+    new_snapshots = kwargs.pop("new_snapshots", [])
+    mock_fetch = AsyncMock(return_value=new_snapshots)
+    with patch.object(d, "authenticate", new_callable=AsyncMock):
+        with patch.object(d, "fetch_all_snapshots", mock_fetch):
+            with patch.object(d, "generate_html_file", new_callable=AsyncMock,
+                              return_value=d.output_dir / "snapshots.html"):
+                asyncio.run(d.download_snapshots(**kwargs))
+    return mock_fetch
+
+
+def test_first_run_saves_cache_and_state(tmp_path):
+    d = _downloader(tmp_path)
+    new_snapshots = [{"id": "abc", "startTime": "2025-01-15T10:00:00Z"}]
+    _run_download(d, date_from="2024-01-01", new_snapshots=new_snapshots)
+
+    assert d.load_snapshot_cache() == new_snapshots
+    assert d.load_last_run_date() is not None
+
+
+def test_subsequent_run_uses_last_run_date_as_fetch_from(tmp_path):
+    d = _downloader(tmp_path)
+    d.save_last_run_date("2025-03-01")
+    d.save_snapshot_cache([{"id": "old", "startTime": "2025-02-01T00:00:00Z"}])
+
+    new_snapshots = [{"id": "new", "startTime": "2025-03-15T00:00:00Z"}]
+    mock_fetch = _run_download(d, date_from="2024-01-01", new_snapshots=new_snapshots)
+
+    # Third positional arg to fetch_all_snapshots is date_from (after session, type_ids)
+    assert mock_fetch.call_args.args[2] == "2025-03-01"
+
+    ids = {s["id"] for s in d.load_snapshot_cache()}
+    assert ids == {"old", "new"}
+
+
+def test_deduplication_by_id(tmp_path):
+    d = _downloader(tmp_path)
+    d.save_last_run_date("2025-01-01")
+    d.save_snapshot_cache([{"id": "dup", "startTime": "2025-01-01T00:00:00Z"}])
+
+    # API returns the boundary snapshot again plus one new one
+    new_snapshots = [
+        {"id": "dup", "startTime": "2025-01-01T00:00:00Z"},
+        {"id": "fresh", "startTime": "2025-01-02T00:00:00Z"},
+    ]
+    _run_download(d, date_from="2024-01-01", new_snapshots=new_snapshots)
+
+    cache = d.load_snapshot_cache()
+    ids = [s["id"] for s in cache]
+    assert ids.count("dup") == 1
+    assert "fresh" in ids
+
+
+def test_fetch_failure_does_not_update_state(tmp_path):
+    d = _downloader(tmp_path)
+    d.save_last_run_date("2025-01-01")
+    d.save_snapshot_cache([{"id": "existing"}])
+
+    with patch.object(d, "authenticate", new_callable=AsyncMock):
+        with patch.object(d, "fetch_all_snapshots", new_callable=AsyncMock,
+                          side_effect=Exception("network error")):
+            with pytest.raises(Exception, match="network error"):
+                asyncio.run(d.download_snapshots(date_from="2024-01-01"))
+
+    assert d.load_last_run_date() == "2025-01-01"
+    assert d.load_snapshot_cache() == [{"id": "existing"}]