fix: Prevent cache invalidation from display filters and ensure no gaps

wesm · claude · wesm · commit f4fbd47c610f · 2025-12-20T14:40:17.000-06:00
Three critical fixes for the two-tier cache system: 1. Separate display filters from cache behavior - --mtd and --year now only filter the VIEW, not what's cached - Cache always stores full data (year=None, since=None) - Prevents --mtd from nuking an existing full cache 2. Fix partial refresh API calls - Monarch API requires BOTH startDate and endDate - Was passing None for one date, causing API failure - Added get_hot_refresh_date_range() and get_cold_refresh_date_range() 3. Prevent gaps between cache tiers - Hot refresh now uses cold's latest_date (from metadata) - Cold refresh now uses hot's earliest_date (from metadata) - Both use 7-day overlap (TIER_OVERLAP_DAYS) - Fixes gap that would grow daily as boundary moves Added 12 regression tests covering: - Display filters don't invalidate cache - Partial refresh date ranges always non-None - Tier overlap ensures no gaps 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
diff --git a/moneyflow/app.py b/moneyflow/app.py
@@ -245,6 +245,7 @@ def __init__(
         self.cache_manager = None  # Will be set if caching is enabled
         self.cache_year_filter = None  # Track what filters the cache uses
         self.cache_since_filter = None
+        self.display_start_date = None  # Display filter (--mtd/--since) separate from cache
         self.config_dir = config_dir  # Custom config directory (None = default ~/.moneyflow)
         self.encryption_key: Optional[bytes] = None  # Encryption key for cache (set after login)
         # Controller will be initialized after data_manager is ready
@@ -367,27 +368,27 @@ def _initialize_managers(
     def _determine_date_range(self):
         """Determine date range based on CLI arguments.
 
+        Separates display filtering (--mtd, --since) from cache behavior:
+        - display_start_date: What the user wants to VIEW (filters the UI)
+        - cache filters: What the cache actually STORES (preserved on refresh)
+
         Returns:
-            tuple: (start_date, end_date, cache_year_filter, cache_since_filter)
+            tuple: (display_start_date, cache_year_filter, cache_since_filter)
         """
+        # Display filter - what user wants to see
         if self.custom_start_date:
-            start_date = self.custom_start_date
-            end_date = datetime.now().strftime("%Y-%m-%d")
-            cache_year_filter = None
-            cache_since_filter = self.custom_start_date
+            display_start_date = self.custom_start_date
         elif self.start_year:
-            start_date = f"{self.start_year}-01-01"
-            end_date = datetime.now().strftime("%Y-%m-%d")
-            cache_year_filter = self.start_year
-            cache_since_filter = None
+            display_start_date = f"{self.start_year}-01-01"
         else:
-            # Fetch ALL transactions (no date filter for offline-first approach)
-            start_date = None
-            end_date = None
-            cache_year_filter = None
-            cache_since_filter = None
+            display_start_date = None
+
+        # Cache filters - determined by existing cache or first fetch
+        # These are set later based on what's actually cached
+        cache_year_filter = None
+        cache_since_filter = None
 
-        return start_date, end_date, cache_year_filter, cache_since_filter
+        return display_start_date, cache_year_filter, cache_since_filter
 
     @staticmethod
     def _filter_df_by_start_date(df: pl.DataFrame, start_date: str) -> pl.DataFrame:
@@ -1027,14 +1028,15 @@ async def fetch_operation():
             )
 
             # Save to cache for next time (only if --cache was passed)
+            # Always save as full cache (no filters) - display filters applied separately
             if self.cache_manager:
                 loading_status.update("💾 Saving to cache...")
                 self.cache_manager.save_cache(
                     transactions_df=df,
                     categories=categories,
                     category_groups=category_groups,
-                    year=self.cache_year_filter,
-                    since=self.cache_since_filter,
+                    year=None,  # Full cache - no year filter
+                    since=None,  # Full cache - no since filter
                 )
                 loading_status.update(f"✅ Loaded {len(df):,} transactions and cached!")
             else:
@@ -1102,14 +1104,12 @@ def update_progress(msg: str) -> None:
 
         try:
             # Fetch the expired tier from API
+            # Use helper methods to ensure both dates are always provided (API requirement)
             if is_hot_refresh:
-                fetch_start, fetch_end = boundary_str, None
-                loading_status.update(f"📊 Fetching transactions since {boundary_str}...")
+                fetch_start, fetch_end = self.cache_manager.get_hot_refresh_date_range()
+                loading_status.update(f"📊 Fetching transactions since {fetch_start}...")
             else:
-                fetch_start, fetch_end = (
-                    None,
-                    (boundary_date - timedelta(days=1)).strftime("%Y-%m-%d"),
-                )
+                fetch_start, fetch_end = self.cache_manager.get_cold_refresh_date_range()
                 loading_status.update(
                     f"📊 Fetching historical transactions before {boundary_str}..."
                 )
@@ -1322,8 +1322,8 @@ async def initialize_data(self) -> None:
                 profile_dir=determined_profile_dir, backend_type=determined_backend_type
             )
 
-            # Step 4: Determine date range
-            start_date, end_date, self.cache_year_filter, self.cache_since_filter = (
+            # Step 4: Determine display filter (separate from cache)
+            self.display_start_date, self.cache_year_filter, self.cache_since_filter = (
                 self._determine_date_range()
             )
 
@@ -1335,9 +1335,9 @@ async def initialize_data(self) -> None:
                 df, categories, category_groups = cached_data
                 # Filter cached data to match requested date range (e.g., --mtd)
                 # Cache may contain more data than requested (e.g., full year cache for MTD request)
-                if start_date:
+                if self.display_start_date:
                     original_count = len(df)
-                    df = self._filter_df_by_start_date(df, start_date)
+                    df = self._filter_df_by_start_date(df, self.display_start_date)
                     if len(df) < original_count:
                         loading_status.update(
                             f"📦 Filtered cache: {len(df):,} of {original_count:,} transactions"
@@ -1348,32 +1348,43 @@ async def initialize_data(self) -> None:
                 if partial_result:
                     df, categories, category_groups = partial_result
                     # Filter if needed
-                    if start_date:
+                    if self.display_start_date:
                         original_count = len(df)
-                        df = self._filter_df_by_start_date(df, start_date)
+                        df = self._filter_df_by_start_date(df, self.display_start_date)
                         if len(df) < original_count:
                             loading_status.update(
                                 f"📦 Filtered: {len(df):,} of {original_count:,} transactions"
                             )
                 else:
                     # Partial refresh failed, fall back to full fetch
+                    # Always fetch full data - display filter applied after
                     fetch_result = await self._fetch_data_with_retry(
-                        creds, start_date, end_date, loading_status
+                        creds, None, None, loading_status
                     )
                     if fetch_result is None:
                         has_error = True
                         return
                     df, categories, category_groups = fetch_result
             else:
                 # Step 6: Full fetch from API (BOTH, ALL, or no cache)
+                # Always fetch full data - display filter applied after
                 fetch_result = await self._fetch_data_with_retry(
-                    creds, start_date, end_date, loading_status
+                    creds, None, None, loading_status
                 )
                 if fetch_result is None:
                     has_error = True
                     return
                 df, categories, category_groups = fetch_result
 
+            # Apply display filter after fetch (cache stores full data)
+            if self.display_start_date and strategy != RefreshStrategy.NONE:
+                original_count = len(df)
+                df = self._filter_df_by_start_date(df, self.display_start_date)
+                if len(df) < original_count:
+                    loading_status.update(
+                        f"📦 Filtered: {len(df):,} of {original_count:,} transactions"
+                    )
+
             # Step 7: Store data
             self._store_data(df, categories, category_groups)
 
diff --git a/moneyflow/cache_manager.py b/moneyflow/cache_manager.py
@@ -88,6 +88,73 @@ def _get_boundary_date(self) -> date:
         """Get the boundary date between hot and cold cache (90 days ago)."""
         return date.today() - timedelta(days=self.HOT_WINDOW_DAYS)
 
+    # Overlap days to fetch before cold's end date (ensures no gaps from timing/date changes)
+    TIER_OVERLAP_DAYS = 7
+
+    def get_hot_refresh_date_range(self) -> tuple[str, str]:
+        """Get the date range for refreshing the hot cache tier.
+
+        CRITICAL: Must start from cold cache's latest_date to avoid gaps.
+        The boundary moves forward each day, but cold data is fixed until
+        cold cache expires (30 days). Without this, gaps would grow daily.
+
+        Subtracts TIER_OVERLAP_DAYS to handle transactions that might change
+        dates or timing variations during refresh.
+
+        Returns:
+            Tuple of (start_date, end_date) as ISO format strings.
+            Both values are always non-None to satisfy API requirements.
+        """
+        today = date.today()
+
+        # MUST use cold cache's latest date to avoid gaps
+        try:
+            metadata = self.load_metadata()
+            cold_meta = metadata.get("cold", {})
+            cold_latest = cold_meta.get("latest_date")
+            if cold_latest:
+                cold_end = date.fromisoformat(cold_latest)
+                # Overlap: start a few days before cold ends
+                start = (cold_end - timedelta(days=self.TIER_OVERLAP_DAYS))
+                return start.isoformat(), today.isoformat()
+        except Exception:
+            pass
+
+        # Fallback only if no cold metadata (shouldn't happen in normal use)
+        boundary = self._get_boundary_date()
+        start = (boundary - timedelta(days=self.TIER_OVERLAP_DAYS)).isoformat()
+        return start, today.isoformat()
+
+    def get_cold_refresh_date_range(self) -> tuple[str, str]:
+        """Get the date range for refreshing the cold cache tier.
+
+        CRITICAL: Must end at hot cache's earliest_date + overlap to ensure
+        proper coverage. Uses stored metadata, not computed boundary.
+
+        Returns:
+            Tuple of (start_date, end_date) as ISO format strings.
+            Both values are always non-None to satisfy API requirements.
+        """
+        start = "2000-01-01"
+
+        # Use hot cache's earliest date to ensure overlap
+        try:
+            metadata = self.load_metadata()
+            hot_meta = metadata.get("hot", {})
+            hot_earliest = hot_meta.get("earliest_date")
+            if hot_earliest:
+                hot_start = date.fromisoformat(hot_earliest)
+                # Overlap: end a few days after hot starts
+                end = (hot_start + timedelta(days=self.TIER_OVERLAP_DAYS)).isoformat()
+                return start, end
+        except Exception:
+            pass
+
+        # Fallback only if no hot metadata
+        boundary = self._get_boundary_date()
+        end = (boundary + timedelta(days=self.TIER_OVERLAP_DAYS)).isoformat()
+        return start, end
+
     def cache_exists(self) -> bool:
         """Check if two-tier cache files exist."""
         return (
diff --git a/tests/test_tiered_cache.py b/tests/test_tiered_cache.py