resolve gemini comments

Leahlijuan · Leahlijuan · commit d41d2a2537ca · 2026-02-24T19:43:26.000Z
diff --git a/src/ml_flashpoint/adapter/pytorch/custom_state_dict_saver.py b/src/ml_flashpoint/adapter/pytorch/custom_state_dict_saver.py
@@ -41,9 +41,11 @@
 
 
 # Type for the plan cache: hash -> (SavePlan, Metadata)
-# The SavePlan stored here is a "template" plan with empty tensor_data to avoid memory leaks.
+# The SavePlan stored here is a "template" plan with tensor_data set to None to avoid memory leaks.
 PlanCache = dict[int, tuple[SavePlan, torchdistsaver.Metadata]]
 
+_MAX_PLAN_CACHE_SIZE = 16
+
 
 def _compute_plan_structure_hash(plan: SavePlan) -> int:
     """Computes a hash of the plan structure (FQN, type, shape, dtype).
@@ -161,6 +163,12 @@ def generate_plan(
             cached_template_plan, global_metadata = cached_entry
             updated_local_plan = _rehydrate_plan(cached_template_plan, local_plan)
             _LOGGER.info("Plan cache HIT for hash %d. Skipping global planning (reduce_scatter).", plan_hash)
+
+            # Move to end to mark as recently used
+            if plan_cache is not None:
+                plan_cache.pop(plan_hash)
+                plan_cache[plan_hash] = cached_entry
+
         except ValueError:
             _LOGGER.warning(
                 "Plan cache HIT for hash %d but rehydration failed. Falling back to global planning.",
@@ -200,6 +208,19 @@ def global_step(all_local_plans: list[SavePlan]) -> list[SavePlan]:
 
         # Cache the result
         if plan_cache is not None:
+            # TODO: Revisit this, ideally only one plan be cached in each training.
+            # Check size and evict if needed (LRU policy)
+            if len(plan_cache) >= _MAX_PLAN_CACHE_SIZE:
+                # Remove the first item inserted (which will be the least recently used)
+                # Note: Python 3.7+ dicts preserve insertion order.
+                oldest_key = next(iter(plan_cache))
+                plan_cache.pop(oldest_key)
+                _LOGGER.debug(
+                    "Evicted oldest plan cache entry with hash %d. Cache size is now %d.",
+                    oldest_key,
+                    len(plan_cache),
+                )
+
             # Sanitize to avoid memory leaks
             sanitized_plan = _sanitize_plan_for_cache(updated_local_plan)
             plan_cache[plan_hash] = (sanitized_plan, global_metadata)
diff --git a/tests/adapter/pytorch/test_custom_state_dict_saver.py b/tests/adapter/pytorch/test_custom_state_dict_saver.py
@@ -363,11 +363,101 @@ def test_generate_plan_fallback_on_rehydration_failure(
             dist_wrapper,
             plan_cache=plan_cache,
         )
-
         # Then
         # Check that reduce_scatter WAS called (fallback occurred)
         assert mock_reduce_scatter.call_count == 1
 
+    def test_plan_cache_lru_behavior(self, mock_storage_writer, mock_save_planner, dist_wrapper, mocker):
+        """Tests that the plan cache respects the LRU policy."""
+        # Given
+        # Mock _MAX_PLAN_CACHE_SIZE to a small number for testing
+        mocker.patch.object(custom_state_dict_saver, "_MAX_PLAN_CACHE_SIZE", 2)
+        state_dict = {"model": "test"}
+        global_metadata = Metadata(state_dict_metadata={})
+
+        mock_save_planner.create_global_plan.return_value = ([], global_metadata)
+        mock_storage_writer.prepare_global_plan.return_value = []
+        mock_save_planner.finish_plan.side_effect = lambda x: x
+        mocker.patch.object(dist_wrapper, "broadcast_object", side_effect=lambda x: x)
+
+        # We need distinct ReduceScatter results for each call to distinguish them
+        def reduce_scatter_side_effect(tag, local_fn, global_fn):
+            return local_fn()
+
+        mocker.patch.object(dist_wrapper, "reduce_scatter", side_effect=reduce_scatter_side_effect)
+
+        plan_cache = {}
+
+        # 1. Insert Item A
+        plan_a = SavePlan([WriteItem(index=MetadataIndex("A"), type=WriteItemType.TENSOR)])
+        mock_save_planner.create_local_plan.return_value = plan_a
+        mock_storage_writer.prepare_local_plan.return_value = plan_a
+
+        custom_state_dict_saver.generate_plan(
+            CheckpointContainerId("/ckpt_a"),
+            state_dict,
+            mock_storage_writer,
+            mock_save_planner,
+            dist_wrapper,
+            plan_cache,
+        )
+        assert len(plan_cache) == 1
+        hash_a = custom_state_dict_saver._compute_plan_structure_hash(plan_a)
+
+        # 2. Insert Item B
+        plan_b = SavePlan([WriteItem(index=MetadataIndex("B"), type=WriteItemType.TENSOR)])
+        mock_save_planner.create_local_plan.return_value = plan_b
+        mock_storage_writer.prepare_local_plan.return_value = plan_b
+
+        custom_state_dict_saver.generate_plan(
+            CheckpointContainerId("/ckpt_b"),
+            state_dict,
+            mock_storage_writer,
+            mock_save_planner,
+            dist_wrapper,
+            plan_cache,
+        )
+        assert len(plan_cache) == 2
+        hash_b = custom_state_dict_saver._compute_plan_structure_hash(plan_b)
+
+        # 3. Access Item A (Mark as recently used)
+        # We need to simulate a hit
+        # The generate_plan logic computes hash based on local plan.
+        # So we pass plan_a again.
+        mock_save_planner.create_local_plan.return_value = plan_a
+        mock_storage_writer.prepare_local_plan.return_value = plan_a
+
+        custom_state_dict_saver.generate_plan(
+            CheckpointContainerId("/ckpt_a_2"),
+            state_dict,
+            mock_storage_writer,
+            mock_save_planner,
+            dist_wrapper,
+            plan_cache,
+        )
+        # Verify A is now at the end (most recently used)
+        keys = list(plan_cache.keys())
+        assert keys[-1] == hash_a
+
+        # 4. Insert Item C (Should evict oldest, which is now B because A was accessed)
+        plan_c = SavePlan([WriteItem(index=MetadataIndex("C"), type=WriteItemType.TENSOR)])
+        mock_save_planner.create_local_plan.return_value = plan_c
+        mock_storage_writer.prepare_local_plan.return_value = plan_c
+
+        custom_state_dict_saver.generate_plan(
+            CheckpointContainerId("/ckpt_c"),
+            state_dict,
+            mock_storage_writer,
+            mock_save_planner,
+            dist_wrapper,
+            plan_cache,
+        )
+
+        assert len(plan_cache) == 2
+        assert hash_a in plan_cache
+        assert hash_b not in plan_cache  # B should be evicted
+        assert custom_state_dict_saver._compute_plan_structure_hash(plan_c) in plan_cache
+
 
 class TestWriteData:
     """Tests for the write_data function."""