round kv cache to allow expert sharding

khatwanimohit · khatwanimohit · commit 88df0220e23b · 2026-04-02T18:18:41.000Z
Signed-off-by: Mohit Khatwani &lt;mohitkhatwani@google.com&gt;
diff --git a/tpu_inference/runner/kv_cache_manager.py b/tpu_inference/runner/kv_cache_manager.py
@@ -394,9 +394,18 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 num_blocks = kv_cache_tensor.size // page_size_bytes
                 if duplicate_shared_layers:
                     num_blocks //= num_shared_layers
-                dp_size = self.runner.vllm_config.sharding_config.total_dp_size
-                # num_blocks must be a multiple of dp_size
-                num_blocks = (num_blocks // dp_size) * dp_size
+                sharding_config = self.runner.vllm_config.sharding_config
+                if self.use_mla and not sharding_config.sharding_strategy.get(
+                        "enable_dp_attention", False):
+                    # MLA KV cache is sharded with MLP_TENSOR = (attn_dp, attn_dp_expert, model, expert)
+                    divisor = (sharding_config.attn_dp_size *
+                               sharding_config.attn_dp_expert_size *
+                               sharding_config.tp_size *
+                               sharding_config.expert_size)
+                else:
+                    divisor = sharding_config.total_dp_size
+                # num_blocks must be a multiple of the sharding divisor
+                num_blocks = (num_blocks // divisor) * divisor
 
                 if isinstance(layer_spec, MambaSpec):
                     mamba_states = []