round kv cache to allow expert sharding

khatwanimohit · khatwanimohit · commit 405ef3951c93 · 2026-04-01T22:47:09.000Z
Signed-off-by: Mohit Khatwani &lt;mohitkhatwani@google.com&gt;
diff --git a/tpu_inference/runner/kv_cache_manager.py b/tpu_inference/runner/kv_cache_manager.py
@@ -389,6 +389,7 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
             for j, layer_name in enumerate(kv_cache_tensor.shared_by):
                 layer_spec = layer_name_to_spec[layer_name]
 
+<<<<<<< HEAD
                 page_size_bytes = layer_spec.page_size_bytes
                 assert kv_cache_tensor.size % page_size_bytes == 0
                 num_blocks = kv_cache_tensor.size // page_size_bytes
@@ -397,6 +398,42 @@ def initialize_kv_cache(self, kv_cache_config: KVCacheConfig) -> None:
                 dp_size = self.runner.vllm_config.sharding_config.total_dp_size
                 # num_blocks must be a multiple of dp_size
                 num_blocks = (num_blocks // dp_size) * dp_size
+=======
+            page_size_bytes = layer_spec.page_size_bytes
+            assert kv_cache_tensor.size % page_size_bytes == 0
+            num_blocks = kv_cache_tensor.size // page_size_bytes
+            sharding_config = self.runner.vllm_config.sharding_config
+            if self.use_mla and not sharding_config.enable_dp_attention:
+                # MLA KV cache is sharded with MLP_TENSOR = (attn_dp, attn_dp_expert, model, expert)
+                divisor = (sharding_config.attn_dp_size *
+                           sharding_config.attn_dp_expert_size *
+                           sharding_config.tp_size *
+                           sharding_config.expert_size)
+            else:
+                divisor = sharding_config.total_dp_size
+            # num_blocks must be a multiple of the sharding divisor
+            num_blocks = (num_blocks // divisor) * divisor
+            # NOTE: we'll multiply the num_kv_heads by 2 in the function
+            if self.use_mla:
+                head_size = self.runner.model_config.hf_config.kv_lora_rank + \
+                    self.runner.model_config.hf_config.qk_rope_head_dim
+            else:
+                head_size = layer_spec.head_size
+            kv_cache = create_kv_caches(
+                num_blocks=num_blocks,
+                block_size=layer_spec.block_size,
+                num_kv_heads=layer_spec.num_kv_heads,
+                head_size=head_size,
+                mesh=self.runner.mesh,
+                layer_names=[f'kv_cache_tensor.{i}'],
+                cache_dtype=t2j_dtype(layer_spec.dtype),
+                use_mla=self.use_mla,
+            )[0]
+            kv_caches.append(kv_cache)
+            num_blocks_list.append(num_blocks)
+            for layer_name in kv_cache_tensor.shared_by:
+                self.runner.layer_name_to_kvcache_index[layer_name] = i
+>>>>>>> 0c544707 (round kv cache to allow expert sharding)
 
                 if isinstance(layer_spec, MambaSpec):
                     mamba_states = []