3333from tensorrt_llm .runtime .kv_cache_manager_v2 import (LayerId , TokenIdExt ,
3434 _KVCache )
3535from tensorrt_llm .runtime .kv_cache_manager_v2 ._config import DataRole
36- from tensorrt_llm .runtime .kv_cache_manager_v2 ._copy_engine import \
37- copy_batch_block_offsets as copy_batch_block_offsets_nanobind
3836from tensorrt_llm .runtime .kv_cache_manager_v2 ._utils import (exact_div ,
3937 typed_range )
4038from tensorrt_llm .sampling_params import SamplingParams
@@ -1506,20 +1504,17 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
15061504
15071505 self .enable_block_reuse = kv_cache_config .enable_block_reuse
15081506
1509- self .index_mapper = IndexMapper (max_batch_size , max_beam_width )
1507+ # Plus 1 for cuda graph dummy request
1508+ self .index_mapper = IndexMapper (max_batch_size + 1 , max_beam_width )
15101509
15111510 self .host_kv_cache_block_offsets = torch .empty (
15121511 self .num_pools ,
1513- max_batch_size * max_beam_width ,
1512+ ( max_batch_size + 1 ) * max_beam_width ,
15141513 2 , # key and value
15151514 self .max_blocks_per_seq ,
15161515 dtype = torch .int32 ,
15171516 pin_memory = True ,
15181517 device = 'cpu' )
1519- import os
1520-
1521- # V2 is using zero copy api for index copy. By default we use zero copy api.
1522- self .index_copy_v1 = os .environ .get ("INDEX_COPY_V1" , "0" ) == "1"
15231518
15241519 @property
15251520 def blocks_in_primary_pool (self ) -> int :
@@ -1734,8 +1729,6 @@ def add_dummy_requests(
17341729 def free_resources (self , request : LlmRequest , pin_on_release : bool = False ):
17351730 kv_cache = self .kv_cache_map .pop (request .py_request_id )
17361731 kv_cache .close ()
1737- if self .index_copy_v1 :
1738- return
17391732 self .index_mapper .remove_sequence (request .py_request_id )
17401733
17411734 def get_batch_cache_indices (
@@ -1918,17 +1911,6 @@ def update_resources(self,
19181911 def copy_batch_block_offsets (self , dst_tensor : torch .Tensor ,
19191912 request_ids : List [int ], beam_width : int ,
19201913 num_contexts : int , num_gen : int ):
1921- if self .index_copy_v1 :
1922- self .copy_batch_block_offsets_v1 (dst_tensor , request_ids ,
1923- beam_width , num_contexts , num_gen )
1924- else :
1925- self .copy_batch_block_offsets_v2 (dst_tensor , request_ids ,
1926- beam_width , num_contexts , num_gen )
1927-
1928- @nvtx_range ("copy_batch_block_offsets_v2" )
1929- def copy_batch_block_offsets_v2 (self , dst_tensor : torch .Tensor ,
1930- request_ids : List [int ], beam_width : int ,
1931- num_contexts : int , num_gen : int ):
19321914 assert beam_width == 1 , "beam_width must be 1 for KVCacheManagerV2"
19331915
19341916 assert num_contexts + num_gen == len (
@@ -1941,35 +1923,11 @@ def copy_batch_block_offsets_v2(self, dst_tensor: torch.Tensor,
19411923 self .host_kv_cache_block_offsets , dst_tensor , copy_idx , True ,
19421924 torch .cuda .current_stream ().cuda_stream )
19431925
1944- def copy_batch_block_offsets_v1 (self , dst_tensor : torch .Tensor ,
1945- request_ids : List [int ], beam_width : int ,
1946- num_contexts : int , num_gen : int ):
1947- assert beam_width == 1 , "beam_width must be 1 for KVCacheManager"
1948-
1949- num_seqs = num_contexts + num_gen * beam_width
1950-
1951- for offset , end in [(0 , num_contexts ), (num_contexts , num_seqs )]:
1952- batch_cache_indices = []
1953- for pool_idx in range (self .num_pools ):
1954- for req_id in request_ids [offset :end ]:
1955- batch_cache_indices .append (
1956- self .kv_cache_map [req_id ].get_page_indices (
1957- pool_idx , 0 ).buffer_info ())
1958- if len (batch_cache_indices ) > 0 :
1959- copy_batch_block_offsets_nanobind (
1960- self .host_kv_cache_block_offsets , end - offset ,
1961- batch_cache_indices , self .num_pools , offset )
1962-
1963- dst_tensor [:, :num_seqs ].copy_ (
1964- self .host_kv_cache_block_offsets [:, :num_seqs ], non_blocking = True )
1965-
19661926 def _create_kv_cache (self , request_id : int , lora_task_id : int ,
19671927 input_tokens : Sequence [TokenIdExt ]):
19681928 assert request_id not in self .kv_cache_map , f"KV cache for request { request_id } already exists"
19691929 kv_cache = self .impl .create_kv_cache (lora_task_id , input_tokens )
19701930 self .kv_cache_map [request_id ] = kv_cache
1971- if self .index_copy_v1 :
1972- return kv_cache
19731931 index = self .index_mapper .add_new_sequence (request_id )
19741932 for i in range (self .max_beam_width ):
19751933 for pool_idx in range (self .num_pools ):
0 commit comments