Skip to content

Commit f957715

Browse files
committed
Minor fix
Signed-off-by: yizhang-nv <[email protected]>
1 parent e76e4f7 commit f957715

File tree

2 files changed

+3
-46
lines changed

2 files changed

+3
-46
lines changed

tensorrt_llm/_torch/attention_backend/trtllm.py

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -726,7 +726,6 @@ def _post_init_with_buffers(self, buffers) -> None:
726726
capture_graph=capture_graph,
727727
)
728728
self.host_kv_cache_block_offsets = self.kv_cache_manager.host_kv_cache_block_offsets
729-
assert self.host_kv_cache_block_offsets.shape == self.kv_cache_block_offsets.shape, f"host_kv_cache_block_offsets and kv_cache_block_offsets should have the same shape, but got {self.host_kv_cache_block_offsets.shape} and {self.kv_cache_block_offsets.shape}"
730729
self.block_ids_per_seq = None
731730
self.kv_block_ids_per_seq = None
732731
if self.enable_flash_mla:

tensorrt_llm/_torch/pyexecutor/resource_manager.py

Lines changed: 3 additions & 45 deletions
Original file line numberDiff line numberDiff line change
@@ -33,8 +33,6 @@
3333
from tensorrt_llm.runtime.kv_cache_manager_v2 import (LayerId, TokenIdExt,
3434
_KVCache)
3535
from tensorrt_llm.runtime.kv_cache_manager_v2._config import DataRole
36-
from tensorrt_llm.runtime.kv_cache_manager_v2._copy_engine import \
37-
copy_batch_block_offsets as copy_batch_block_offsets_nanobind
3836
from tensorrt_llm.runtime.kv_cache_manager_v2._utils import (exact_div,
3937
typed_range)
4038
from tensorrt_llm.sampling_params import SamplingParams
@@ -1506,20 +1504,17 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
15061504

15071505
self.enable_block_reuse = kv_cache_config.enable_block_reuse
15081506

1509-
self.index_mapper = IndexMapper(max_batch_size, max_beam_width)
1507+
# Plus 1 for cuda graph dummy request
1508+
self.index_mapper = IndexMapper(max_batch_size + 1, max_beam_width)
15101509

15111510
self.host_kv_cache_block_offsets = torch.empty(
15121511
self.num_pools,
1513-
max_batch_size * max_beam_width,
1512+
(max_batch_size + 1) * max_beam_width,
15141513
2, # key and value
15151514
self.max_blocks_per_seq,
15161515
dtype=torch.int32,
15171516
pin_memory=True,
15181517
device='cpu')
1519-
import os
1520-
1521-
# V2 is using zero copy api for index copy. By default we use zero copy api.
1522-
self.index_copy_v1 = os.environ.get("INDEX_COPY_V1", "0") == "1"
15231518

15241519
@property
15251520
def blocks_in_primary_pool(self) -> int:
@@ -1734,8 +1729,6 @@ def add_dummy_requests(
17341729
def free_resources(self, request: LlmRequest, pin_on_release: bool = False):
17351730
kv_cache = self.kv_cache_map.pop(request.py_request_id)
17361731
kv_cache.close()
1737-
if self.index_copy_v1:
1738-
return
17391732
self.index_mapper.remove_sequence(request.py_request_id)
17401733

17411734
def get_batch_cache_indices(
@@ -1918,17 +1911,6 @@ def update_resources(self,
19181911
def copy_batch_block_offsets(self, dst_tensor: torch.Tensor,
19191912
request_ids: List[int], beam_width: int,
19201913
num_contexts: int, num_gen: int):
1921-
if self.index_copy_v1:
1922-
self.copy_batch_block_offsets_v1(dst_tensor, request_ids,
1923-
beam_width, num_contexts, num_gen)
1924-
else:
1925-
self.copy_batch_block_offsets_v2(dst_tensor, request_ids,
1926-
beam_width, num_contexts, num_gen)
1927-
1928-
@nvtx_range("copy_batch_block_offsets_v2")
1929-
def copy_batch_block_offsets_v2(self, dst_tensor: torch.Tensor,
1930-
request_ids: List[int], beam_width: int,
1931-
num_contexts: int, num_gen: int):
19321914
assert beam_width == 1, "beam_width must be 1 for KVCacheManagerV2"
19331915

19341916
assert num_contexts + num_gen == len(
@@ -1941,35 +1923,11 @@ def copy_batch_block_offsets_v2(self, dst_tensor: torch.Tensor,
19411923
self.host_kv_cache_block_offsets, dst_tensor, copy_idx, True,
19421924
torch.cuda.current_stream().cuda_stream)
19431925

1944-
def copy_batch_block_offsets_v1(self, dst_tensor: torch.Tensor,
1945-
request_ids: List[int], beam_width: int,
1946-
num_contexts: int, num_gen: int):
1947-
assert beam_width == 1, "beam_width must be 1 for KVCacheManager"
1948-
1949-
num_seqs = num_contexts + num_gen * beam_width
1950-
1951-
for offset, end in [(0, num_contexts), (num_contexts, num_seqs)]:
1952-
batch_cache_indices = []
1953-
for pool_idx in range(self.num_pools):
1954-
for req_id in request_ids[offset:end]:
1955-
batch_cache_indices.append(
1956-
self.kv_cache_map[req_id].get_page_indices(
1957-
pool_idx, 0).buffer_info())
1958-
if len(batch_cache_indices) > 0:
1959-
copy_batch_block_offsets_nanobind(
1960-
self.host_kv_cache_block_offsets, end - offset,
1961-
batch_cache_indices, self.num_pools, offset)
1962-
1963-
dst_tensor[:, :num_seqs].copy_(
1964-
self.host_kv_cache_block_offsets[:, :num_seqs], non_blocking=True)
1965-
19661926
def _create_kv_cache(self, request_id: int, lora_task_id: int,
19671927
input_tokens: Sequence[TokenIdExt]):
19681928
assert request_id not in self.kv_cache_map, f"KV cache for request {request_id} already exists"
19691929
kv_cache = self.impl.create_kv_cache(lora_task_id, input_tokens)
19701930
self.kv_cache_map[request_id] = kv_cache
1971-
if self.index_copy_v1:
1972-
return kv_cache
19731931
index = self.index_mapper.add_new_sequence(request_id)
19741932
for i in range(self.max_beam_width):
19751933
for pool_idx in range(self.num_pools):

0 commit comments

Comments
 (0)