Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 7 additions & 1 deletion cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -68,7 +68,7 @@ std::optional<tensorrt_llm::runtime::ITensor::UniquePtr> from_torch(std::optiona
class PyKvCacheManager : public tbk::BaseKVCacheManager
{
public:
NB_TRAMPOLINE(tbk::BaseKVCacheManager, 30);
NB_TRAMPOLINE(tbk::BaseKVCacheManager, 36);

// using BaseKVCacheManager::BaseKVCacheManager; // Inherit constructors
void allocatePools(bool useUvm = false) override
Expand Down Expand Up @@ -255,6 +255,12 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager
{
NB_OVERRIDE_PURE(flushIterationEvents);
}

SizeType32 countReusableBlocks(VecUniqueTokens const& uniqueTokens, tb::LlmRequest const& llmRequest,
bool onlyAllocated = false) const override
{
NB_OVERRIDE_PURE(countReusableBlocks, uniqueTokens, llmRequest, onlyAllocated);
}
};

// TODO: Deduplicate executor bindings KvCacheStats
Expand Down
8 changes: 4 additions & 4 deletions security_scanning/docs/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion security_scanning/docs/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dependencies = [
"breathe (>=4.36.0,<5.0.0)",
"sphinx-copybutton (>=0.5.2,<0.6.0)",
"autodoc-pydantic (>=2.2.0,<3.0.0)",
"sphinx-togglebutton (>=0.4.4,<0.5.0)",
"sphinx-togglebutton (>=0.4.5,<0.5.0)",
"sphinxcontrib-mermaid (>=2.0.1,<3.0.0)"
]

Expand Down
6 changes: 3 additions & 3 deletions security_scanning/examples/auto_deploy/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions security_scanning/examples/models/contrib/mmdit/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions security_scanning/examples/models/contrib/stdit/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions security_scanning/examples/models/core/mixtral/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions security_scanning/examples/models/core/mllama/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

6 changes: 3 additions & 3 deletions security_scanning/examples/serve/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

4 changes: 2 additions & 2 deletions security_scanning/metadata.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"commit_hash": "d0d12138a352f40cf420795c192f18ecfaea6a81",
"timestamp": "2026-03-27T02:47:11Z"
"commit_hash": "789494fcfe75d130a9c79cc781d9628426b51835",
"timestamp": "2026-03-28T02:47:37Z"
}
47 changes: 23 additions & 24 deletions security_scanning/triton_backend/poetry.lock

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

2 changes: 1 addition & 1 deletion security_scanning/triton_backend/pyproject.toml
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,7 @@ requires-python = ">=3.10,<3.13"
dependencies = [
"regex (>=2026.2.28,<2027.0.0)",
"fire (>=0.7.1,<0.8.0)",
"tritonclient[all] (>=2.66.0,<3.0.0)",
"tritonclient[all] (>=2.67.0,<3.0.0)",
"transformers (==4.57.3)",
"tabulate (>=0.10.0,<0.11.0)",
"torchao (>=0.14.1)"
Expand Down
1 change: 1 addition & 0 deletions tensorrt_llm/_torch/pyexecutor/_util.py
Original file line number Diff line number Diff line change
Expand Up @@ -1335,6 +1335,7 @@ def create_py_executor_instance(
waiting_queue_policy = (scheduler_config.waiting_queue_policy
if scheduler_config is not None else
WaitingQueuePolicy.FCFS)

return PyExecutor(
resource_manager,
scheduler,
Expand Down
15 changes: 11 additions & 4 deletions tensorrt_llm/_torch/pyexecutor/py_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,7 +67,7 @@
from .scheduler import (RequestScheduler, ScheduledRequests,
SerializableSchedulerOutput, WaitingQueue,
create_waiting_queue)
from .scheduler.adp_router import ADPRouter, DefaultADPRouter
from .scheduler.adp_router import ADPRouter

# Environment variable to specify iteration ranges for profiling start/stop.
# Format: "start1-stop1,start2-stop2,..." or single iterations "iter1,iter2,..."
Expand Down Expand Up @@ -285,8 +285,7 @@ def __init__(
virtual_memory_pools: Optional[dict] = None,
hang_detection_timeout: Optional[int] = None,
execution_stream: Optional[torch.cuda.Stream] = None,
waiting_queue_policy: WaitingQueuePolicy = WaitingQueuePolicy.FCFS,
adp_router: Optional[ADPRouter] = None):
waiting_queue_policy: WaitingQueuePolicy = WaitingQueuePolicy.FCFS):
super(PyExecutor, self).__init__()
self.device_id = torch.cuda.current_device()
self.global_rank = dist.rank
Expand All @@ -313,7 +312,6 @@ def __init__(
self.model_engine = model_engine
self.enable_attention_dp = model_engine.enable_attention_dp
self.dist = dist
self.adp_router: ADPRouter = (adp_router or DefaultADPRouter(dist=dist))
self.sampler = sampler
self.drafter = drafter
self.draft_model_engine = getattr(self.drafter, "draft_model_engine",
Expand Down Expand Up @@ -387,6 +385,12 @@ def __init__(
self.enable_kv_cache_reuse
and self.kv_cache_manager.enable_partial_reuse)

self.adp_router: ADPRouter = ADPRouter.create(
dist=self.dist,
kv_cache_manager=self.kv_cache_manager,
attention_dp_config=self.llm_args.attention_dp_config,
)

self.max_input_len = max_input_len
# _executor_loop private data
self.max_num_active_requests = model_engine.get_max_num_sequences()
Expand Down Expand Up @@ -2583,6 +2587,9 @@ def _fetch_new_requests(

# 6. Schedule requests across ranks (DP only)
if self.enable_attention_dp:
if self.adp_router.needs_prefix_matches:
self.adp_router.gather_prefix_matches(new_requests)

all_ranks_new_requests, self.expected_num_active_requests = \
self.adp_router.route_requests(
all_rank_states, new_requests,
Expand Down
31 changes: 31 additions & 0 deletions tensorrt_llm/_torch/pyexecutor/resource_manager.py
Original file line number Diff line number Diff line change
Expand Up @@ -559,6 +559,37 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
pin_memory=prefer_pinned(),
device='cpu')

def probe_prefix_match_length(self, input_tokens, lora_task_id=None):
"""Probe the KV cache radix tree for prefix match length.

Returns the number of prefix tokens already cached on this rank.
Used by KVCacheAwareADPRouter for cache-aware routing.
"""
if not self.enable_block_reuse:
return 0
# is_variable_window is only defined on the concrete KVCacheManager
# nanobind class, not on BaseKVCacheManager. Use getattr to avoid
# AttributeError on other subclasses or mocks.
if getattr(self.impl, 'is_variable_window', False):
return 0
if not input_tokens:
return 0
from tensorrt_llm.bindings import SamplingConfig
from tensorrt_llm.bindings.internal.batch_manager import BlockKey
from tensorrt_llm.bindings.internal.batch_manager import \
LlmRequest as CppLlmRequest
block_key = BlockKey(tokens=input_tokens, lora_task_id=lora_task_id)
unique_tokens = block_key.unique_tokens
dummy_req = CppLlmRequest(request_id=0,
max_new_tokens=0,
input_tokens=input_tokens,
sampling_config=SamplingConfig(),
is_streaming=False,
lora_task_id=lora_task_id)
num_blocks = self.impl.count_reusable_blocks(unique_tokens, dummy_req,
False)
return num_blocks * self.tokens_per_block

def shutdown(self):
self.impl.release_pools()

Expand Down
Loading
Loading