yingguo-trt · pull · Mar 28, 2026 · Mar 28, 2026 · Mar 28, 2026
diff --git a/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/nanobind/batch_manager/kvCacheManager.cpp
@@ -68,7 +68,7 @@ std::optional<tensorrt_llm::runtime::ITensor::UniquePtr> from_torch(std::optiona
 class PyKvCacheManager : public tbk::BaseKVCacheManager
 {
 public:
-    NB_TRAMPOLINE(tbk::BaseKVCacheManager, 30);
+    NB_TRAMPOLINE(tbk::BaseKVCacheManager, 36);
 
     // using BaseKVCacheManager::BaseKVCacheManager; // Inherit constructors
     void allocatePools(bool useUvm = false) override
@@ -255,6 +255,12 @@ class PyKvCacheManager : public tbk::BaseKVCacheManager
     {
         NB_OVERRIDE_PURE(flushIterationEvents);
     }
+
+    SizeType32 countReusableBlocks(VecUniqueTokens const& uniqueTokens, tb::LlmRequest const& llmRequest,
+        bool onlyAllocated = false) const override
+    {
+        NB_OVERRIDE_PURE(countReusableBlocks, uniqueTokens, llmRequest, onlyAllocated);
+    }
 };
 
 // TODO: Deduplicate executor bindings KvCacheStats

diff --git a/security_scanning/docs/poetry.lock b/security_scanning/docs/poetry.lock
diff --git a/security_scanning/docs/pyproject.toml b/security_scanning/docs/pyproject.toml
@@ -14,7 +14,7 @@ dependencies = [
     "breathe (>=4.36.0,<5.0.0)",
     "sphinx-copybutton (>=0.5.2,<0.6.0)",
     "autodoc-pydantic (>=2.2.0,<3.0.0)",
-    "sphinx-togglebutton (>=0.4.4,<0.5.0)",
+    "sphinx-togglebutton (>=0.4.5,<0.5.0)",
     "sphinxcontrib-mermaid (>=2.0.1,<3.0.0)"
 ]
 

diff --git a/security_scanning/examples/auto_deploy/poetry.lock b/security_scanning/examples/auto_deploy/poetry.lock
diff --git a/security_scanning/examples/models/contrib/hyperclovax/poetry.lock b/security_scanning/examples/models/contrib/hyperclovax/poetry.lock
diff --git a/security_scanning/examples/models/contrib/mmdit/poetry.lock b/security_scanning/examples/models/contrib/mmdit/poetry.lock
diff --git a/security_scanning/examples/models/contrib/stdit/poetry.lock b/security_scanning/examples/models/contrib/stdit/poetry.lock
diff --git a/security_scanning/examples/models/core/mixtral/poetry.lock b/security_scanning/examples/models/core/mixtral/poetry.lock
diff --git a/security_scanning/examples/models/core/mllama/poetry.lock b/security_scanning/examples/models/core/mllama/poetry.lock
diff --git a/security_scanning/examples/serve/poetry.lock b/security_scanning/examples/serve/poetry.lock
diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json
@@ -1,4 +1,4 @@
 {
-  "commit_hash": "d0d12138a352f40cf420795c192f18ecfaea6a81",
-  "timestamp": "2026-03-27T02:47:11Z"
+  "commit_hash": "789494fcfe75d130a9c79cc781d9628426b51835",
+  "timestamp": "2026-03-28T02:47:37Z"
 }
diff --git a/security_scanning/triton_backend/poetry.lock b/security_scanning/triton_backend/poetry.lock
diff --git a/security_scanning/triton_backend/pyproject.toml b/security_scanning/triton_backend/pyproject.toml
@@ -9,7 +9,7 @@ requires-python = ">=3.10,<3.13"
 dependencies = [
     "regex (>=2026.2.28,<2027.0.0)",
     "fire (>=0.7.1,<0.8.0)",
-    "tritonclient[all] (>=2.66.0,<3.0.0)",
+    "tritonclient[all] (>=2.67.0,<3.0.0)",
     "transformers (==4.57.3)",
     "tabulate (>=0.10.0,<0.11.0)",
     "torchao (>=0.14.1)"

diff --git a/tensorrt_llm/_torch/pyexecutor/_util.py b/tensorrt_llm/_torch/pyexecutor/_util.py
@@ -1335,6 +1335,7 @@ def create_py_executor_instance(
     waiting_queue_policy = (scheduler_config.waiting_queue_policy
                             if scheduler_config is not None else
                             WaitingQueuePolicy.FCFS)
+
     return PyExecutor(
         resource_manager,
         scheduler,

diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -67,7 +67,7 @@
 from .scheduler import (RequestScheduler, ScheduledRequests,
                         SerializableSchedulerOutput, WaitingQueue,
                         create_waiting_queue)
-from .scheduler.adp_router import ADPRouter, DefaultADPRouter
+from .scheduler.adp_router import ADPRouter
 
 # Environment variable to specify iteration ranges for profiling start/stop.
 # Format: "start1-stop1,start2-stop2,..." or single iterations "iter1,iter2,..."
@@ -285,8 +285,7 @@ def __init__(
             virtual_memory_pools: Optional[dict] = None,
             hang_detection_timeout: Optional[int] = None,
             execution_stream: Optional[torch.cuda.Stream] = None,
-            waiting_queue_policy: WaitingQueuePolicy = WaitingQueuePolicy.FCFS,
-            adp_router: Optional[ADPRouter] = None):
+            waiting_queue_policy: WaitingQueuePolicy = WaitingQueuePolicy.FCFS):
         super(PyExecutor, self).__init__()
         self.device_id = torch.cuda.current_device()
         self.global_rank = dist.rank
@@ -313,7 +312,6 @@ def __init__(
         self.model_engine = model_engine
         self.enable_attention_dp = model_engine.enable_attention_dp
         self.dist = dist
-        self.adp_router: ADPRouter = (adp_router or DefaultADPRouter(dist=dist))
         self.sampler = sampler
         self.drafter = drafter
         self.draft_model_engine = getattr(self.drafter, "draft_model_engine",
@@ -387,6 +385,12 @@ def __init__(
             self.enable_kv_cache_reuse
             and self.kv_cache_manager.enable_partial_reuse)
 
+        self.adp_router: ADPRouter = ADPRouter.create(
+            dist=self.dist,
+            kv_cache_manager=self.kv_cache_manager,
+            attention_dp_config=self.llm_args.attention_dp_config,
+        )
+
         self.max_input_len = max_input_len
         # _executor_loop private data
         self.max_num_active_requests = model_engine.get_max_num_sequences()
@@ -2583,6 +2587,9 @@ def _fetch_new_requests(
 
         # 6. Schedule requests across ranks (DP only)
         if self.enable_attention_dp:
+            if self.adp_router.needs_prefix_matches:
+                self.adp_router.gather_prefix_matches(new_requests)
+
             all_ranks_new_requests, self.expected_num_active_requests = \
                 self.adp_router.route_requests(
                     all_rank_states, new_requests,

diff --git a/tensorrt_llm/_torch/pyexecutor/resource_manager.py b/tensorrt_llm/_torch/pyexecutor/resource_manager.py
@@ -559,6 +559,37 @@ def append_to_kv_heads_per_layer(num_kv_heads_per_layer: List[int],
             pin_memory=prefer_pinned(),
             device='cpu')
 
+    def probe_prefix_match_length(self, input_tokens, lora_task_id=None):
+        """Probe the KV cache radix tree for prefix match length.
+
+        Returns the number of prefix tokens already cached on this rank.
+        Used by KVCacheAwareADPRouter for cache-aware routing.
+        """
+        if not self.enable_block_reuse:
+            return 0
+        # is_variable_window is only defined on the concrete KVCacheManager
+        # nanobind class, not on BaseKVCacheManager. Use getattr to avoid
+        # AttributeError on other subclasses or mocks.
+        if getattr(self.impl, 'is_variable_window', False):
+            return 0
+        if not input_tokens:
+            return 0
+        from tensorrt_llm.bindings import SamplingConfig
+        from tensorrt_llm.bindings.internal.batch_manager import BlockKey
+        from tensorrt_llm.bindings.internal.batch_manager import \
+            LlmRequest as CppLlmRequest
+        block_key = BlockKey(tokens=input_tokens, lora_task_id=lora_task_id)
+        unique_tokens = block_key.unique_tokens
+        dummy_req = CppLlmRequest(request_id=0,
+                                  max_new_tokens=0,
+                                  input_tokens=input_tokens,
+                                  sampling_config=SamplingConfig(),
+                                  is_streaming=False,
+                                  lora_task_id=lora_task_id)
+        num_blocks = self.impl.count_reusable_blocks(unique_tokens, dummy_req,
+                                                     False)
+        return num_blocks * self.tokens_per_block
+
     def shutdown(self):
         self.impl.release_pools()