yingguo-trt · pull · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026 · Jan 21, 2026
diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp
@@ -98,13 +98,14 @@ TransferState MooncakeTransferStatus::wait(int64_t timeout_ms) const
             mBatchFreed = true;
             TLLM_LOG_DEBUG("Batch ID %lu freed in wait()", mBatchId);
             syncSegmentCache(mEngine);
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
             return TransferState::kSUCCESS;
         }
 
         // If timeout_ms < 0, wait indefinitely
         if (timeout_ms < 0)
         {
-            std::this_thread::yield();
+            std::this_thread::sleep_for(std::chrono::milliseconds(1));
             continue;
         }
 
@@ -117,7 +118,7 @@ TransferState MooncakeTransferStatus::wait(int64_t timeout_ms) const
             return TransferState::kIN_PROGRESS;
         }
 
-        std::this_thread::yield();
+        std::this_thread::sleep_for(std::chrono::milliseconds(1));
     }
 }
 

diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu
@@ -32,6 +32,10 @@ namespace kernels::moe_comm
 #define ENABLE_DEBUG_PRINT 0
 #define DISABLE_SYNC_FOR_PROFILING 0
 
+#ifndef DISABLE_TIMEOUT
+#define DISABLE_TIMEOUT 0
+#endif
+
 // Macros for concise launch-time specialization
 #define SWITCH_BOOL(flag, NAME, ...)                                                                                   \
     if (flag)                                                                                                          \
@@ -141,6 +145,13 @@ namespace kernels::moe_comm
         __VA_ARGS__                                                                                                    \
     }
 
+#if DISABLE_TIMEOUT
+#define check_timeout(s) false
+#else
+// 300 * 2000 MHz - should be high enough on any GPU but will prevent a hang
+#define check_timeout(s) ((clock64() - (s)) > (300ll * 2000ll * 1000ll * 1000ll))
+#endif
+
 // ============================================================================
 // Helper Functions for Expert-to-Rank Mapping
 // ============================================================================
@@ -515,6 +526,7 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
             for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize)
             {
                 bool flag_set = false;
+                auto s = clock64();
                 do
                 {
                     uint32_t* flag_ptr = &ptrs.completion_flags[rank_id][peer_rank];
@@ -528,7 +540,15 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
                         rank_id, peer_rank, flag_value, expected_value, flag_ptr);
 #endif
                     flag_set = flag_value == expected_value;
-                } while (!flag_set);
+                } while (!flag_set && !check_timeout(s));
+
+                if (__builtin_expect(!flag_set, 0))
+                {
+                    printf("dispatch: ---Rank %d timed out waiting for completion flag from rank %d\n", rank_id,
+                        peer_rank);
+                    asm volatile("trap;");
+                    return;
+                }
             }
 #endif
         }
@@ -1038,6 +1058,7 @@ __global__ void moeA2ACombineKernel(
         for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize)
         {
             bool flag_set = false;
+            auto s = clock64();
             do
             {
                 uint32_t* flag_ptr = &ptrs.completion_flags[rank_id][peer_rank];
@@ -1046,12 +1067,20 @@ __global__ void moeA2ACombineKernel(
                 asm volatile("ld.relaxed.sys.u32 %0, [%1];" : "=r"(flag_value) : "l"(flag_ptr));
 #if ENABLE_DEBUG_PRINT
                 printf(
-                    "combine: ---Rank %d received completion flag from rank %d, flag_value: %d, expected_value: %d, "
+                    "combine: ---Rank %d received completion flag from rank %d, flag_value: %d, expected_value: "
+                    "%d, "
                     "address: %p\n",
                     rank_id, peer_rank, flag_value, expected_value, flag_ptr);
 #endif
                 flag_set = flag_value == expected_value;
-            } while (!flag_set);
+            } while (!flag_set && !check_timeout(s));
+
+            if (__builtin_expect(!flag_set, 0))
+            {
+                printf("combine: ---Rank %d timed out waiting for completion flag from rank %d\n", rank_id, peer_rank);
+                asm volatile("trap;");
+                return;
+            }
         }
 #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
         // .acquire and .release qualifiers for fence instruction require sm_90 or higher.

diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
@@ -1304,6 +1304,11 @@ TEST_P(AsymmetricalCacheTest, TestCase)
     {
         GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP.";
     }
+    if (isIndexerKCache && tensorrt_llm::common::getEnvUseMooncakeKvCache())
+    {
+        // https://nvbugs/5760737
+        GTEST_SKIP() << "Temporarily skipping cache transceiver tests with Mooncake backend for Indexer KCache.";
+    }
     std::vector<int> lenList = {30, 10, 60, 80};
     if (genCp > 1)
     {

diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json
@@ -1,4 +1,4 @@
 {
-  "commit_hash": "864b61cadd1b112ed3e28391f39def529d7788f0",
-  "timestamp": "2026-01-20T18:11:54Z"
+  "commit_hash": "c381790d15585e8f9e014e72218d1fef6945ed5f",
+  "timestamp": "2026-01-21T02:50:03Z"
 }
diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -334,11 +334,11 @@ def __init__(
                              key="sparse_attention_config")
 
             if config.sparse_attention_config.algorithm == "rocket":
-                logger.info_once("disable rope_fusion for RocketKV.")
+                logger.warning("disable rope_fusion for RocketKV.")
                 self.rope_fusion = False
 
         if self.rope_fusion and not attn_cls.support_fused_rope():
-            logger.info_once(
+            logger.warning(
                 "rope_fusion is true but the attention backend does not support it. Will disable rope_fusion."
             )
             self.rope_fusion = False

diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml
@@ -22,7 +22,7 @@ l0_a100:
     - unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream
     - unittest/trt/model_api/test_model_quantization.py
     # executor
-    - unittest/executor/test_base_worker.py
+    - unittest/executor/test_base_worker.py ISOLATION
     - unittest/executor/test_rpc_proxy.py
     - unittest/executor/test_rpc_worker.py
 - condition:

diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
@@ -229,7 +229,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826)
-unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] SKIP (https://nvbugs/5740075)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/5740075)
 unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[False] SKIP (https://nvbugs/5739981)
@@ -250,9 +249,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_ep
 cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)
 examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979)
 examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979)
-unittest/executor/test_base_worker.py::TestWorkerBase SKIP (https://nvbugs/5759698)
 triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118)
-cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5760737)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] SKIP (https://nvbugs/5760747)
 unittest/_torch/multi_gpu/test_mnnvl_allreduce.py::test_row_linear_residual_norm_fusion[no_fusion-strategy:8-dtype:bfloat16-hidden:8192-seqlen:[15]] SKIP (https://nvbugs/5761364)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] SKIP (https://nvbugs/5759338)