diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp index ce46e1035155..bd956862ecbe 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp +++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp @@ -98,13 +98,14 @@ TransferState MooncakeTransferStatus::wait(int64_t timeout_ms) const mBatchFreed = true; TLLM_LOG_DEBUG("Batch ID %lu freed in wait()", mBatchId); syncSegmentCache(mEngine); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); return TransferState::kSUCCESS; } // If timeout_ms < 0, wait indefinitely if (timeout_ms < 0) { - std::this_thread::yield(); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); continue; } @@ -117,7 +118,7 @@ TransferState MooncakeTransferStatus::wait(int64_t timeout_ms) const return TransferState::kIN_PROGRESS; } - std::this_thread::yield(); + std::this_thread::sleep_for(std::chrono::milliseconds(1)); } } diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu index f32ce4c7d06d..303255e6a7ef 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu @@ -32,6 +32,10 @@ namespace kernels::moe_comm #define ENABLE_DEBUG_PRINT 0 #define DISABLE_SYNC_FOR_PROFILING 0 +#ifndef DISABLE_TIMEOUT +#define DISABLE_TIMEOUT 0 +#endif + // Macros for concise launch-time specialization #define SWITCH_BOOL(flag, NAME, ...) \ if (flag) \ @@ -141,6 +145,13 @@ namespace kernels::moe_comm __VA_ARGS__ \ } +#if DISABLE_TIMEOUT +#define check_timeout(s) false +#else +// 300 * 2000 MHz - should be high enough on any GPU but will prevent a hang +#define check_timeout(s) ((clock64() - (s)) > (300ll * 2000ll * 1000ll * 1000ll)) +#endif + // ============================================================================ // Helper Functions for Expert-to-Rank Mapping // ============================================================================ @@ -515,6 +526,7 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [ for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize) { bool flag_set = false; + auto s = clock64(); do { uint32_t* flag_ptr = &ptrs.completion_flags[rank_id][peer_rank]; @@ -528,7 +540,15 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [ rank_id, peer_rank, flag_value, expected_value, flag_ptr); #endif flag_set = flag_value == expected_value; - } while (!flag_set); + } while (!flag_set && !check_timeout(s)); + + if (__builtin_expect(!flag_set, 0)) + { + printf("dispatch: ---Rank %d timed out waiting for completion flag from rank %d\n", rank_id, + peer_rank); + asm volatile("trap;"); + return; + } } #endif } @@ -1038,6 +1058,7 @@ __global__ void moeA2ACombineKernel( for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize) { bool flag_set = false; + auto s = clock64(); do { uint32_t* flag_ptr = &ptrs.completion_flags[rank_id][peer_rank]; @@ -1046,12 +1067,20 @@ __global__ void moeA2ACombineKernel( asm volatile("ld.relaxed.sys.u32 %0, [%1];" : "=r"(flag_value) : "l"(flag_ptr)); #if ENABLE_DEBUG_PRINT printf( - "combine: ---Rank %d received completion flag from rank %d, flag_value: %d, expected_value: %d, " + "combine: ---Rank %d received completion flag from rank %d, flag_value: %d, expected_value: " + "%d, " "address: %p\n", rank_id, peer_rank, flag_value, expected_value, flag_ptr); #endif flag_set = flag_value == expected_value; - } while (!flag_set); + } while (!flag_set && !check_timeout(s)); + + if (__builtin_expect(!flag_set, 0)) + { + printf("combine: ---Rank %d timed out waiting for completion flag from rank %d\n", rank_id, peer_rank); + asm volatile("trap;"); + return; + } } #if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 // .acquire and .release qualifiers for fence instruction require sm_90 or higher. diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp index 5e7528b8dd4e..8732d46cae26 100644 --- a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp @@ -1304,6 +1304,11 @@ TEST_P(AsymmetricalCacheTest, TestCase) { GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP."; } + if (isIndexerKCache && tensorrt_llm::common::getEnvUseMooncakeKvCache()) + { + // https://nvbugs/5760737 + GTEST_SKIP() << "Temporarily skipping cache transceiver tests with Mooncake backend for Indexer KCache."; + } std::vector lenList = {30, 10, 60, 80}; if (genCp > 1) { diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json index 81b7d99378ba..ba92245290ea 100644 --- a/security_scanning/metadata.json +++ b/security_scanning/metadata.json @@ -1,4 +1,4 @@ { - "commit_hash": "864b61cadd1b112ed3e28391f39def529d7788f0", - "timestamp": "2026-01-20T18:11:54Z" + "commit_hash": "c381790d15585e8f9e014e72218d1fef6945ed5f", + "timestamp": "2026-01-21T02:50:03Z" } diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py index 695001286451..69ae313713ad 100644 --- a/tensorrt_llm/_torch/modules/attention.py +++ b/tensorrt_llm/_torch/modules/attention.py @@ -334,11 +334,11 @@ def __init__( key="sparse_attention_config") if config.sparse_attention_config.algorithm == "rocket": - logger.info_once("disable rope_fusion for RocketKV.") + logger.warning("disable rope_fusion for RocketKV.") self.rope_fusion = False if self.rope_fusion and not attn_cls.support_fused_rope(): - logger.info_once( + logger.warning( "rope_fusion is true but the attention backend does not support it. Will disable rope_fusion." ) self.rope_fusion = False diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml index e26d31da0102..ac0938c23937 100644 --- a/tests/integration/test_lists/test-db/l0_a100.yml +++ b/tests/integration/test_lists/test-db/l0_a100.yml @@ -22,7 +22,7 @@ l0_a100: - unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream - unittest/trt/model_api/test_model_quantization.py # executor - - unittest/executor/test_base_worker.py + - unittest/executor/test_base_worker.py ISOLATION - unittest/executor/test_rpc_proxy.py - unittest/executor/test_rpc_worker.py - condition: diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index ddde25ba756e..d1a28d25db3e 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -229,7 +229,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) -unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] SKIP (https://nvbugs/5740075) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/5740075) unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[False] SKIP (https://nvbugs/5739981) @@ -250,9 +249,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_ep cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941) examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979) examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979) -unittest/executor/test_base_worker.py::TestWorkerBase SKIP (https://nvbugs/5759698) triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118) -cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5760737) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] SKIP (https://nvbugs/5760747) unittest/_torch/multi_gpu/test_mnnvl_allreduce.py::test_row_linear_residual_norm_fusion[no_fusion-strategy:8-dtype:bfloat16-hidden:8192-seqlen:[15]] SKIP (https://nvbugs/5761364) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] SKIP (https://nvbugs/5759338)