Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -98,13 +98,14 @@ TransferState MooncakeTransferStatus::wait(int64_t timeout_ms) const
mBatchFreed = true;
TLLM_LOG_DEBUG("Batch ID %lu freed in wait()", mBatchId);
syncSegmentCache(mEngine);
std::this_thread::sleep_for(std::chrono::milliseconds(1));
return TransferState::kSUCCESS;
}

// If timeout_ms < 0, wait indefinitely
if (timeout_ms < 0)
{
std::this_thread::yield();
std::this_thread::sleep_for(std::chrono::milliseconds(1));
continue;
}

Expand All @@ -117,7 +118,7 @@ TransferState MooncakeTransferStatus::wait(int64_t timeout_ms) const
return TransferState::kIN_PROGRESS;
}

std::this_thread::yield();
std::this_thread::sleep_for(std::chrono::milliseconds(1));
}
}

Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -32,6 +32,10 @@ namespace kernels::moe_comm
#define ENABLE_DEBUG_PRINT 0
#define DISABLE_SYNC_FOR_PROFILING 0

#ifndef DISABLE_TIMEOUT
#define DISABLE_TIMEOUT 0
#endif

// Macros for concise launch-time specialization
#define SWITCH_BOOL(flag, NAME, ...) \
if (flag) \
Expand Down Expand Up @@ -141,6 +145,13 @@ namespace kernels::moe_comm
__VA_ARGS__ \
}

#if DISABLE_TIMEOUT
#define check_timeout(s) false
#else
// 300 * 2000 MHz - should be high enough on any GPU but will prevent a hang
#define check_timeout(s) ((clock64() - (s)) > (300ll * 2000ll * 1000ll * 1000ll))
#endif

// ============================================================================
// Helper Functions for Expert-to-Rank Mapping
// ============================================================================
Expand Down Expand Up @@ -515,6 +526,7 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize)
{
bool flag_set = false;
auto s = clock64();
do
{
uint32_t* flag_ptr = &ptrs.completion_flags[rank_id][peer_rank];
Expand All @@ -528,7 +540,15 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
rank_id, peer_rank, flag_value, expected_value, flag_ptr);
#endif
flag_set = flag_value == expected_value;
} while (!flag_set);
} while (!flag_set && !check_timeout(s));

if (__builtin_expect(!flag_set, 0))
{
printf("dispatch: ---Rank %d timed out waiting for completion flag from rank %d\n", rank_id,
peer_rank);
asm volatile("trap;");
return;
}
}
#endif
}
Expand Down Expand Up @@ -1038,6 +1058,7 @@ __global__ void moeA2ACombineKernel(
for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize)
{
bool flag_set = false;
auto s = clock64();
do
{
uint32_t* flag_ptr = &ptrs.completion_flags[rank_id][peer_rank];
Expand All @@ -1046,12 +1067,20 @@ __global__ void moeA2ACombineKernel(
asm volatile("ld.relaxed.sys.u32 %0, [%1];" : "=r"(flag_value) : "l"(flag_ptr));
#if ENABLE_DEBUG_PRINT
printf(
"combine: ---Rank %d received completion flag from rank %d, flag_value: %d, expected_value: %d, "
"combine: ---Rank %d received completion flag from rank %d, flag_value: %d, expected_value: "
"%d, "
"address: %p\n",
rank_id, peer_rank, flag_value, expected_value, flag_ptr);
#endif
flag_set = flag_value == expected_value;
} while (!flag_set);
} while (!flag_set && !check_timeout(s));

if (__builtin_expect(!flag_set, 0))
{
printf("combine: ---Rank %d timed out waiting for completion flag from rank %d\n", rank_id, peer_rank);
asm volatile("trap;");
return;
}
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
// .acquire and .release qualifiers for fence instruction require sm_90 or higher.
Expand Down
5 changes: 5 additions & 0 deletions cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -1304,6 +1304,11 @@ TEST_P(AsymmetricalCacheTest, TestCase)
{
GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP.";
}
if (isIndexerKCache && tensorrt_llm::common::getEnvUseMooncakeKvCache())
{
// https://nvbugs/5760737
GTEST_SKIP() << "Temporarily skipping cache transceiver tests with Mooncake backend for Indexer KCache.";
}
std::vector<int> lenList = {30, 10, 60, 80};
if (genCp > 1)
{
Expand Down
4 changes: 2 additions & 2 deletions security_scanning/metadata.json
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
{
"commit_hash": "864b61cadd1b112ed3e28391f39def529d7788f0",
"timestamp": "2026-01-20T18:11:54Z"
"commit_hash": "c381790d15585e8f9e014e72218d1fef6945ed5f",
"timestamp": "2026-01-21T02:50:03Z"
}
4 changes: 2 additions & 2 deletions tensorrt_llm/_torch/modules/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -334,11 +334,11 @@ def __init__(
key="sparse_attention_config")

if config.sparse_attention_config.algorithm == "rocket":
logger.info_once("disable rope_fusion for RocketKV.")
logger.warning("disable rope_fusion for RocketKV.")
self.rope_fusion = False

if self.rope_fusion and not attn_cls.support_fused_rope():
logger.info_once(
logger.warning(
"rope_fusion is true but the attention backend does not support it. Will disable rope_fusion."
)
self.rope_fusion = False
Expand Down
2 changes: 1 addition & 1 deletion tests/integration/test_lists/test-db/l0_a100.yml
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@ l0_a100:
- unittest/llmapi/test_memory_profiling.py::test_pyexecutor_and_kvcache_share_execution_stream # test that PyExecutor and KVCacheManager share the same execution_stream
- unittest/trt/model_api/test_model_quantization.py
# executor
- unittest/executor/test_base_worker.py
- unittest/executor/test_base_worker.py ISOLATION
- unittest/executor/test_rpc_proxy.py
- unittest/executor/test_rpc_worker.py
- condition:
Expand Down
3 changes: 0 additions & 3 deletions tests/integration/test_lists/waives.txt
Original file line number Diff line number Diff line change
Expand Up @@ -229,7 +229,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826)
unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] SKIP (https://nvbugs/5740075)
accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/5740075)
unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[False] SKIP (https://nvbugs/5739981)
Expand All @@ -250,9 +249,7 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus_online_ep
cpp/test_multi_gpu.py::TestDisagg::test_symmetric_executor[gpt-2proc-mpi_kvcache-90] SKIP (https://nvbugs/5755941)
examples/test_granite.py::test_llm_granite[granite-3.0-1b-a400m-instruct-bfloat16] SKIP (https://nvbugs/5608979)
examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKIP (https://nvbugs/5608979)
unittest/executor/test_base_worker.py::TestWorkerBase SKIP (https://nvbugs/5759698)
triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118)
cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5760737)
accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] SKIP (https://nvbugs/5760747)
unittest/_torch/multi_gpu/test_mnnvl_allreduce.py::test_row_linear_residual_norm_fusion[no_fusion-strategy:8-dtype:bfloat16-hidden:8192-seqlen:[15]] SKIP (https://nvbugs/5761364)
accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] SKIP (https://nvbugs/5759338)
Expand Down
Loading