yingguo-trt · pull · Jan 5, 2026 · Jan 5, 2026 · Jan 5, 2026 · Jan 5, 2026
diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt
@@ -38,8 +38,8 @@ FetchContent_Declare(
 
 FetchContent_Declare(
   deepgemm
-  GIT_REPOSITORY https://github.com/ruoqianguo/DeepGEMM
-  GIT_TAG 6cb8161516302550785d9af924d2778afef1f3f6 # swapab_sm100 branch
+  GIT_REPOSITORY https://github.com/deepseek-ai/DeepGEMM
+  GIT_TAG 4ff3f54d9b7ed3129e4f36f9871232ea7ecab86b # nv_dev branch
   GIT_SUBMODULES_RECURSE
   ON
   SOURCE_SUBDIR

diff --git a/cpp/tensorrt_llm/common/customAllReduceUtils.h b/cpp/tensorrt_llm/common/customAllReduceUtils.h
@@ -40,50 +40,12 @@ inline size_t getMaxRequiredWorkspaceSize(int worldSize) noexcept
     {
         return common::getEnvAllReduceWorkspaceSize();
     }
-    if (worldSize <= 2)
+    char const* envWorkspaceSize = std::getenv("TRTLLM_ALLREDUCE_FUSION_WORKSPACE_SIZE");
+    if (envWorkspaceSize != nullptr)
     {
-        return 16 * 1000 * 1000;
-    }
-    return 8 * 1000 * 1000;
-}
-
-// (SM major_version, TP_size) -> (NCCL_num_token_threshold, TWO_SHOT_numel_threshold)
-inline std::unordered_map<int, std::unordered_map<int, std::pair<size_t, size_t>>> HeuristicThresholdLP{
-    {90,
-        {
-            {2, {4096, 4096 * 4096}},
-            {4, {4096, 1024 * 1024}},
-            {8, {2048, 512 * 512}},
-        }},
-    {100,
-        {
-            {2, {4096, 4096 * 4096}},
-            {4, {4096, 1024 * 2048}},
-            {8, {4096, 1024 * 1024}},
-        }},
-};
-
-inline AllReduceStrategyType SelectStrategyLP(size_t seq_len, size_t hidden_size, int world_size, AllReduceFusionOp op)
-{
-    // The heuristic is based on the following assumptions:
-    //  __________________________________
-    // |              \ TWO-SHOT zone |
-    // | ONE-SHOT zone    \           | NCCL zone
-    // |_______________________\______|___
-    // sm_major is 90 or 100
-
-    auto const sm_major = std::min(100, std::max(90, tensorrt_llm::common::getSMVersion()));
-
-    auto const [nccl_num_token_threshold, two_shot_numel_threshold] = HeuristicThresholdLP[sm_major][world_size];
-    auto const message_size = seq_len * hidden_size;
-    if (message_size >= two_shot_numel_threshold)
-    {
-        return AllReduceStrategyType::TWOSHOT;
-    }
-    else
-    {
-        return AllReduceStrategyType::ONESHOT;
+        return static_cast<size_t>(std::atoi(envWorkspaceSize));
     }
+    return 67108864; // 64 MiB
 }
 
 // use 1D vector to store the best strategy instead of a map for each sm version

diff --git a/cpp/tensorrt_llm/deep_gemm/CMakeLists.txt b/cpp/tensorrt_llm/deep_gemm/CMakeLists.txt
@@ -38,7 +38,13 @@ foreach(SOURCE_FILE ${DEEP_GEMM_ALL_FILES})
   if(FILE_EXT STREQUAL ".py")
     # Read file content and replace module imports for Python files
     file(READ ${SOURCE_FILE} _content)
-    string(REPLACE "deep_gemm_cpp" "tensorrt_llm.deep_gemm_cpp_tllm" _content
+    string(REPLACE "from . import _C" "import tensorrt_llm.deep_gemm_cpp_tllm"
+                   _content "${_content}")
+    string(REPLACE ".._C" "tensorrt_llm.deep_gemm_cpp_tllm" _content
+                   "${_content}")
+    string(REPLACE "._C" "tensorrt_llm.deep_gemm_cpp_tllm" _content
+                   "${_content}")
+    string(REPLACE "_C." "tensorrt_llm.deep_gemm_cpp_tllm." _content
                    "${_content}")
 
     # Add adaptation header

diff --git a/docs/source/commands/trtllm-serve/trtllm-serve.rst b/docs/source/commands/trtllm-serve/trtllm-serve.rst
@@ -299,6 +299,8 @@ To configure the nested level arguments like ``moe_config.backend``, the yaml fi
 Syntax
 ------
 
+This syntax section lists all command line arguments for ``trtllm-serve``'s subcommands. Some of the arguments are accompanied with a stability tag indicating their development status. Refer to our `API Reference <https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html>`__ for details
+
 .. click:: tensorrt_llm.commands.serve:main
    :prog: trtllm-serve
    :nested: full

diff --git a/examples/models/core/deepseek_v3/README.md b/examples/models/core/deepseek_v3/README.md
@@ -90,7 +90,6 @@ To quickly run DeepSeek-V3, [examples/llm-api/quickstart_advanced.py](../llm-api
 cd examples/llm-api
 python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --tp_size 8
 ```
-Please include `--tokens_per_block 64` when running DeepSeek-V3.2-Exp, as this model uses the deep_gemm.fp8_paged_mqa_logits kernel, which requires a KV cache block size of 64.
 
 The model will be run by PyTorch backend and generate outputs like:
 ```
@@ -108,7 +107,7 @@ cd examples/llm-api
 python quickstart_advanced.py --model_dir <YOUR_MODEL_DIR> --spec_decode_algo MTP --spec_decode_max_draft_len N
 ```
 
-`N` is the number of MTP modules. When `N` is equal to `0`, which means that MTP is not used (default). When `N` is greater than `0`, which means that `N` MTP modules are enabled. In the current implementation, the weight of each MTP module is shared. Please include `--tokens_per_block 64` when running DeepSeek-V3.2-Exp.
+`N` is the number of MTP modules. When `N` is equal to `0`, which means that MTP is not used (default). When `N` is greater than `0`, which means that `N` MTP modules are enabled. In the current implementation, the weight of each MTP module is shared.
 
 #### Relaxed acceptance
 **NOTE: This feature can only be used for DeepSeek R1.**

diff --git a/tensorrt_llm/_torch/attention_backend/sparse/dsa.py b/tensorrt_llm/_torch/attention_backend/sparse/dsa.py
@@ -785,7 +785,6 @@ def on_update_kv_lens(self):
         # After changing the kv_lens/kv_lens_cuda, we may need to update other metadatas.
         # Especially for the changes in the _preprocess_inputs() of model_engine.py.
         if self.num_generations > 0:
-            tokens_per_block = self.kv_cache_manager.indexer_k_cache_tokens_per_block
             torch.cumsum(
                 self.kv_lens_cuda[self.num_contexts:self.
                                   num_seqs],  # num_contexts should be 0
@@ -800,7 +799,7 @@ def on_update_kv_lens(self):
                 out=self.gen_cached_token_indptr[1:self.num_generations + 1])
             scheduler_metadata_buffer = get_paged_mqa_logits_metadata(
                 self.kv_lens_cuda[self.num_contexts:self.num_seqs],
-                tokens_per_block, self.num_sms)
+                self.kv_cache_manager.tokens_per_block, self.num_sms)
             self.scheduler_metadata_buffer.copy_(scheduler_metadata_buffer,
                                                  non_blocking=True)
             if self.use_expanded_buffers_for_mtp:
@@ -827,7 +826,6 @@ def on_update_kv_lens(self):
 
     def update_for_spec_dec(self):
         super().update_for_spec_dec()
-        self.kv_cache_manager.indexer_k_cache_tokens_per_block
         # host
         self.max_ctx_kv_len = 0
         self.num_ctx_cached_tokens = 0
@@ -1030,7 +1028,7 @@ def prepare(metadata: DSAtrtllmAttentionMetadata):
         request_ids = metadata.request_ids
         seq_lens = metadata.seq_lens
         head_dim = metadata.kv_cache_manager.index_head_dim
-        tokens_per_block = metadata.kv_cache_manager.indexer_k_cache_tokens_per_block
+        tokens_per_block = metadata.kv_cache_manager.tokens_per_block
         quant_block_size = metadata.kv_cache_manager.quant_block_size
         cached_tokens = metadata.kv_cache_params.num_cached_tokens_per_seq
         total_tokens = seq_lens.sum().item()
@@ -1750,9 +1748,6 @@ def __init__(
     ) -> None:
         self.quant_block_size = 128
         self.index_head_dim = sparse_attn_config.index_head_dim
-        # Use a fixed tokens_per_block for indexer k cache due to DG kernel constraints
-        self.indexer_k_cache_tokens_per_block = 64
-        assert self.indexer_k_cache_tokens_per_block == tokens_per_block, "tokens_per_block must be set to 64 for DeepSeek v3.2"
 
         super().__init__(
             kv_cache_config,
@@ -1778,7 +1773,7 @@ def __init__(
         self.num_blocks = self.blocks_in_primary_pool
 
         # Indexer K cache pool for DSA attention
-        # Shape: [num_blocks, self.indexer_k_cache_tokens_per_block * (index_head_dim + scale_size)]
+        # Shape: [num_blocks, self.tokens_per_block * (index_head_dim + scale_size)]
         # Non-interleaved layout: [fp8_tok0 | fp8_tok1 | ... | scale_tok0 | scale_tok1 | ...]
         # Store FP8-quantized k values from the indexer
         self.indexer_k_cache_pool_per_layer = [
@@ -1805,9 +1800,7 @@ def get_cache_size_per_token(model_config: ModelConfig, mapping: Mapping,
         config = model_config.pretrained_config
         sparse_attn_config = model_config.sparse_attention_config
         index_head_dim = sparse_attn_config.index_head_dim
-        tokens_per_block = kwargs['tokens_per_block']
         quant_block_size = 128
-        indexer_k_cache_tokens_per_block = 64
 
         # get kv cache dtype bytes
         mem_per_token = 2
@@ -1827,17 +1820,15 @@ def get_cache_size_per_token(model_config: ModelConfig, mapping: Mapping,
         # 1 for K, others for indexer K cache
         head_dim_factor = (index_head_dim +
                            index_head_dim // quant_block_size * 4) / head_dim
-        tokens_per_block_factor = indexer_k_cache_tokens_per_block / tokens_per_block
-        kv_factor = 1 + head_dim_factor * tokens_per_block_factor
+        kv_factor = 1 + head_dim_factor
         mem_per_token *= kv_factor
         return mem_per_token
 
     def get_cache_bytes_per_token(self):
         # self.kv_factor for K, others for indexer K cache
         head_dim_factor = (self.index_head_dim + self.index_head_dim //
                            self.quant_block_size * 4) / self.head_dim
-        tokens_per_block_factor = self.indexer_k_cache_tokens_per_block / self.tokens_per_block
-        kv_factor = self.kv_factor + head_dim_factor * tokens_per_block_factor
+        kv_factor = self.kv_factor + head_dim_factor
         cache_size_per_token = math.ceil(
             kv_factor * sum(self.num_kv_heads_per_layer) * self.head_dim)
 

diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -23,6 +23,7 @@
 
 from tensorrt_llm._torch.attention_backend.interface import AttentionRuntimeFeatures
 from tensorrt_llm._torch.auto_deploy.utils._graph import get_input_embeddings, get_lm_head_weights
+from tensorrt_llm._torch.autotuner import AutoTuner
 from tensorrt_llm._torch.models.modeling_speculative import Eagle3ForCausalLM
 from tensorrt_llm._torch.pyexecutor._util import (
     _create_kv_cache_manager,
@@ -1008,6 +1009,10 @@ def create_autodeploy_executor(ad_config: LlmArgs, tokenizer: Optional[Tokenizer
     torch.cuda.set_device(rank)
     port = mpi_dist.broadcast(dist.get_free_port())  # use MPI broadcast to pick a free port
     dist.initialize_or_skip(rank, world_size, port)
+
+    # Setup AutoTuner with distributed state for allreduce autotuning
+    AutoTuner.get().setup_distributed_state(dist_mapping, mpi_dist)
+
     # some config
     assert ad_config.max_beam_width <= 1, "_autodeploy + beam_search is not supported"