xinhe-nv · pull · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026 · Mar 4, 2026
diff --git a/.github/workflows/auto-assign-author.yml b/.github/workflows/auto-assign-author.yml
@@ -15,4 +15,4 @@ jobs:
           GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
           PR_URL: ${{ github.event.pull_request.html_url }}
           AUTHOR: ${{ github.actor }}
-        run: gh pr edit $PR_URL --add-assignee $AUTHOR
+        run: gh pr edit $PR_URL --add-assignee $AUTHOR || echo "Could not assign $AUTHOR (not a collaborator), skipping."
diff --git a/tensorrt_llm/_torch/autotuner.py b/tensorrt_llm/_torch/autotuner.py
@@ -201,7 +201,7 @@ def get_valid_tactics(self, inputs: List[torch.Tensor],
 
         tactic==-1 has special meaning, means the fallback kernel which should be able to implement any shapes
         This fallback tactic is needed for 2 reasons:
-            * when the autotuner cannot find a valid tactic in it's cache.
+            * when the autotuner cannot find a valid tactic in its cache.
             * in eager mode, w/o autotunning the custom op should have at least one kernel, which makes the autotuning
               process an optional process, such that user can opt out.
 
@@ -1437,10 +1437,10 @@ def _create_tensor_like(self, origin_tensor: torch.Tensor,
         # during the tuning process. This can by controlled in the preparation phase by the runner.
         # It must not use all zero tensors. Otherwise the timing results become unreliable.
         if dtype == torch.float4_e2m1fn_x2:
-            return torch.randint(-5, 5, shapes,
-                                 device=device).to(torch.uint8).view(dtype)
+            return (torch.rand(shapes, device=device) * 10 - 5).to(
+                torch.uint8).view(dtype)
         else:
-            return torch.randint(-5, 5, shapes, device=device).to(dtype)
+            return (torch.rand(shapes, device=device) * 10 - 5).to(dtype)
 
     def _prepare_input_tensors(
             self, profile: OptimizationProfile,

diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -1069,7 +1069,7 @@ def get_valid_tactics(self, inputs: List[torch.Tensor],
     def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]:
         """Get the dynamic tensor specs for use with the AutoTuner."""
 
-        # These indices correspond to the 0th input tensor and it's first dimension
+        # These indices correspond to the 0th input tensor and its first dimension
         # i.e. we are tuning M where the first input tensor is of shape [B, M, K]
 
         MAT1_IDX = 0

diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -284,7 +284,7 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
+        MAX_PROFILE_BUCKET = 8192
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
@@ -660,7 +660,7 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
+        MAX_PROFILE_BUCKET = 8192
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
@@ -967,7 +967,7 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
+        MAX_PROFILE_BUCKET = 8192
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
@@ -1273,7 +1273,7 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
+        MAX_PROFILE_BUCKET = 8192
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
@@ -1561,7 +1561,7 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
+        MAX_PROFILE_BUCKET = 8192
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 
@@ -1833,7 +1833,7 @@ def get_dynamic_tensor_specs(cls,
                                  ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
         HIDDEN_STATES_IDX = 2
         TUNED_DIM = 0
-        MAX_PROFILE_BUCKET = 4096
+        MAX_PROFILE_BUCKET = 8192
 
         m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)
 

diff --git a/tensorrt_llm/_torch/models/modeling_exaone_moe.py b/tensorrt_llm/_torch/models/modeling_exaone_moe.py
@@ -62,7 +62,21 @@ def check_is_moe(config: ExaoneMoEConfig, layer_idx: int, is_mtp_layer: bool = F
     """
     Check if the current layer is a MoE layer.
     """
-    return not is_mtp_layer and hasattr(config, "is_moe_layer") and config.is_moe_layer[layer_idx]
+    # The MTP layer of K-EXAONE is always dense.
+    if is_mtp_layer:
+        return False
+
+    if hasattr(config, "mlp_layer_types") and config.mlp_layer_types is not None:
+        return config.mlp_layer_types[layer_idx] == "sparse"
+
+    # For backward compatibility, older K-EXAONE checkpoints do not include `mlp_layer_types`.
+    if hasattr(config, "is_moe_layer") and config.is_moe_layer is not None:
+        return config.is_moe_layer[layer_idx]
+
+    raise ValueError(
+        "Invalid configuration: Neither `mlp_layer_types` nor `is_moe_layer` found in config. "
+        "Please check if the checkpoint and config are compatible with ExaoneMoEConfig."
+    )
 
 
 def enable_attn_allreduce(mapping: Mapping):

diff --git a/tensorrt_llm/_torch/modules/attention.py b/tensorrt_llm/_torch/modules/attention.py
@@ -811,7 +811,7 @@ def apply_rope(self, q: torch.Tensor, k: Optional[torch.Tensor],
 
     def apply_qk_norm(self, q, k):
         raise NotImplementedError(
-            f"QK norm is not implemented for {self.__class__.__name__}."
+            f"QK norm is not implemented for {self.__class__.__name__}. "
             "Please override the `apply_qk_norm` method in the subclass.")
 
 
@@ -959,7 +959,7 @@ def __init__(
                 self)
             self.register_to_config = True
 
-        # only support one kind of sparse attention, dsa now.
+        # Currently only DSA sparse attention is supported.
         if config is not None and config.sparse_attention_config is not None and config.sparse_attention_config.algorithm == "dsa":
             self.is_dsa = True
         else:
@@ -982,7 +982,7 @@ def __init__(
             dp_size = tp_size
             tp_size = 1
         if self.mapping.has_cp_ulysses():
-            raise NotImplementedError("MLA doesn't support CP Ulyssees yet")
+            raise NotImplementedError("MLA doesn't support CP Ulysses yet")
         if self.mapping.cp_size > 1:
             assert self.mapping.has_cp_helix(
             ), f"CP type must be HELIX for MLA, but got {self.mapping.cp_config['cp_type']}."
@@ -1360,13 +1360,13 @@ def forward_impl(self,
                      output: torch.Tensor,
                      latent_cache_gen: Optional[torch.Tensor] = None) -> None:
         """
-        Forward pass for the MLA module.
+        Forward pass for the MLA module. Writes result into output tensor in-place.
 
         Args:
             position_ids (Optional[torch.IntTensor]): The position IDs.
             hidden_states (torch.Tensor): The hidden states.
             attn_metadata (AttentionMetadata): The attention metadata.
-            output (torch.Tensor): Pre-allocated output tensor, written in-place.
+            output (torch.Tensor): The output tensor to write results into.
             latent_cache_gen (Optional[torch.Tensor]): The latent cache used in generation.
         """
         # split q, k, v into context and gen batches
@@ -1464,12 +1464,13 @@ def forward_impl_with_dsa(self, position_ids: Optional[torch.Tensor],
                               output: torch.Tensor) -> None:
         """
         Forward pass for the MLA module with DSA (always in MQA mode).
+        Writes result into output tensor in-place.
 
         Args:
             position_ids (Optional[torch.IntTensor]): The position IDs.
             hidden_states (torch.Tensor): The hidden states.
             attn_metadata (AttentionMetadata): The attention metadata.
-            output (torch.Tensor): Pre-allocated output tensor, written in-place.
+            output (torch.Tensor): The output tensor to write results into.
         """
         assert self.mqa is not None, "DSA is only supported in MQA mode"
         # split q, k, v into context and gen batches
@@ -1800,7 +1801,7 @@ def forward_context_with_chunked_prefill(
         # currently we assume that the chunk size is the same as the max_num_tokens
         chunked_loop_num = attn_metadata.chunked_loop_num
 
-        # [toal_token_q, num_heads, 2] -> [toal_token_q, num_heads] float2
+        # [total_token_q, num_heads, 2] -> [total_token_q, num_heads] float2
         self.softmax_stats_tensor = torch.empty(
             (attn_metadata.num_ctx_tokens, self.num_heads_tp, 2),
             dtype=torch.float,

diff --git a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
@@ -56,7 +56,7 @@ def get_moe_cls(
             return TRTLLMGenFusedMoE
         else:
             logger.warning(
-                "TRTLLMGenFusedMoE only supports fp8_block_scales, nvfp4, w4a16_mxfp4, w4a8_mxfp4_fp8 and w4a8_mxfp4_mxfp8. "
+                "TRTLLMGenFusedMoE only supports fp8_block_scales, nvfp4, w4a16_mxfp4, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, and w4a8_mxfp4_mxfp8. "
                 f"Check out details in quant_config: {quant_config}. Using CutlassFusedMoE instead."
             )
             return CutlassFusedMoE
@@ -140,7 +140,7 @@ def create_moe_backend(
         assert moe_cls in [
             WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE,
             DeepGemmFusedMoE
-        ], "MoE Load Balance is only supported in WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE and CuteDslFusedMoE, and DeepGemmFusedMoE."
+        ], "MoE Load Balance is only supported in WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE, and DeepGemmFusedMoE."
 
     if bias:
         assert moe_cls in [CutlassFusedMoE, TritonFusedMoE, TRTLLMGenFusedMoE
@@ -371,14 +371,14 @@ def create_moe(
                 activation_type=activation_type,
             )
         else:
-            # Check if this is a TRTLLM backend request that fallback to CutlassFusedMoE
+            # Check if this is a TRTLLM or CUTEDSL backend request that fell back to CutlassFusedMoE
             requested_backend = model_config.moe_backend.upper()
             if requested_backend in ("TRTLLM",
                                      "CUTEDSL") and moe_cls == CutlassFusedMoE:
-                # Workaround for test cases where TRTLLM backend fallbacks to CutlassFusedMoE due to quant_config incompatibility
+                # Workaround for test cases where TRTLLM backend falls back to CutlassFusedMoE due to quant_config incompatibility
                 # Log warning and continue with the fallback backend
                 logger.warning(
-                    f"ENABLE_CONFIGURABLE_MOE is set but TRTLLM backend fallback to {moe_cls.__name__} due to quant_config. "
+                    f"ENABLE_CONFIGURABLE_MOE is set but {requested_backend} backend fell back to {moe_cls.__name__} due to quant_config. "
                     f"ConfigurableMoE only supports TRTLLMGenFusedMoE and CuteDslFusedMoE backends. "
                     f"Continuing with legacy MoE backend {moe_cls.__name__}.")
             else:

diff --git a/tensorrt_llm/_torch/modules/fused_moe/routing.py b/tensorrt_llm/_torch/modules/fused_moe/routing.py
@@ -75,7 +75,7 @@ def precompute_common_perfect_router_logits(num_experts: int,
         5120,
         6144,
         7168,
-        8192  # Powers of 2 and common sizes
+        8192  # Common sizes
     ]
 
     print(

diff --git a/tensorrt_llm/_torch/modules/linear.py b/tensorrt_llm/_torch/modules/linear.py
@@ -91,7 +91,7 @@ def split_dim(cls, mode):
         return 1 if mode == cls.ROW else 0
 
     # Helper to shard the corresponding per-channel activation scales
-    # Which shard along the dimension orthogonal to the weights
+    # which are sharded along the dimension orthogonal to the weights
     @classmethod
     def flip(cls, mode):
         return cls.ROW if mode == cls.COLUMN else cls.COLUMN
@@ -190,7 +190,7 @@ def load_weights_vanilla_helper(module: Linear,
 
     if weight is not None:
         if module.has_weight_only_quant:
-            # NOTE: without the preprocess during the runtime, the gemm output nan's. in order to use the preprocess_weights_for_mixed_gemm
+            # NOTE: without the preprocess during the runtime, the gemm outputs NaNs. In order to use the preprocess_weights_for_mixed_gemm
             # we need to cast the weight to int8 first.
             activation_dtype = torch.float8_e4m3fn if module.has_w4a8_awq else torch.float16
             weight_dtype, _ = get_weight_dtype_and_id(module)
@@ -571,7 +571,7 @@ def create_weights(self, module: Linear, in_features: int,
         # K, V scales for NVFP4 KV cache
         module.kv_scales = Parameter(torch.ones(3, dtype=torch.float32),
                                      requires_grad=False)
-        # K, V scales for NVFP4 KV cache
+        # Inverse K, V scales for NVFP4 KV cache
         module.inv_kv_scales = Parameter(torch.ones(3, dtype=torch.float32),
                                          requires_grad=False)
         if bias:
@@ -919,7 +919,7 @@ def apply(self, module: Linear, input: torch.Tensor,
         return output
 
     def _get_scale_name(self, weights: List[Dict]):
-        # `weight_scale_inv` for DS recipe and  `weight_scale` for ModelOpt recipe.
+        # `weight_scale_inv` for DS recipe and `weight_scale` for ModelOpt recipe.
         # Actually they hold identical values of data_amax / 448.
         scale_name = "weight_scale_inv"
         if scale_name not in weights[0]:
@@ -1065,7 +1065,7 @@ def apply(self, module: Linear, input: torch.Tensor,
         return output
 
     def _get_scale_name(self, weights: List[Dict]):
-        # `weight_scale_inv` for DS recipe and  `weight_scale` for ModelOpt recipe.
+        # `weight_scale_inv` for DS recipe and `weight_scale` for ModelOpt recipe.
         # Actually they hold identical values of data_amax / 448.
         for w in weights:
             if "weight_scale_inv" in w:
@@ -1230,7 +1230,7 @@ def create_weights(self, module: Linear, in_features: int,
         # K, V scales for NVFP4 KV cache
         module.kv_scales = Parameter(torch.ones(3, dtype=torch.float32),
                                      requires_grad=False)
-        # K, V scales for NVFP4 KV cache
+        # Inverse K, V scales for NVFP4 KV cache
         module.inv_kv_scales = Parameter(torch.ones(3, dtype=torch.float32),
                                          requires_grad=False)
 
@@ -1531,14 +1531,8 @@ def load_weights_fused_gate_up_linear(self, module: Linear,
             copy_weight(module.pre_quant_scale, pre_quant_scale)
 
     def post_load_weights(self, module: Linear):
+        """Pad weight and weight_scale tensors to meet torch trtllm NVFP4 GEMM alignment requirements."""
         super().post_load_weights(module)
-        """
-        Pad weight and weight_scale tensors to meet torch trtllm NVFP4 GEMM alignment requirements.
-
-        Args:
-            row_alignment: Required row alignment (default: 32)
-            col_alignment: Required column alignment (default: 16)
-        """
         row_alignment, col_alignment = 32, 16
         row_pad_size = (row_alignment - module.weight.size(0)) % row_alignment
         col_pad_size = (col_alignment - module.weight.size(1)) % col_alignment
@@ -1682,7 +1676,7 @@ def load_weight_scales(
                     weight_scale_2 = w["weight_scale_2"][...]
                 else:
                     assert weight_scale_2 == w["weight_scale_2"][...], (
-                        f"The weight_scale_2 should be same for all the weights: {weight_scale_2} vs. {w['weight_scale_2']}*6"
+                        f"The weight_scale_2 should be same for all the weights: {weight_scale_2} vs. {w['weight_scale_2']}"
                     )
 
         # TODO: ModelOpt's o_proj.weight_scale_2 is bfloat16, which should be float32
@@ -2195,7 +2189,7 @@ def apply(self, module: Linear, input: torch.Tensor,
          1. multiply pre_quant_scale to input
          2. quantize input to fp8 using input_scale
          3. unpack_weights and multiply by weight_scales (int4 -> fp16)
-         4. divied by weight_scale_2 (fp16 -> fp8 to allow gemm in fp8).
+         4. divided by weight_scale_2 (fp16 -> fp8 to allow gemm in fp8).
          5. apply gemm in fp8.
          6. rescale using alpha which is input_scale * weight_scale_2
         """
@@ -2731,7 +2725,7 @@ def load_weights(self,
 
         weight_mode = self.weights_loading_config.weight_mode
         if not isinstance(self.quant_method, UnquantizedLinearMethod):
-            assert allow_partial_loading is False, "allow_partial_loading is only supported for non-unquantized linear methods now"
+            assert allow_partial_loading is False, "allow_partial_loading is only supported for unquantized linear methods now"
         self.quant_method.load_weights(
             self,
             weights,

diff --git a/tensorrt_llm/_torch/pyexecutor/model_engine.py b/tensorrt_llm/_torch/pyexecutor/model_engine.py
@@ -362,7 +362,7 @@ def __init__(
         # 3) The model configuration is not loaded until the model engine
         # is initialized.
         #
-        # NOTE: This can simplified by decoupling the model config loading and
+        # NOTE: This can be simplified by decoupling the model config loading and
         # the model engine.
         self.attn_metadata = None
         self.iter_states = {}
@@ -904,8 +904,8 @@ def _capture_piecewise_cuda_graphs(self, resource_manager: ResourceManager):
                     gc.collect()
                     torch.cuda.empty_cache()
 
-        # When using piecewise cuda graph, the logits may suffer severe memory faction problem.
-        # When the num of requests is growing, the block allocated by torch cannot be reused.
+        # When using piecewise cuda graph, the logits may suffer severe memory fragmentation problem.
+        # As the number of requests grows, the blocks allocated by torch cannot be reused.
         # So after piecewise cuda graph capture, a request with most requests is triggered to make
         # sure that large enough blocks are allocated and can be correctly reused.
         for num_tokens in piecewise_cuda_graph_num_tokens:
@@ -1389,14 +1389,14 @@ def _release_cuda_graphs(self):
 
     def get_max_num_sequences(self) -> int:
         """
-        Return the maximum number of sequences that the model supports. PyExecutor need this to compute max_num_active_requests
+        Return the maximum number of sequences that the model supports. PyExecutor needs this to compute max_num_active_requests
         """
         num_batches = self.mapping.pp_size
         return num_batches * self.batch_size
 
     def _preprocess_inputs(self, inputs: Dict[str, Any]):
         """
-        Make some changes to the device inputs and avoid block the async data transfer
+        Make some changes to the device inputs and avoid blocking the async data transfer
         """
         if self.enable_spec_decode and not self._disable_overlap_scheduler:
             # When enabling overlap scheduler, the kv cache for draft tokens will
@@ -1554,7 +1554,7 @@ def get_padded_piecewise_tokens(tokens):
                 return padded_num_tokens, True, None
             else:
                 logger.debug(
-                    f"Picewise cudagraph cannot be used with {total_num_tokens} tokens, {num_ctx_requests} context requests"
+                    f"Piecewise CUDA graph cannot be used with {total_num_tokens} tokens, {num_ctx_requests} context requests"
                 )
                 return total_num_tokens, False, None
-Original file line number
+Diff line change
@@ Expand Up / @@ -75,7 +75,7 @@ def precompute_common_perfect_router_logits(num_experts: int, @@
 ,
 ,
 ,
-# Powers of 2 and common sizes
+# Common sizes
         ]
         print(
@@ Expand Down @@