Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion .github/workflows/auto-assign-author.yml
Original file line number Diff line number Diff line change
Expand Up @@ -15,4 +15,4 @@ jobs:
GH_TOKEN: ${{ secrets.GITHUB_TOKEN }}
PR_URL: ${{ github.event.pull_request.html_url }}
AUTHOR: ${{ github.actor }}
run: gh pr edit $PR_URL --add-assignee $AUTHOR
run: gh pr edit $PR_URL --add-assignee $AUTHOR || echo "Could not assign $AUTHOR (not a collaborator), skipping."
8 changes: 4 additions & 4 deletions tensorrt_llm/_torch/autotuner.py
Original file line number Diff line number Diff line change
Expand Up @@ -201,7 +201,7 @@ def get_valid_tactics(self, inputs: List[torch.Tensor],

tactic==-1 has special meaning, means the fallback kernel which should be able to implement any shapes
This fallback tactic is needed for 2 reasons:
* when the autotuner cannot find a valid tactic in it's cache.
* when the autotuner cannot find a valid tactic in its cache.
* in eager mode, w/o autotunning the custom op should have at least one kernel, which makes the autotuning
process an optional process, such that user can opt out.

Expand Down Expand Up @@ -1437,10 +1437,10 @@ def _create_tensor_like(self, origin_tensor: torch.Tensor,
# during the tuning process. This can by controlled in the preparation phase by the runner.
# It must not use all zero tensors. Otherwise the timing results become unreliable.
if dtype == torch.float4_e2m1fn_x2:
return torch.randint(-5, 5, shapes,
device=device).to(torch.uint8).view(dtype)
return (torch.rand(shapes, device=device) * 10 - 5).to(
torch.uint8).view(dtype)
else:
return torch.randint(-5, 5, shapes, device=device).to(dtype)
return (torch.rand(shapes, device=device) * 10 - 5).to(dtype)

def _prepare_input_tensors(
self, profile: OptimizationProfile,
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -1069,7 +1069,7 @@ def get_valid_tactics(self, inputs: List[torch.Tensor],
def get_dynamic_tensor_specs(cls) -> Tuple[DynamicTensorSpec, ...]:
"""Get the dynamic tensor specs for use with the AutoTuner."""

# These indices correspond to the 0th input tensor and it's first dimension
# These indices correspond to the 0th input tensor and its first dimension
# i.e. we are tuning M where the first input tensor is of shape [B, M, K]

MAT1_IDX = 0
Expand Down
12 changes: 6 additions & 6 deletions tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
Original file line number Diff line number Diff line change
Expand Up @@ -284,7 +284,7 @@ def get_dynamic_tensor_specs(cls,
ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
HIDDEN_STATES_IDX = 2
TUNED_DIM = 0
MAX_PROFILE_BUCKET = 4096
MAX_PROFILE_BUCKET = 8192

m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)

Expand Down Expand Up @@ -660,7 +660,7 @@ def get_dynamic_tensor_specs(cls,
ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
HIDDEN_STATES_IDX = 2
TUNED_DIM = 0
MAX_PROFILE_BUCKET = 4096
MAX_PROFILE_BUCKET = 8192

m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)

Expand Down Expand Up @@ -967,7 +967,7 @@ def get_dynamic_tensor_specs(cls,
ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
HIDDEN_STATES_IDX = 2
TUNED_DIM = 0
MAX_PROFILE_BUCKET = 4096
MAX_PROFILE_BUCKET = 8192

m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)

Expand Down Expand Up @@ -1273,7 +1273,7 @@ def get_dynamic_tensor_specs(cls,
ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
HIDDEN_STATES_IDX = 2
TUNED_DIM = 0
MAX_PROFILE_BUCKET = 4096
MAX_PROFILE_BUCKET = 8192

m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)

Expand Down Expand Up @@ -1561,7 +1561,7 @@ def get_dynamic_tensor_specs(cls,
ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
HIDDEN_STATES_IDX = 2
TUNED_DIM = 0
MAX_PROFILE_BUCKET = 4096
MAX_PROFILE_BUCKET = 8192

m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)

Expand Down Expand Up @@ -1833,7 +1833,7 @@ def get_dynamic_tensor_specs(cls,
ep_size: int) -> Tuple[DynamicTensorSpec, ...]:
HIDDEN_STATES_IDX = 2
TUNED_DIM = 0
MAX_PROFILE_BUCKET = 4096
MAX_PROFILE_BUCKET = 8192

m_values = get_last_power_of_2_num_tokens_buckets(MAX_PROFILE_BUCKET)

Expand Down
16 changes: 15 additions & 1 deletion tensorrt_llm/_torch/models/modeling_exaone_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -62,7 +62,21 @@ def check_is_moe(config: ExaoneMoEConfig, layer_idx: int, is_mtp_layer: bool = F
"""
Check if the current layer is a MoE layer.
"""
return not is_mtp_layer and hasattr(config, "is_moe_layer") and config.is_moe_layer[layer_idx]
# The MTP layer of K-EXAONE is always dense.
if is_mtp_layer:
return False

if hasattr(config, "mlp_layer_types") and config.mlp_layer_types is not None:
return config.mlp_layer_types[layer_idx] == "sparse"

# For backward compatibility, older K-EXAONE checkpoints do not include `mlp_layer_types`.
if hasattr(config, "is_moe_layer") and config.is_moe_layer is not None:
return config.is_moe_layer[layer_idx]

raise ValueError(
"Invalid configuration: Neither `mlp_layer_types` nor `is_moe_layer` found in config. "
"Please check if the checkpoint and config are compatible with ExaoneMoEConfig."
)


def enable_attn_allreduce(mapping: Mapping):
Expand Down
15 changes: 8 additions & 7 deletions tensorrt_llm/_torch/modules/attention.py
Original file line number Diff line number Diff line change
Expand Up @@ -811,7 +811,7 @@ def apply_rope(self, q: torch.Tensor, k: Optional[torch.Tensor],

def apply_qk_norm(self, q, k):
raise NotImplementedError(
f"QK norm is not implemented for {self.__class__.__name__}."
f"QK norm is not implemented for {self.__class__.__name__}. "
"Please override the `apply_qk_norm` method in the subclass.")


Expand Down Expand Up @@ -959,7 +959,7 @@ def __init__(
self)
self.register_to_config = True

# only support one kind of sparse attention, dsa now.
# Currently only DSA sparse attention is supported.
if config is not None and config.sparse_attention_config is not None and config.sparse_attention_config.algorithm == "dsa":
self.is_dsa = True
else:
Expand All @@ -982,7 +982,7 @@ def __init__(
dp_size = tp_size
tp_size = 1
if self.mapping.has_cp_ulysses():
raise NotImplementedError("MLA doesn't support CP Ulyssees yet")
raise NotImplementedError("MLA doesn't support CP Ulysses yet")
if self.mapping.cp_size > 1:
assert self.mapping.has_cp_helix(
), f"CP type must be HELIX for MLA, but got {self.mapping.cp_config['cp_type']}."
Expand Down Expand Up @@ -1360,13 +1360,13 @@ def forward_impl(self,
output: torch.Tensor,
latent_cache_gen: Optional[torch.Tensor] = None) -> None:
"""
Forward pass for the MLA module.
Forward pass for the MLA module. Writes result into output tensor in-place.

Args:
position_ids (Optional[torch.IntTensor]): The position IDs.
hidden_states (torch.Tensor): The hidden states.
attn_metadata (AttentionMetadata): The attention metadata.
output (torch.Tensor): Pre-allocated output tensor, written in-place.
output (torch.Tensor): The output tensor to write results into.
latent_cache_gen (Optional[torch.Tensor]): The latent cache used in generation.
"""
# split q, k, v into context and gen batches
Expand Down Expand Up @@ -1464,12 +1464,13 @@ def forward_impl_with_dsa(self, position_ids: Optional[torch.Tensor],
output: torch.Tensor) -> None:
"""
Forward pass for the MLA module with DSA (always in MQA mode).
Writes result into output tensor in-place.

Args:
position_ids (Optional[torch.IntTensor]): The position IDs.
hidden_states (torch.Tensor): The hidden states.
attn_metadata (AttentionMetadata): The attention metadata.
output (torch.Tensor): Pre-allocated output tensor, written in-place.
output (torch.Tensor): The output tensor to write results into.
"""
assert self.mqa is not None, "DSA is only supported in MQA mode"
# split q, k, v into context and gen batches
Expand Down Expand Up @@ -1800,7 +1801,7 @@ def forward_context_with_chunked_prefill(
# currently we assume that the chunk size is the same as the max_num_tokens
chunked_loop_num = attn_metadata.chunked_loop_num

# [toal_token_q, num_heads, 2] -> [toal_token_q, num_heads] float2
# [total_token_q, num_heads, 2] -> [total_token_q, num_heads] float2
self.softmax_stats_tensor = torch.empty(
(attn_metadata.num_ctx_tokens, self.num_heads_tp, 2),
dtype=torch.float,
Expand Down
10 changes: 5 additions & 5 deletions tensorrt_llm/_torch/modules/fused_moe/create_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -56,7 +56,7 @@ def get_moe_cls(
return TRTLLMGenFusedMoE
else:
logger.warning(
"TRTLLMGenFusedMoE only supports fp8_block_scales, nvfp4, w4a16_mxfp4, w4a8_mxfp4_fp8 and w4a8_mxfp4_mxfp8. "
"TRTLLMGenFusedMoE only supports fp8_block_scales, nvfp4, w4a16_mxfp4, w4a8_nvfp4_fp8, w4a8_mxfp4_fp8, and w4a8_mxfp4_mxfp8. "
f"Check out details in quant_config: {quant_config}. Using CutlassFusedMoE instead."
)
return CutlassFusedMoE
Expand Down Expand Up @@ -140,7 +140,7 @@ def create_moe_backend(
assert moe_cls in [
WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE,
DeepGemmFusedMoE
], "MoE Load Balance is only supported in WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE and CuteDslFusedMoE, and DeepGemmFusedMoE."
], "MoE Load Balance is only supported in WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE, and DeepGemmFusedMoE."

if bias:
assert moe_cls in [CutlassFusedMoE, TritonFusedMoE, TRTLLMGenFusedMoE
Expand Down Expand Up @@ -371,14 +371,14 @@ def create_moe(
activation_type=activation_type,
)
else:
# Check if this is a TRTLLM backend request that fallback to CutlassFusedMoE
# Check if this is a TRTLLM or CUTEDSL backend request that fell back to CutlassFusedMoE
requested_backend = model_config.moe_backend.upper()
if requested_backend in ("TRTLLM",
"CUTEDSL") and moe_cls == CutlassFusedMoE:
# Workaround for test cases where TRTLLM backend fallbacks to CutlassFusedMoE due to quant_config incompatibility
# Workaround for test cases where TRTLLM backend falls back to CutlassFusedMoE due to quant_config incompatibility
# Log warning and continue with the fallback backend
logger.warning(
f"ENABLE_CONFIGURABLE_MOE is set but TRTLLM backend fallback to {moe_cls.__name__} due to quant_config. "
f"ENABLE_CONFIGURABLE_MOE is set but {requested_backend} backend fell back to {moe_cls.__name__} due to quant_config. "
f"ConfigurableMoE only supports TRTLLMGenFusedMoE and CuteDslFusedMoE backends. "
f"Continuing with legacy MoE backend {moe_cls.__name__}.")
else:
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/_torch/modules/fused_moe/routing.py
Original file line number Diff line number Diff line change
Expand Up @@ -75,7 +75,7 @@ def precompute_common_perfect_router_logits(num_experts: int,
5120,
6144,
7168,
8192 # Powers of 2 and common sizes
8192 # Common sizes
]

print(
Expand Down
26 changes: 10 additions & 16 deletions tensorrt_llm/_torch/modules/linear.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,7 +91,7 @@ def split_dim(cls, mode):
return 1 if mode == cls.ROW else 0

# Helper to shard the corresponding per-channel activation scales
# Which shard along the dimension orthogonal to the weights
# which are sharded along the dimension orthogonal to the weights
@classmethod
def flip(cls, mode):
return cls.ROW if mode == cls.COLUMN else cls.COLUMN
Expand Down Expand Up @@ -190,7 +190,7 @@ def load_weights_vanilla_helper(module: Linear,

if weight is not None:
if module.has_weight_only_quant:
# NOTE: without the preprocess during the runtime, the gemm output nan's. in order to use the preprocess_weights_for_mixed_gemm
# NOTE: without the preprocess during the runtime, the gemm outputs NaNs. In order to use the preprocess_weights_for_mixed_gemm
# we need to cast the weight to int8 first.
activation_dtype = torch.float8_e4m3fn if module.has_w4a8_awq else torch.float16
weight_dtype, _ = get_weight_dtype_and_id(module)
Expand Down Expand Up @@ -571,7 +571,7 @@ def create_weights(self, module: Linear, in_features: int,
# K, V scales for NVFP4 KV cache
module.kv_scales = Parameter(torch.ones(3, dtype=torch.float32),
requires_grad=False)
# K, V scales for NVFP4 KV cache
# Inverse K, V scales for NVFP4 KV cache
module.inv_kv_scales = Parameter(torch.ones(3, dtype=torch.float32),
requires_grad=False)
if bias:
Expand Down Expand Up @@ -919,7 +919,7 @@ def apply(self, module: Linear, input: torch.Tensor,
return output

def _get_scale_name(self, weights: List[Dict]):
# `weight_scale_inv` for DS recipe and `weight_scale` for ModelOpt recipe.
# `weight_scale_inv` for DS recipe and `weight_scale` for ModelOpt recipe.
# Actually they hold identical values of data_amax / 448.
scale_name = "weight_scale_inv"
if scale_name not in weights[0]:
Expand Down Expand Up @@ -1065,7 +1065,7 @@ def apply(self, module: Linear, input: torch.Tensor,
return output

def _get_scale_name(self, weights: List[Dict]):
# `weight_scale_inv` for DS recipe and `weight_scale` for ModelOpt recipe.
# `weight_scale_inv` for DS recipe and `weight_scale` for ModelOpt recipe.
# Actually they hold identical values of data_amax / 448.
for w in weights:
if "weight_scale_inv" in w:
Expand Down Expand Up @@ -1230,7 +1230,7 @@ def create_weights(self, module: Linear, in_features: int,
# K, V scales for NVFP4 KV cache
module.kv_scales = Parameter(torch.ones(3, dtype=torch.float32),
requires_grad=False)
# K, V scales for NVFP4 KV cache
# Inverse K, V scales for NVFP4 KV cache
module.inv_kv_scales = Parameter(torch.ones(3, dtype=torch.float32),
requires_grad=False)

Expand Down Expand Up @@ -1531,14 +1531,8 @@ def load_weights_fused_gate_up_linear(self, module: Linear,
copy_weight(module.pre_quant_scale, pre_quant_scale)

def post_load_weights(self, module: Linear):
"""Pad weight and weight_scale tensors to meet torch trtllm NVFP4 GEMM alignment requirements."""
super().post_load_weights(module)
"""
Pad weight and weight_scale tensors to meet torch trtllm NVFP4 GEMM alignment requirements.

Args:
row_alignment: Required row alignment (default: 32)
col_alignment: Required column alignment (default: 16)
"""
row_alignment, col_alignment = 32, 16
row_pad_size = (row_alignment - module.weight.size(0)) % row_alignment
col_pad_size = (col_alignment - module.weight.size(1)) % col_alignment
Expand Down Expand Up @@ -1682,7 +1676,7 @@ def load_weight_scales(
weight_scale_2 = w["weight_scale_2"][...]
else:
assert weight_scale_2 == w["weight_scale_2"][...], (
f"The weight_scale_2 should be same for all the weights: {weight_scale_2} vs. {w['weight_scale_2']}*6"
f"The weight_scale_2 should be same for all the weights: {weight_scale_2} vs. {w['weight_scale_2']}"
)

# TODO: ModelOpt's o_proj.weight_scale_2 is bfloat16, which should be float32
Expand Down Expand Up @@ -2195,7 +2189,7 @@ def apply(self, module: Linear, input: torch.Tensor,
1. multiply pre_quant_scale to input
2. quantize input to fp8 using input_scale
3. unpack_weights and multiply by weight_scales (int4 -> fp16)
4. divied by weight_scale_2 (fp16 -> fp8 to allow gemm in fp8).
4. divided by weight_scale_2 (fp16 -> fp8 to allow gemm in fp8).
5. apply gemm in fp8.
6. rescale using alpha which is input_scale * weight_scale_2
"""
Expand Down Expand Up @@ -2731,7 +2725,7 @@ def load_weights(self,

weight_mode = self.weights_loading_config.weight_mode
if not isinstance(self.quant_method, UnquantizedLinearMethod):
assert allow_partial_loading is False, "allow_partial_loading is only supported for non-unquantized linear methods now"
assert allow_partial_loading is False, "allow_partial_loading is only supported for unquantized linear methods now"
self.quant_method.load_weights(
self,
weights,
Expand Down
12 changes: 6 additions & 6 deletions tensorrt_llm/_torch/pyexecutor/model_engine.py
Original file line number Diff line number Diff line change
Expand Up @@ -362,7 +362,7 @@ def __init__(
# 3) The model configuration is not loaded until the model engine
# is initialized.
#
# NOTE: This can simplified by decoupling the model config loading and
# NOTE: This can be simplified by decoupling the model config loading and
# the model engine.
self.attn_metadata = None
self.iter_states = {}
Expand Down Expand Up @@ -904,8 +904,8 @@ def _capture_piecewise_cuda_graphs(self, resource_manager: ResourceManager):
gc.collect()
torch.cuda.empty_cache()

# When using piecewise cuda graph, the logits may suffer severe memory faction problem.
# When the num of requests is growing, the block allocated by torch cannot be reused.
# When using piecewise cuda graph, the logits may suffer severe memory fragmentation problem.
# As the number of requests grows, the blocks allocated by torch cannot be reused.
# So after piecewise cuda graph capture, a request with most requests is triggered to make
# sure that large enough blocks are allocated and can be correctly reused.
for num_tokens in piecewise_cuda_graph_num_tokens:
Expand Down Expand Up @@ -1389,14 +1389,14 @@ def _release_cuda_graphs(self):

def get_max_num_sequences(self) -> int:
"""
Return the maximum number of sequences that the model supports. PyExecutor need this to compute max_num_active_requests
Return the maximum number of sequences that the model supports. PyExecutor needs this to compute max_num_active_requests
"""
num_batches = self.mapping.pp_size
return num_batches * self.batch_size

def _preprocess_inputs(self, inputs: Dict[str, Any]):
"""
Make some changes to the device inputs and avoid block the async data transfer
Make some changes to the device inputs and avoid blocking the async data transfer
"""
if self.enable_spec_decode and not self._disable_overlap_scheduler:
# When enabling overlap scheduler, the kv cache for draft tokens will
Expand Down Expand Up @@ -1554,7 +1554,7 @@ def get_padded_piecewise_tokens(tokens):
return padded_num_tokens, True, None
else:
logger.debug(
f"Picewise cudagraph cannot be used with {total_num_tokens} tokens, {num_ctx_requests} context requests"
f"Piecewise CUDA graph cannot be used with {total_num_tokens} tokens, {num_ctx_requests} context requests"
)
return total_num_tokens, False, None

Expand Down
Loading
Loading