Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion docs/source/torch/features/lora.md
Original file line number Diff line number Diff line change
Expand Up @@ -33,7 +33,7 @@ The PyTorch backend provides LoRA support, allowing you to:

```python
from tensorrt_llm import LLM
from tensorrt_llm.lora_manager import LoraConfig
from tensorrt_llm.lora_helper import LoraConfig
from tensorrt_llm.executor.request import LoRARequest
from tensorrt_llm.sampling_params import SamplingParams

Expand Down
2 changes: 1 addition & 1 deletion examples/llm-api/llm_multilora.py
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@

from tensorrt_llm import LLM
from tensorrt_llm.executor import LoRARequest
from tensorrt_llm.lora_manager import LoraConfig
from tensorrt_llm.lora_helper import LoraConfig


def main():
Expand Down
1 change: 1 addition & 0 deletions examples/models/core/multimodal/requirements-qwen2vl.txt
Original file line number Diff line number Diff line change
@@ -1,2 +1,3 @@
accelerate
qwen-vl-utils==0.0.8 # 0.0.9 has bug https://github.com/QwenLM/Qwen2-VL/pull/673, rollback until a newer version is released
transformers==4.51.0 # nvbugs/5385987
2 changes: 2 additions & 0 deletions tensorrt_llm/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -33,6 +33,7 @@ def _add_trt_llm_dll_directory():
# otherwise `MemoryError: std::bad_alloc` pattern error will be raised.
import xgrammar # noqa

import tensorrt_llm._torch.models as torch_models
import tensorrt_llm.functional as functional
import tensorrt_llm.math_utils as math_utils
import tensorrt_llm.models as models
Expand Down Expand Up @@ -82,6 +83,7 @@ def _add_trt_llm_dll_directory():
'default_trtnet',
'precision',
'net_guard',
'torch_models',
'Network',
'Mapping',
'MnnvlMemory',
Expand Down
5 changes: 3 additions & 2 deletions tensorrt_llm/_torch/models/modeling_deepseekv3.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,7 +453,7 @@ def __init__(self,
False, # In both low‑latency and attention‑DP modes, FusedMoE skips the in‑op all‑reduce.
model_config=model_config,
override_quant_config=override_quant_config,
aux_stream=aux_stream_dict[AuxStreamType.MoeChunkingOverlap],
aux_stream_dict=aux_stream_dict,
layer_idx=layer_idx)

self.mapping = model_config.mapping
Expand Down Expand Up @@ -1049,11 +1049,12 @@ def __init__(self, model_config: ModelConfig[PretrainedConfig]):
config = model_config.pretrained_config
self.vocab_size = config.vocab_size
self.num_hidden_layers = config.num_hidden_layers
aux_stream_list = [torch.cuda.Stream() for _ in range(2)]
aux_stream_list = [torch.cuda.Stream() for _ in range(3)]
self.aux_stream_dict = {
AuxStreamType.Attention: aux_stream_list[0],
AuxStreamType.MoeShared: aux_stream_list[0],
AuxStreamType.MoeChunkingOverlap: aux_stream_list[1],
AuxStreamType.MoeBalancer: aux_stream_list[2],
}

self.embed_tokens = Embedding(
Expand Down
3 changes: 2 additions & 1 deletion tensorrt_llm/_torch/models/modeling_mixtral.py
Original file line number Diff line number Diff line change
Expand Up @@ -15,6 +15,7 @@
from ..modules.fused_moe import RenormalizeMoeRoutingMethod, create_moe
from ..modules.linear import Linear
from ..modules.rms_norm import RMSNorm
from ..utils import AuxStreamType
from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
register_auto_model)

Expand Down Expand Up @@ -49,7 +50,7 @@ def __init__(
routing_method=RenormalizeMoeRoutingMethod(top_k=self.top_k),
hidden_size=self.hidden_dim,
intermediate_size=self.ffn_dim,
aux_stream=aux_stream,
aux_stream_dict={AuxStreamType.MoeChunkingOverlap: aux_stream},
dtype=config.torch_dtype,
reduce_results=reduce_results,
model_config=model_config,
Expand Down
2 changes: 1 addition & 1 deletion tensorrt_llm/_torch/models/modeling_phi4mm.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
from ...inputs import (ExtraProcessedInputs, InputProcessor, TextPrompt,
register_input_processor)
from ...logger import logger
from ...lora_manager import LoraConfig
from ...lora_helper import LoraConfig
from ...sampling_params import SamplingParams
from ..attention_backend import AttentionMetadata
from ..model_config import ModelConfig
Expand Down
3 changes: 2 additions & 1 deletion tensorrt_llm/_torch/models/modeling_qwen3_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -22,6 +22,7 @@
from ..modules.linear import TensorParallelMode
from ..modules.rms_norm import RMSNorm
from ..speculative import SpecMetadata
from ..utils import AuxStreamType
from .modeling_qwen3 import Qwen3Attention
from .modeling_speculative import SpecDecOneEngineForCausalLM
from .modeling_utils import DecoderModel, EagerFusionConfig, register_auto_model
Expand Down Expand Up @@ -107,7 +108,7 @@ def __init__(
routing_method=self.gate.routing_method,
hidden_size=self.hidden_dim,
intermediate_size=self.moe_intermediate_size,
aux_stream=aux_stream,
aux_stream_dict={AuxStreamType.MoeChunkingOverlap: aux_stream},
dtype=config.torch_dtype,
reduce_results=False,
model_config=model_config,
Expand Down
3 changes: 2 additions & 1 deletion tensorrt_llm/_torch/models/modeling_qwen_moe.py
Original file line number Diff line number Diff line change
Expand Up @@ -17,6 +17,7 @@
from ..modules.gated_mlp import GatedMLP
from ..modules.linear import Linear, TensorParallelMode
from ..modules.rms_norm import RMSNorm
from ..utils import AuxStreamType
from .modeling_utils import (DecoderModel, DecoderModelForCausalLM,
register_auto_model)

Expand Down Expand Up @@ -53,7 +54,7 @@ def __init__(
routing_method=DefaultMoeRoutingMethod(top_k=self.top_k),
hidden_size=self.hidden_dim,
intermediate_size=self.moe_intermediate_size,
aux_stream=aux_stream,
aux_stream_dict={AuxStreamType.MoeChunkingOverlap: aux_stream},
dtype=config.torch_dtype,
reduce_results=reduce_results,
model_config=model_config,
Expand Down
14 changes: 7 additions & 7 deletions tensorrt_llm/_torch/modules/fused_moe/create_moe.py
Original file line number Diff line number Diff line change
@@ -1,11 +1,12 @@
from typing import Optional, Type
from typing import Dict, Optional, Type

import torch

from tensorrt_llm.logger import logger
from tensorrt_llm.models.modeling_utils import QuantConfig

from ...model_config import ModelConfig
from ...utils import AuxStreamType
from .fused_moe_cute_dsl import CuteDslFusedMoE
from .fused_moe_cutlass import CutlassFusedMoE
from .fused_moe_deepgemm import DeepGemmFusedMoE
Expand Down Expand Up @@ -66,7 +67,7 @@ def create_moe(
reduce_results: bool = False,
model_config: ModelConfig = ModelConfig(),
override_quant_config: Optional[QuantConfig] = None,
aux_stream: Optional[torch.cuda.Stream] = None,
aux_stream_dict: Optional[Dict[AuxStreamType, torch.cuda.Stream]] = None,
weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.VANILLA,
bias: bool = False,
apply_router_weight_on_input: bool = False,
Expand Down Expand Up @@ -123,7 +124,7 @@ def create_moe(
dtype=dtype,
reduce_results=reduce_results,
model_config=model_config,
aux_stream=aux_stream,
aux_stream_dict=aux_stream_dict,
weight_loading_mode=weight_loading_mode,
bias=bias,
apply_router_weight_on_input=apply_router_weight_on_input,
Expand All @@ -141,7 +142,7 @@ def create_moe(
dtype=dtype,
reduce_results=reduce_results,
model_config=model_config,
aux_stream=aux_stream,
aux_stream_dict=aux_stream_dict,
weight_loading_mode=weight_loading_mode,
apply_router_weight_on_input=apply_router_weight_on_input,
layer_idx=layer_idx,
Expand Down Expand Up @@ -169,7 +170,7 @@ def create_moe(
dtype=dtype,
reduce_results=reduce_results,
model_config=model_config,
aux_stream=aux_stream,
aux_stream_dict=aux_stream_dict,
weight_loading_mode=weight_loading_mode,
apply_router_weight_on_input=apply_router_weight_on_input,
layer_idx=layer_idx,
Expand All @@ -183,7 +184,7 @@ def create_moe(
dtype=dtype,
reduce_results=reduce_results,
model_config=model_config,
aux_stream=aux_stream,
aux_stream_dict=aux_stream_dict,
weight_loading_mode=weight_loading_mode,
apply_router_weight_on_input=apply_router_weight_on_input,
layer_idx=layer_idx,
Expand All @@ -199,7 +200,6 @@ def create_moe(
dtype=dtype,
reduce_results=reduce_results,
model_config=model_config,
aux_stream=aux_stream,
weight_loading_mode=weight_loading_mode,
bias=bias,
layer_idx=layer_idx,
Expand Down
11 changes: 6 additions & 5 deletions tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
Original file line number Diff line number Diff line change
@@ -1,13 +1,13 @@
import math
from typing import List, Optional, Union
from typing import Dict, List, Optional, Union

import torch
import torch.nn.functional as F

from tensorrt_llm._utils import get_sm_version

from ...model_config import ModelConfig
from ...utils import Fp4QuantizedTensor
from ...utils import AuxStreamType, Fp4QuantizedTensor
from .fused_moe_cutlass import CutlassFusedMoE
from .quantization import MoEWeightLoadingMode
from .routing import BaseMoeRoutingMethod
Expand Down Expand Up @@ -97,7 +97,7 @@ class CuteDslFusedMoE(CutlassFusedMoE):
top_k (int): Number of top experts to select for each input token.
hidden_size (int): Size of the hidden state.
intermediate_size (int): Size of the intermediate state.
aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks.
aux_stream_dict (Optional[Dict[AuxStreamType, torch.cuda.Stream]]): Auxiliary CUDA streams for overlapping.
dtype (Optional[torch.dtype]): Data type for the weights.
reduce_results (bool): Whether to reduce the results across devices.
model_config (ModelConfig): Configuration object for the model.
Expand All @@ -118,7 +118,8 @@ def __init__(
dtype: Optional[torch.dtype] = None,
reduce_results: bool = False,
model_config: ModelConfig = ModelConfig(),
aux_stream: Optional[torch.cuda.Stream] = None,
aux_stream_dict: Optional[Dict[AuxStreamType,
torch.cuda.Stream]] = None,
weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
VANILLA,
apply_router_weight_on_input: bool = False,
Expand All @@ -133,7 +134,7 @@ def __init__(
dtype=dtype,
reduce_results=reduce_results,
model_config=model_config,
aux_stream=aux_stream,
aux_stream_dict=aux_stream_dict,
weight_loading_mode=weight_loading_mode,
apply_router_weight_on_input=apply_router_weight_on_input,
layer_idx=layer_idx,
Expand Down
14 changes: 9 additions & 5 deletions tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
Original file line number Diff line number Diff line change
Expand Up @@ -9,7 +9,8 @@

from ...distributed import allgather
from ...model_config import ModelConfig
from ...utils import EventType, Fp4QuantizedTensor, ceil_div, swizzle_sf
from ...utils import (AuxStreamType, EventType, Fp4QuantizedTensor, ceil_div,
swizzle_sf)
from .interface import MoE

# isort: off
Expand All @@ -31,7 +32,7 @@ class CutlassFusedMoE(MoE):
top_k (int): Number of top experts to select for each input token.
hidden_size (int): Size of the hidden state.
intermediate_size (int): Size of the intermediate state.
aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks.
aux_stream_dict (Optional[Dict[AuxStreamType, torch.cuda.Stream]]): Auxiliary CUDA streams for overlapping.
dtype (Optional[torch.dtype]): Data type for the weights.
reduce_results (bool): Whether to reduce the results across devices.
model_config (ModelConfig): Configuration object for the model.
Expand Down Expand Up @@ -60,7 +61,8 @@ def __init__(
dtype: Optional[torch.dtype] = None,
reduce_results: bool = False,
model_config: ModelConfig = ModelConfig(),
aux_stream: Optional[torch.cuda.Stream] = None,
aux_stream_dict: Optional[Dict[AuxStreamType,
torch.cuda.Stream]] = None,
weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
VANILLA,
bias: bool = False,
Expand Down Expand Up @@ -115,8 +117,10 @@ def __init__(
self.moe_max_num_tokens = model_config.moe_max_num_tokens or max_num_tokens
# The auxiliary CUDA stream and CUDA events are only used when MoE chunking is applied
if self.moe_max_num_tokens < max_num_tokens:
self.aux_stream = aux_stream if aux_stream is not None else torch.cuda.Stream(
)
self.aux_stream = aux_stream_dict[
AuxStreamType.
MoeChunkingOverlap] if aux_stream_dict is not None else torch.cuda.Stream(
)
self.event_dict = {
key: torch.cuda.Event()
for key in [EventType.Main, EventType.MoeChunkingOverlap]
Expand Down
11 changes: 6 additions & 5 deletions tensorrt_llm/_torch/modules/fused_moe/fused_moe_deepgemm.py
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
from typing import List, Optional, Union
from typing import Dict, List, Optional, Union

import torch
import torch.nn.functional as F
Expand All @@ -11,7 +11,7 @@

from ...distributed import allgather
from ...model_config import ModelConfig
from ...utils import Fp4QuantizedTensor
from ...utils import AuxStreamType, Fp4QuantizedTensor
from .fused_moe_cutlass import CutlassFusedMoE
from .quantization import (DeepSeekFP8BlockScalesFusedMoEMethodDeepGemm,
MoEWeightLoadingMode, UnquantizedFusedMoEMethod)
Expand Down Expand Up @@ -299,7 +299,7 @@ class DeepGemmFusedMoE(CutlassFusedMoE):
top_k (int): Number of top experts to select for each input token.
hidden_size (int): Size of the hidden state.
intermediate_size (int): Size of the intermediate state.
aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks.
aux_stream_dict (Optional[Dict[AuxStreamType, torch.cuda.Stream]]): Auxiliary CUDA streams for overlapping.
dtype (Optional[torch.dtype]): Data type for the weights.
reduce_results (bool): Whether to reduce the results across devices.
model_config (ModelConfig): Configuration object for the model.
Expand All @@ -320,7 +320,8 @@ def __init__(
dtype: Optional[torch.dtype] = None,
reduce_results: bool = False,
model_config: ModelConfig = ModelConfig(),
aux_stream: Optional[torch.cuda.Stream] = None,
aux_stream_dict: Optional[Dict[AuxStreamType,
torch.cuda.Stream]] = None,
weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
VANILLA,
apply_router_weight_on_input: bool = False,
Expand All @@ -335,7 +336,7 @@ def __init__(
dtype=dtype,
reduce_results=reduce_results,
model_config=model_config,
aux_stream=aux_stream,
aux_stream_dict=aux_stream_dict,
weight_loading_mode=weight_loading_mode,
apply_router_weight_on_input=apply_router_weight_on_input,
layer_idx=layer_idx,
Expand Down
1 change: 0 additions & 1 deletion tensorrt_llm/_torch/modules/fused_moe/fused_moe_triton.py
Original file line number Diff line number Diff line change
Expand Up @@ -1207,7 +1207,6 @@ def __init__(
dtype: Optional[torch.dtype] = None,
reduce_results: bool = False,
model_config: ModelConfig = ModelConfig(),
aux_stream: Optional[torch.cuda.Stream] = None,
weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.
VANILLA,
bias: bool = False,
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -23,7 +23,6 @@ class TRTLLMGenFusedMoE(MoE):
top_k (int): Number of top experts to select for each input token.
hidden_size (int): Size of the hidden state.
intermediate_size (int): Size of the intermediate state.
aux_stream (Optional[torch.cuda.Stream]): Auxiliary CUDA stream to overlap chunks.
dtype (Optional[torch.dtype]): Data type for the weights.
reduce_results (bool): Whether to reduce the results across devices.
model_config (ModelConfig): Configuration object for the model.
Expand Down
Loading
Loading