Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -10,7 +10,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.<
[![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
[![cuda](https://img.shields.io/badge/cuda-13.0.0-green)](https://developer.nvidia.com/cuda-downloads)
[![torch](https://img.shields.io/badge/torch-2.9.0-green)](https://pytorch.org)
[![version](https://img.shields.io/badge/release-1.2.0rc7-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
[![version](https://img.shields.io/badge/release-1.2.0rc8-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
[![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)

[Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://nvidia.github.io/TensorRT-LLM/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)
Expand Down
2 changes: 1 addition & 1 deletion examples/constraints.txt
Original file line number Diff line number Diff line change
@@ -1,3 +1,3 @@
tensorrt_llm==1.2.0rc7
tensorrt_llm==1.2.0rc8
evaluate~=0.4.1
rouge_score~=0.1.2
3 changes: 2 additions & 1 deletion examples/models/core/mistral_large_3/README.md
Original file line number Diff line number Diff line change
Expand Up @@ -19,7 +19,8 @@ mpirun -n 1 --allow-run-as-root --oversubscribe python3 examples/llm-api/quickst
--max_tokens 100 \
--checkpoint_format mistral \
--model_type mistral_large_3 \
--moe_backend TRTLLM
--moe_backend TRTLLM \
--image_format pil
```

## LLM-only run
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -113,11 +113,6 @@ def __init__(self, config, layer_idx: int):
A = torch.arange(1, self.num_heads + 1)
self.A_log = nn.Parameter(torch.log(A))
self.A_log._no_weight_decay = True
# Instead of recomputing `torch.exp(self.A_log.float())` on every forward pass, we will register a hook
# that sets this appropriately when loading weights.
# NOTE: we explicitly register this as a non-persistent buffer so that it does not appear in the state dict of
# this module, or an equivalent graph module trace from it, but still gets included in e.g. `to()` calls.
self.register_buffer("_minus_A", -A.float(), persistent=False)
self.norm = MambaRMSNormGated(
self.intermediate_size,
eps=self.layer_norm_epsilon,
Expand All @@ -129,8 +124,6 @@ def __init__(self, config, layer_idx: int):
self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
self.use_bias = config.use_bias

self.register_load_state_dict_post_hook(self._load_state_dict_post_hook)

def torch_forward(self, input_states):
batch_size, seq_len, _ = input_states.shape
dtype = input_states.dtype
Expand Down Expand Up @@ -166,7 +159,7 @@ def torch_forward(self, input_states):
)

# 3. SSM transformation
A = self._minus_A
A = -torch.exp(self.A_log.float())
y = torch.ops.auto_deploy.torch_ssm(
hidden_states=hidden_states.view(batch_size, seq_len, -1, self.head_dim),
A=A,
Expand All @@ -193,10 +186,6 @@ def torch_forward(self, input_states):
def forward(self, hidden_states):
return self.torch_forward(hidden_states)

@staticmethod
def _load_state_dict_post_hook(module, incompatible_keys) -> None:
module._minus_A.data = -torch.exp(module.A_log.float())


class NemotronHRMSNorm(nn.Module):
def __init__(self, hidden_size, eps=1e-6):
Expand Down
11 changes: 8 additions & 3 deletions tensorrt_llm/_torch/models/checkpoints/base_weight_mapper.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,9 +29,6 @@ def init_model_and_config(self, model: Union[nn.Module,
raise ValueError("model must have a config attribute")

self._tp_size = 1 if model.model_config.mapping.enable_attention_dp else model.model_config.mapping.tp_size
self._head_dim = model.config.head_dim if hasattr(
model.config, 'head_dim'
) and model.config.head_dim is not None else model.config.hidden_size // model.config.num_attention_heads

self.map_weights()

Expand Down Expand Up @@ -173,3 +170,11 @@ def model(self) -> Union[nn.Module, DecoderModelForCausalLM]:
if self._model is None:
raise RuntimeError("Weight mapper is not initialized")
return self._model

@property
def _head_dim(self) -> int:
model = self.model
head_dim = model.config.head_dim if hasattr(
model.config, 'head_dim'
) and model.config.head_dim is not None else model.config.hidden_size // model.config.num_attention_heads
return head_dim
19 changes: 19 additions & 0 deletions tensorrt_llm/_torch/models/checkpoints/hf/qwen3vl_weight_mapper.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,8 @@
from transformers.models.qwen3_vl.configuration_qwen3_vl import (
Qwen3VLTextConfig,
Qwen3VLVisionConfig,
)

from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import HfWeightMapper
from tensorrt_llm._torch.models.modeling_utils import register_mapper

Expand All @@ -6,3 +11,17 @@
class Qwen3VLHfWeightMapper(HfWeightMapper):
def preprocess_weights(self, weights: dict) -> dict:
return weights

@property
def _head_dim(self) -> int:
config = self.model.config
if (head_dim := getattr(config, "head_dim", None)) is not None:
return head_dim
if isinstance(config, Qwen3VLTextConfig):
num_heads = config.num_attention_heads
elif isinstance(config, Qwen3VLVisionConfig):
num_heads = config.num_heads
else:
raise TypeError(f"Unexpected config class {type(config).__name__}.")

return config.hidden_size // num_heads
Original file line number Diff line number Diff line change
Expand Up @@ -103,17 +103,14 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
"apply_scale": "apply_yarn_scaling",
}
yarn_config = config.get("yarn") or {}
config["rope_parameters"] = {
config["rope_scaling"] = {
"rope_type": "yarn",
"mscale_all_dim": 1,
}

if rope_theta := config.pop("rope_theta", None):
config["rope_parameters"]["rope_theta"] = rope_theta

for old_name, new_name in yarn_config_map.items():
if old_name in yarn_config:
config["rope_parameters"][new_name] = yarn_config.pop(old_name)
config["rope_scaling"][new_name] = yarn_config.pop(old_name)

assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"

Expand Down
64 changes: 36 additions & 28 deletions tensorrt_llm/_torch/models/modeling_mistral.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,6 +46,7 @@
MultimodalPlaceholderPlacement, TextPrompt,
register_input_processor)
from tensorrt_llm.inputs.multimodal import MultimodalParams
from tensorrt_llm.inputs.utils import encode_base64_image
from tensorrt_llm.llmapi import SamplingParams
from tensorrt_llm.logger import logger

Expand All @@ -58,16 +59,28 @@ def __init__(
layer_idx: int | None = None,
):
config = model_config.pretrained_config
rope_params = RopeParams.from_config(config)
rope_params_section = getattr(config, "rope_scaling", None) or getattr(
config, "rope_parameters", None)
rope_type = getattr(rope_params_section, "rope_type", None)
if rope_type == "yarn":
pos_embd_params = PositionalEmbeddingParams(
type=PositionEmbeddingType.yarn,
rope=rope_params,
is_neox=False)
else:
pos_embd_params = PositionalEmbeddingParams(
type=PositionEmbeddingType.rope_gpt_neox,
rope=rope_params,
)

super().__init__(
hidden_size=config.hidden_size,
num_attention_heads=config.num_attention_heads,
num_key_value_heads=config.num_key_value_heads,
max_position_embeddings=config.max_position_embeddings,
bias=False,
pos_embd_params=PositionalEmbeddingParams(
type=PositionEmbeddingType.rope_gpt_neox,
rope=RopeParams.from_config(config),
),
pos_embd_params=pos_embd_params,
layer_idx=layer_idx,
dtype=config.torch_dtype,
config=model_config,
Expand Down Expand Up @@ -266,20 +279,18 @@ def _get_num_multimodal_tokens(self, image_sizes):
}

def get_num_tokens_per_image(self, image_sizes):
# FIXME avoid double loading with custom loader
h, w = image_sizes
ncols, nrows = self.image_processor._image_to_num_tokens(
Image.new("RGB", (w, h)))
return ncols * nrows + nrows

def __call__(self, text, images, media, **kwargs):
assert media is not None
if isinstance(media, str):
media = [media]

mm_items = [{"type": "image_url", "image_url": url} for url in media]

logger.debug(f"text: {text}")
def __call__(self, text, images, **kwargs):
mm_items = []
if images:
mm_items = [{
"type": "image",
"base64": encode_base64_image(image)
} for image in images]

conversation = [{
"role": "user",
Expand All @@ -292,19 +303,20 @@ def __call__(self, text, images, media, **kwargs):
encoded = self.tokenizer.transformers_tokenizer.apply_chat_template(
conversation, tokenize=True, return_dict=True, return_tensors='pt')

logger.debug(
f"encoded.pixel_values.shape: {encoded.pixel_values.shape}, encoded.input_ids: {encoded.input_ids[0][-20:]}"
)
logger.debug(
f"encoded.input_ids list: {self.tokenizer.transformers_tokenizer.apply_chat_template(conversation)}"
)

processed = {
"input_ids": encoded.input_ids,
"pixel_values": encoded.pixel_values.to(self.dtype),
"attention_mask": encoded.attention_mask,
"image_sizes": torch.tensor([encoded.pixel_values.shape[2:]])
}

# text-only mode for VLM
if "pixel_values" in encoded:
processed.update({
"pixel_values":
encoded.pixel_values.to(self.dtype),
"attention_mask":
encoded.attention_mask,
"image_sizes":
torch.tensor([encoded.pixel_values.shape[2:]])
})
return processed


Expand Down Expand Up @@ -376,26 +388,22 @@ def __call__(
self, inputs: TextPrompt, sampling_params: SamplingParams
) -> Tuple[List[int], ExtraProcessedInputs | None]:
images = inputs.get("multi_modal_data", {}).get("image")
mm_processor_kwargs = inputs.get("mm_processor_kwargs", {})
do_rescale = getattr(self.processor.image_processor, "do_rescale",
False)
if images is not None and isinstance(images[0], torch.Tensor):
# The default multimodal input loader will normalize images to [0, 1] when the requested
# format is "pt" (pytorch tensors), but not for "pil" (PIL images).
do_rescale = False

if mm_processor_kwargs:
# Currently, we only support image modality in MistralCommonImageProcessor.
if images is not None:
processed = self.processor(
text=inputs["prompt"],
images=images,
do_rescale=do_rescale,
**mm_processor_kwargs,
)
else:
processed = self.text_processor(
text=inputs["prompt"],
images=images,
do_rescale=do_rescale,
)
input_ids = processed.pop("input_ids").tolist()[0]
Expand Down
Loading
Loading