yingguo-trt · pull · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026 · Jan 8, 2026
diff --git a/README.md b/README.md
@@ -10,7 +10,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.<
 [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/)
 [![cuda](https://img.shields.io/badge/cuda-13.0.0-green)](https://developer.nvidia.com/cuda-downloads)
 [![torch](https://img.shields.io/badge/torch-2.9.0-green)](https://pytorch.org)
-[![version](https://img.shields.io/badge/release-1.2.0rc7-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
+[![version](https://img.shields.io/badge/release-1.2.0rc8-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py)
 [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE)
 
 [Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Documentation](https://nvidia.github.io/TensorRT-LLM/)&nbsp;&nbsp;&nbsp;|&nbsp;&nbsp;&nbsp;[Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap)

diff --git a/examples/constraints.txt b/examples/constraints.txt
@@ -1,3 +1,3 @@
-tensorrt_llm==1.2.0rc7
+tensorrt_llm==1.2.0rc8
 evaluate~=0.4.1
 rouge_score~=0.1.2
diff --git a/examples/models/core/mistral_large_3/README.md b/examples/models/core/mistral_large_3/README.md
@@ -19,7 +19,8 @@ mpirun -n 1 --allow-run-as-root --oversubscribe python3 examples/llm-api/quickst
     --max_tokens 100 \
     --checkpoint_format mistral \
     --model_type mistral_large_3 \
-    --moe_backend TRTLLM
+    --moe_backend TRTLLM \
+    --image_format pil
 ```
 
 ## LLM-only run

diff --git a/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py b/tensorrt_llm/_torch/auto_deploy/models/custom/modeling_nemotron_h.py
@@ -113,11 +113,6 @@ def __init__(self, config, layer_idx: int):
         A = torch.arange(1, self.num_heads + 1)
         self.A_log = nn.Parameter(torch.log(A))
         self.A_log._no_weight_decay = True
-        # Instead of recomputing `torch.exp(self.A_log.float())` on every forward pass, we will register a hook
-        # that sets this appropriately when loading weights.
-        # NOTE: we explicitly register this as a non-persistent buffer so that it does not appear in the state dict of
-        # this module, or an equivalent graph module trace from it, but still gets included in e.g. `to()` calls.
-        self.register_buffer("_minus_A", -A.float(), persistent=False)
         self.norm = MambaRMSNormGated(
             self.intermediate_size,
             eps=self.layer_norm_epsilon,
@@ -129,8 +124,6 @@ def __init__(self, config, layer_idx: int):
         self.out_proj = nn.Linear(self.intermediate_size, self.hidden_size, bias=config.use_bias)
         self.use_bias = config.use_bias
 
-        self.register_load_state_dict_post_hook(self._load_state_dict_post_hook)
-
     def torch_forward(self, input_states):
         batch_size, seq_len, _ = input_states.shape
         dtype = input_states.dtype
@@ -166,7 +159,7 @@ def torch_forward(self, input_states):
         )
 
         # 3. SSM transformation
-        A = self._minus_A
+        A = -torch.exp(self.A_log.float())
         y = torch.ops.auto_deploy.torch_ssm(
             hidden_states=hidden_states.view(batch_size, seq_len, -1, self.head_dim),
             A=A,
@@ -193,10 +186,6 @@ def torch_forward(self, input_states):
     def forward(self, hidden_states):
         return self.torch_forward(hidden_states)
 
-    @staticmethod
-    def _load_state_dict_post_hook(module, incompatible_keys) -> None:
-        module._minus_A.data = -torch.exp(module.A_log.float())
-
 
 class NemotronHRMSNorm(nn.Module):
     def __init__(self, hidden_size, eps=1e-6):

diff --git a/tensorrt_llm/_torch/models/checkpoints/base_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/base_weight_mapper.py
@@ -29,9 +29,6 @@ def init_model_and_config(self, model: Union[nn.Module,
             raise ValueError("model must have a config attribute")
 
         self._tp_size = 1 if model.model_config.mapping.enable_attention_dp else model.model_config.mapping.tp_size
-        self._head_dim = model.config.head_dim if hasattr(
-            model.config, 'head_dim'
-        ) and model.config.head_dim is not None else model.config.hidden_size // model.config.num_attention_heads
 
         self.map_weights()
 
@@ -173,3 +170,11 @@ def model(self) -> Union[nn.Module, DecoderModelForCausalLM]:
         if self._model is None:
             raise RuntimeError("Weight mapper is not initialized")
         return self._model
+
+    @property
+    def _head_dim(self) -> int:
+        model = self.model
+        head_dim = model.config.head_dim if hasattr(
+            model.config, 'head_dim'
+        ) and model.config.head_dim is not None else model.config.hidden_size // model.config.num_attention_heads
+        return head_dim
diff --git a/tensorrt_llm/_torch/models/checkpoints/hf/qwen3vl_weight_mapper.py b/tensorrt_llm/_torch/models/checkpoints/hf/qwen3vl_weight_mapper.py
@@ -1,3 +1,8 @@
+from transformers.models.qwen3_vl.configuration_qwen3_vl import (
+    Qwen3VLTextConfig,
+    Qwen3VLVisionConfig,
+)
+
 from tensorrt_llm._torch.models.checkpoints.hf.weight_mapper import HfWeightMapper
 from tensorrt_llm._torch.models.modeling_utils import register_mapper
 
@@ -6,3 +11,17 @@
 class Qwen3VLHfWeightMapper(HfWeightMapper):
     def preprocess_weights(self, weights: dict) -> dict:
         return weights
+
+    @property
+    def _head_dim(self) -> int:
+        config = self.model.config
+        if (head_dim := getattr(config, "head_dim", None)) is not None:
+            return head_dim
+        if isinstance(config, Qwen3VLTextConfig):
+            num_heads = config.num_attention_heads
+        elif isinstance(config, Qwen3VLVisionConfig):
+            num_heads = config.num_heads
+        else:
+            raise TypeError(f"Unexpected config class {type(config).__name__}.")
+
+        return config.hidden_size // num_heads
diff --git a/tensorrt_llm/_torch/models/checkpoints/mistral/config_loader.py b/tensorrt_llm/_torch/models/checkpoints/mistral/config_loader.py
@@ -103,17 +103,14 @@ def _remap_mistral_yarn_args(config: dict) -> dict:
         "apply_scale": "apply_yarn_scaling",
     }
     yarn_config = config.get("yarn") or {}
-    config["rope_parameters"] = {
+    config["rope_scaling"] = {
         "rope_type": "yarn",
         "mscale_all_dim": 1,
     }
 
-    if rope_theta := config.pop("rope_theta", None):
-        config["rope_parameters"]["rope_theta"] = rope_theta
-
     for old_name, new_name in yarn_config_map.items():
         if old_name in yarn_config:
-            config["rope_parameters"][new_name] = yarn_config.pop(old_name)
+            config["rope_scaling"][new_name] = yarn_config.pop(old_name)
 
     assert len(yarn_config) == 0, f"Unparsed yarn config: {yarn_config}"
 

diff --git a/tensorrt_llm/_torch/models/modeling_mistral.py b/tensorrt_llm/_torch/models/modeling_mistral.py
@@ -46,6 +46,7 @@
                                  MultimodalPlaceholderPlacement, TextPrompt,
                                  register_input_processor)
 from tensorrt_llm.inputs.multimodal import MultimodalParams
+from tensorrt_llm.inputs.utils import encode_base64_image
 from tensorrt_llm.llmapi import SamplingParams
 from tensorrt_llm.logger import logger
 
@@ -58,16 +59,28 @@ def __init__(
         layer_idx: int | None = None,
     ):
         config = model_config.pretrained_config
+        rope_params = RopeParams.from_config(config)
+        rope_params_section = getattr(config, "rope_scaling", None) or getattr(
+            config, "rope_parameters", None)
+        rope_type = getattr(rope_params_section, "rope_type", None)
+        if rope_type == "yarn":
+            pos_embd_params = PositionalEmbeddingParams(
+                type=PositionEmbeddingType.yarn,
+                rope=rope_params,
+                is_neox=False)
+        else:
+            pos_embd_params = PositionalEmbeddingParams(
+                type=PositionEmbeddingType.rope_gpt_neox,
+                rope=rope_params,
+            )
+
         super().__init__(
             hidden_size=config.hidden_size,
             num_attention_heads=config.num_attention_heads,
             num_key_value_heads=config.num_key_value_heads,
             max_position_embeddings=config.max_position_embeddings,
             bias=False,
-            pos_embd_params=PositionalEmbeddingParams(
-                type=PositionEmbeddingType.rope_gpt_neox,
-                rope=RopeParams.from_config(config),
-            ),
+            pos_embd_params=pos_embd_params,
             layer_idx=layer_idx,
             dtype=config.torch_dtype,
             config=model_config,
@@ -266,20 +279,18 @@ def _get_num_multimodal_tokens(self, image_sizes):
         }
 
     def get_num_tokens_per_image(self, image_sizes):
-        # FIXME avoid double loading with custom loader
         h, w = image_sizes
         ncols, nrows = self.image_processor._image_to_num_tokens(
             Image.new("RGB", (w, h)))
         return ncols * nrows + nrows
 
-    def __call__(self, text, images, media, **kwargs):
-        assert media is not None
-        if isinstance(media, str):
-            media = [media]
-
-        mm_items = [{"type": "image_url", "image_url": url} for url in media]
-
-        logger.debug(f"text: {text}")
+    def __call__(self, text, images, **kwargs):
+        mm_items = []
+        if images:
+            mm_items = [{
+                "type": "image",
+                "base64": encode_base64_image(image)
+            } for image in images]
 
         conversation = [{
             "role": "user",
@@ -292,19 +303,20 @@ def __call__(self, text, images, media, **kwargs):
         encoded = self.tokenizer.transformers_tokenizer.apply_chat_template(
             conversation, tokenize=True, return_dict=True, return_tensors='pt')
 
-        logger.debug(
-            f"encoded.pixel_values.shape: {encoded.pixel_values.shape}, encoded.input_ids: {encoded.input_ids[0][-20:]}"
-        )
-        logger.debug(
-            f"encoded.input_ids list: {self.tokenizer.transformers_tokenizer.apply_chat_template(conversation)}"
-        )
-
         processed = {
             "input_ids": encoded.input_ids,
-            "pixel_values": encoded.pixel_values.to(self.dtype),
-            "attention_mask": encoded.attention_mask,
-            "image_sizes": torch.tensor([encoded.pixel_values.shape[2:]])
         }
+
+        # text-only mode for VLM
+        if "pixel_values" in encoded:
+            processed.update({
+                "pixel_values":
+                encoded.pixel_values.to(self.dtype),
+                "attention_mask":
+                encoded.attention_mask,
+                "image_sizes":
+                torch.tensor([encoded.pixel_values.shape[2:]])
+            })
         return processed
 
 
@@ -376,26 +388,22 @@ def __call__(
         self, inputs: TextPrompt, sampling_params: SamplingParams
     ) -> Tuple[List[int], ExtraProcessedInputs | None]:
         images = inputs.get("multi_modal_data", {}).get("image")
-        mm_processor_kwargs = inputs.get("mm_processor_kwargs", {})
         do_rescale = getattr(self.processor.image_processor, "do_rescale",
                              False)
         if images is not None and isinstance(images[0], torch.Tensor):
             # The default multimodal input loader will normalize images to [0, 1] when the requested
             # format is "pt" (pytorch tensors), but not for "pil" (PIL images).
             do_rescale = False
 
-        if mm_processor_kwargs:
-            # Currently, we only support image modality in MistralCommonImageProcessor.
+        if images is not None:
             processed = self.processor(
                 text=inputs["prompt"],
                 images=images,
                 do_rescale=do_rescale,
-                **mm_processor_kwargs,
             )
         else:
             processed = self.text_processor(
                 text=inputs["prompt"],
-                images=images,
                 do_rescale=do_rescale,
             )
         input_ids = processed.pop("input_ids").tolist()[0]