stnie
diff --git a/‎examples/llm-api/quickstart_multimodal.py‎
Lines changed: 8 additions & 1 deletion b/‎examples/llm-api/quickstart_multimodal.py‎
Lines changed: 8 additions & 1 deletion
diff --git a/‎examples/models/core/nemotron/README_nano-v2-vl.md‎
Lines changed: 6 additions & 5 deletions b/‎examples/models/core/nemotron/README_nano-v2-vl.md‎
Lines changed: 6 additions & 5 deletions
diff --git a/‎tensorrt_llm/_torch/model_config.py‎
Lines changed: 3 additions & 0 deletions b/‎tensorrt_llm/_torch/model_config.py‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎tensorrt_llm/_torch/models/modeling_nemotron_nano.py‎
Lines changed: 38 additions & 24 deletions b/‎tensorrt_llm/_torch/models/modeling_nemotron_nano.py‎
Lines changed: 38 additions & 24 deletions
diff --git a/‎tensorrt_llm/_torch/modules/mamba/fuse_elementwise_ops.py‎
Lines changed: 3 additions & 1 deletion b/‎tensorrt_llm/_torch/modules/mamba/fuse_elementwise_ops.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/modules/mamba/layernorm_gated.py‎
Lines changed: 3 additions & 1 deletion b/‎tensorrt_llm/_torch/modules/mamba/layernorm_gated.py‎
Lines changed: 3 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 6 additions & 1 deletion b/‎tensorrt_llm/_torch/pyexecutor/model_engine.py‎
Lines changed: 6 additions & 1 deletion
diff --git a/‎tensorrt_llm/_torch/pyexecutor/model_loader.py‎
Lines changed: 1 addition & 0 deletions b/‎tensorrt_llm/_torch/pyexecutor/model_loader.py‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎tensorrt_llm/commands/serve.py‎
Lines changed: 16 additions & 3 deletions b/‎tensorrt_llm/commands/serve.py‎
Lines changed: 16 additions & 3 deletions
@@ -132,6 +132,11 @@ def add_multimodal_args(parser):
         type=int,
         default=2,
         help="Number of conversation turns for automated testing.")
+    parser.add_argument("--video_pruning_rate",
+                        type=float,
+                        default=None,
+                        help="Pruning rate for video frames (EVS). "
+                        "None disables EVS, values in [0, 1) enable pruning.")
     return parser
 
 
@@ -181,7 +186,9 @@ def main():
         lora_config.max_loras = 2
         lora_config.max_cpu_loras = 2
 
-    llm, sampling_params = setup_llm(args, lora_config=lora_config)
+    llm, sampling_params = setup_llm(args,
+                                     lora_config=lora_config,
+                                     video_pruning_rate=args.video_pruning_rate)
 
     image_format = args.image_format
     if args.model_type is not None:
 
@@ -38,7 +38,7 @@ python3 examples/llm-api/quickstart_multimodal.py --model_dir nvidia/NVIDIA-Nemo
  * Video modality input with Efficient video sampling (EVS):
 
 ```bash
-TLLM_VIDEO_PRUNING_RATIO=0.9 python3 examples/llm-api/quickstart_multimodal.py --model_dir nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16 --disable_kv_cache_reuse --max_batch_size 128 --trust_remote_code --modality video --max_num_tokens 131072
+python3 examples/llm-api/quickstart_multimodal.py --model_dir nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16 --disable_kv_cache_reuse --max_batch_size 128 --trust_remote_code --modality video --max_num_tokens 131072 --video_pruning_rate 0.9
 ```
 
 ## Online serving example CMDs
@@ -55,7 +55,7 @@ EOF
 
 # CMD to launch serve without EVS.
 trtllm-serve  \
-nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16\
+nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16 \
 --host 0.0.0.0 \
 --port 8000 \
 --backend pytorch \
@@ -65,16 +65,17 @@ nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16\
 --media_io_kwargs "{\"video\": {\"fps\": 2, \"num_frames\": 128} }" \
 --config config.yml
 
-# CMD to launch serve with EVS (video_pruning_ratio=0.9).
-TLLM_VIDEO_PRUNING_RATIO=0.9 trtllm-serve  \
-nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16\
+# CMD to launch serve with EVS (video_pruning_rate=0.9).
+trtllm-serve  \
+nvidia/NVIDIA-Nemotron-Nano-12B-v2-VL-BF16 \
 --host 0.0.0.0 \
 --port 8000 \
 --backend pytorch \
 --max_batch_size 16 \
 --max_num_tokens 131072 \
 --trust_remote_code \
 --media_io_kwargs "{\"video\": {\"fps\": 2, \"num_frames\": 128} }" \
+--video_pruning_rate 0.9 \
 --config config.yml
 ```
 
 
@@ -131,6 +131,9 @@ class ModelConfig(Generic[TConfig]):
     # If true, ONLY the vision encoder part of the full model is loaded/executed.
     mm_encoder_only: bool = False
 
+    # Video pruning rate for VLM models (None = EVS disabled)
+    video_pruning_rate: Optional[float] = None
+
     def __setattr__(self, key, value):
         """
         Prevent modification of frozen instance attributes.
 
@@ -42,7 +42,6 @@
 from .modeling_radio import RADIOVisionModel, calc_seq_lens
 from .modeling_utils import register_auto_model
 
-VIDEO_PRUNING_RATIO = float(os.getenv("TLLM_VIDEO_PRUNING_RATIO", "0"))
 # Set max_num_tiles to 1 for video modality, to match the training behavior.
 VIDEO_MAX_NUM_TILES = 1
 IMAGE_PLACEHOLDER = "<image>"
@@ -257,7 +256,10 @@ def __init__(self, model_config: ModelConfig[transformers.PretrainedConfig]):
             raise NotImplementedError(
                 f"Unsupported {config.ps_version=}. Supported versions: {supported_versions}."
             )
-        self.video_pruning_ratio = VIDEO_PRUNING_RATIO
+        # Use config value if explicitly set (EVS enabled), otherwise default to 0.0 (EVS disabled)
+        self.video_pruning_rate = (
+            model_config.video_pruning_rate if model_config.video_pruning_rate is not None else 0.0
+        )
 
         # Construct the vision projection.
         self.vit_hidden_size = config.vit_hidden_size
@@ -414,7 +416,7 @@ def apply_evs_per_video(
                 video_embeds=reshaped_partial_mm_embed,
                 video_size=(t, p * ih, iw),
                 spatial_merge_size=self.spatial_merge_size,
-                pruning_ratio=self.video_pruning_ratio,
+                pruning_ratio=self.video_pruning_rate,
                 flatten_output=False,
             ).flatten(start_dim=1)
             # -> [num_frames, num_patches_per_frame*h*w]
@@ -437,7 +439,7 @@ def apply_evs(
     ) -> Tuple[List[torch.Tensor], Optional[List[List[int] | None]]]:
         """Apply EVS to the multimodal embedding."""
         # Skip EVS if pruning ratio is 0.
-        if self.video_pruning_ratio <= 0:
+        if self.video_pruning_rate <= 0:
             return mm_embedding, None
 
         modality_types = [
@@ -448,7 +450,7 @@ def apply_evs(
             return mm_embedding, None
 
         video_size_list = [
-            multimodal_data[modality_type]["video_size"]
+            multimodal_data[modality_type].get("video_size") if modality_type == "video" else None
             for modality_type, multimodal_data in zip(modality_types, multimodal_data_lst)
         ]
         mm_embedding_evs = []
@@ -487,17 +489,23 @@ def forward(
                 pixel_values_flat = data["pixel_values"]
                 image_sizes = data["image_sizes"]
                 embeds = self.extract_feature_dynamic(pixel_values_flat, image_sizes)
-                mm_embedding.append(embeds.reshape(-1, self.llm_hidden_size))
+                # Keep 3D shape for apply_evs, will reshape to 2D after EVS
+                mm_embedding.append(embeds)
             # This applies to images without dynamic resolution, or videos.
             else:
                 # Fallback to fixed-tile extraction for this modality.
                 pixel_values = data["pixel_values"]
                 embeds = self.extract_feature(pixel_values)
-                mm_embedding.append(embeds.reshape(-1, self.llm_hidden_size))
+                # Keep 3D shape [num_patches, h*w, hidden] for apply_evs
+                mm_embedding.append(embeds)
 
-        return mm_embedding, [None] * len(modality_types)
+        # Apply EVS if video_pruning_rate > 0
+        mm_embedding, num_tokens_in_videos = self.apply_evs(mm_embedding, multimodal_data_lst)
+        # Reshape to 2D after EVS: [num_patches*h*w, hidden_size]
+        mm_embedding = [m.reshape(-1, self.llm_hidden_size) for m in mm_embedding]
+        return mm_embedding, num_tokens_in_videos
 
-        # Existing fixed-tile path.
+        # Existing fixed-tile path (unreachable, kept for reference).
         pixel_values = [
             multimodal_data[modality_type]["pixel_values"]
             for modality_type, multimodal_data in zip(modality_types, multimodal_data_lst)
@@ -530,6 +538,9 @@ def __init__(
         trust_remote_code: bool = True,
         **kwargs,
     ):
+        # Extract video_pruning_rate before passing kwargs to parent
+        video_pruning_rate = kwargs.pop("video_pruning_rate", None) or 0.0
+
         super().__init__(
             model_path=model_path,
             config=config,
@@ -563,7 +574,7 @@ def __init__(
         self.num_image_token = int(
             (self.image_size // self.patch_size) ** 2 * (self.downsample_ratio**2)
         )
-        self.video_pruning_ratio = VIDEO_PRUNING_RATIO
+        self.video_pruning_rate = video_pruning_rate
         self.img_context_token = self.config.img_context_token
         self.video_context_token = self.config.video_context_token
         self.img_start_token = self.config.img_start_token
@@ -747,15 +758,15 @@ def get_num_tokens_per_video(
         self,
         *,
         video: List[Image.Image],
-        video_pruning_ratio: Optional[float] = None,
+        video_pruning_rate: Optional[float] = None,
         **kwargs,
     ):
         # Use VIDEO_PRUNING_RATIO if not explicitly provided
-        if video_pruning_ratio is None:
-            video_pruning_ratio = self.video_pruning_ratio
+        if video_pruning_rate is None:
+            video_pruning_rate = self.video_pruning_rate
 
         num_frames = len(video)
-        if video_pruning_ratio > 0:
+        if video_pruning_rate > 0:
             num_tokens_per_frame = self.get_num_tokens_per_image(
                 image=video[0],
                 max_num_tiles=VIDEO_MAX_NUM_TILES,
@@ -767,7 +778,7 @@ def get_num_tokens_per_video(
             num_total_tokens = compute_retained_tokens_count(
                 video_size=video_size,
                 spatial_merge_size=self.spatial_merge_size,
-                pruning_ratio=video_pruning_ratio,
+                pruning_ratio=video_pruning_rate,
             )
             # Add special tokens for each frame.
             num_total_tokens += num_frames * len(self.get_mm_special_token_ids())
@@ -776,7 +787,7 @@ def get_num_tokens_per_video(
             num_total_tokens = sum(
                 self.get_num_tokens_per_image(
                     image=frame,
-                    video_pruning_ratio=None,
+                    video_pruning_rate=None,
                     max_num_tiles=VIDEO_MAX_NUM_TILES,
                     **kwargs,
                 )
@@ -961,7 +972,7 @@ def _process_video_prompts(
                 processed_query.extend(frame_prompts)
             # Video_context_token as placeholder,
             # it will be replaced with the real image_tokens_per_frames during model forward.
-            if self.video_pruning_ratio > 0:
+            if self.video_pruning_rate > 0:
                 evs_query.append(split_text_prompt[video_index])
                 evs_query.append("This is a video:\n")
                 for frame_sep in frame_separators:
@@ -986,7 +997,7 @@ def _process_video_prompts(
         ]
         input_ids = torch.cat(input_ids_lst, dim=1)
 
-        if self.video_pruning_ratio > 0:
+        if self.video_pruning_rate > 0:
             evs_query.append(split_text_prompt[-1])
             evs_ids = [
                 self.tokenizer.encode(
@@ -1009,11 +1020,11 @@ def _compute_token_numbers_per_video(self, video_size_lst: List[Tuple]) -> List[
             img_height = video_size[2]
             img_width = video_size[3]
 
-            if self.video_pruning_ratio > 0:
+            if self.video_pruning_rate > 0:
                 desired_num_tokens = compute_retained_tokens_count(
                     video_size=(num_frames, num_patches_per_frame * img_height, img_width),
                     spatial_merge_size=self.spatial_merge_size,
-                    pruning_ratio=self.video_pruning_ratio,
+                    pruning_ratio=self.video_pruning_rate,
                 )
                 # It is dummy tokens and will be adjusted in VisionEncoder after applied EVS.
                 # Need to know the length of the full input ids ahead,
@@ -1069,7 +1080,7 @@ def __call__(
             # Store input_ids for image modality here when EVS is enabled,
             # which will be used in merge_evs_mm_embeds later.
             modality_data["evs_ids"] = (
-                input_ids[0].to(torch.int32) if self.video_pruning_ratio > 0 else None
+                input_ids[0].to(torch.int32) if self.video_pruning_rate > 0 else None
             )
         elif videos is not None:
             modality_type = "video"
@@ -1249,7 +1260,10 @@ def __init__(self, model_config: ModelConfig):
         self.sound_context_token_id = getattr(config, "sound_context_token_id", None)
         self.post_config()
         self.is_loaded = True
-        self.video_pruning_ratio = VIDEO_PRUNING_RATIO
+        # Use config value if explicitly set (EVS enabled), otherwise default to 0.0 (EVS disabled)
+        self.video_pruning_rate = (
+            model_config.video_pruning_rate if model_config.video_pruning_rate is not None else 0.0
+        )
 
     def load_weights(self, weights):
         # Load vision encoder weights.
@@ -1378,7 +1392,7 @@ def _encode_multimodal(
             if modality_type in ("image", "video"):
                 embs, num_tokens = self.vision_encoder([param])
                 mm_embeddings.append(embs[0])
-                mm_num_tokens.append(num_tokens[0])
+                mm_num_tokens.append(num_tokens[0] if num_tokens is not None else None)
             elif modality_type == "audio":
                 mm_embeddings.append(self._encode_audio(param))
                 mm_num_tokens.append(None)
@@ -1421,7 +1435,7 @@ def forward(
                     "the TLLM_MULTIMODAL_DISAGGREGATED environment variable, or set it to '0'."
                 )
             # Adjust input_ids in videos if EVS is applied.
-            if self.video_pruning_ratio > 0:
+            if self.video_pruning_rate > 0:
                 input_ids = self.merge_evs_mm_embeds(
                     num_tokens_in_videos,
                     multimodal_params=multimodal_params[:num_context_requests],
 
@@ -42,7 +42,9 @@ def _extract_transpose_prefill_kernel(
     conv_mask = conv_offsets < conv_dim
     mask = seq_mask[:, None] & conv_mask[None, :]
 
-    src_offsets = seq_offsets[:, None] * d_in_proj + (d_inner + conv_offsets[None, :])
+    # Cast to int64 to avoid overflow: seq_offsets * d_in_proj can exceed INT32_MAX
+    # (e.g., 131071 * 22656 = 2,969,544,576 > 2,147,483,647)
+    src_offsets = seq_offsets[:, None].to(tl.int64) * d_in_proj + d_inner + conv_offsets[None, :]
     data = tl.load(src_ptr + src_offsets, mask=mask, other=0.0)
 
     dst_offsets = conv_offsets[:, None] * num_prefill_tokens + seq_offsets[None, :]
 
@@ -52,7 +52,9 @@ def _layer_norm_fwd_1pass_kernel(
     X += row * stride_x_row + group * N
     Y += row * stride_y_row + group * N
     if HAS_Z:
-        Z += row * stride_z_row + group * N
+        # Cast to int64 to avoid overflow: row * stride_z_row can exceed INT32_MAX
+        # when Z is a non-contiguous slice (e.g., 131071 * 22656 = 2,969,544,576)
+        Z += tl.cast(row, tl.int64) * stride_z_row + group * N
     if not IS_RMS_NORM:
         Mean += group * M
     Rstd += group * M
 
@@ -203,10 +203,15 @@ def __init__(
         self.attn_runtime_features = attn_runtime_features or AttentionRuntimeFeatures(
         )
 
+        input_processor_kwargs = {}
+        if llm_args.video_pruning_rate is not None:
+            input_processor_kwargs[
+                'video_pruning_rate'] = llm_args.video_pruning_rate
         self.input_processor = create_input_processor(
             model_path,
             tokenizer=None,
-            checkpoint_format=llm_args.checkpoint_format)
+            checkpoint_format=llm_args.checkpoint_format,
+            **input_processor_kwargs)
         self.input_processor_with_hash = create_input_processor_with_hash(
             self.input_processor)
         if model is None:
 
@@ -485,6 +485,7 @@ def _load_and_validate_config(
             use_cute_dsl_blockscaling_mm,
             use_cute_dsl_blockscaling_bmm=self.llm_args.
             use_cute_dsl_blockscaling_bmm,
+            video_pruning_rate=self.llm_args.video_pruning_rate,
         )
 
         # Only pass model_kwargs if it's explicitly set (not None)
 
@@ -154,6 +154,7 @@ def get_llm_args(
         fail_fast_on_attention_window_too_large: bool = True,
         otlp_traces_endpoint: Optional[str] = None,
         enable_chunked_prefill: bool = False,
+        video_pruning_rate: Optional[float] = None,
         **llm_args_extra_dict: Any):
 
     if gpus_per_node is None:
@@ -236,6 +237,8 @@ def get_llm_args(
         otlp_traces_endpoint,
         "fail_fast_on_attention_window_too_large":
         fail_fast_on_attention_window_too_large,
+        "video_pruning_rate":
+        video_pruning_rate,
     }
 
     llm_args = {
@@ -718,6 +721,14 @@ def convert(self, value: Any, param: Optional["click.Parameter"],
               default=None,
               help=help_info_with_stability_tag(
                   "Keyword arguments for media I/O.", "prototype"))
+@click.option("--video_pruning_rate",
+              type=float,
+              default=None,
+              help=help_info_with_stability_tag(
+                  "Pruning rate for video frames in multimodal models. "
+                  "Applied by Efficient Video Sampling (EVS). "
+                  "None disables EVS, values in [0, 1) enable pruning.",
+                  "prototype"))
 @click.option("--chat_template",
               type=str,
               default=None,
@@ -760,8 +771,9 @@ def serve(
         fail_fast_on_attention_window_too_large: bool,
         otlp_traces_endpoint: Optional[str], enable_chunked_prefill: bool,
         disagg_cluster_uri: Optional[str], media_io_kwargs: Optional[str],
-        custom_module_dirs: list[Path], chat_template: Optional[str],
-        grpc: bool, served_model_name: Optional[str],
+        video_pruning_rate: Optional[float], custom_module_dirs: list[Path],
+        chat_template: Optional[str], grpc: bool,
+        served_model_name: Optional[str],
         extra_visual_gen_options: Optional[str]):
     """Running an OpenAI API compatible server
 
@@ -815,7 +827,8 @@ def _serve_llm():
             fail_fast_on_attention_window_too_large=
             fail_fast_on_attention_window_too_large,
             otlp_traces_endpoint=otlp_traces_endpoint,
-            enable_chunked_prefill=enable_chunked_prefill)
+            enable_chunked_prefill=enable_chunked_prefill,
+            video_pruning_rate=video_pruning_rate)
 
         llm_args_extra_dict = {}
         if extra_llm_api_options is not None:
Original file line number	Diff line number	Diff line change
`@@ -485,6 +485,7 @@ def _load_and_validate_config(`
`485`	`485`	`use_cute_dsl_blockscaling_mm,`
`486`	`486`	`use_cute_dsl_blockscaling_bmm=self.llm_args.`
`487`	`487`	`use_cute_dsl_blockscaling_bmm,`
	`488`	`+ video_pruning_rate=self.llm_args.video_pruning_rate,`
`488`	`489`	`)`
`489`	`490`
`490`	`491`	`# Only pass model_kwargs if it's explicitly set (not None)`