xinhe-nv · pull · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026 · Mar 10, 2026
diff --git a/constraints.txt b/constraints.txt
@@ -4,3 +4,7 @@
 urllib3>=2.6.3
 # WAR against https://github.com/advisories/GHSA-8rrh-rw8j-w5fx
 wheel>=0.46.2
+# WAR against https://github.com/advisories/GHSA-7gcm-g887-7qv7
+protobuf>=6.33.5
+# WAR against https://github.com/advisories/GHSA-6mq8-rvhq-8wgg
+aiohttp>=3.13.3
diff --git a/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh b/examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh
@@ -191,6 +191,20 @@ for concurrency in ${concurrency_list}; do
         --percentile-metrics "ttft,tpot,itl,e2el" \
         $([ "${streaming}" = "false" ] && echo "--non-streaming")
 
+    # Print failed request count (consistent with non-nv_sa benchmark format)
+    python - "${output_dir}/result.json" <<-'PYEOF'
+	import json
+	import sys
+
+	try:
+	    with open(sys.argv[1], encoding="utf-8") as f:
+	        d = json.load(f)
+	    failed = d["num_prompts"] - d["completed"]
+	    print(f"Total failed requests: {failed}")
+	except (OSError, json.JSONDecodeError, KeyError) as exc:
+	    print(f"WARNING: failed to read request counts from {sys.argv[1]}: {exc}", file=sys.stderr)
+	PYEOF
+
     echo "Benchmark with concurrency ${concurrency} done"
     do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "log"
 done

diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties
@@ -13,7 +13,7 @@
 #     images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
 IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
 
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202602011118-10901
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202602011118-10901
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202603051044-11898
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202603051044-11898
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202603051044-11898
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202603051044-11898
diff --git a/requirements-dev.txt b/requirements-dev.txt
@@ -36,7 +36,7 @@ opentelemetry-api>=1.26.0
 opentelemetry-exporter-otlp>=1.26.0
 opentelemetry-semantic-conventions-ai>=0.4.1
 fuzzywuzzy==0.18.0
-aiperf==0.3.0
+aiperf==0.4.0
 nanobind>=2.9.0
 nixl==0.8.0
 hf-transfer==0.1.9

diff --git a/requirements.txt b/requirements.txt
@@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0
 # torch 2.9.1+cu130 depends on nvidia-nccl-cu13==2.27.7
 nvidia-nccl-cu13>=2.27.7,<=2.28.9
 nvidia-cuda-nvrtc
-transformers==4.57.1
+transformers==4.57.3
 prometheus_client
 prometheus_fastapi_instrumentator
 pydantic>=2.9.1

diff --git a/tensorrt_llm/_torch/pyexecutor/sampler.py b/tensorrt_llm/_torch/pyexecutor/sampler.py
@@ -1792,7 +1792,8 @@ def _write_finish_reasons(
                     if not single_token_stop_words_only
                     else self._are_stop_words_single_token
                 )
-                batched_finish_reasons[:, stop_word_indices] = torch.where(
+                batched_finish_reasons_stop_words = batched_finish_reasons[:, stop_word_indices]
+                _ = batched_finish_reasons_stop_words.masked_fill_(
                     stop_words_func(
                         stop_seq_slots,
                         stop_tokens,
@@ -1801,18 +1802,17 @@ def _write_finish_reasons(
                         else num_accepted_tokens,
                     ),
                     FinishReason.STOP_WORDS.value,
-                    batched_finish_reasons[:, stop_word_indices],
                 )
+                batched_finish_reasons[:, stop_word_indices] = batched_finish_reasons_stop_words
 
-            batched_finish_reasons = torch.where(
+            _ = batched_finish_reasons.masked_fill_(
                 self._are_max_length(seq_lens, store.max_lengths_cuda[seq_slots]),
                 FinishReason.LENGTH.value,
-                batched_finish_reasons,
             )
-            batched_finish_reasons = torch.where(
+
+            _ = batched_finish_reasons.masked_fill_(
                 self._are_end_id(store.end_ids_cuda[seq_slots], tokens),
                 FinishReason.END_ID.value,
-                batched_finish_reasons,
             )
 
             finish_reasons[:, seq_slots] = batched_finish_reasons
@@ -1916,7 +1916,7 @@ def _are_stop_words(
             # Fill in the new tokens at the end of the past tokens buffer
             full_tokens[-self._max_tokens :] = tokens
             # short words are padded with _PAD_STOP_WORD_TOKEN_ID, so we need to mask them
-            mask = stop_words != self._PAD_STOP_WORD_TOKEN_ID
+            mask = stop_words == self._PAD_STOP_WORD_TOKEN_ID
             matches = torch.empty(
                 (
                     self._max_tokens,
@@ -1941,15 +1941,15 @@ def _are_stop_words(
             stop_words_for_match = stop_words.unsqueeze(0)
             _ = torch.eq(full_tokens_for_match, stop_words_for_match, out=matches)
             # Mask the padding tokens
-            matches_after_mask = torch.where(
-                mask.unsqueeze(0).expand(self._max_tokens, -1, -1, -1, -1), matches, True
+            _ = matches.masked_fill_(
+                mask.unsqueeze(0).expand(self._max_tokens, -1, -1, -1, -1), True
             )
             # Update the past tokens storage for the next iteration
             store.past_tokens_cuda[:, seq_slots] = full_tokens
             # Return the result
             word_len_dim = 2
             num_words_dim = 1
-            return torch.any(matches_after_mask.all(dim=word_len_dim), dim=num_words_dim)
+            return torch.any(matches.all(dim=word_len_dim), dim=num_words_dim)
 
         @nvtx_range("_are_stop_words_single_token")
         def _are_stop_words_single_token(
@@ -3721,8 +3721,10 @@ def _sample_batched_by_strategy(
                     group_logits_indices_for_processed_logprobs_cuda
                 ]
                 current_softmax_cuda = group_softmax_cuda[logit_indices_for_processed_logprobs_cuda]
-                processed_logits_cuda = torch.where(
-                    current_softmax_cuda > 0, current_logits_cuda, float("-inf")
+
+                # processed_logits_cuda is an alias to current_logits_cuda after this operation
+                processed_logits_cuda = current_logits_cuda.masked_fill_(
+                    current_softmax_cuda == 0, float("-inf")
                 )
                 temperature_for_processed_logprobs = group_temperature_cuda
                 if isinstance(temperature_for_processed_logprobs, torch.Tensor):

diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
@@ -81,6 +81,7 @@
 from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds
 from tensorrt_llm.serve.responses_utils import \
     request_preprocess as responses_api_request_preprocess
+from tensorrt_llm.serve.tool_parser.tool_parser_factory import ToolParserFactory
 from tensorrt_llm.serve.visual_gen_utils import (VIDEO_STORE,
                                                  parse_visual_gen_params)
 from tensorrt_llm.version import __version__ as VERSION
@@ -809,6 +810,12 @@ async def chat_stream_generator(
                 gather_generation_logits,
                 reasoning_parser=self.generator.args.reasoning_parser,
                 backend=self.generator.args.backend)
+            if self.tool_parser and request.tools:
+                tool_parser_cls = ToolParserFactory.parsers.get(
+                    self.tool_parser.lower())
+                if tool_parser_cls and getattr(
+                        tool_parser_cls, 'needs_raw_special_tokens', False):
+                    sampling_params.skip_special_tokens = False
             postproc_args = ChatPostprocArgs.from_request(request)
             disaggregated_params = to_llm_disaggregated_params(
                 request.disaggregated_params)

diff --git a/tensorrt_llm/serve/tool_parser/base_tool_parser.py b/tensorrt_llm/serve/tool_parser/base_tool_parser.py
@@ -16,6 +16,8 @@
 class BaseToolParser(ABC):
     """Base class providing two sets of interfaces: one-time and streaming incremental."""
 
+    needs_raw_special_tokens: bool = False
+
     def __init__(self):
         # Streaming state management
         # Buffer for accumulating incomplete patterns that arrive across multiple streaming chunks

diff --git a/tensorrt_llm/serve/tool_parser/deepseekv32_parser.py b/tensorrt_llm/serve/tool_parser/deepseekv32_parser.py
@@ -61,6 +61,10 @@ class DeepSeekV32Parser(BaseToolParser):
     Reference: DeepSeek V3.2 format specification
     """
 
+    needs_raw_special_tokens = True
+
+    _eos_token = "<｜end▁of▁sentence｜>"  # nosec B105
+
     def __init__(self):
         super().__init__()
         self.bot_token = "<｜DSML｜function_calls>"  # nosec B105
@@ -118,6 +122,8 @@ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult
         :param tools: List of available tools.
         :return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
         """
+        if self._eos_token in text:
+            text = text.replace(self._eos_token, "")
         idx = text.find(self.bot_token)
         normal_text = text[:idx].strip() if idx != -1 else text
         if self.bot_token not in text:
@@ -177,7 +183,7 @@ def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> Streami
 
         if not has_tool_call and not potentially_dsml and not ends_with_prefix:
             self._buffer = ""
-            for e_token in [self.eot_token, self.invoke_end_token]:
+            for e_token in [self.eot_token, self.invoke_end_token, self._eos_token]:
                 if e_token in new_text:
                     new_text = new_text.replace(e_token, "")
             return StreamingParseResult(normal_text=new_text)