Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions constraints.txt
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,7 @@
urllib3>=2.6.3
# WAR against https://github.com/advisories/GHSA-8rrh-rw8j-w5fx
wheel>=0.46.2
# WAR against https://github.com/advisories/GHSA-7gcm-g887-7qv7
protobuf>=6.33.5
# WAR against https://github.com/advisories/GHSA-6mq8-rvhq-8wgg
aiohttp>=3.13.3
14 changes: 14 additions & 0 deletions examples/disaggregated/slurm/benchmark/run_benchmark_nv_sa.sh
Original file line number Diff line number Diff line change
Expand Up @@ -191,6 +191,20 @@ for concurrency in ${concurrency_list}; do
--percentile-metrics "ttft,tpot,itl,e2el" \
$([ "${streaming}" = "false" ] && echo "--non-streaming")

# Print failed request count (consistent with non-nv_sa benchmark format)
python - "${output_dir}/result.json" <<-'PYEOF'
import json
import sys

try:
with open(sys.argv[1], encoding="utf-8") as f:
d = json.load(f)
failed = d["num_prompts"] - d["completed"]
print(f"Total failed requests: {failed}")
except (OSError, json.JSONDecodeError, KeyError) as exc:
print(f"WARNING: failed to read request counts from {sys.argv[1]}: {exc}", file=sys.stderr)
PYEOF

echo "Benchmark with concurrency ${concurrency} done"
do_process_all_logs ${log_path}/ ${log_path}/concurrency_${concurrency} "log"
done
Expand Down
8 changes: 4 additions & 4 deletions jenkins/current_image_tags.properties
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm

LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202602011118-10901
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202602011118-10901
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202602011118-10901
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202603051044-11898
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202603051044-11898
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202603051044-11898
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202603051044-11898
2 changes: 1 addition & 1 deletion requirements-dev.txt
Original file line number Diff line number Diff line change
Expand Up @@ -36,7 +36,7 @@ opentelemetry-api>=1.26.0
opentelemetry-exporter-otlp>=1.26.0
opentelemetry-semantic-conventions-ai>=0.4.1
fuzzywuzzy==0.18.0
aiperf==0.3.0
aiperf==0.4.0
nanobind>=2.9.0
nixl==0.8.0
hf-transfer==0.1.9
Expand Down
2 changes: 1 addition & 1 deletion requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -30,7 +30,7 @@ nvidia-modelopt[torch]~=0.37.0
# torch 2.9.1+cu130 depends on nvidia-nccl-cu13==2.27.7
nvidia-nccl-cu13>=2.27.7,<=2.28.9
nvidia-cuda-nvrtc
transformers==4.57.1
transformers==4.57.3
prometheus_client
prometheus_fastapi_instrumentator
pydantic>=2.9.1
Expand Down
26 changes: 14 additions & 12 deletions tensorrt_llm/_torch/pyexecutor/sampler.py
Original file line number Diff line number Diff line change
Expand Up @@ -1792,7 +1792,8 @@ def _write_finish_reasons(
if not single_token_stop_words_only
else self._are_stop_words_single_token
)
batched_finish_reasons[:, stop_word_indices] = torch.where(
batched_finish_reasons_stop_words = batched_finish_reasons[:, stop_word_indices]
_ = batched_finish_reasons_stop_words.masked_fill_(
stop_words_func(
stop_seq_slots,
stop_tokens,
Expand All @@ -1801,18 +1802,17 @@ def _write_finish_reasons(
else num_accepted_tokens,
),
FinishReason.STOP_WORDS.value,
batched_finish_reasons[:, stop_word_indices],
)
batched_finish_reasons[:, stop_word_indices] = batched_finish_reasons_stop_words

batched_finish_reasons = torch.where(
_ = batched_finish_reasons.masked_fill_(
self._are_max_length(seq_lens, store.max_lengths_cuda[seq_slots]),
FinishReason.LENGTH.value,
batched_finish_reasons,
)
batched_finish_reasons = torch.where(

_ = batched_finish_reasons.masked_fill_(
self._are_end_id(store.end_ids_cuda[seq_slots], tokens),
FinishReason.END_ID.value,
batched_finish_reasons,
)

finish_reasons[:, seq_slots] = batched_finish_reasons
Expand Down Expand Up @@ -1916,7 +1916,7 @@ def _are_stop_words(
# Fill in the new tokens at the end of the past tokens buffer
full_tokens[-self._max_tokens :] = tokens
# short words are padded with _PAD_STOP_WORD_TOKEN_ID, so we need to mask them
mask = stop_words != self._PAD_STOP_WORD_TOKEN_ID
mask = stop_words == self._PAD_STOP_WORD_TOKEN_ID
matches = torch.empty(
(
self._max_tokens,
Expand All @@ -1941,15 +1941,15 @@ def _are_stop_words(
stop_words_for_match = stop_words.unsqueeze(0)
_ = torch.eq(full_tokens_for_match, stop_words_for_match, out=matches)
# Mask the padding tokens
matches_after_mask = torch.where(
mask.unsqueeze(0).expand(self._max_tokens, -1, -1, -1, -1), matches, True
_ = matches.masked_fill_(
mask.unsqueeze(0).expand(self._max_tokens, -1, -1, -1, -1), True
)
# Update the past tokens storage for the next iteration
store.past_tokens_cuda[:, seq_slots] = full_tokens
# Return the result
word_len_dim = 2
num_words_dim = 1
return torch.any(matches_after_mask.all(dim=word_len_dim), dim=num_words_dim)
return torch.any(matches.all(dim=word_len_dim), dim=num_words_dim)

@nvtx_range("_are_stop_words_single_token")
def _are_stop_words_single_token(
Expand Down Expand Up @@ -3721,8 +3721,10 @@ def _sample_batched_by_strategy(
group_logits_indices_for_processed_logprobs_cuda
]
current_softmax_cuda = group_softmax_cuda[logit_indices_for_processed_logprobs_cuda]
processed_logits_cuda = torch.where(
current_softmax_cuda > 0, current_logits_cuda, float("-inf")

# processed_logits_cuda is an alias to current_logits_cuda after this operation
processed_logits_cuda = current_logits_cuda.masked_fill_(
current_softmax_cuda == 0, float("-inf")
)
temperature_for_processed_logprobs = group_temperature_cuda
if isinstance(temperature_for_processed_logprobs, torch.Tensor):
Expand Down
7 changes: 7 additions & 0 deletions tensorrt_llm/serve/openai_server.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,7 @@
from tensorrt_llm.serve.responses_utils import get_steady_clock_now_in_seconds
from tensorrt_llm.serve.responses_utils import \
request_preprocess as responses_api_request_preprocess
from tensorrt_llm.serve.tool_parser.tool_parser_factory import ToolParserFactory
from tensorrt_llm.serve.visual_gen_utils import (VIDEO_STORE,
parse_visual_gen_params)
from tensorrt_llm.version import __version__ as VERSION
Expand Down Expand Up @@ -809,6 +810,12 @@ async def chat_stream_generator(
gather_generation_logits,
reasoning_parser=self.generator.args.reasoning_parser,
backend=self.generator.args.backend)
if self.tool_parser and request.tools:
tool_parser_cls = ToolParserFactory.parsers.get(
self.tool_parser.lower())
if tool_parser_cls and getattr(
tool_parser_cls, 'needs_raw_special_tokens', False):
sampling_params.skip_special_tokens = False
postproc_args = ChatPostprocArgs.from_request(request)
disaggregated_params = to_llm_disaggregated_params(
request.disaggregated_params)
Expand Down
2 changes: 2 additions & 0 deletions tensorrt_llm/serve/tool_parser/base_tool_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -16,6 +16,8 @@
class BaseToolParser(ABC):
"""Base class providing two sets of interfaces: one-time and streaming incremental."""

needs_raw_special_tokens: bool = False

def __init__(self):
# Streaming state management
# Buffer for accumulating incomplete patterns that arrive across multiple streaming chunks
Expand Down
8 changes: 7 additions & 1 deletion tensorrt_llm/serve/tool_parser/deepseekv32_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -61,6 +61,10 @@ class DeepSeekV32Parser(BaseToolParser):
Reference: DeepSeek V3.2 format specification
"""

needs_raw_special_tokens = True

_eos_token = "<|end▁of▁sentence|>" # nosec B105

def __init__(self):
super().__init__()
self.bot_token = "<|DSML|function_calls>" # nosec B105
Expand Down Expand Up @@ -118,6 +122,8 @@ def detect_and_parse(self, text: str, tools: List[Tool]) -> StreamingParseResult
:param tools: List of available tools.
:return: ParseResult indicating success or failure, consumed text, leftover text, and parsed calls.
"""
if self._eos_token in text:
text = text.replace(self._eos_token, "")
idx = text.find(self.bot_token)
normal_text = text[:idx].strip() if idx != -1 else text
if self.bot_token not in text:
Expand Down Expand Up @@ -177,7 +183,7 @@ def parse_streaming_increment(self, new_text: str, tools: List[Tool]) -> Streami

if not has_tool_call and not potentially_dsml and not ends_with_prefix:
self._buffer = ""
for e_token in [self.eot_token, self.invoke_end_token]:
for e_token in [self.eot_token, self.invoke_end_token, self._eos_token]:
if e_token in new_text:
new_text = new_text.replace(e_token, "")
return StreamingParseResult(normal_text=new_text)
Expand Down
Loading
Loading