Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
1 change: 1 addition & 0 deletions cpp/kernels/fmha_v2/pytest.ini
Original file line number Diff line number Diff line change
Expand Up @@ -6,6 +6,7 @@ markers =
fmhca
debug
bench
needs_l40s
# bin: unit tests
# test: python script for invoking fmha.exe
testpaths = bin test
Original file line number Diff line number Diff line change
Expand Up @@ -493,7 +493,12 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
#if !DISABLE_SYNC_FOR_PROFILING
uint32_t expected_value = *ptrs.flag_val;

#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
// .acquire and .release qualifiers for fence instruction require sm_90 or higher.
asm volatile("fence.release.sys;");
#else
asm volatile("fence.acq_rel.sys;");
#endif
#pragma unroll 1 // No unroll as one iter is typically enough
for (int target_rank = lane_id; target_rank < ep_size; target_rank += warpSize)
{
Expand Down Expand Up @@ -525,7 +530,6 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
flag_set = flag_value == expected_value;
} while (!flag_set);
}
// asm volatile("fence.acquire.sys;");
#endif
}
}
Expand Down Expand Up @@ -1018,7 +1022,6 @@ __global__ void moeA2ACombineKernel(

if (blockIdx.x == 0)
{
// asm volatile("fence.release.sys;");
#pragma unroll 1 // No unroll
for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize)
{
Expand Down Expand Up @@ -1050,7 +1053,12 @@ __global__ void moeA2ACombineKernel(
flag_set = flag_value == expected_value;
} while (!flag_set);
}
#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
// .acquire and .release qualifiers for fence instruction require sm_90 or higher.
asm volatile("fence.acquire.sys;");
#else
asm volatile("fence.acq_rel.sys;");
#endif
}
__syncthreads();
#endif
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -78,8 +78,10 @@ enum class RoutingMethodType : int64_t
Llama4 = 3,
// RenormalizeNaive: Softmax -> TopK -> Renormalize
RenormalizeNaive = 4,
// MiniMaxM2: Sigmoid -> RoutingBiasAdd -> TopK -> Renormalize(without bias)
MiniMax2 = 5,
// Unspecified
Unspecified = 5,
Unspecified = 6,
};

inline int32_t maybeGetMinTokenCount(int32_t numPaddedTokens, int32_t hiddenSize, int32_t dtypeSizeBits)
Expand All @@ -98,6 +100,7 @@ inline std::string serializeMoeRoutingMethodType(RoutingMethodType routingMethod
case RoutingMethodType::DeepSeekV3: return "DeepSeekV3";
case RoutingMethodType::Llama4: return "Llama4";
case RoutingMethodType::RenormalizeNaive: return "RenormalizeNaive";
case RoutingMethodType::MiniMax2: return "MiniMax2";
default: TLLM_CHECK_WITH_INFO(false, "Invalid routing method"); return "";
};
}
Expand Down
5 changes: 3 additions & 2 deletions docker/Dockerfile.multi
Original file line number Diff line number Diff line change
@@ -1,8 +1,8 @@
# Multi-stage Dockerfile
ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver
ARG BASE_TAG=25.10-py3
ARG TRITON_BASE_TAG=25.10-py3
ARG BASE_TAG=25.12-py3
ARG TRITON_BASE_TAG=25.12-py3
ARG DEVEL_IMAGE=devel

FROM ${BASE_IMAGE}:${BASE_TAG} AS base
Expand Down Expand Up @@ -147,6 +147,7 @@ RUN --mount=type=cache,target=/root/.cache/pip --mount=type=bind,from=wheel,sour
pip install /tmp/wheel/tensorrt_llm*.whl

COPY README.md ./
COPY --from=wheel /src/tensorrt_llm/build/tensorrt_llm*.whl ./
COPY docs docs
COPY cpp/include include

Expand Down
7 changes: 3 additions & 4 deletions docker/Makefile
Original file line number Diff line number Diff line change
Expand Up @@ -202,17 +202,16 @@ jenkins-rockylinux8_%: PYTHON_VERSION_TAG_ID = $(if $(findstring 3.12,${PYTHON_V
jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_${PYTHON_VERSION_TAG_ID}_DOCKER_IMAGE)
jenkins-rockylinux8_%: STAGE = tritondevel
jenkins-rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
# [TODO] Update to NVIDIA CUDA 13.0.2 when it's available
jenkins-rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8
jenkins-rockylinux8_%: BASE_TAG = 13.1.0-devel-rockylinux8

rockylinux8_%: STAGE = tritondevel
rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8
rockylinux8_%: BASE_TAG = 13.1.0-devel-rockylinux8

# For x86_64 and aarch64
ubuntu22_%: STAGE = tritondevel
ubuntu22_%: BASE_IMAGE = nvcr.io/nvidia/cuda
ubuntu22_%: BASE_TAG = 13.0.1-devel-ubuntu22.04
ubuntu22_%: BASE_TAG = 13.1.0-devel-ubuntu22.04

trtllm_%: STAGE = release
trtllm_%: PUSH_TO_STAGING := 0
Expand Down
2 changes: 1 addition & 1 deletion docker/common/install_cuda_toolkit.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set -ex
# This script is used for reinstalling CUDA on Rocky Linux 8 with the run file.
# CUDA version is usually aligned with the latest NGC CUDA image tag.
# Only use when public CUDA image is not ready.
CUDA_VER="13.0.2_580.95.05"
CUDA_VER="13.1.0_590.44.01"
CUDA_VER_SHORT="${CUDA_VER%_*}"

NVCC_VERSION_OUTPUT=$(nvcc --version)
Expand Down
2 changes: 1 addition & 1 deletion docker/common/install_polygraphy.sh
Original file line number Diff line number Diff line change
Expand Up @@ -5,7 +5,7 @@ set -ex
if [ -n "${GITHUB_MIRROR}" ]; then
export PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
fi
pip3 install polygraphy==0.49.9
pip3 install polygraphy==0.49.26

# Clean up pip cache and temporary files
pip3 cache purge
Expand Down
4 changes: 2 additions & 2 deletions docker/common/install_pytorch.sh
Original file line number Diff line number Diff line change
Expand Up @@ -4,8 +4,8 @@ set -ex

# Use latest stable version from https://pypi.org/project/torch/#history
# and closest to the version specified in
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10
TORCH_VERSION="2.9.0"
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12
TORCH_VERSION="2.9.1"
SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')

prepare_environment() {
Expand Down
23 changes: 14 additions & 9 deletions docker/common/install_tensorrt.sh
Original file line number Diff line number Diff line change
Expand Up @@ -2,20 +2,20 @@

set -ex

TRT_VER="10.13.3.9"
TRT_VER="10.14.1.48"
# Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10
CUDA_VER="13.0" # 13.0.2
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12
CUDA_VER="13.1" # 13.1.0
# Keep the installation for cuDNN if users want to install PyTorch with source codes.
# PyTorch 2.x can compile with cuDNN v9.
CUDNN_VER="9.14.0.64-1"
NCCL_VER="2.27.7-1+cuda13.0"
CUBLAS_VER="13.1.0.3-1"
CUDNN_VER="9.17.0.29-1"
NCCL_VER="2.28.9-1+cuda13.0"
CUBLAS_VER="13.2.0.9-1"
# Align with the pre-installed CUDA / NVCC / NVRTC versions from
# https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
NVRTC_VER="13.0.88-1"
CUDA_RUNTIME="13.0.96-1"
CUDA_DRIVER_VERSION="580.95.05-1.el8"
NVRTC_VER="13.1.80-1"
CUDA_RUNTIME="13.1.80-1"
CUDA_DRIVER_VERSION="590.44.01-1.el8"

for i in "$@"; do
case $i in
Expand Down Expand Up @@ -118,7 +118,12 @@ install_rockylinux_requirements() {
install_tensorrt() {
PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")

TRT_CUDA_VERSION=${CUDA_VER}
# No CUDA 13.1 version for TensorRT yet. Use CUDA 13.0 package instead.
if [ "$CUDA_VER" = "13.1" ]; then
TRT_CUDA_VERSION="13.0"
fi
TRT_VER_SHORT=$(echo $TRT_VER | cut -d. -f1-3)

if [ -z "$RELEASE_URL_TRT" ];then
Expand Down
18 changes: 18 additions & 0 deletions docs/source/commands/trtllm-serve/trtllm-serve.rst
Original file line number Diff line number Diff line change
Expand Up @@ -170,6 +170,24 @@ TRT-LLM multimodal supports the following modalities and data types (depending o
`load_base64_image utility <https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/utils/load_base64_image.py>`__
for implementation details.

**Image embeddings**

It is also possible to directly provide the image embeddings to use by the multimodal
model.

* Using "image_embeds" with base64-encoded data:

.. code-block:: json

{"role": "user", "content": [
{"type": "text", "text": "What's in this image?"},
{"type": "image_embeds", "image_embeds": {"data": "{image_embeddings_base64}"}}}
]}

.. note::
The contents of `image_embeddings_base64` can be generated by base64-encoding
the result of serializing a tensor via `torch.save`.

**Video**

* Using "video_url":
Expand Down
6 changes: 3 additions & 3 deletions jenkins/Build.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -83,19 +83,19 @@ def BUILD_CONFIGS = [
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
(TARNAME) : "TensorRT-LLM-GH200.tar.gz",
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
(BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
(BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
],
(CONFIG_LINUX_AARCH64_PYBIND): [
(WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
(TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz",
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
(BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
(BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
],
(CONFIG_LINUX_AARCH64_LLVM) : [
(WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
(TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz",
(WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
(BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
(BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
],
]

Expand Down
22 changes: 14 additions & 8 deletions jenkins/L0_Test.groovy
Original file line number Diff line number Diff line change
Expand Up @@ -39,7 +39,7 @@ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312

// DLFW torch image
DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.10-py3"
DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.12-py3"

//Ubuntu base image
UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04"
Expand Down Expand Up @@ -316,6 +316,11 @@ def processShardTestList(llmSrc, testDBList, splitId, splits, perfMode=false) {
foundRunningLine = true
return false // Don't include the "Running" line itself
}
// Stop collecting when we hit the warnings/errors summary separator
if (foundRunningLine && line.contains('======================')) {
foundRunningLine = false // Stop collecting
return false
}

def hasDoubleColon = line.contains('::')
def shouldInclude = foundRunningLine && hasDoubleColon
Expand Down Expand Up @@ -3389,7 +3394,7 @@ def launchTestJobs(pipeline, testFilter)
// Python version and OS for sanity check
x86SanityCheckConfigs = [
"PY312-DLFW": [
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE,
LLM_DOCKER_IMAGE, // Workaround ABI incompatibilities between PyTorch 2.9.1 and 2.10.0a0
"B200_PCIe",
X86_64_TRIPLE,
false,
Expand Down Expand Up @@ -3418,15 +3423,16 @@ def launchTestJobs(pipeline, testFilter)
]

aarch64SanityCheckConfigs = [
/* //Disable PY312-UB2404 temporarily since lack of official PyTorch for CUDA 13.1.
"PY312-UB2404": [
LLM_DOCKER_IMAGE,
"GH200",
AARCH64_TRIPLE,
false,
"",
UBUNTU_24_04_IMAGE,
true, // Extra PyTorch CUDA 13.0 install
],
DLFW_IMAGE,
false, // Extra PyTorch CUDA 13.0 install
],*/
"PY312-DLFW": [
LLM_DOCKER_IMAGE,
"GH200",
Expand Down Expand Up @@ -3524,17 +3530,17 @@ def launchTestJobs(pipeline, testFilter)
def platform = cpu_arch == X86_64_TRIPLE ? "x86_64" : "sbsa"
trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget https://developer.download.nvidia.com/compute/cuda/repos/${ubuntu_version}/${platform}/cuda-keyring_1.1-1_all.deb")
trtllm_utils.llmExecStepWithRetry(pipeline, script: "dpkg -i cuda-keyring_1.1-1_all.deb")
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-0")
trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-1")
}
// Extra PyTorch CUDA 13.0 install for all bare-metal environments (Default PyTorch is for CUDA 12.8)
if (values[6]) {
echo "###### Extra PyTorch CUDA 13.0 install Start ######"
// Use internal mirror instead of https://download.pytorch.org/whl/cu130 for better network stability.
// PyTorch CUDA 13.0 package and torchvision package can be installed as expected.
if (k8s_arch == "amd64") {
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
} else {
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
}
}

Expand Down
8 changes: 4 additions & 4 deletions jenkins/current_image_tags.properties
Original file line number Diff line number Diff line change
Expand Up @@ -13,7 +13,7 @@
# images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm

LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512241744-10055
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512241744-10055
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512241744-10055
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512241744-10055
LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202601011103-9818
LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202601011103-9818
LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202601011103-9818
LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202601011103-9818
13 changes: 7 additions & 6 deletions requirements.txt
Original file line number Diff line number Diff line change
Expand Up @@ -19,13 +19,14 @@ pandas
h5py==3.12.1
StrEnum
sentencepiece>=0.1.99
tensorrt~=10.13.3
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.9.0a0.
torch>=2.9.0a0,<=2.9.0
tensorrt~=10.14.1
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12 uses 2.10.0a0.
torch>=2.9.1,<=2.10.0a0
torchvision
nvidia-modelopt[torch]~=0.37.0
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.27.7
nvidia-nccl-cu13==2.27.7
# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12 uses 2.28.9
# torch 2.9.1+cu130 depends on nvidia-nccl-cu13==2.27.7
nvidia-nccl-cu13>=2.27.7,<=2.28.9
nvidia-cuda-nvrtc
transformers==4.57.1
prometheus_client
Expand Down Expand Up @@ -65,7 +66,7 @@ ninja
etcd3 @ git+https://github.com/kragniz/python-etcd3.git@e58a899579ba416449c4e225b61f039457c8072a
blake3
soundfile
triton==3.5.0
triton==3.5.1
tiktoken
blobfile
openai-harmony==0.0.4
Expand Down
15 changes: 15 additions & 0 deletions tensorrt_llm/_torch/attention_backend/interface.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,14 @@
from ..pyexecutor.resource_manager import KVCacheManager
from ..utils import get_model_extra_attrs

try:
# Transformers v5
from transformers.configuration_utils import ALLOWED_ATTENTION_LAYER_TYPES
except ImportError:
# Transformers v4
from transformers.configuration_utils import \
ALLOWED_LAYER_TYPES as ALLOWED_ATTENTION_LAYER_TYPES


@dataclass
class AttentionRuntimeFeatures:
Expand Down Expand Up @@ -448,6 +456,13 @@ class RopeParams:
def from_config(config) -> "RopeParams":
rope_params = RopeParams()

hf_rope_parameters = getattr(config, 'rope_parameters', None)
if hf_rope_parameters is not None:
assert not set(hf_rope_parameters.keys()).issubset(
ALLOWED_ATTENTION_LAYER_TYPES), (
"Per-layer-type RoPE configuration is not supported yet.")
config.update(hf_rope_parameters)

# get rotary parameters.
hidden_size = config.hidden_size
num_attention_heads = config.num_attention_heads
Expand Down
6 changes: 6 additions & 0 deletions tensorrt_llm/_torch/auto_deploy/llm_args.py
Original file line number Diff line number Diff line change
Expand Up @@ -384,6 +384,12 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings):

_quant_config: Optional[QuantConfig] = PrivateAttr(default=None)

max_stats_len: int = Field(
default=1000,
description="The max number of performance statistic entries.",
status="prototype",
)

@property
def quant_config(self) -> QuantConfig:
if self._quant_config is None:
Expand Down
2 changes: 2 additions & 0 deletions tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
Original file line number Diff line number Diff line change
Expand Up @@ -490,10 +490,12 @@ def __init__(
self.max_beam_width = ad_config.max_beam_width
self.spec_config = ad_config.speculative_config
self._disable_overlap_scheduler = ad_config.disable_overlap_scheduler
self.llm_args.max_stats_len = ad_config.max_stats_len
else:
self.max_beam_width = 1
self.spec_config = None
self._disable_overlap_scheduler = False
self.llm_args.max_stats_len = 1000

# check for max total draft tokens
if self.spec_config is not None:
Expand Down
Loading
Loading