diff --git a/cpp/kernels/fmha_v2/pytest.ini b/cpp/kernels/fmha_v2/pytest.ini index 1b7c95070116..4ffcf349e9c4 100644 --- a/cpp/kernels/fmha_v2/pytest.ini +++ b/cpp/kernels/fmha_v2/pytest.ini @@ -6,6 +6,7 @@ markers = fmhca debug bench + needs_l40s # bin: unit tests # test: python script for invoking fmha.exe testpaths = bin test diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu index a72d8d95ee60..f32ce4c7d06d 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu @@ -493,7 +493,12 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [ #if !DISABLE_SYNC_FOR_PROFILING uint32_t expected_value = *ptrs.flag_val; +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 + // .acquire and .release qualifiers for fence instruction require sm_90 or higher. asm volatile("fence.release.sys;"); +#else + asm volatile("fence.acq_rel.sys;"); +#endif #pragma unroll 1 // No unroll as one iter is typically enough for (int target_rank = lane_id; target_rank < ep_size; target_rank += warpSize) { @@ -525,7 +530,6 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [ flag_set = flag_value == expected_value; } while (!flag_set); } - // asm volatile("fence.acquire.sys;"); #endif } } @@ -1018,7 +1022,6 @@ __global__ void moeA2ACombineKernel( if (blockIdx.x == 0) { - // asm volatile("fence.release.sys;"); #pragma unroll 1 // No unroll for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize) { @@ -1050,7 +1053,12 @@ __global__ void moeA2ACombineKernel( flag_set = flag_value == expected_value; } while (!flag_set); } +#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900 + // .acquire and .release qualifiers for fence instruction require sm_90 or higher. asm volatile("fence.acquire.sys;"); +#else + asm volatile("fence.acq_rel.sys;"); +#endif } __syncthreads(); #endif diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h index 987b953ee3c4..31eba5bb8d4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h @@ -78,8 +78,10 @@ enum class RoutingMethodType : int64_t Llama4 = 3, // RenormalizeNaive: Softmax -> TopK -> Renormalize RenormalizeNaive = 4, + // MiniMaxM2: Sigmoid -> RoutingBiasAdd -> TopK -> Renormalize(without bias) + MiniMax2 = 5, // Unspecified - Unspecified = 5, + Unspecified = 6, }; inline int32_t maybeGetMinTokenCount(int32_t numPaddedTokens, int32_t hiddenSize, int32_t dtypeSizeBits) @@ -98,6 +100,7 @@ inline std::string serializeMoeRoutingMethodType(RoutingMethodType routingMethod case RoutingMethodType::DeepSeekV3: return "DeepSeekV3"; case RoutingMethodType::Llama4: return "Llama4"; case RoutingMethodType::RenormalizeNaive: return "RenormalizeNaive"; + case RoutingMethodType::MiniMax2: return "MiniMax2"; default: TLLM_CHECK_WITH_INFO(false, "Invalid routing method"); return ""; }; } diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi index 74e18b2cd2a2..a6bf164d1ad0 100644 --- a/docker/Dockerfile.multi +++ b/docker/Dockerfile.multi @@ -1,8 +1,8 @@ # Multi-stage Dockerfile ARG BASE_IMAGE=nvcr.io/nvidia/pytorch ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver -ARG BASE_TAG=25.10-py3 -ARG TRITON_BASE_TAG=25.10-py3 +ARG BASE_TAG=25.12-py3 +ARG TRITON_BASE_TAG=25.12-py3 ARG DEVEL_IMAGE=devel FROM ${BASE_IMAGE}:${BASE_TAG} AS base @@ -147,6 +147,7 @@ RUN --mount=type=cache,target=/root/.cache/pip --mount=type=bind,from=wheel,sour pip install /tmp/wheel/tensorrt_llm*.whl COPY README.md ./ +COPY --from=wheel /src/tensorrt_llm/build/tensorrt_llm*.whl ./ COPY docs docs COPY cpp/include include diff --git a/docker/Makefile b/docker/Makefile index 519dbbda13d3..67c0a36015c5 100644 --- a/docker/Makefile +++ b/docker/Makefile @@ -202,17 +202,16 @@ jenkins-rockylinux8_%: PYTHON_VERSION_TAG_ID = $(if $(findstring 3.12,${PYTHON_V jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_${PYTHON_VERSION_TAG_ID}_DOCKER_IMAGE) jenkins-rockylinux8_%: STAGE = tritondevel jenkins-rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda -# [TODO] Update to NVIDIA CUDA 13.0.2 when it's available -jenkins-rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8 +jenkins-rockylinux8_%: BASE_TAG = 13.1.0-devel-rockylinux8 rockylinux8_%: STAGE = tritondevel rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda -rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8 +rockylinux8_%: BASE_TAG = 13.1.0-devel-rockylinux8 # For x86_64 and aarch64 ubuntu22_%: STAGE = tritondevel ubuntu22_%: BASE_IMAGE = nvcr.io/nvidia/cuda -ubuntu22_%: BASE_TAG = 13.0.1-devel-ubuntu22.04 +ubuntu22_%: BASE_TAG = 13.1.0-devel-ubuntu22.04 trtllm_%: STAGE = release trtllm_%: PUSH_TO_STAGING := 0 diff --git a/docker/common/install_cuda_toolkit.sh b/docker/common/install_cuda_toolkit.sh index 0dc5cb305aa5..555a3b348b87 100644 --- a/docker/common/install_cuda_toolkit.sh +++ b/docker/common/install_cuda_toolkit.sh @@ -5,7 +5,7 @@ set -ex # This script is used for reinstalling CUDA on Rocky Linux 8 with the run file. # CUDA version is usually aligned with the latest NGC CUDA image tag. # Only use when public CUDA image is not ready. -CUDA_VER="13.0.2_580.95.05" +CUDA_VER="13.1.0_590.44.01" CUDA_VER_SHORT="${CUDA_VER%_*}" NVCC_VERSION_OUTPUT=$(nvcc --version) diff --git a/docker/common/install_polygraphy.sh b/docker/common/install_polygraphy.sh index 315658d7a0d3..da9df6495464 100644 --- a/docker/common/install_polygraphy.sh +++ b/docker/common/install_polygraphy.sh @@ -5,7 +5,7 @@ set -ex if [ -n "${GITHUB_MIRROR}" ]; then export PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple" fi -pip3 install polygraphy==0.49.9 +pip3 install polygraphy==0.49.26 # Clean up pip cache and temporary files pip3 cache purge diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh index 069b26846c89..9a971a3bb25c 100644 --- a/docker/common/install_pytorch.sh +++ b/docker/common/install_pytorch.sh @@ -4,8 +4,8 @@ set -ex # Use latest stable version from https://pypi.org/project/torch/#history # and closest to the version specified in -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 -TORCH_VERSION="2.9.0" +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12 +TORCH_VERSION="2.9.1" SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"') prepare_environment() { diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh index 3887be6fa260..855daa366e4f 100644 --- a/docker/common/install_tensorrt.sh +++ b/docker/common/install_tensorrt.sh @@ -2,20 +2,20 @@ set -ex -TRT_VER="10.13.3.9" +TRT_VER="10.14.1.48" # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 -CUDA_VER="13.0" # 13.0.2 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12 +CUDA_VER="13.1" # 13.1.0 # Keep the installation for cuDNN if users want to install PyTorch with source codes. # PyTorch 2.x can compile with cuDNN v9. -CUDNN_VER="9.14.0.64-1" -NCCL_VER="2.27.7-1+cuda13.0" -CUBLAS_VER="13.1.0.3-1" +CUDNN_VER="9.17.0.29-1" +NCCL_VER="2.28.9-1+cuda13.0" +CUBLAS_VER="13.2.0.9-1" # Align with the pre-installed CUDA / NVCC / NVRTC versions from # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html -NVRTC_VER="13.0.88-1" -CUDA_RUNTIME="13.0.96-1" -CUDA_DRIVER_VERSION="580.95.05-1.el8" +NVRTC_VER="13.1.80-1" +CUDA_RUNTIME="13.1.80-1" +CUDA_DRIVER_VERSION="590.44.01-1.el8" for i in "$@"; do case $i in @@ -118,7 +118,12 @@ install_rockylinux_requirements() { install_tensorrt() { PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))') PARSED_PY_VERSION=$(echo "${PY_VERSION//./}") + TRT_CUDA_VERSION=${CUDA_VER} + # No CUDA 13.1 version for TensorRT yet. Use CUDA 13.0 package instead. + if [ "$CUDA_VER" = "13.1" ]; then + TRT_CUDA_VERSION="13.0" + fi TRT_VER_SHORT=$(echo $TRT_VER | cut -d. -f1-3) if [ -z "$RELEASE_URL_TRT" ];then diff --git a/docs/source/commands/trtllm-serve/trtllm-serve.rst b/docs/source/commands/trtllm-serve/trtllm-serve.rst index b26e45de9242..c73e903e6cd0 100644 --- a/docs/source/commands/trtllm-serve/trtllm-serve.rst +++ b/docs/source/commands/trtllm-serve/trtllm-serve.rst @@ -170,6 +170,24 @@ TRT-LLM multimodal supports the following modalities and data types (depending o `load_base64_image utility `__ for implementation details. +**Image embeddings** + +It is also possible to directly provide the image embeddings to use by the multimodal +model. + +* Using "image_embeds" with base64-encoded data: + + .. code-block:: json + + {"role": "user", "content": [ + {"type": "text", "text": "What's in this image?"}, + {"type": "image_embeds", "image_embeds": {"data": "{image_embeddings_base64}"}}} + ]} + +.. note:: + The contents of `image_embeddings_base64` can be generated by base64-encoding + the result of serializing a tensor via `torch.save`. + **Video** * Using "video_url": diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy index 5ecaa43a2233..35b95d4cac9e 100644 --- a/jenkins/Build.groovy +++ b/jenkins/Build.groovy @@ -83,19 +83,19 @@ def BUILD_CONFIGS = [ (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake", (TARNAME) : "TensorRT-LLM-GH200.tar.gz", (WHEEL_ARCHS): "90-real;100-real;103-real;120-real", - (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA + (BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA ], (CONFIG_LINUX_AARCH64_PYBIND): [ (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake", (TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz", (WHEEL_ARCHS): "90-real;100-real;103-real;120-real", - (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA + (BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA ], (CONFIG_LINUX_AARCH64_LLVM) : [ (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD", (TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz", (WHEEL_ARCHS): "90-real;100-real;103-real;120-real", - (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA + (BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA ], ] diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index f62874af41a0..bd2cdd21a132 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -39,7 +39,7 @@ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310 LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312 // DLFW torch image -DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.10-py3" +DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.12-py3" //Ubuntu base image UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04" @@ -316,6 +316,11 @@ def processShardTestList(llmSrc, testDBList, splitId, splits, perfMode=false) { foundRunningLine = true return false // Don't include the "Running" line itself } + // Stop collecting when we hit the warnings/errors summary separator + if (foundRunningLine && line.contains('======================')) { + foundRunningLine = false // Stop collecting + return false + } def hasDoubleColon = line.contains('::') def shouldInclude = foundRunningLine && hasDoubleColon @@ -3389,7 +3394,7 @@ def launchTestJobs(pipeline, testFilter) // Python version and OS for sanity check x86SanityCheckConfigs = [ "PY312-DLFW": [ - LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE, + LLM_DOCKER_IMAGE, // Workaround ABI incompatibilities between PyTorch 2.9.1 and 2.10.0a0 "B200_PCIe", X86_64_TRIPLE, false, @@ -3418,15 +3423,16 @@ def launchTestJobs(pipeline, testFilter) ] aarch64SanityCheckConfigs = [ + /* //Disable PY312-UB2404 temporarily since lack of official PyTorch for CUDA 13.1. "PY312-UB2404": [ LLM_DOCKER_IMAGE, "GH200", AARCH64_TRIPLE, false, "", - UBUNTU_24_04_IMAGE, - true, // Extra PyTorch CUDA 13.0 install - ], + DLFW_IMAGE, + false, // Extra PyTorch CUDA 13.0 install + ],*/ "PY312-DLFW": [ LLM_DOCKER_IMAGE, "GH200", @@ -3524,7 +3530,7 @@ def launchTestJobs(pipeline, testFilter) def platform = cpu_arch == X86_64_TRIPLE ? "x86_64" : "sbsa" trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget https://developer.download.nvidia.com/compute/cuda/repos/${ubuntu_version}/${platform}/cuda-keyring_1.1-1_all.deb") trtllm_utils.llmExecStepWithRetry(pipeline, script: "dpkg -i cuda-keyring_1.1-1_all.deb") - trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-0") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-1") } // Extra PyTorch CUDA 13.0 install for all bare-metal environments (Default PyTorch is for CUDA 12.8) if (values[6]) { @@ -3532,9 +3538,9 @@ def launchTestJobs(pipeline, testFilter) // Use internal mirror instead of https://download.pytorch.org/whl/cu130 for better network stability. // PyTorch CUDA 13.0 package and torchvision package can be installed as expected. if (k8s_arch == "amd64") { - trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") } else { - trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") + trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple") } } diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties index 32643c10d3c9..24e44e26fabe 100644 --- a/jenkins/current_image_tags.properties +++ b/jenkins/current_image_tags.properties @@ -13,7 +13,7 @@ # images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead. IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm -LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512241744-10055 -LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512241744-10055 -LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512241744-10055 -LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512241744-10055 +LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202601011103-9818 +LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202601011103-9818 +LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202601011103-9818 +LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202601011103-9818 diff --git a/requirements.txt b/requirements.txt index a21b8ca2819c..2e789cbc7f61 100644 --- a/requirements.txt +++ b/requirements.txt @@ -19,13 +19,14 @@ pandas h5py==3.12.1 StrEnum sentencepiece>=0.1.99 -tensorrt~=10.13.3 -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.9.0a0. -torch>=2.9.0a0,<=2.9.0 +tensorrt~=10.14.1 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12 uses 2.10.0a0. +torch>=2.9.1,<=2.10.0a0 torchvision nvidia-modelopt[torch]~=0.37.0 -# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.27.7 -nvidia-nccl-cu13==2.27.7 +# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12 uses 2.28.9 +# torch 2.9.1+cu130 depends on nvidia-nccl-cu13==2.27.7 +nvidia-nccl-cu13>=2.27.7,<=2.28.9 nvidia-cuda-nvrtc transformers==4.57.1 prometheus_client @@ -65,7 +66,7 @@ ninja etcd3 @ git+https://github.com/kragniz/python-etcd3.git@e58a899579ba416449c4e225b61f039457c8072a blake3 soundfile -triton==3.5.0 +triton==3.5.1 tiktoken blobfile openai-harmony==0.0.4 diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py index 677a441c4059..71f7c8be6757 100644 --- a/tensorrt_llm/_torch/attention_backend/interface.py +++ b/tensorrt_llm/_torch/attention_backend/interface.py @@ -24,6 +24,14 @@ from ..pyexecutor.resource_manager import KVCacheManager from ..utils import get_model_extra_attrs +try: + # Transformers v5 + from transformers.configuration_utils import ALLOWED_ATTENTION_LAYER_TYPES +except ImportError: + # Transformers v4 + from transformers.configuration_utils import \ + ALLOWED_LAYER_TYPES as ALLOWED_ATTENTION_LAYER_TYPES + @dataclass class AttentionRuntimeFeatures: @@ -448,6 +456,13 @@ class RopeParams: def from_config(config) -> "RopeParams": rope_params = RopeParams() + hf_rope_parameters = getattr(config, 'rope_parameters', None) + if hf_rope_parameters is not None: + assert not set(hf_rope_parameters.keys()).issubset( + ALLOWED_ATTENTION_LAYER_TYPES), ( + "Per-layer-type RoPE configuration is not supported yet.") + config.update(hf_rope_parameters) + # get rotary parameters. hidden_size = config.hidden_size num_attention_heads = config.num_attention_heads diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py index aa9f0147cd20..7c46a48df36f 100644 --- a/tensorrt_llm/_torch/auto_deploy/llm_args.py +++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py @@ -384,6 +384,12 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings): _quant_config: Optional[QuantConfig] = PrivateAttr(default=None) + max_stats_len: int = Field( + default=1000, + description="The max number of performance statistic entries.", + status="prototype", + ) + @property def quant_config(self) -> QuantConfig: if self._quant_config is None: diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py index 48066cb2565d..a81e0f3f5ca7 100644 --- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py +++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py @@ -490,10 +490,12 @@ def __init__( self.max_beam_width = ad_config.max_beam_width self.spec_config = ad_config.speculative_config self._disable_overlap_scheduler = ad_config.disable_overlap_scheduler + self.llm_args.max_stats_len = ad_config.max_stats_len else: self.max_beam_width = 1 self.spec_config = None self._disable_overlap_scheduler = False + self.llm_args.max_stats_len = 1000 # check for max total draft tokens if self.spec_config is not None: diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py index 06e93eb3e5e2..5b683637c6e3 100644 --- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py @@ -252,14 +252,32 @@ def fused_moe( ) run_moe = moe_runner.fused_moe_runner.run_moe_min_latency if min_latency_mode else moe_runner.fused_moe_runner.run_moe - output = run_moe(input, token_selected_experts, token_final_scales, - fc1_expert_weights, fc1_expert_biases, fc2_expert_weights, - fc2_expert_biases, quant_scales, input_sf, - swizzled_input_sf, swiglu_alpha, swiglu_beta, swiglu_limit, - tp_size, tp_rank, ep_size, ep_rank, cluster_size, - cluster_rank, enable_alltoall, min_latency_mode, - [gemm_tactic_1, gemm_tactic_2], activation_type, - unpadded_hidden_size, tuner_num_tokens, out_tensor) + try: + output = run_moe(input, token_selected_experts, token_final_scales, + fc1_expert_weights, fc1_expert_biases, + fc2_expert_weights, fc2_expert_biases, quant_scales, + input_sf, swizzled_input_sf, swiglu_alpha, swiglu_beta, + swiglu_limit, tp_size, tp_rank, ep_size, ep_rank, + cluster_size, cluster_rank, enable_alltoall, + min_latency_mode, [gemm_tactic_1, gemm_tactic_2], + activation_type, unpadded_hidden_size, + tuner_num_tokens, out_tensor) + except RuntimeError as e: + error_msg = str(e) + if "DeepGEMM only supports Hopper" in error_msg: + raise RuntimeError( + f"{error_msg}" + "Note: This is the Cutlass backend with DeepGemm JIT path. " + "For Blackwell (SM100+) support, please use the DEEPGEMM backend instead." + ) from e + raise + + # When out_tensor is provided, the result is written in-place to out_tensor. + # Return empty list to avoid aliasing constraint violation in PyTorch 2.9.1+ + # (custom op output cannot be the same tensor as input). + # Callers should use out_tensor directly when they provide it. + if out_tensor is not None and not min_latency_mode: + return [] return output if min_latency_mode else [output] diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py index 412b461a93c5..ea7d7f93e689 100644 --- a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py @@ -1102,9 +1102,17 @@ def mxe4m3_mxe2m1_block_scale_moe_runner( 0] = routing_logits # replace dummy routing logits with actual routing logits input_tensors[-2] = topk_weights # replace dummy topk_weights with actual input_tensors[-1] = topk_ids # replace dummy topk_ids with actual - return kernel_runner(input_tensors, - tactic=[-1, -1] if best_tactic == -1 else best_tactic, - output=output) + result = kernel_runner( + input_tensors, + tactic=[-1, -1] if best_tactic == -1 else best_tactic, + output=output) + # When output is provided, the result is written in-place to output. + # Return empty tensor to avoid aliasing constraint violation in PyTorch 2.9.1+ + # (custom op output cannot be the same tensor as input). + # Callers should use output directly when they provide it. + if output is not None: + return torch.empty(0, device=result.device, dtype=result.dtype) + return result @dataclass(frozen=True) diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py index 7b8718c10bdc..c56bf86faffd 100644 --- a/tensorrt_llm/_torch/models/__init__.py +++ b/tensorrt_llm/_torch/models/__init__.py @@ -15,6 +15,7 @@ from .modeling_hyperclovax import HCXVisionForCausalLM from .modeling_llama import LlamaForCausalLM from .modeling_llava_next import LlavaNextModel +from .modeling_minimaxm2 import MiniMaxM2ForCausalLM from .modeling_mistral import Mistral3VLM, MistralForCausalLM from .modeling_mixtral import MixtralForCausalLM from .modeling_nemotron import NemotronForCausalLM @@ -80,6 +81,7 @@ "SeedOssForCausalLM", "Glm4MoeForCausalLM", "Qwen3VLModel", + "MiniMaxM2ForCausalLM", ] if transformers.__version__ >= "4.45.1": diff --git a/tensorrt_llm/_torch/models/modeling_minimaxm2.py b/tensorrt_llm/_torch/models/modeling_minimaxm2.py new file mode 100644 index 000000000000..73cd480ee7cc --- /dev/null +++ b/tensorrt_llm/_torch/models/modeling_minimaxm2.py @@ -0,0 +1,314 @@ +# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from typing import Dict, List, Optional + +import torch +from torch import nn +from transformers import PretrainedConfig + +from tensorrt_llm.functional import PositionEmbeddingType + +from ..attention_backend import AttentionMetadata +from ..attention_backend.interface import PositionalEmbeddingParams, RopeParams +from ..models.modeling_utils import ModelConfig +from ..modules.attention import Attention +from ..modules.decoder_layer import DecoderLayer +from ..modules.embedding import Embedding +from ..modules.fused_moe import MiniMaxM2MoeRoutingMethod, create_moe +from ..modules.linear import Linear +from ..modules.rms_norm import RMSNorm +from ..utils import AuxStreamType +from .modeling_utils import DecoderModel, DecoderModelForCausalLM, register_auto_model + + +# MiniMax M2/M2.1 requires the implementation of the following two additional components: +# 1. MoE routing method: Currently, TRT-LLM does not support +# the following routing method: sigmoid -> add bias -> topk -> renorm. +# 2. QK layer normalization needs to be performed across the head_num * head_size dimension, +# which conflicts with the current TP-mode attention logic. +# For the better performance, we suggest to enable attention DP when using MiniMax M2/M2.1 model. +class MiniMaxM2MoE(nn.Module): + def __init__( + self, + model_config: ModelConfig[PretrainedConfig], + aux_stream: torch.cuda.Stream, + layer_idx: Optional[int] = None, + ): + super().__init__() + config = model_config.pretrained_config + self.hidden_dim = config.hidden_size + self.ffn_dim = config.intermediate_size + self.num_experts = config.num_local_experts + self.top_k = config.num_experts_per_tok + self.enable_attention_dp = model_config.mapping.enable_attention_dp + + # moe gate (linear layer) only runs in half/full precision for now + self.gate = Linear( + self.hidden_dim, self.num_experts, bias=False, dtype=torch.float32, quant_config=None + ) + + self.e_score_correction_bias = nn.Parameter( + torch.empty((self.num_experts), dtype=torch.float32), requires_grad=False + ) + + reduce_results = True + self.experts = create_moe( + routing_method=MiniMaxM2MoeRoutingMethod( + top_k=self.top_k, + num_experts=self.num_experts, + callable_e_score_correction_bias=lambda: self.e_score_correction_bias, + ), + num_experts=self.num_experts, + aux_stream_dict={AuxStreamType.MoeChunkingOverlap: aux_stream}, + reduce_results=reduce_results, + model_config=model_config, + layer_idx=layer_idx, + ) + + def load_weights(self, weights: List[Dict]): + assert len(weights) == 1 + + self.e_score_correction_bias.copy_( + weights[0]["e_score_correction_bias"][:].to(self.e_score_correction_bias.dtype) + ) + + def forward( + self, + hidden_states: torch.Tensor, + attn_metadata: AttentionMetadata, + ) -> torch.Tensor: + all_rank_num_tokens = attn_metadata.all_rank_num_tokens + hidden_states_f32 = hidden_states.to(torch.float32) + router_logits = self.gate(hidden_states_f32) + final_hidden_states = self.experts( + hidden_states, + router_logits, + all_rank_num_tokens=all_rank_num_tokens, + use_dp_padding=False, + ) + return final_hidden_states + + +# It's a little bit tricky to implement special qk norm +# because rms dim is hidden_size * num_heads, not hidden_size, after qkv linear, +# the result size is hidden_size * num_heads / tp_size. +# Actually, we have two strategies to implement qk norm attention: +# 1. the first linear layer is not col parallel, then we can use the normal rms layer norm. each attention use full qkv +# 2. we use col parallel linear layer, then we use allgather to gather qkv from all gpus, +# then we use rms norm on q and k. Finally, we split qkv to each gpus and continue. +# for better performance, we choose the second strategy here. +# Most adaptions are from QKNormRoPEAttention. +class MiniMaxM2Attention(Attention): + def __init__( + self, + *, + model_config: ModelConfig[PretrainedConfig], + layer_idx: Optional[int] = None, + ): + config = model_config.pretrained_config + self.pretrained_config = config + + super().__init__( + hidden_size=config.hidden_size, + num_attention_heads=config.num_attention_heads, + num_key_value_heads=config.num_key_value_heads, + max_position_embeddings=config.max_position_embeddings, + bias=False, + pos_embd_params=PositionalEmbeddingParams( + type=PositionEmbeddingType.rope_gpt_neox, + rope=RopeParams.from_config(config), + ), + rope_fusion=True, + layer_idx=layer_idx, + dtype=config.torch_dtype, + config=model_config, + ) + + self.q_norm = RMSNorm( + hidden_size=self.q_size * self.tp_size, + eps=config.rms_norm_eps, + dtype=config.torch_dtype, + ) + self.k_norm = RMSNorm( + hidden_size=self.kv_size * self.tp_size, + eps=config.rms_norm_eps, + dtype=config.torch_dtype, + ) + + def apply_qk_norm(self, q, k): + if self.qkv_proj.mapping.tp_size > 1: + # collect q and k from all gpus + from ..distributed import allgather + + temp_q = allgather(q, self.qkv_proj.mapping) + temp_k = allgather(k, self.qkv_proj.mapping) + temp_q = self.q_norm(temp_q) + temp_k = self.k_norm(temp_k) + q = temp_q.reshape(-1, self.tp_size, self.q_size)[:, self.tp_rank, :].reshape( + -1, self.q_size + ) + k = temp_k.reshape(-1, self.tp_size, self.kv_size)[:, self.tp_rank, :].reshape( + -1, self.kv_size + ) + else: + q = self.q_norm(q) + k = self.k_norm(k) + + return q, k + + def apply_rope( + self, + q: torch.Tensor, + k: Optional[torch.Tensor], + v: Optional[torch.Tensor], + position_ids: torch.Tensor, + ): + """ + The apply_rope method is called in the forward method of the Attention class. + The apply_rope method is overridden in this class to apply QK norm and RoPE to the input tensor. + """ + # Apply QK norm before RoPE. + q, k, v = self.split_qkv(q, k, v) + q, k = self.apply_qk_norm(q, k) + return super().apply_rope(q, k, v, position_ids) + + +class MiniMaxM2DecoderLayer(DecoderLayer): + def __init__( + self, + model_config: ModelConfig[PretrainedConfig], + layer_idx: int, + aux_stream: torch.cuda.Stream, + ): + super().__init__() + config = model_config.pretrained_config + self.hidden_size = config.hidden_size + + self.self_attn = MiniMaxM2Attention(model_config=model_config, layer_idx=layer_idx) + + self.block_sparse_moe = MiniMaxM2MoE( + model_config=model_config, aux_stream=aux_stream, layer_idx=layer_idx + ) + + self.input_layernorm = RMSNorm( + hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype + ) + + self.post_attention_layernorm = RMSNorm( + hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype + ) + self.mapping = model_config.mapping + self.layer_idx = layer_idx + + def forward( + self, + position_ids: torch.IntTensor, + hidden_states: torch.Tensor, + attn_metadata: AttentionMetadata, + residual: Optional[torch.Tensor], + **kwargs, + ) -> torch.Tensor: + if residual is None: + residual = hidden_states + hidden_states = self.input_layernorm(hidden_states) + else: + hidden_states, residual = self.input_layernorm(hidden_states, residual) + + # Self Attention + hidden_states = self.self_attn( + position_ids=position_ids, + hidden_states=hidden_states, + attn_metadata=attn_metadata, + **kwargs, + ) + + # Fully Connected + hidden_states, residual = self.post_attention_layernorm(hidden_states, residual) + hidden_states = self.block_sparse_moe(hidden_states, attn_metadata) + return hidden_states, residual + + +class MiniMaxM2Model(DecoderModel): + def __init__(self, model_config: ModelConfig[PretrainedConfig]): + super().__init__(model_config) + # add this for kv cache initialization (if we use bf16 for kv cache) + quant_config = model_config.quant_config + if quant_config is None or ( + (not quant_config.quant_mode.has_fp8_kv_cache()) + and (not quant_config.quant_mode.has_fp4_kv_cache()) + ): + model_config.pretrained_config.torch_dtype = torch.bfloat16 + config = model_config.pretrained_config + self.vocab_size = config.vocab_size + self.aux_stream = torch.cuda.Stream() + + self.embed_tokens = Embedding( + config.vocab_size, + config.hidden_size, + dtype=config.torch_dtype, + enable_torch_compile_for_embedding=model_config.enable_torch_compile_for_embedding, + ) + + self.layers = nn.ModuleList( + [ + MiniMaxM2DecoderLayer(model_config, layer_idx, self.aux_stream) + for layer_idx in range(config.num_hidden_layers) + ] + ) + self.norm = RMSNorm( + hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype + ) + + def forward( + self, + attn_metadata: AttentionMetadata, + input_ids: Optional[torch.IntTensor] = None, + position_ids: Optional[torch.IntTensor] = None, + inputs_embeds: Optional[torch.FloatTensor] = None, + **kwargs, + ) -> torch.Tensor: + if (input_ids is None) ^ (inputs_embeds is not None): + raise ValueError( + "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one" + ) + + if inputs_embeds is None: + inputs_embeds = self.embed_tokens(input_ids) + + hidden_states = inputs_embeds + + residual = None + for decoder_layer in self.layers: + hidden_states, residual = decoder_layer( + position_ids=position_ids, + hidden_states=hidden_states, + attn_metadata=attn_metadata, + residual=residual, + ) + + hidden_states, _ = self.norm(hidden_states, residual) + return hidden_states + + +@register_auto_model("MiniMaxM2ForCausalLM") +class MiniMaxM2ForCausalLM(DecoderModelForCausalLM[MiniMaxM2Model, PretrainedConfig]): + def __init__(self, model_config: ModelConfig[PretrainedConfig]): + super().__init__( + MiniMaxM2Model(model_config), + config=model_config, + hidden_size=model_config.pretrained_config.hidden_size, + vocab_size=model_config.pretrained_config.vocab_size, + ) diff --git a/tensorrt_llm/_torch/modules/fused_moe/__init__.py b/tensorrt_llm/_torch/modules/fused_moe/__init__.py index 053ecaa25fe0..51d6ba5f9475 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/__init__.py +++ b/tensorrt_llm/_torch/modules/fused_moe/__init__.py @@ -12,7 +12,8 @@ from .routing import (BaseMoeRoutingMethod, DeepSeekV3MoeRoutingMethod, DefaultMoeRoutingMethod, Llama4RenormalizeMoeRoutingMethod, - LoadBalancedMoeRoutingMethod, RenormalizeMoeRoutingMethod, + LoadBalancedMoeRoutingMethod, MiniMaxM2MoeRoutingMethod, + RenormalizeMoeRoutingMethod, RenormalizeNaiveMoeRoutingMethod, RoutingMethodType, SparseMixerMoeRoutingMethod, StaticMoeRoutingMethod, create_renormalize_expert_load_balanced_logits) @@ -33,6 +34,7 @@ "MoE", "MoeLoadBalancer", "MoEWeightLoadingMode", + "MiniMaxM2MoeRoutingMethod", "RenormalizeMoeRoutingMethod", "RenormalizeNaiveMoeRoutingMethod", "RoutingMethodType", diff --git a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py index f1559edfb68d..530d903ad354 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py +++ b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py @@ -138,8 +138,9 @@ def create_moe_backend( moe_load_balancer = get_moe_load_balancer() if moe_load_balancer is not None: assert moe_cls in [ - WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE - ], "MoE Load Balance is only supported in WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE and CuteDslFusedMoE now." + WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE, + DeepGemmFusedMoE + ], "MoE Load Balance is only supported in WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE and CuteDslFusedMoE, and DeepGemmFusedMoE." if bias: assert moe_cls in [CutlassFusedMoE, TritonFusedMoE, TRTLLMGenFusedMoE diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py index 71e13e1324b7..41eacfe65c39 100755 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py @@ -433,7 +433,7 @@ def run_moe( elif self.has_w4a16_mxfp4: weight_dtype = torch.uint8 - final_hidden_states = torch.ops.trtllm.fused_moe( + result = torch.ops.trtllm.fused_moe( x, token_selected_experts, token_final_scales, @@ -468,10 +468,13 @@ def run_moe( unpadded_hidden_size=self.unpadded_hidden_size, out_tensor=moe_output, ) - # Custom op requires all inputs are in the same type. - # Only in cutlass_min_latency_mode, the output is a list of tensors. - # Otherwise, the output should be unpacked as a single tensor. - final_hidden_states = final_hidden_states[0] + # When moe_output is provided, the result is written in-place and + # fused_moe returns empty list to avoid aliasing constraint violation. + # Otherwise, unpack the single tensor from the returned list. + if moe_output is not None: + final_hidden_states = moe_output + else: + final_hidden_states = result[0] return final_hidden_states diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py index df0a419b3872..5cd181812fca 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py @@ -610,7 +610,7 @@ def run_moe( intermediate_size_per_partition_padded = self.w3_w1_weight.shape[ -2] // 2 - final_hidden_states = torch.ops.trtllm.mxe4m3_mxe2m1_block_scale_moe_runner( + result = torch.ops.trtllm.mxe4m3_mxe2m1_block_scale_moe_runner( router_logits, routing_bias, x, @@ -640,6 +640,10 @@ def run_moe( token_selected_experts, output=moe_output, ) + + # When output is provided, use it directly as the result + # (custom op returns empty tensor to avoid PyTorch aliasing constraints) + final_hidden_states = moe_output if moe_output is not None else result else: raise NotImplementedError( "TRTLLMGenFusedMoE only supports fp8_block_scaling, nvfp4, w4a16_mxfp4, w4a8_mxfp4_mxfp8 and w4a8_mxfp4_fp8 dtypes." diff --git a/tensorrt_llm/_torch/modules/fused_moe/routing.py b/tensorrt_llm/_torch/modules/fused_moe/routing.py index 3927e9bd6bf1..be21b5716a91 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/routing.py +++ b/tensorrt_llm/_torch/modules/fused_moe/routing.py @@ -155,8 +155,10 @@ class RoutingMethodType(IntEnum): Llama4 = 3, # Qwen3: Softmax -> TopK -> Renormalize RenormalizeNaive = 4, + # MiniMaxM2: Sigmoid -> RoutingBiasAdd -> TopK -> Renormalize(without bias) + MiniMax2 = 5, # Unspecified - Unspecified = 5, + Unspecified = 6, class BaseMoeRoutingMethod(nn.Module): @@ -379,6 +381,57 @@ def routing_method_type(self): return RoutingMethodType.DeepSeekV3 +class MiniMaxM2MoeRoutingMethod(BaseMoeRoutingMethod): + + def __init__( + self, + top_k: int, + num_experts: int, + callable_e_score_correction_bias: Callable[[], torch.Tensor], + output_dtype: torch.dtype = torch.float32, + ): + super().__init__() + self.top_k = top_k + self.num_experts = num_experts + assert callable(callable_e_score_correction_bias) + self.callable_e_score_correction_bias = callable_e_score_correction_bias + self.output_dtype = output_dtype + + @staticmethod + @torch.compile(options={"max-autotune": True}) + def get_scores(logits, e_score_correction_bias): + scores = F.sigmoid(logits) + scores_with_bias = scores + e_score_correction_bias + if enable_llm_debug(): + has_nan = torch.isnan(scores_with_bias).any() + if has_nan: + warnings.warn( + "Detected NAN in the tensor scores_with_bias. Please check if it matches the expectation." + ) + + return scores, scores_with_bias + + def apply(self, + router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]: + scores, scores_with_bias = self.get_scores(router_logits, + self.e_score_correction_bias) + _, topk_idx = torch.topk(scores_with_bias, + k=self.top_k, + dim=-1, + sorted=False) + top_k_weights = scores.gather(1, topk_idx) + top_k_weights /= (top_k_weights.sum(dim=-1, keepdim=True) + 1e-20) + return topk_idx.to(torch.int32), top_k_weights.to(self.output_dtype) + + @property + def e_score_correction_bias(self) -> torch.Tensor: + return self.callable_e_score_correction_bias() + + @property + def routing_method_type(self): + return RoutingMethodType.MiniMax2 + + class RenormalizeMoeRoutingMethod(BaseMoeRoutingMethod): def __init__( @@ -587,6 +640,8 @@ def routing_method_type(self) -> RoutingMethodType: RenormalizeNaiveMoeRoutingMethod, RoutingMethodType.Unspecified: BaseMoeRoutingMethod, + RoutingMethodType.MiniMax2: + MiniMaxM2MoeRoutingMethod, } diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py index 412997336396..94fa5020ca7f 100644 --- a/tensorrt_llm/_torch/pyexecutor/py_executor.py +++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py @@ -143,7 +143,6 @@ def __init__(self, super(PyExecutor, self).__init__() self.device_id = torch.cuda.current_device() self.global_rank = dist.rank - # Store the execution stream for model forward operations. # This stream is used for proper synchronization with KVCacheTransferManager. # execution_stream can be provided by create_py_executor @@ -181,6 +180,7 @@ def __init__(self, self.max_draft_len = max_draft_len self.max_total_draft_tokens = max_total_draft_tokens self.llm_args = self.model_engine.llm_args + self.max_stats_len = max(self.llm_args.max_stats_len, 1) self.max_num_tokens = self.llm_args.max_num_tokens self.print_log = self.llm_args.print_iter_log self.enable_iter_perf_stats = self.llm_args.enable_iter_perf_stats @@ -866,6 +866,8 @@ def _append_iter_stats(self, req_stats: Optional[List[RequestStats]] = None): with self.stats_lock: + if len(self.stats) > self.max_stats_len: + self.stats.pop(0) self.stats.append((stats, req_stats)) def _process_iter_stats( diff --git a/tensorrt_llm/inputs/__init__.py b/tensorrt_llm/inputs/__init__.py index 406a71d4f5ca..4587d39f033e 100644 --- a/tensorrt_llm/inputs/__init__.py +++ b/tensorrt_llm/inputs/__init__.py @@ -16,7 +16,8 @@ async_load_audio, async_load_image, async_load_video, convert_image_mode, default_multimodal_input_loader, encode_base64_content_from_url, encode_base64_image, - get_cache_salt_id, load_image, load_video) + get_cache_salt_id, load_base64_image_embeds, load_image, + load_video) __all__ = [ "ALL_SUPPORTED_MULTIMODAL_MODELS", @@ -57,4 +58,5 @@ "get_cache_salt_id", "compute_retained_tokens_count", "compute_retention_mask", + "load_base64_image_embeds", ] diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py index bbbd5f4f8f22..0dc09547b3d2 100644 --- a/tensorrt_llm/inputs/utils.py +++ b/tensorrt_llm/inputs/utils.py @@ -114,6 +114,15 @@ def load_base64_image(parsed_url: str) -> Image.Image: return image +def load_base64_image_embeds(str_content: str) -> torch.Tensor: + content_bytes = base64.b64decode(str_content) + with BytesIO(content_bytes) as buf: + image_data: torch.Tensor = torch.load(buf, + weights_only=True, + map_location="cpu") + return image_data + + def load_image(image: Union[str, Image.Image], format: str = "pt", device: str = "cpu") -> Union[Image.Image, torch.Tensor]: @@ -425,13 +434,14 @@ class MultimodalData(TypedDict): """Type definition for multimodal data structure.""" modality: str data: Any + is_embedding: bool class ConversationMessage(TypedDict): """Type definition for conversation message structure.""" role: str content: List[dict[str, Any]] - media: List[MultimodalData] | List[torch.Tensor] | List[Dict[str, Any]] + media: List[MultimodalData] # @classmethod # def fromSample(cls, sample: dict[str, str]) -> "ConversationMessage": @@ -446,33 +456,57 @@ def __init__( model_type: str, multimodal_server_config: Optional[MultimodalServerConfig] = None): self._model_type = model_type - self._data = defaultdict[str](list) - self._placeholder_counts = defaultdict[str](int) + self._data = defaultdict[str, list](list) + self._embeddings = defaultdict[str, list](list) + self._placeholder_counts = defaultdict[str, int](int) self._multimodal_server_config = multimodal_server_config if multimodal_server_config is not None else MultimodalServerConfig( ) - async def retrieve_all_async(self) -> Optional[Dict[str, List[Any]]]: - """Retrieve all collected multimodal data.""" - if not self._data: - return None - - return { - modality: await asyncio.gather(*items) - for modality, items in self._data.items() - } - - def retrieve_all_sync(self) -> Optional[Dict[str, List[Any]]]: - """Retrieve all collected multimodal data.""" - if not self._data: - return None - - return {modality: items for modality, items in self._data.items()} - - def add_data(self, media_type: str, data: Union[Coroutine, Any]): - current_count = len(self._data[media_type]) + 1 + async def retrieve_all_async( + self + ) -> tuple[Optional[Dict[str, List[Any]]], Optional[Dict[str, List[Any]]]]: + """Retrieve all collected multimodal data and embeddings.""" + + async def _retrieve( + data: Optional[dict[str, + list]]) -> Optional[Dict[str, List[Any]]]: + if not data: + return None + return { + modality: await asyncio.gather(*items) + for modality, items in data.items() if items + } + + return await _retrieve(self._data), await _retrieve(self._embeddings) + + def retrieve_all_sync( + self + ) -> tuple[Optional[Dict[str, List[Any]]], Optional[Dict[str, List[Any]]]]: + """Retrieve all collected multimodal data and embeddings.""" + + def _retrieve( + data: Optional[dict[str, + list]]) -> Optional[Dict[str, List[Any]]]: + if not data: + return None + return { + modality: items + for modality, items in data.items() if items + } + + return _retrieve(self._data), _retrieve(self._embeddings) + + def add_data(self, + media_type: str, + data: Union[Coroutine, Any], + *, + is_embedding: bool = False): + current_count = len(self._data[media_type]) + len( + self._embeddings[media_type]) + 1 placeholder = retrieve_multimodal_placeholder(self._model_type, media_type, current_count) - self._data[media_type].append(data) + (self._embeddings + if is_embedding else self._data)[media_type].append(data) if placeholder: self._placeholder_counts[placeholder] += 1 @@ -643,33 +677,34 @@ def convert_to_conversation_message( media = [media] if modality in ["image", "multiple_image"]: if is_embedding: + _load = lambda mm: mm + # each mm_embedding corresponds to each image placeholder if not isinstance(media, list): media = [media] - - mm_data = [{ - 'modality': modality, - 'mm_embedding_info': mm - } for mm in media] else: - mm_data = [ - MultimodalData(modality=modality, - data=load_image(i, - format=image_data_format, - device=device)) - for i in media - ] + _load = lambda mm: load_image( + mm, format=image_data_format, device=device) + + mm_data = [ + MultimodalData(modality=modality, + data=_load(mm), + is_embedding=is_embedding) for mm in media + ] elif modality == "video": if is_embedding: raise ValueError( "External embedding is not supported for video modality yet." ) mm_data = [ - MultimodalData(modality=modality, - data=load_video(i, - num_frames, - format=image_data_format, - device=device)) for i in media + MultimodalData( + modality=modality, + data=load_video(i, + num_frames, + format=image_data_format, + device=device), + is_embedding=False, + ) for i in media ] elif modality == "audio": if is_embedding: @@ -677,8 +712,11 @@ def convert_to_conversation_message( "External embedding is not supported for audio modality yet." ) mm_data = [ - MultimodalData(modality=modality, - data=load_audio(i, device=device)) for i in media + MultimodalData( + modality=modality, + data=load_audio(i, device=device), + is_embedding=False, + ) for i in media ] elif modality == "image_audio": if is_embedding: @@ -706,16 +744,22 @@ def convert_to_conversation_message( pass if _modal is None: raise ValueError(f"Unknown matching modality: {modality}") - mm_data.append(MultimodalData(modality=_modal, data=data)) + mm_data.append( + MultimodalData(modality=_modal, + data=data, + is_embedding=False)) elif modality == "mixture_text_image": mm_data = [] for m in media: if m: mm_data.append( - MultimodalData(modality="image", - data=load_image(m, - format=image_data_format, - device=device))) + MultimodalData( + modality="image", + data=load_image(m, + format=image_data_format, + device=device), + is_embedding=False, + )) else: raise ValueError(f"Unknown modality: {modality}") return ConversationMessage(role="user", content=prompt, media=mm_data) @@ -749,17 +793,12 @@ def convert_to_conversation_message( is_embedding) mm_data_tracker = MultimodalDataTracker(model_type) for mdata in conv["media"]: - # Check if mdata is a MultimodalData - if isinstance(mdata, - dict) and "modality" in mdata and "data" in mdata: - mdata_modality = mdata["modality"] - if modality == "multiple_image": - mdata_modality = "image" - mm_data_tracker.add_data(mdata_modality, mdata["data"]) - else: - # Add embeddings to the tracker for placeholder handling - mm_data_tracker.add_data(mdata["modality"], - mdata["mm_embedding_info"]) + mdata_modality = mdata["modality"] + if modality == "multiple_image": + mdata_modality = "image" + mm_data_tracker.add_data(mdata_modality, + mdata["data"], + is_embedding=is_embedding) mm_placeholder_counts = mm_data_tracker.placeholder_counts() prompt = conv["content"] if mm_placeholder_counts: @@ -776,11 +815,13 @@ def convert_to_conversation_message( if mm_placeholder_counts: if mm_embeddings is not None: - input[ + _, input[ "multi_modal_embeddings"] = mm_data_tracker.retrieve_all_sync( ) else: - input["multi_modal_data"] = mm_data_tracker.retrieve_all_sync() + input[ + "multi_modal_data"], _ = mm_data_tracker.retrieve_all_sync( + ) inputs.append(input) return inputs diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py index ac869d765a5e..71d34af1a647 100644 --- a/tensorrt_llm/llmapi/llm.py +++ b/tensorrt_llm/llmapi/llm.py @@ -31,7 +31,7 @@ GenerationResult, IterationResult, LoRARequest, PostprocWorkerConfig, PromptAdapterRequest) from ..executor.postproc_worker import PostprocParams -from ..executor.utils import (create_mpi_comm_session, +from ..executor.utils import (RequestError, create_mpi_comm_session, get_spawn_proxy_process_env) from ..inputs import (PromptInputs, create_input_processor, create_input_processor_with_hash, get_cache_salt_id, @@ -686,7 +686,7 @@ def _check_arguments(self, prompt_len: int, query_len: int, if self.args.backend == "pytorch" and not self.args.enable_chunked_prefill and not is_gen_only: max_num_tokens = self.args.max_num_tokens if max_num_tokens and prompt_len / self.args.parallel_config.cp_size + query_len > max_num_tokens: - raise ValueError( + raise RequestError( f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}), query length ({query_len}) should not exceed " f"max_num_tokens ({max_num_tokens})") return diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py index 3f15252b84fb..9a13eab29a83 100644 --- a/tensorrt_llm/llmapi/llm_args.py +++ b/tensorrt_llm/llmapi/llm_args.py @@ -2963,6 +2963,12 @@ class TorchLlmArgs(BaseLlmArgs): status="prototype", ) + max_stats_len: int = Field( + default=1000, + description="The max number of performance statistic entries.", + status="prototype", + ) + @property def quant_config(self) -> QuantConfig: if self._quant_config is None: diff --git a/tensorrt_llm/serve/chat_utils.py b/tensorrt_llm/serve/chat_utils.py index 26ee17c4f407..e08caadaaf5f 100644 --- a/tensorrt_llm/serve/chat_utils.py +++ b/tensorrt_llm/serve/chat_utils.py @@ -17,7 +17,8 @@ from tensorrt_llm.inputs import (ConversationMessage, MultimodalData, MultimodalDataTracker, add_multimodal_placeholders, async_load_audio, - async_load_image, async_load_video) + async_load_image, async_load_video, + load_base64_image_embeds) from tensorrt_llm.inputs.multimodal import MultimodalServerConfig from tensorrt_llm.logger import logger @@ -33,24 +34,45 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): type: Required[Literal["video_url"]] +class ImageEmbedsData(TypedDict): + """Type definition for serialized image embeddings structure.""" + data: Required[str] + + +class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False): + """Type definition for image embeddings passed in base64-encoded PyTorch tensor format.""" + image_embeds: Required[ + # TODO: Besides "data", could support "url" and "ipc_handle" in the future. + ImageEmbedsData] + type: Required[Literal["image_embeds"]] + + # Type Aliases and Constants ChatCompletionContentPartParam: TypeAlias = Union[ - OpenAIChatCompletionContentPartParam, ChatCompletionContentPartVideoParam, - str] + OpenAIChatCompletionContentPartParam, + ChatCompletionContentPartVideoParam, + ChatCompletionContentPartImageEmbedsParam, + str, +] # TODO: Add "input_audio" to support byte_encoded audio input. VALID_MESSAGE_CONTENT_MM_PART_TYPES = [ - "text", "image_url", "video_url", "audio_url" + "text", + "image_url", + "video_url", + "audio_url", + "image_embeds", ] # Parser Functions _TextParser = partial(cast, ChatCompletionContentPartTextParam) _ImageParser = partial(cast, ChatCompletionContentPartImageParam) +_ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam) _VideoParser = partial(cast, ChatCompletionContentPartVideoParam) _AudioParser = partial(cast, ChatCompletionContentPartInputAudioParam) MM_PARSER_MAP: dict[str, Callable[[ChatCompletionContentPartParam], Union[ - str, dict[str, str]]]] = { + str, dict[str, str], None]]] = { "text": lambda part: _TextParser(part).get("text", None), "image_url": @@ -59,12 +81,15 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False): lambda part: _VideoParser(part).get("video_url", {}).get("url", None), "audio_url": lambda part: _AudioParser(part).get("audio_url", {}).get("url", None), + "image_embeds": + lambda part: _ImageEmbedsParser(part).get("image_embeds", {}).get( + "data", None), } def _parse_chat_message_content_mm_part( part: ChatCompletionContentPartParam -) -> tuple[str, Union[str, dict[str, str]]]: +) -> tuple[str, Union[str, dict[str, str], None]]: """Parse a single multimodal part of a chat message.""" assert isinstance(part, dict) part_type = part.get("type", None) @@ -78,9 +103,9 @@ def _parse_chat_message_content_mm_part( def parse_chat_message_content_part( - part: ChatCompletionMessageParam, + part: ChatCompletionContentPartParam, mm_data_tracker: MultimodalDataTracker, -) -> Optional[Any]: +) -> str | MultimodalData | None: """Parse a single part of a chat message.""" if isinstance(part, str): return part @@ -110,7 +135,23 @@ async def load_image_async(): logger.error(f"Failed to load image: {str(e)}") return None - return MultimodalData(modality="image", data=load_image_async()) + return MultimodalData(modality="image", + data=load_image_async(), + is_embedding=False) + + if part_type == "image_embeds": + str_content = cast(str, content) + + async def decode_image_embeds_async(): + try: + return load_base64_image_embeds(str_content) + except Exception as e: + logger.error(f"Failed to decode image data: {str(e)}") + return None + + return MultimodalData(modality="image", + data=decode_image_embeds_async(), + is_embedding=True) if part_type == "video_url": str_content = cast(str, content) @@ -125,7 +166,9 @@ async def load_video_async(): logger.error(f"Failed to load video: {str(e)}") return None - return MultimodalData(modality="video", data=load_video_async()) + return MultimodalData(modality="video", + data=load_video_async(), + is_embedding=False) if part_type == "audio_url": str_content = cast(str, content) @@ -140,14 +183,16 @@ async def load_audio_async(): logger.error(f"Failed to load audio: {str(e)}") return None - return MultimodalData(modality="audio", data=load_audio_async()) + return MultimodalData(modality="audio", + data=load_audio_async(), + is_embedding=False) raise NotImplementedError(f"Unknown part type: {part_type}") def parse_chat_message_content_parts( role: str, - parts: Iterable[ChatCompletionMessageParam], + parts: Iterable[ChatCompletionContentPartParam], mm_data_tracker: MultimodalDataTracker, ) -> ConversationMessage: """Parse multiple parts of a chat message.""" @@ -224,8 +269,9 @@ def parse_chat_messages_coroutines( messages: List[ChatCompletionMessageParam], model_config: AutoConfig, multimodal_server_config: Optional[MultimodalServerConfig] = None -) -> Tuple[List[ConversationMessage], Optional[Coroutine[ - Any, Any, Optional[Dict[str, List[Any]]]]]]: +) -> Tuple[List[ConversationMessage], Coroutine[Any, Any, tuple[Optional[Dict[ + str, List[Any]]], Optional[Dict[str, List[Any]]]]], list[dict[str, + int]]]: """Parse multiple chat messages and return conversation and coroutine.""" conversation = [] mm_placeholder_counts = [] @@ -237,7 +283,9 @@ def parse_chat_messages_coroutines( conversation.append(parsed_msg) if parsed_msg["media"]: for mdata in parsed_msg["media"]: - mm_data_tracker.add_data(mdata["modality"], mdata["data"]) + mm_data_tracker.add_data(mdata["modality"], + mdata["data"], + is_embedding=mdata["is_embedding"]) mm_placeholder_count = mm_data_tracker.placeholder_counts() if mm_placeholder_count: parsed_msg["content"] = add_multimodal_placeholders( diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py index 44983306dc83..afb97aa6f0c6 100644 --- a/tensorrt_llm/serve/openai_server.py +++ b/tensorrt_llm/serve/openai_server.py @@ -563,9 +563,13 @@ async def create_chat_response( ) prompt = prompt_inputs(prompt) - mm_data = await mm_coroutines - if mm_data is not None: + mm_data, mm_embeddings = await mm_coroutines + if mm_data: prompt["multi_modal_data"] = mm_data + if mm_embeddings: + prompt["multi_modal_embeddings"] = mm_embeddings + if mm_data and mm_embeddings: + raise ValueError("Passing 'multi_modal_data' and 'multi_modal_embeddings' at the same time is not supported.") postproc_args.reasoning_parser = self.llm.args.reasoning_parser postproc_args.tool_parser = self.tool_parser @@ -666,7 +670,9 @@ async def create_mm_embedding_response(promise: RequestOutput): ) prompt = prompt_inputs(prompt) - mm_data = await mm_coroutines + mm_data, mm_embeddings = await mm_coroutines + if mm_embeddings: + raise ValueError("Cannot use multimodal embeddings as input") if mm_data is not None: prompt["multi_modal_data"] = mm_data diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml index 76cf65bcb3d4..96a9ef6b94c4 100644 --- a/tests/integration/defs/accuracy/references/gsm8k.yaml +++ b/tests/integration/defs/accuracy/references/gsm8k.yaml @@ -313,3 +313,7 @@ nvidia/Nemotron-3-Nano: - quant_algo: FP8 kv_cache_quant_algo: FP8 accuracy: 68.73 +MiniMaxAI/MiniMax-M2: + - accuracy: 85 + - quant_algo: FP8_BLOCK_SCALES + accuracy: 85 diff --git a/tests/integration/defs/accuracy/references/longbench_v1.yaml b/tests/integration/defs/accuracy/references/longbench_v1.yaml index c638ab92bb8e..e54288d0945e 100644 --- a/tests/integration/defs/accuracy/references/longbench_v1.yaml +++ b/tests/integration/defs/accuracy/references/longbench_v1.yaml @@ -1,8 +1,8 @@ Qwen3/Qwen3-30B-A3B-Instruct-2507: # Skip Softmax Attention ref accuracy - extra_acc_spec: "target_sparsity=0.0" - accuracy: 47.22 + accuracy: 47.357 - extra_acc_spec: "target_sparsity=0.5" - accuracy: 47.22 + accuracy: 47.102 - extra_acc_spec: "target_sparsity=0.9" - accuracy: 45.90 + accuracy: 46.169 diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 0ca0842b006e..d5835744a7c9 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -3884,7 +3884,6 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness): MODEL_PATH = f"{llm_models_root()}/{MODEL_NAME}" @skip_pre_hopper - # @pytest.mark.skip_less_device_memory(140000) # Only test for H200, B200 @pytest.mark.parametrize( "target_sparsity,thr_prefill,thr_decode", [ @@ -3903,15 +3902,51 @@ def test_skip_softmax_attention(self, target_sparsity: float, "prefill": thr_prefill, "decode": thr_decode, }) - kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.85) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75, + enable_block_reuse=False) if get_sm_version() >= 100: - pytest.skip("Bug to be fixed on Blackwell") + pytest.skip("https://nvbugs/5783509: Bug to be fixed on Blackwell") + + with LLM(self.MODEL_PATH, + attn_backend="TRTLLM", + max_batch_size=256, + max_num_tokens=100000, + kv_cache_config=kv_cache_config, + sparse_attention_config=sparse_attention_config) as llm: + task = LongBenchV1(self.MODEL_NAME) + task.evaluate(llm, + extra_acc_spec=f"target_sparsity={target_sparsity}") + + @pytest.mark.parametrize( + "target_sparsity,thr_prefill,thr_decode", + [ + (0.0, 0.0, 0.0), + (0.5, 85.97384174442398, 55.48258322852407), + (0.9, 1418.142868970396, 863.147841750025), + ], + ids=[ + "target_sparsity_0.0", "target_sparsity_0.5", "target_sparsity_0.9" + ], + ) + def test_skip_softmax_attention_2gpus(self, target_sparsity: float, + thr_prefill: float, + thr_decode: float): + sparse_attention_config = SkipSoftmaxAttentionConfig( + threshold_scale_factor={ + "prefill": thr_prefill, + "decode": thr_decode, + }) + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75, + enable_block_reuse=False) with LLM(self.MODEL_PATH, attn_backend="TRTLLM", max_batch_size=256, max_num_tokens=100000, + tensor_parallel_size=2, + moe_expert_parallel_size=2, + enable_attention_dp=True, kv_cache_config=kv_cache_config, sparse_attention_config=sparse_attention_config) as llm: task = LongBenchV1(self.MODEL_NAME) @@ -5350,3 +5385,35 @@ def test_nvfp4_8gpus(self, attention_dp): task = GSM8K(self.MODEL_NAME) task.evaluate(llm, extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS) + + +@skip_pre_hopper +class TestMiniMaxM2(LlmapiAccuracyTestHarness): + MODEL_NAME = "MiniMaxAI/MiniMax-M2" + MODEL_PATH = f"{llm_models_root()}/MiniMax-M2" + + @parametrize_with_ids("tp_size,ep_size", [(4, 4)]) + @pytest.mark.skip_less_device(4) + @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler", + [(False, True, True), (True, True, True)]) + def test_4gpus(self, tp_size, ep_size, attention_dp, cuda_graph, + overlap_scheduler): + kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6) + + pytorch_config = dict( + disable_overlap_scheduler=not overlap_scheduler, + cuda_graph_config=CudaGraphConfig() if cuda_graph else None, + moe_config=MoeConfig( + backend="DEEPGEMM" if get_sm_version() >= 100 else "CUTLASS")) + + with LLM(self.MODEL_PATH, + tensor_parallel_size=tp_size, + pipeline_parallel_size=1, + moe_expert_parallel_size=ep_size, + kv_cache_config=kv_cache_config, + max_seq_len=4096, + **pytorch_config, + enable_attention_dp=attention_dp) as llm: + assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + task = GSM8K(self.MODEL_NAME) + task.evaluate(llm) diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py index ab1690c87a68..f1d12a3ffbc6 100644 --- a/tests/integration/defs/test_e2e.py +++ b/tests/integration/defs/test_e2e.py @@ -1683,9 +1683,13 @@ def test_openai_lora(llm_root, llm_venv): def test_openai_chat_multimodal_example(llm_root, llm_venv): test_root = unittest_path() / "llmapi" / "apps" - llm_venv.run_cmd( - ["-m", "pytest", - str(test_root / "_test_openai_chat_multimodal.py")]) + llm_venv.run_cmd([ + "-m", + "pytest", + str(test_root / "_test_openai_chat_multimodal.py"), + "-m", + "not needs_l40s", + ]) def test_openai_mmencoder_example(llm_root, llm_venv): diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt index 8a276d4615fc..8fae1d62933f 100644 --- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt +++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt @@ -148,7 +148,6 @@ accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False] accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-True] accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True] -accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml index 809810c6d963..02616d7eda35 100644 --- a/tests/integration/test_lists/test-db/l0_b200.yml +++ b/tests/integration/test_lists/test-db/l0_b200.yml @@ -55,7 +55,6 @@ l0_b200: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM] - - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index a914a00c5348..84af0aae2b16 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -17,7 +17,7 @@ l0_dgx_b200: tests: - unittest/_torch/misc/test_autotuner.py::test_autotuner_distributed_strategy - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEPLowLatency] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[NVLinkTwoSided] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_configurable_moe-disable_finalize_fusion-TRTLLM-dtype1] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-TRTLLM] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-64-TRTLLM] @@ -31,6 +31,8 @@ l0_dgx_b200: - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8] - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8] - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4] + - accuracy/test_llm_api_pytorch.py::TestMiniMaxM2::test_4gpus[attention_dp=False-cuda_graph=True-overlap_scheduler=True-tp_size=4-ep_size=4] TIMEOUT (60) + # ------------- AutoDeploy tests --------------- - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16 - condition: diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 85b018c1894c..91bf2542b7ef 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -48,6 +48,9 @@ l0_dgx_h100: - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2] # llmapi - unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks + # ------------- Skip softmax attention tests --------------- + - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.5] + - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9] - condition: ranges: system_gpu_count: @@ -146,7 +149,7 @@ l0_dgx_h100: - unittest/_torch/multi_gpu_modeling/test_deepseek.py::test_deepseek_streaming[tp4-bf16-trtllm-deepseekv3_lite] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEP] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEPLowLatency] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[MNNVL] + - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[NVLinkTwoSided] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype0] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype1] - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.W4A8_CUSTOM-dtype0] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index adae47f6268b..994c43a1fcbc 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -78,10 +78,6 @@ l0_h100: - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=True] - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_dummy_load_format - # Waive known failures in https://nvbugs/5774869 - # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] TIMEOUT (90) - # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] TIMEOUT (90) - # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] TIMEOUT (90) - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=False] - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=True] - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=True] diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml index c3037894895d..d10ba9fc2cdd 100644 --- a/tests/integration/test_lists/test-db/l0_l40s.yml +++ b/tests/integration/test_lists/test-db/l0_l40s.yml @@ -28,6 +28,7 @@ l0_l40s: - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio] - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image] - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio] + - unittest/llmapi/apps/_test_openai_chat_multimodal.py::test_single_chat_session_image_embeds -m needs_l40s # MMMU sanity check - accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype - accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt index 8e763d0000a7..047c4019d109 100644 --- a/tests/integration/test_lists/waives.txt +++ b/tests/integration/test_lists/waives.txt @@ -220,11 +220,7 @@ examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-I accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697) accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697) test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5647825) -unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL] SKIP (https://nvbugs/5664904) -unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEP] SKIP (https://nvbugs/5664904) -unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEPLowLatency] SKIP (https://nvbugs/5664904) -unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEP] SKIP (https://nvbugs/5664904) -unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[MNNVL] SKIP (https://nvbugs/5664904) +unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEPLowLatency] SKIP (https://nvbugs/5808500) test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8] SKIP (https://nvbugs/5670469) test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-NVFP4-nvfp4-quantized/Llama-3_3-Nemotron-Super-49B-v1_nvfp4_hf] SKIP (https://nvbugs/5670469) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] SKIP (https://nvbugs/5756804) @@ -258,7 +254,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826) -unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] SKIP (https://nvbugs/5740075) accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/5740075) unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[False] SKIP (https://nvbugs/5739981) @@ -283,7 +278,6 @@ examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKI unittest/executor/test_base_worker.py::TestWorkerBase SKIP (https://nvbugs/5759698) triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118) cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5760737) -unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_allreduce_pg_op[seqlen:16-hidden:1024] SKIP (https://nvbugs/5760740) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] SKIP (https://nvbugs/5760747) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] SKIP (https://nvbugs/5759338) accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] SKIP (https://nvbugs/5759338) @@ -304,6 +298,9 @@ full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5612502) unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741) unittest/executor/test_rpc_worker.py SKIP (https://nvbugs/5605741) +cpp/test_e2e.py::test_model[-redrafter-86] SKIP (https://nvbugs/5761642) +unittest/_torch/thop/parallel/test_fp8_block_scale_gemm.py::test_deep_gemm_in_subprocess[env2] SKIP (https://nvbugs/5766853) +test_e2e.py::test_openai_responses SKIP (https://nvbugs/5804146) triton_server/test_triton.py::test_gpt_gather_logits[gpt-gather-logits] SKIP (https://nvbugs/5766960) stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/5766952) full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5596337) @@ -324,9 +321,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] SKIP (https://nvbugs/5794796) accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] SKIP (https://nvbugs/5794796) accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/5775544) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5774869) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] SKIP (https://nvbugs/5774869) -accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] SKIP (https://nvbugs/5774869) triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5775205) triton_server/test_triton.py::test_gpt_ib_lad[gpt-ib-lad] SKIP (https://nvbugs/5775223) unittest/_torch/modules/test_fused_moe.py::test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu[MoEWeightLoadingMode.FUSED_GATE_UP_PROJ-DefaultMoeRoutingMethod-1] SKIP (https://nvbugs/5775256) @@ -387,3 +381,6 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (h accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model] SKIP (https://nvbugs/5596343) accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model] SKIP (https://nvbugs/5596343) accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5799901) +accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664) +accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm_eagle] SKIP (https://nvbugs/5804683) +accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] SKIP (https://nvbugs/5809169) diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml index 4b6f8cedab1c..2e3457f3d452 100644 --- a/tests/unittest/api_stability/references/llm.yaml +++ b/tests/unittest/api_stability/references/llm.yaml @@ -227,6 +227,10 @@ methods: annotation: Optional[Dict[str, str]] default: null status: prototype + max_stats_len: + annotation: int + default: 1000 + status: prototype return_annotation: None generate: parameters: diff --git a/tests/unittest/llmapi/apps/_attach_multimodal_embeddings_patch/__init__.py b/tests/unittest/llmapi/apps/_attach_multimodal_embeddings_patch/__init__.py new file mode 100644 index 000000000000..7d8281ecd2b2 --- /dev/null +++ b/tests/unittest/llmapi/apps/_attach_multimodal_embeddings_patch/__init__.py @@ -0,0 +1,53 @@ +# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +# used by tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py + +import tempfile +from pathlib import Path +from typing import Optional + +import torch + +from tensorrt_llm._torch.models.modeling_qwen2vl import Qwen2VLInputProcessorBase +from tensorrt_llm.inputs import ExtraProcessedInputs, TextPrompt +from tensorrt_llm.sampling_params import SamplingParams + +_attach_multimodal_embeddings_orig = Qwen2VLInputProcessorBase.attach_multimodal_embeddings + + +# signature taken from tensorrt_llm/inputs/registry.py +def _attach_multimodal_embeddings( + self, + inputs: TextPrompt, + multimodal_embedding: dict[str, list[torch.Tensor]], + sampling_params: SamplingParams, +) -> tuple[list[int], Optional[ExtraProcessedInputs]]: + try: + _attach_multimodal_embeddings_orig(self, inputs, multimodal_embedding, sampling_params) + except NotImplementedError: + pass + else: + raise ValueError( + "Remove this custom module, Qwen2VLInputProcessorBase implements attach_multimodal_embeddings" + ) + + tempdir = tempfile.gettempdir() + file_path = Path(tempdir) / "multimodal_embedding.pickle" + with open(file_path, "wb") as f: + torch.save(multimodal_embedding, f) + raise ValueError(file_path) + + +Qwen2VLInputProcessorBase.attach_multimodal_embeddings = _attach_multimodal_embeddings diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py index fda0f8a49333..4183e1874e75 100644 --- a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py +++ b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py @@ -1,13 +1,18 @@ +import io import os +import sys import tempfile +from base64 import b64encode from pathlib import Path from typing import List import openai import pytest +import torch import yaml from PIL import Image +from tensorrt_llm._torch.shared_tensor import SharedTensorContainer from tensorrt_llm.inputs import encode_base64_image from ..test_llm import get_model_path @@ -17,6 +22,13 @@ from utils.llm_data import llm_models_root +from ._test_openai_mmencoder import RemoteMMEncoderServer +from ._test_openai_mmencoder import server as mm_encoder_server +from ._test_openai_mmencoder import \ + test_multimodal_content_mm_encoder as _test_multimodal_content_mm_encoder + +assert mm_encoder_server is not None # keep 'mm_encoder_server' fixture visible in this module + @pytest.fixture(scope="module", ids=["Qwen2.5-VL-3B-Instruct"]) def model_name(): @@ -25,7 +37,7 @@ def model_name(): @pytest.fixture(scope="module") def temp_extra_llm_api_options_file(request): - temp_dir = tempfile.gettempdir() + temp_dir = tempfile.mkdtemp() temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml") try: extra_llm_api_options_dict = { @@ -123,6 +135,98 @@ def test_single_chat_session_image(client: openai.OpenAI, model_name: str): == chat_completion.choices[0].message.content +# used by mm_encoder_server +@pytest.fixture(scope="module") +def extra_encoder_options() -> bool: + return False + + +# used by mm_encoder_server +@pytest.fixture(scope="module") +def temp_extra_encoder_options_file() -> str: + return "/dummy/path" + + +@pytest.fixture(scope="module") +def server_patched(model_name: str, temp_extra_llm_api_options_file: str): + # Custom module implements missing 'attach_multimodal_embeddings' to intercept + # embeddings. + model_path = get_model_path(model_name) + args = [ + "--extra_llm_api_options", + temp_extra_llm_api_options_file, + "--max_batch_size", + "64", + "--max_num_tokens", + "16384", + "--custom_module_dirs", + str( + Path(sys.modules[test_single_chat_session_image_embeds.__module__]. + __file__).parent / "_attach_multimodal_embeddings_patch"), + ] + with RemoteOpenAIServer(model_path, args) as remote_server: + yield remote_server + + +@pytest.mark.needs_l40s +@pytest.mark.asyncio(loop_scope="module") +def test_single_chat_session_image_embeds( + server_patched: RemoteOpenAIServer, + model_name: str, + mm_encoder_server: RemoteMMEncoderServer, +): + client = server_patched.get_client() + messages, mm_embed_handle = _test_multimodal_content_mm_encoder( + mm_encoder_server.get_client(), model_name) + + max_completion_tokens = 10 + + chat_completion_image = client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=max_completion_tokens, + temperature=0.0, + logprobs=False) + + mm_embed = SharedTensorContainer.from_dict(mm_embed_handle).get_local_view() + with io.BytesIO() as buf: + torch.save(mm_embed, buf) + mm_embed_bytes = buf.getvalue() + + image_content = messages[0]["content"][1] + assert image_content["type"] == "image_url" + image_content.clear() + image_content["type"] = "image_embeds" + image_content["image_embeds"] = { + "data": b64encode(mm_embed_bytes).decode("ascii") + } + + # test single completion + # + # FIXME: Remove try-except and use 'server' instead of 'server_patched', + # once Qwen2VLInputProcessorBase implements attach_multimodal_embeddings. + try: + chat_completion_embeds = client.chat.completions.create( + model=model_name, + messages=messages, + max_completion_tokens=max_completion_tokens, + temperature=0.0, + logprobs=False) + + assert chat_completion_embeds.choices[ + 0].message == chat_completion_image.choices[0].message + except openai.BadRequestError as e: + assert isinstance(e.body, dict) + with open(Path(e.body["message"]), "rb") as f: + intercepted_embeddings = torch.load(f, weights_only=True) + assert list(intercepted_embeddings.keys()) == ["image"] + assert len(intercepted_embeddings["image"]) == 1 + torch.testing.assert_close(intercepted_embeddings["image"][0], + mm_embed.cpu()) + pytest.xfail( + reason="Model does not implement 'attach_multimodal_embeddings'") + + @pytest.mark.asyncio(loop_scope="module") def test_single_chat_session_multi_image(client: openai.OpenAI, model_name: str): diff --git a/tests/unittest/llmapi/apps/_test_openai_mmencoder.py b/tests/unittest/llmapi/apps/_test_openai_mmencoder.py index 312f9232d400..483f9ad994c2 100644 --- a/tests/unittest/llmapi/apps/_test_openai_mmencoder.py +++ b/tests/unittest/llmapi/apps/_test_openai_mmencoder.py @@ -1,5 +1,6 @@ import os import tempfile +from typing import Any import openai import pytest @@ -67,7 +68,9 @@ def async_client(server: RemoteMMEncoderServer): return server.get_async_client() -def test_multimodal_content_mm_encoder(client: openai.OpenAI, model_name: str): +def test_multimodal_content_mm_encoder( + client: openai.OpenAI, + model_name: str) -> tuple[list[dict[str, Any]], dict[str, Any]]: content_text = "Describe the natural environment in the image." image_url = str(llm_models_root() / "multimodals" / "test_data" / @@ -105,6 +108,8 @@ def test_multimodal_content_mm_encoder(client: openai.OpenAI, model_name: str): assert mm_handle["tensor_size"][ 1] == 2048 # qwen2.5-vl: hidden_size of the vision encoder + return messages, mm_handle # used by tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py + def test_health(server: RemoteMMEncoderServer): health_url = server.url_for("health") diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py index f8ffe8fc7bd7..87624b61b8e4 100644 --- a/tests/unittest/llmapi/test_llm.py +++ b/tests/unittest/llmapi/test_llm.py @@ -2393,7 +2393,8 @@ def fail_path(): enable_chunked_prefill=False, fast_build=True) - with pytest.raises(ValueError): + # max_num_tokens validation now raises RequestError consistently + with pytest.raises(RequestError): output = llm.generate_async( "A " * build_config.max_num_tokens, sampling_params=sampling_params, @@ -2436,13 +2437,9 @@ def _test_llm_capture_request_error(pytorch_backend: bool, tp_size: int = 1): ) prompt = 'A ' * 65 # the minimum max_num_tokens is 64 - if pytorch_backend: - # pytorch backend will raise ValueError for max_num_tokens - with pytest.raises(ValueError): - llm.generate(prompt) - else: - with pytest.raises(RequestError): - llm.generate(prompt) + # Both backends now consistently raise RequestError for max_num_tokens validation + with pytest.raises(RequestError): + llm.generate(prompt) def test_llm_capture_request_error(): diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py index 1ea15a2e781e..68cdc62ba7ea 100644 --- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py +++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py @@ -66,7 +66,7 @@ def test_llama_7b_multi_lora_tp2(): cuda_graph_config=None) -@skip_ray +@skip_ray # https://nvbugs/5727075 @pytest.mark.gpu2 def test_phi3_lora_fused_modules_output_on_tp2_identical_to_tp1() -> None: check_phi3_lora_fused_modules_output_tp2_identical_to_tp1( diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py index b9702133152e..86f48d31266e 100644 --- a/tests/unittest/llmapi/test_llm_pytorch.py +++ b/tests/unittest/llmapi/test_llm_pytorch.py @@ -8,7 +8,7 @@ from tensorrt_llm import LLM from tensorrt_llm.disaggregated_params import DisaggregatedParams -from tensorrt_llm.executor import GenerationExecutorWorker +from tensorrt_llm.executor import GenerationExecutorWorker, RequestError from tensorrt_llm.executor.rpc_proxy import GenerationExecutorRpcProxy from tensorrt_llm.llmapi import CacheTransceiverConfig, KvCacheConfig from tensorrt_llm.llmapi.llm_args import NGramDecodingConfig, PeftCacheConfig @@ -830,10 +830,13 @@ def test_max_num_token_check(self): kv_cache_config=global_kvcache_config, max_num_tokens=100) - with pytest.raises(ValueError, - match="should not exceed max_num_tokens"): - ids = [random.randint(10, 100) for _ in range(101)] - llm.generate([ids]) + try: + with pytest.raises(RequestError, + match="should not exceed max_num_tokens"): + ids = [random.randint(10, 100) for _ in range(101)] + llm.generate([ids]) + finally: + llm.shutdown() class FailingExecutorWorker(GenerationExecutorWorker): @@ -962,10 +965,13 @@ def test_max_num_token_check(self): kv_cache_config=global_kvcache_config, max_num_tokens=100) - with pytest.raises(ValueError, - match="should not exceed max_num_tokens"): - ids = [random.randint(10, 100) for _ in range(101)] - llm.generate([ids]) + try: + with pytest.raises(RequestError, + match="should not exceed max_num_tokens"): + ids = [random.randint(10, 100) for _ in range(101)] + llm.generate([ids]) + finally: + llm.shutdown() @skip_ray