diff --git a/cpp/kernels/fmha_v2/pytest.ini b/cpp/kernels/fmha_v2/pytest.ini
index 1b7c95070116..4ffcf349e9c4 100644
--- a/cpp/kernels/fmha_v2/pytest.ini
+++ b/cpp/kernels/fmha_v2/pytest.ini
@@ -6,6 +6,7 @@ markers =
     fmhca
     debug
     bench
+    needs_l40s
 # bin: unit tests
 # test: python script for invoking fmha.exe
 testpaths = bin test
diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu
index a72d8d95ee60..f32ce4c7d06d 100644
--- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu
+++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu
@@ -493,7 +493,12 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
 #if !DISABLE_SYNC_FOR_PROFILING
             uint32_t expected_value = *ptrs.flag_val;
 
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+            // .acquire and .release qualifiers for fence instruction require sm_90 or higher.
             asm volatile("fence.release.sys;");
+#else
+            asm volatile("fence.acq_rel.sys;");
+#endif
 #pragma unroll 1 // No unroll as one iter is typically enough
             for (int target_rank = lane_id; target_rank < ep_size; target_rank += warpSize)
             {
@@ -525,7 +530,6 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [
                     flag_set = flag_value == expected_value;
                 } while (!flag_set);
             }
-            // asm volatile("fence.acquire.sys;");
 #endif
         }
     }
@@ -1018,7 +1022,6 @@ __global__ void moeA2ACombineKernel(
 
         if (blockIdx.x == 0)
         {
-            // asm volatile("fence.release.sys;");
 #pragma unroll 1 // No unroll
             for (int peer_rank = lane_id; peer_rank < ep_size; peer_rank += warpSize)
             {
@@ -1050,7 +1053,12 @@ __global__ void moeA2ACombineKernel(
                 flag_set = flag_value == expected_value;
             } while (!flag_set);
         }
+#if defined(__CUDA_ARCH__) && __CUDA_ARCH__ >= 900
+        // .acquire and .release qualifiers for fence instruction require sm_90 or higher.
         asm volatile("fence.acquire.sys;");
+#else
+        asm volatile("fence.acq_rel.sys;");
+#endif
     }
     __syncthreads();
 #endif
diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h
index 987b953ee3c4..31eba5bb8d4a 100644
--- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h
+++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/blockScaleMoe/runner.h
@@ -78,8 +78,10 @@ enum class RoutingMethodType : int64_t
     Llama4 = 3,
     // RenormalizeNaive: Softmax -> TopK -> Renormalize
     RenormalizeNaive = 4,
+    // MiniMaxM2: Sigmoid -> RoutingBiasAdd -> TopK -> Renormalize(without bias)
+    MiniMax2 = 5,
     // Unspecified
-    Unspecified = 5,
+    Unspecified = 6,
 };
 
 inline int32_t maybeGetMinTokenCount(int32_t numPaddedTokens, int32_t hiddenSize, int32_t dtypeSizeBits)
@@ -98,6 +100,7 @@ inline std::string serializeMoeRoutingMethodType(RoutingMethodType routingMethod
     case RoutingMethodType::DeepSeekV3: return "DeepSeekV3";
     case RoutingMethodType::Llama4: return "Llama4";
     case RoutingMethodType::RenormalizeNaive: return "RenormalizeNaive";
+    case RoutingMethodType::MiniMax2: return "MiniMax2";
     default: TLLM_CHECK_WITH_INFO(false, "Invalid routing method"); return "";
     };
 }
diff --git a/docker/Dockerfile.multi b/docker/Dockerfile.multi
index 74e18b2cd2a2..a6bf164d1ad0 100644
--- a/docker/Dockerfile.multi
+++ b/docker/Dockerfile.multi
@@ -1,8 +1,8 @@
 # Multi-stage Dockerfile
 ARG BASE_IMAGE=nvcr.io/nvidia/pytorch
 ARG TRITON_IMAGE=nvcr.io/nvidia/tritonserver
-ARG BASE_TAG=25.10-py3
-ARG TRITON_BASE_TAG=25.10-py3
+ARG BASE_TAG=25.12-py3
+ARG TRITON_BASE_TAG=25.12-py3
 ARG DEVEL_IMAGE=devel
 
 FROM ${BASE_IMAGE}:${BASE_TAG} AS base
@@ -147,6 +147,7 @@ RUN --mount=type=cache,target=/root/.cache/pip --mount=type=bind,from=wheel,sour
     pip install /tmp/wheel/tensorrt_llm*.whl
 
 COPY README.md ./
+COPY --from=wheel /src/tensorrt_llm/build/tensorrt_llm*.whl ./
 COPY docs docs
 COPY cpp/include include
 
diff --git a/docker/Makefile b/docker/Makefile
index 519dbbda13d3..67c0a36015c5 100644
--- a/docker/Makefile
+++ b/docker/Makefile
@@ -202,17 +202,16 @@ jenkins-rockylinux8_%: PYTHON_VERSION_TAG_ID = $(if $(findstring 3.12,${PYTHON_V
 jenkins-rockylinux8_%: IMAGE_WITH_TAG = $(shell . ../jenkins/current_image_tags.properties && echo $$LLM_ROCKYLINUX8_${PYTHON_VERSION_TAG_ID}_DOCKER_IMAGE)
 jenkins-rockylinux8_%: STAGE = tritondevel
 jenkins-rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
-# [TODO] Update to NVIDIA CUDA 13.0.2 when it's available
-jenkins-rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8
+jenkins-rockylinux8_%: BASE_TAG = 13.1.0-devel-rockylinux8
 
 rockylinux8_%: STAGE = tritondevel
 rockylinux8_%: BASE_IMAGE = nvcr.io/nvidia/cuda
-rockylinux8_%: BASE_TAG = 13.0.1-devel-rockylinux8
+rockylinux8_%: BASE_TAG = 13.1.0-devel-rockylinux8
 
 # For x86_64 and aarch64
 ubuntu22_%: STAGE = tritondevel
 ubuntu22_%: BASE_IMAGE = nvcr.io/nvidia/cuda
-ubuntu22_%: BASE_TAG = 13.0.1-devel-ubuntu22.04
+ubuntu22_%: BASE_TAG = 13.1.0-devel-ubuntu22.04
 
 trtllm_%: STAGE = release
 trtllm_%: PUSH_TO_STAGING := 0
diff --git a/docker/common/install_cuda_toolkit.sh b/docker/common/install_cuda_toolkit.sh
index 0dc5cb305aa5..555a3b348b87 100644
--- a/docker/common/install_cuda_toolkit.sh
+++ b/docker/common/install_cuda_toolkit.sh
@@ -5,7 +5,7 @@ set -ex
 # This script is used for reinstalling CUDA on Rocky Linux 8 with the run file.
 # CUDA version is usually aligned with the latest NGC CUDA image tag.
 # Only use when public CUDA image is not ready.
-CUDA_VER="13.0.2_580.95.05"
+CUDA_VER="13.1.0_590.44.01"
 CUDA_VER_SHORT="${CUDA_VER%_*}"
 
 NVCC_VERSION_OUTPUT=$(nvcc --version)
diff --git a/docker/common/install_polygraphy.sh b/docker/common/install_polygraphy.sh
index 315658d7a0d3..da9df6495464 100644
--- a/docker/common/install_polygraphy.sh
+++ b/docker/common/install_polygraphy.sh
@@ -5,7 +5,7 @@ set -ex
 if [ -n "${GITHUB_MIRROR}" ]; then
   export PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple"
 fi
-pip3 install polygraphy==0.49.9
+pip3 install polygraphy==0.49.26
 
 # Clean up pip cache and temporary files
 pip3 cache purge
diff --git a/docker/common/install_pytorch.sh b/docker/common/install_pytorch.sh
index 069b26846c89..9a971a3bb25c 100644
--- a/docker/common/install_pytorch.sh
+++ b/docker/common/install_pytorch.sh
@@ -4,8 +4,8 @@ set -ex
 
 # Use latest stable version from https://pypi.org/project/torch/#history
 # and closest to the version specified in
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10
-TORCH_VERSION="2.9.0"
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12
+TORCH_VERSION="2.9.1"
 SYSTEM_ID=$(grep -oP '(?<=^ID=).+' /etc/os-release | tr -d '"')
 
 prepare_environment() {
diff --git a/docker/common/install_tensorrt.sh b/docker/common/install_tensorrt.sh
index 3887be6fa260..855daa366e4f 100644
--- a/docker/common/install_tensorrt.sh
+++ b/docker/common/install_tensorrt.sh
@@ -2,20 +2,20 @@
 
 set -ex
 
-TRT_VER="10.13.3.9"
+TRT_VER="10.14.1.48"
 # Align with the pre-installed cuDNN / cuBLAS / NCCL versions from
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10
-CUDA_VER="13.0" # 13.0.2
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12
+CUDA_VER="13.1" # 13.1.0
 # Keep the installation for cuDNN if users want to install PyTorch with source codes.
 # PyTorch 2.x can compile with cuDNN v9.
-CUDNN_VER="9.14.0.64-1"
-NCCL_VER="2.27.7-1+cuda13.0"
-CUBLAS_VER="13.1.0.3-1"
+CUDNN_VER="9.17.0.29-1"
+NCCL_VER="2.28.9-1+cuda13.0"
+CUBLAS_VER="13.2.0.9-1"
 # Align with the pre-installed CUDA / NVCC / NVRTC versions from
 # https://docs.nvidia.com/cuda/cuda-toolkit-release-notes/index.html
-NVRTC_VER="13.0.88-1"
-CUDA_RUNTIME="13.0.96-1"
-CUDA_DRIVER_VERSION="580.95.05-1.el8"
+NVRTC_VER="13.1.80-1"
+CUDA_RUNTIME="13.1.80-1"
+CUDA_DRIVER_VERSION="590.44.01-1.el8"
 
 for i in "$@"; do
     case $i in
@@ -118,7 +118,12 @@ install_rockylinux_requirements() {
 install_tensorrt() {
     PY_VERSION=$(python3 -c 'import sys; print(".".join(map(str, sys.version_info[0:2])))')
     PARSED_PY_VERSION=$(echo "${PY_VERSION//./}")
+
     TRT_CUDA_VERSION=${CUDA_VER}
+    # No CUDA 13.1 version for TensorRT yet. Use CUDA 13.0 package instead.
+    if [ "$CUDA_VER" = "13.1" ]; then
+        TRT_CUDA_VERSION="13.0"
+    fi
     TRT_VER_SHORT=$(echo $TRT_VER | cut -d. -f1-3)
 
     if [ -z "$RELEASE_URL_TRT" ];then
diff --git a/docs/source/commands/trtllm-serve/trtllm-serve.rst b/docs/source/commands/trtllm-serve/trtllm-serve.rst
index b26e45de9242..c73e903e6cd0 100644
--- a/docs/source/commands/trtllm-serve/trtllm-serve.rst
+++ b/docs/source/commands/trtllm-serve/trtllm-serve.rst
@@ -170,6 +170,24 @@ TRT-LLM multimodal supports the following modalities and data types (depending o
    `load_base64_image utility <https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/utils/load_base64_image.py>`__
    for implementation details.
 
+**Image embeddings**
+
+It is also possible to directly provide the image embeddings to use by the multimodal
+model.
+
+* Using "image_embeds" with base64-encoded data:
+
+  .. code-block:: json
+
+     {"role": "user", "content": [
+         {"type": "text", "text": "What's in this image?"},
+         {"type": "image_embeds", "image_embeds": {"data": "{image_embeddings_base64}"}}}
+     ]}
+
+.. note::
+   The contents of `image_embeddings_base64` can be generated by base64-encoding
+   the result of serializing a tensor via `torch.save`.
+
 **Video**
 
 * Using "video_url":
diff --git a/jenkins/Build.groovy b/jenkins/Build.groovy
index 5ecaa43a2233..35b95d4cac9e 100644
--- a/jenkins/Build.groovy
+++ b/jenkins/Build.groovy
@@ -83,19 +83,19 @@ def BUILD_CONFIGS = [
     (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
     (TARNAME) : "TensorRT-LLM-GH200.tar.gz",
     (WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
-    (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
+    (BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
   ],
   (CONFIG_LINUX_AARCH64_PYBIND): [
     (WHEEL_EXTRA_ARGS) : "--binding_type pybind --extra-cmake-vars WARNING_IS_ERROR=ON --extra-cmake-vars NIXL_ROOT=/opt/nvidia/nvda_nixl --extra-cmake-vars MOONCAKE_ROOT=/usr/local/Mooncake",
     (TARNAME) : "pybind-TensorRT-LLM-GH200.tar.gz",
     (WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
-    (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
+    (BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
   ],
   (CONFIG_LINUX_AARCH64_LLVM) : [
     (WHEEL_EXTRA_ARGS) : "--extra-cmake-vars WARNING_IS_ERROR=ON -DCMAKE_C_COMPILER=clang -DCMAKE_CXX_COMPILER=clang++ -DCMAKE_CUDA_HOST_COMPILER=clang -DCMAKE_LINKER_TYPE=LLD",
     (TARNAME) : "llvm-TensorRT-LLM-GH200.tar.gz",
     (WHEEL_ARCHS): "90-real;100-real;103-real;120-real",
-    (BUILD_JOBS_FOR_CONFIG): "4", // TODO: Remove after fix the build OOM issue on SBSA
+    (BUILD_JOBS_FOR_CONFIG): "8", // TODO: Remove after fix the build OOM issue on SBSA
   ],
 ]
 
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index f62874af41a0..bd2cdd21a132 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -39,7 +39,7 @@ LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE = env.wheelDockerImagePy310
 LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE = env.wheelDockerImagePy312
 
 // DLFW torch image
-DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.10-py3"
+DLFW_IMAGE = "urm.nvidia.com/docker/nvidia/pytorch:25.12-py3"
 
 //Ubuntu base image
 UBUNTU_22_04_IMAGE = "urm.nvidia.com/docker/ubuntu:22.04"
@@ -316,6 +316,11 @@ def processShardTestList(llmSrc, testDBList, splitId, splits, perfMode=false) {
                     foundRunningLine = true
                     return false  // Don't include the "Running" line itself
                 }
+                // Stop collecting when we hit the warnings/errors summary separator
+                if (foundRunningLine && line.contains('======================')) {
+                    foundRunningLine = false  // Stop collecting
+                    return false
+                }
 
                 def hasDoubleColon = line.contains('::')
                 def shouldInclude = foundRunningLine && hasDoubleColon
@@ -3389,7 +3394,7 @@ def launchTestJobs(pipeline, testFilter)
     // Python version and OS for sanity check
     x86SanityCheckConfigs = [
         "PY312-DLFW": [
-            LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE,
+            LLM_DOCKER_IMAGE,  // Workaround ABI incompatibilities between PyTorch 2.9.1 and 2.10.0a0
             "B200_PCIe",
             X86_64_TRIPLE,
             false,
@@ -3418,15 +3423,16 @@ def launchTestJobs(pipeline, testFilter)
     ]
 
     aarch64SanityCheckConfigs = [
+        /* //Disable PY312-UB2404 temporarily since lack of official PyTorch for CUDA 13.1.
         "PY312-UB2404": [
             LLM_DOCKER_IMAGE,
             "GH200",
             AARCH64_TRIPLE,
             false,
             "",
-            UBUNTU_24_04_IMAGE,
-            true, // Extra PyTorch CUDA 13.0 install
-        ],
+            DLFW_IMAGE,
+            false, // Extra PyTorch CUDA 13.0 install
+        ],*/
         "PY312-DLFW": [
             LLM_DOCKER_IMAGE,
             "GH200",
@@ -3524,7 +3530,7 @@ def launchTestJobs(pipeline, testFilter)
                             def platform = cpu_arch == X86_64_TRIPLE ? "x86_64" : "sbsa"
                             trtllm_utils.llmExecStepWithRetry(pipeline, script: "wget https://developer.download.nvidia.com/compute/cuda/repos/${ubuntu_version}/${platform}/cuda-keyring_1.1-1_all.deb")
                             trtllm_utils.llmExecStepWithRetry(pipeline, script: "dpkg -i cuda-keyring_1.1-1_all.deb")
-                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-0")
+                            trtllm_utils.llmExecStepWithRetry(pipeline, script: "apt-get update && apt-get install -y cuda-toolkit-13-1")
                         }
                         // Extra PyTorch CUDA 13.0 install for all bare-metal environments (Default PyTorch is for CUDA 12.8)
                         if (values[6]) {
@@ -3532,9 +3538,9 @@ def launchTestJobs(pipeline, testFilter)
                             // Use internal mirror instead of https://download.pytorch.org/whl/cu130 for better network stability.
                             // PyTorch CUDA 13.0 package and torchvision package can be installed as expected.
                             if (k8s_arch == "amd64") {
-                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
+                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1+cu130 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
                             } else {
-                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.0+cu130 torchvision==0.24.0 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
+                                trtllm_utils.llmExecStepWithRetry(pipeline, script: "pip3 install torch==2.9.1+cu130 torchvision==0.24.1 --extra-index-url https://urm.nvidia.com/artifactory/api/pypi/pytorch-cu128-remote/simple")
                             }
                         }
 
diff --git a/jenkins/current_image_tags.properties b/jenkins/current_image_tags.properties
index 32643c10d3c9..24e44e26fabe 100644
--- a/jenkins/current_image_tags.properties
+++ b/jenkins/current_image_tags.properties
@@ -13,7 +13,7 @@
 #     images are adopted from PostMerge pipelines, the abbreviated commit hash is used instead.
 IMAGE_NAME=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm
 
-LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-x86_64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512241744-10055
-LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.10-py3-aarch64-ubuntu24.04-trt10.13.3.9-skip-tritondevel-202512241744-10055
-LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py310-trt10.13.3.9-skip-tritondevel-202512241744-10055
-LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.0.2-devel-rocky8-x86_64-rocky8-py312-trt10.13.3.9-skip-tritondevel-202512241744-10055
+LLM_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-x86_64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202601011103-9818
+LLM_SBSA_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:pytorch-25.12-py3-aarch64-ubuntu24.04-trt10.14.1.48-skip-tritondevel-202601011103-9818
+LLM_ROCKYLINUX8_PY310_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py310-trt10.14.1.48-skip-tritondevel-202601011103-9818
+LLM_ROCKYLINUX8_PY312_DOCKER_IMAGE=urm.nvidia.com/sw-tensorrt-docker/tensorrt-llm:cuda-13.1.0-devel-rocky8-x86_64-rocky8-py312-trt10.14.1.48-skip-tritondevel-202601011103-9818
diff --git a/requirements.txt b/requirements.txt
index a21b8ca2819c..2e789cbc7f61 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -19,13 +19,14 @@ pandas
 h5py==3.12.1
 StrEnum
 sentencepiece>=0.1.99
-tensorrt~=10.13.3
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.9.0a0.
-torch>=2.9.0a0,<=2.9.0
+tensorrt~=10.14.1
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12 uses 2.10.0a0.
+torch>=2.9.1,<=2.10.0a0
 torchvision
 nvidia-modelopt[torch]~=0.37.0
-# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-10.html#rel-25-10 uses 2.27.7
-nvidia-nccl-cu13==2.27.7
+# https://docs.nvidia.com/deeplearning/frameworks/pytorch-release-notes/rel-25-12.html#rel-25-12 uses 2.28.9
+# torch 2.9.1+cu130 depends on nvidia-nccl-cu13==2.27.7
+nvidia-nccl-cu13>=2.27.7,<=2.28.9
 nvidia-cuda-nvrtc
 transformers==4.57.1
 prometheus_client
@@ -65,7 +66,7 @@ ninja
 etcd3 @ git+https://github.com/kragniz/python-etcd3.git@e58a899579ba416449c4e225b61f039457c8072a
 blake3
 soundfile
-triton==3.5.0
+triton==3.5.1
 tiktoken
 blobfile
 openai-harmony==0.0.4
diff --git a/tensorrt_llm/_torch/attention_backend/interface.py b/tensorrt_llm/_torch/attention_backend/interface.py
index 677a441c4059..71f7c8be6757 100644
--- a/tensorrt_llm/_torch/attention_backend/interface.py
+++ b/tensorrt_llm/_torch/attention_backend/interface.py
@@ -24,6 +24,14 @@
 from ..pyexecutor.resource_manager import KVCacheManager
 from ..utils import get_model_extra_attrs
 
+try:
+    # Transformers v5
+    from transformers.configuration_utils import ALLOWED_ATTENTION_LAYER_TYPES
+except ImportError:
+    # Transformers v4
+    from transformers.configuration_utils import \
+        ALLOWED_LAYER_TYPES as ALLOWED_ATTENTION_LAYER_TYPES
+
 
 @dataclass
 class AttentionRuntimeFeatures:
@@ -448,6 +456,13 @@ class RopeParams:
     def from_config(config) -> "RopeParams":
         rope_params = RopeParams()
 
+        hf_rope_parameters = getattr(config, 'rope_parameters', None)
+        if hf_rope_parameters is not None:
+            assert not set(hf_rope_parameters.keys()).issubset(
+                ALLOWED_ATTENTION_LAYER_TYPES), (
+                    "Per-layer-type RoPE configuration is not supported yet.")
+            config.update(hf_rope_parameters)
+
         # get rotary parameters.
         hidden_size = config.hidden_size
         num_attention_heads = config.num_attention_heads
diff --git a/tensorrt_llm/_torch/auto_deploy/llm_args.py b/tensorrt_llm/_torch/auto_deploy/llm_args.py
index aa9f0147cd20..7c46a48df36f 100644
--- a/tensorrt_llm/_torch/auto_deploy/llm_args.py
+++ b/tensorrt_llm/_torch/auto_deploy/llm_args.py
@@ -384,6 +384,12 @@ class LlmArgs(AutoDeployConfig, BaseLlmArgs, BaseSettings):
 
     _quant_config: Optional[QuantConfig] = PrivateAttr(default=None)
 
+    max_stats_len: int = Field(
+        default=1000,
+        description="The max number of performance statistic entries.",
+        status="prototype",
+    )
+
     @property
     def quant_config(self) -> QuantConfig:
         if self._quant_config is None:
diff --git a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
index 48066cb2565d..a81e0f3f5ca7 100644
--- a/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
+++ b/tensorrt_llm/_torch/auto_deploy/shim/ad_executor.py
@@ -490,10 +490,12 @@ def __init__(
             self.max_beam_width = ad_config.max_beam_width
             self.spec_config = ad_config.speculative_config
             self._disable_overlap_scheduler = ad_config.disable_overlap_scheduler
+            self.llm_args.max_stats_len = ad_config.max_stats_len
         else:
             self.max_beam_width = 1
             self.spec_config = None
             self._disable_overlap_scheduler = False
+            self.llm_args.max_stats_len = 1000
 
         # check for max total draft tokens
         if self.spec_config is not None:
diff --git a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
index 06e93eb3e5e2..5b683637c6e3 100644
--- a/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/torch_custom_ops.py
@@ -252,14 +252,32 @@ def fused_moe(
     )
 
     run_moe = moe_runner.fused_moe_runner.run_moe_min_latency if min_latency_mode else moe_runner.fused_moe_runner.run_moe
-    output = run_moe(input, token_selected_experts, token_final_scales,
-                     fc1_expert_weights, fc1_expert_biases, fc2_expert_weights,
-                     fc2_expert_biases, quant_scales, input_sf,
-                     swizzled_input_sf, swiglu_alpha, swiglu_beta, swiglu_limit,
-                     tp_size, tp_rank, ep_size, ep_rank, cluster_size,
-                     cluster_rank, enable_alltoall, min_latency_mode,
-                     [gemm_tactic_1, gemm_tactic_2], activation_type,
-                     unpadded_hidden_size, tuner_num_tokens, out_tensor)
+    try:
+        output = run_moe(input, token_selected_experts, token_final_scales,
+                         fc1_expert_weights, fc1_expert_biases,
+                         fc2_expert_weights, fc2_expert_biases, quant_scales,
+                         input_sf, swizzled_input_sf, swiglu_alpha, swiglu_beta,
+                         swiglu_limit, tp_size, tp_rank, ep_size, ep_rank,
+                         cluster_size, cluster_rank, enable_alltoall,
+                         min_latency_mode, [gemm_tactic_1, gemm_tactic_2],
+                         activation_type, unpadded_hidden_size,
+                         tuner_num_tokens, out_tensor)
+    except RuntimeError as e:
+        error_msg = str(e)
+        if "DeepGEMM only supports Hopper" in error_msg:
+            raise RuntimeError(
+                f"{error_msg}"
+                "Note: This is the Cutlass backend with DeepGemm JIT path. "
+                "For Blackwell (SM100+) support, please use the DEEPGEMM backend instead."
+            ) from e
+        raise
+
+    # When out_tensor is provided, the result is written in-place to out_tensor.
+    # Return empty list to avoid aliasing constraint violation in PyTorch 2.9.1+
+    # (custom op output cannot be the same tensor as input).
+    # Callers should use out_tensor directly when they provide it.
+    if out_tensor is not None and not min_latency_mode:
+        return []
 
     return output if min_latency_mode else [output]
 
diff --git a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
index 412b461a93c5..ea7d7f93e689 100644
--- a/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/trtllm_gen_custom_ops.py
@@ -1102,9 +1102,17 @@ def mxe4m3_mxe2m1_block_scale_moe_runner(
         0] = routing_logits  # replace dummy routing logits with actual routing logits
     input_tensors[-2] = topk_weights  # replace dummy topk_weights with actual
     input_tensors[-1] = topk_ids  # replace dummy topk_ids with actual
-    return kernel_runner(input_tensors,
-                         tactic=[-1, -1] if best_tactic == -1 else best_tactic,
-                         output=output)
+    result = kernel_runner(
+        input_tensors,
+        tactic=[-1, -1] if best_tactic == -1 else best_tactic,
+        output=output)
+    # When output is provided, the result is written in-place to output.
+    # Return empty tensor to avoid aliasing constraint violation in PyTorch 2.9.1+
+    # (custom op output cannot be the same tensor as input).
+    # Callers should use output directly when they provide it.
+    if output is not None:
+        return torch.empty(0, device=result.device, dtype=result.dtype)
+    return result
 
 
 @dataclass(frozen=True)
diff --git a/tensorrt_llm/_torch/models/__init__.py b/tensorrt_llm/_torch/models/__init__.py
index 7b8718c10bdc..c56bf86faffd 100644
--- a/tensorrt_llm/_torch/models/__init__.py
+++ b/tensorrt_llm/_torch/models/__init__.py
@@ -15,6 +15,7 @@
 from .modeling_hyperclovax import HCXVisionForCausalLM
 from .modeling_llama import LlamaForCausalLM
 from .modeling_llava_next import LlavaNextModel
+from .modeling_minimaxm2 import MiniMaxM2ForCausalLM
 from .modeling_mistral import Mistral3VLM, MistralForCausalLM
 from .modeling_mixtral import MixtralForCausalLM
 from .modeling_nemotron import NemotronForCausalLM
@@ -80,6 +81,7 @@
     "SeedOssForCausalLM",
     "Glm4MoeForCausalLM",
     "Qwen3VLModel",
+    "MiniMaxM2ForCausalLM",
 ]
 
 if transformers.__version__ >= "4.45.1":
diff --git a/tensorrt_llm/_torch/models/modeling_minimaxm2.py b/tensorrt_llm/_torch/models/modeling_minimaxm2.py
new file mode 100644
index 000000000000..73cd480ee7cc
--- /dev/null
+++ b/tensorrt_llm/_torch/models/modeling_minimaxm2.py
@@ -0,0 +1,314 @@
+# SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from typing import Dict, List, Optional
+
+import torch
+from torch import nn
+from transformers import PretrainedConfig
+
+from tensorrt_llm.functional import PositionEmbeddingType
+
+from ..attention_backend import AttentionMetadata
+from ..attention_backend.interface import PositionalEmbeddingParams, RopeParams
+from ..models.modeling_utils import ModelConfig
+from ..modules.attention import Attention
+from ..modules.decoder_layer import DecoderLayer
+from ..modules.embedding import Embedding
+from ..modules.fused_moe import MiniMaxM2MoeRoutingMethod, create_moe
+from ..modules.linear import Linear
+from ..modules.rms_norm import RMSNorm
+from ..utils import AuxStreamType
+from .modeling_utils import DecoderModel, DecoderModelForCausalLM, register_auto_model
+
+
+# MiniMax M2/M2.1 requires the implementation of the following two additional components:
+#  1. MoE routing method: Currently, TRT-LLM does not support
+#     the following routing method: sigmoid -> add bias -> topk -> renorm.
+#  2. QK layer normalization needs to be performed across the head_num * head_size dimension,
+#     which conflicts with the current TP-mode attention logic.
+# For the better performance, we suggest to enable attention DP when using MiniMax M2/M2.1 model.
+class MiniMaxM2MoE(nn.Module):
+    def __init__(
+        self,
+        model_config: ModelConfig[PretrainedConfig],
+        aux_stream: torch.cuda.Stream,
+        layer_idx: Optional[int] = None,
+    ):
+        super().__init__()
+        config = model_config.pretrained_config
+        self.hidden_dim = config.hidden_size
+        self.ffn_dim = config.intermediate_size
+        self.num_experts = config.num_local_experts
+        self.top_k = config.num_experts_per_tok
+        self.enable_attention_dp = model_config.mapping.enable_attention_dp
+
+        # moe gate (linear layer) only runs in half/full precision for now
+        self.gate = Linear(
+            self.hidden_dim, self.num_experts, bias=False, dtype=torch.float32, quant_config=None
+        )
+
+        self.e_score_correction_bias = nn.Parameter(
+            torch.empty((self.num_experts), dtype=torch.float32), requires_grad=False
+        )
+
+        reduce_results = True
+        self.experts = create_moe(
+            routing_method=MiniMaxM2MoeRoutingMethod(
+                top_k=self.top_k,
+                num_experts=self.num_experts,
+                callable_e_score_correction_bias=lambda: self.e_score_correction_bias,
+            ),
+            num_experts=self.num_experts,
+            aux_stream_dict={AuxStreamType.MoeChunkingOverlap: aux_stream},
+            reduce_results=reduce_results,
+            model_config=model_config,
+            layer_idx=layer_idx,
+        )
+
+    def load_weights(self, weights: List[Dict]):
+        assert len(weights) == 1
+
+        self.e_score_correction_bias.copy_(
+            weights[0]["e_score_correction_bias"][:].to(self.e_score_correction_bias.dtype)
+        )
+
+    def forward(
+        self,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+    ) -> torch.Tensor:
+        all_rank_num_tokens = attn_metadata.all_rank_num_tokens
+        hidden_states_f32 = hidden_states.to(torch.float32)
+        router_logits = self.gate(hidden_states_f32)
+        final_hidden_states = self.experts(
+            hidden_states,
+            router_logits,
+            all_rank_num_tokens=all_rank_num_tokens,
+            use_dp_padding=False,
+        )
+        return final_hidden_states
+
+
+# It's a little bit tricky to implement special qk norm
+# because rms dim is hidden_size * num_heads, not hidden_size, after qkv linear,
+# the result size is hidden_size * num_heads / tp_size.
+# Actually, we have two strategies to implement qk norm attention:
+# 1. the first linear layer is not col parallel, then we can use the normal rms layer norm. each attention use full qkv
+# 2. we use col parallel linear layer, then we use allgather to gather qkv from all gpus,
+#    then we use rms norm on q and k. Finally, we split qkv to each gpus and continue.
+# for better performance, we choose the second strategy here.
+# Most adaptions are from QKNormRoPEAttention.
+class MiniMaxM2Attention(Attention):
+    def __init__(
+        self,
+        *,
+        model_config: ModelConfig[PretrainedConfig],
+        layer_idx: Optional[int] = None,
+    ):
+        config = model_config.pretrained_config
+        self.pretrained_config = config
+
+        super().__init__(
+            hidden_size=config.hidden_size,
+            num_attention_heads=config.num_attention_heads,
+            num_key_value_heads=config.num_key_value_heads,
+            max_position_embeddings=config.max_position_embeddings,
+            bias=False,
+            pos_embd_params=PositionalEmbeddingParams(
+                type=PositionEmbeddingType.rope_gpt_neox,
+                rope=RopeParams.from_config(config),
+            ),
+            rope_fusion=True,
+            layer_idx=layer_idx,
+            dtype=config.torch_dtype,
+            config=model_config,
+        )
+
+        self.q_norm = RMSNorm(
+            hidden_size=self.q_size * self.tp_size,
+            eps=config.rms_norm_eps,
+            dtype=config.torch_dtype,
+        )
+        self.k_norm = RMSNorm(
+            hidden_size=self.kv_size * self.tp_size,
+            eps=config.rms_norm_eps,
+            dtype=config.torch_dtype,
+        )
+
+    def apply_qk_norm(self, q, k):
+        if self.qkv_proj.mapping.tp_size > 1:
+            # collect q and k from all gpus
+            from ..distributed import allgather
+
+            temp_q = allgather(q, self.qkv_proj.mapping)
+            temp_k = allgather(k, self.qkv_proj.mapping)
+            temp_q = self.q_norm(temp_q)
+            temp_k = self.k_norm(temp_k)
+            q = temp_q.reshape(-1, self.tp_size, self.q_size)[:, self.tp_rank, :].reshape(
+                -1, self.q_size
+            )
+            k = temp_k.reshape(-1, self.tp_size, self.kv_size)[:, self.tp_rank, :].reshape(
+                -1, self.kv_size
+            )
+        else:
+            q = self.q_norm(q)
+            k = self.k_norm(k)
+
+        return q, k
+
+    def apply_rope(
+        self,
+        q: torch.Tensor,
+        k: Optional[torch.Tensor],
+        v: Optional[torch.Tensor],
+        position_ids: torch.Tensor,
+    ):
+        """
+        The apply_rope method is called in the forward method of the Attention class.
+        The apply_rope method is overridden in this class to apply QK norm and RoPE to the input tensor.
+        """
+        # Apply QK norm before RoPE.
+        q, k, v = self.split_qkv(q, k, v)
+        q, k = self.apply_qk_norm(q, k)
+        return super().apply_rope(q, k, v, position_ids)
+
+
+class MiniMaxM2DecoderLayer(DecoderLayer):
+    def __init__(
+        self,
+        model_config: ModelConfig[PretrainedConfig],
+        layer_idx: int,
+        aux_stream: torch.cuda.Stream,
+    ):
+        super().__init__()
+        config = model_config.pretrained_config
+        self.hidden_size = config.hidden_size
+
+        self.self_attn = MiniMaxM2Attention(model_config=model_config, layer_idx=layer_idx)
+
+        self.block_sparse_moe = MiniMaxM2MoE(
+            model_config=model_config, aux_stream=aux_stream, layer_idx=layer_idx
+        )
+
+        self.input_layernorm = RMSNorm(
+            hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype
+        )
+
+        self.post_attention_layernorm = RMSNorm(
+            hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype
+        )
+        self.mapping = model_config.mapping
+        self.layer_idx = layer_idx
+
+    def forward(
+        self,
+        position_ids: torch.IntTensor,
+        hidden_states: torch.Tensor,
+        attn_metadata: AttentionMetadata,
+        residual: Optional[torch.Tensor],
+        **kwargs,
+    ) -> torch.Tensor:
+        if residual is None:
+            residual = hidden_states
+            hidden_states = self.input_layernorm(hidden_states)
+        else:
+            hidden_states, residual = self.input_layernorm(hidden_states, residual)
+
+        # Self Attention
+        hidden_states = self.self_attn(
+            position_ids=position_ids,
+            hidden_states=hidden_states,
+            attn_metadata=attn_metadata,
+            **kwargs,
+        )
+
+        # Fully Connected
+        hidden_states, residual = self.post_attention_layernorm(hidden_states, residual)
+        hidden_states = self.block_sparse_moe(hidden_states, attn_metadata)
+        return hidden_states, residual
+
+
+class MiniMaxM2Model(DecoderModel):
+    def __init__(self, model_config: ModelConfig[PretrainedConfig]):
+        super().__init__(model_config)
+        # add this for kv cache initialization (if we use bf16 for kv cache)
+        quant_config = model_config.quant_config
+        if quant_config is None or (
+            (not quant_config.quant_mode.has_fp8_kv_cache())
+            and (not quant_config.quant_mode.has_fp4_kv_cache())
+        ):
+            model_config.pretrained_config.torch_dtype = torch.bfloat16
+        config = model_config.pretrained_config
+        self.vocab_size = config.vocab_size
+        self.aux_stream = torch.cuda.Stream()
+
+        self.embed_tokens = Embedding(
+            config.vocab_size,
+            config.hidden_size,
+            dtype=config.torch_dtype,
+            enable_torch_compile_for_embedding=model_config.enable_torch_compile_for_embedding,
+        )
+
+        self.layers = nn.ModuleList(
+            [
+                MiniMaxM2DecoderLayer(model_config, layer_idx, self.aux_stream)
+                for layer_idx in range(config.num_hidden_layers)
+            ]
+        )
+        self.norm = RMSNorm(
+            hidden_size=config.hidden_size, eps=config.rms_norm_eps, dtype=config.torch_dtype
+        )
+
+    def forward(
+        self,
+        attn_metadata: AttentionMetadata,
+        input_ids: Optional[torch.IntTensor] = None,
+        position_ids: Optional[torch.IntTensor] = None,
+        inputs_embeds: Optional[torch.FloatTensor] = None,
+        **kwargs,
+    ) -> torch.Tensor:
+        if (input_ids is None) ^ (inputs_embeds is not None):
+            raise ValueError(
+                "You cannot specify both input_ids and inputs_embeds at the same time, and must specify either one"
+            )
+
+        if inputs_embeds is None:
+            inputs_embeds = self.embed_tokens(input_ids)
+
+        hidden_states = inputs_embeds
+
+        residual = None
+        for decoder_layer in self.layers:
+            hidden_states, residual = decoder_layer(
+                position_ids=position_ids,
+                hidden_states=hidden_states,
+                attn_metadata=attn_metadata,
+                residual=residual,
+            )
+
+        hidden_states, _ = self.norm(hidden_states, residual)
+        return hidden_states
+
+
+@register_auto_model("MiniMaxM2ForCausalLM")
+class MiniMaxM2ForCausalLM(DecoderModelForCausalLM[MiniMaxM2Model, PretrainedConfig]):
+    def __init__(self, model_config: ModelConfig[PretrainedConfig]):
+        super().__init__(
+            MiniMaxM2Model(model_config),
+            config=model_config,
+            hidden_size=model_config.pretrained_config.hidden_size,
+            vocab_size=model_config.pretrained_config.vocab_size,
+        )
diff --git a/tensorrt_llm/_torch/modules/fused_moe/__init__.py b/tensorrt_llm/_torch/modules/fused_moe/__init__.py
index 053ecaa25fe0..51d6ba5f9475 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/__init__.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/__init__.py
@@ -12,7 +12,8 @@
 from .routing import (BaseMoeRoutingMethod, DeepSeekV3MoeRoutingMethod,
                       DefaultMoeRoutingMethod,
                       Llama4RenormalizeMoeRoutingMethod,
-                      LoadBalancedMoeRoutingMethod, RenormalizeMoeRoutingMethod,
+                      LoadBalancedMoeRoutingMethod, MiniMaxM2MoeRoutingMethod,
+                      RenormalizeMoeRoutingMethod,
                       RenormalizeNaiveMoeRoutingMethod, RoutingMethodType,
                       SparseMixerMoeRoutingMethod, StaticMoeRoutingMethod,
                       create_renormalize_expert_load_balanced_logits)
@@ -33,6 +34,7 @@
     "MoE",
     "MoeLoadBalancer",
     "MoEWeightLoadingMode",
+    "MiniMaxM2MoeRoutingMethod",
     "RenormalizeMoeRoutingMethod",
     "RenormalizeNaiveMoeRoutingMethod",
     "RoutingMethodType",
diff --git a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
index f1559edfb68d..530d903ad354 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/create_moe.py
@@ -138,8 +138,9 @@ def create_moe_backend(
     moe_load_balancer = get_moe_load_balancer()
     if moe_load_balancer is not None:
         assert moe_cls in [
-            WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE
-        ], "MoE Load Balance is only supported in WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE and CuteDslFusedMoE now."
+            WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE, CuteDslFusedMoE,
+            DeepGemmFusedMoE
+        ], "MoE Load Balance is only supported in WideEPMoE, CutlassFusedMoE, TRTLLMGenFusedMoE and CuteDslFusedMoE, and DeepGemmFusedMoE."
 
     if bias:
         assert moe_cls in [CutlassFusedMoE, TritonFusedMoE, TRTLLMGenFusedMoE
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
index 71e13e1324b7..41eacfe65c39 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -433,7 +433,7 @@ def run_moe(
             elif self.has_w4a16_mxfp4:
                 weight_dtype = torch.uint8
 
-        final_hidden_states = torch.ops.trtllm.fused_moe(
+        result = torch.ops.trtllm.fused_moe(
             x,
             token_selected_experts,
             token_final_scales,
@@ -468,10 +468,13 @@ def run_moe(
             unpadded_hidden_size=self.unpadded_hidden_size,
             out_tensor=moe_output,
         )
-        # Custom op requires all inputs are in the same type.
-        # Only in cutlass_min_latency_mode, the output is a list of tensors.
-        # Otherwise, the output should be unpacked as a single tensor.
-        final_hidden_states = final_hidden_states[0]
+        # When moe_output is provided, the result is written in-place and
+        # fused_moe returns empty list to avoid aliasing constraint violation.
+        # Otherwise, unpack the single tensor from the returned list.
+        if moe_output is not None:
+            final_hidden_states = moe_output
+        else:
+            final_hidden_states = result[0]
 
         return final_hidden_states
 
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
index df0a419b3872..5cd181812fca 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_trtllm_gen.py
@@ -610,7 +610,7 @@ def run_moe(
             intermediate_size_per_partition_padded = self.w3_w1_weight.shape[
                 -2] // 2
 
-            final_hidden_states = torch.ops.trtllm.mxe4m3_mxe2m1_block_scale_moe_runner(
+            result = torch.ops.trtllm.mxe4m3_mxe2m1_block_scale_moe_runner(
                 router_logits,
                 routing_bias,
                 x,
@@ -640,6 +640,10 @@ def run_moe(
                 token_selected_experts,
                 output=moe_output,
             )
+
+            # When output is provided, use it directly as the result
+            # (custom op returns empty tensor to avoid PyTorch aliasing constraints)
+            final_hidden_states = moe_output if moe_output is not None else result
         else:
             raise NotImplementedError(
                 "TRTLLMGenFusedMoE only supports fp8_block_scaling, nvfp4, w4a16_mxfp4, w4a8_mxfp4_mxfp8 and w4a8_mxfp4_fp8 dtypes."
diff --git a/tensorrt_llm/_torch/modules/fused_moe/routing.py b/tensorrt_llm/_torch/modules/fused_moe/routing.py
index 3927e9bd6bf1..be21b5716a91 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/routing.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/routing.py
@@ -155,8 +155,10 @@ class RoutingMethodType(IntEnum):
     Llama4 = 3,
     # Qwen3: Softmax -> TopK -> Renormalize
     RenormalizeNaive = 4,
+    # MiniMaxM2: Sigmoid -> RoutingBiasAdd -> TopK -> Renormalize(without bias)
+    MiniMax2 = 5,
     # Unspecified
-    Unspecified = 5,
+    Unspecified = 6,
 
 
 class BaseMoeRoutingMethod(nn.Module):
@@ -379,6 +381,57 @@ def routing_method_type(self):
         return RoutingMethodType.DeepSeekV3
 
 
+class MiniMaxM2MoeRoutingMethod(BaseMoeRoutingMethod):
+
+    def __init__(
+        self,
+        top_k: int,
+        num_experts: int,
+        callable_e_score_correction_bias: Callable[[], torch.Tensor],
+        output_dtype: torch.dtype = torch.float32,
+    ):
+        super().__init__()
+        self.top_k = top_k
+        self.num_experts = num_experts
+        assert callable(callable_e_score_correction_bias)
+        self.callable_e_score_correction_bias = callable_e_score_correction_bias
+        self.output_dtype = output_dtype
+
+    @staticmethod
+    @torch.compile(options={"max-autotune": True})
+    def get_scores(logits, e_score_correction_bias):
+        scores = F.sigmoid(logits)
+        scores_with_bias = scores + e_score_correction_bias
+        if enable_llm_debug():
+            has_nan = torch.isnan(scores_with_bias).any()
+            if has_nan:
+                warnings.warn(
+                    "Detected NAN in the tensor scores_with_bias. Please check if it matches the expectation."
+                )
+
+        return scores, scores_with_bias
+
+    def apply(self,
+              router_logits: torch.Tensor) -> tuple[torch.Tensor, torch.Tensor]:
+        scores, scores_with_bias = self.get_scores(router_logits,
+                                                   self.e_score_correction_bias)
+        _, topk_idx = torch.topk(scores_with_bias,
+                                 k=self.top_k,
+                                 dim=-1,
+                                 sorted=False)
+        top_k_weights = scores.gather(1, topk_idx)
+        top_k_weights /= (top_k_weights.sum(dim=-1, keepdim=True) + 1e-20)
+        return topk_idx.to(torch.int32), top_k_weights.to(self.output_dtype)
+
+    @property
+    def e_score_correction_bias(self) -> torch.Tensor:
+        return self.callable_e_score_correction_bias()
+
+    @property
+    def routing_method_type(self):
+        return RoutingMethodType.MiniMax2
+
+
 class RenormalizeMoeRoutingMethod(BaseMoeRoutingMethod):
 
     def __init__(
@@ -587,6 +640,8 @@ def routing_method_type(self) -> RoutingMethodType:
                                        RenormalizeNaiveMoeRoutingMethod,
                                        RoutingMethodType.Unspecified:
                                        BaseMoeRoutingMethod,
+                                       RoutingMethodType.MiniMax2:
+                                       MiniMaxM2MoeRoutingMethod,
                                    }
 
 
diff --git a/tensorrt_llm/_torch/pyexecutor/py_executor.py b/tensorrt_llm/_torch/pyexecutor/py_executor.py
index 412997336396..94fa5020ca7f 100644
--- a/tensorrt_llm/_torch/pyexecutor/py_executor.py
+++ b/tensorrt_llm/_torch/pyexecutor/py_executor.py
@@ -143,7 +143,6 @@ def __init__(self,
         super(PyExecutor, self).__init__()
         self.device_id = torch.cuda.current_device()
         self.global_rank = dist.rank
-
         # Store the execution stream for model forward operations.
         # This stream is used for proper synchronization with KVCacheTransferManager.
         # execution_stream can be provided by create_py_executor
@@ -181,6 +180,7 @@ def __init__(self,
         self.max_draft_len = max_draft_len
         self.max_total_draft_tokens = max_total_draft_tokens
         self.llm_args = self.model_engine.llm_args
+        self.max_stats_len = max(self.llm_args.max_stats_len, 1)
         self.max_num_tokens = self.llm_args.max_num_tokens
         self.print_log = self.llm_args.print_iter_log
         self.enable_iter_perf_stats = self.llm_args.enable_iter_perf_stats
@@ -866,6 +866,8 @@ def _append_iter_stats(self,
                            req_stats: Optional[List[RequestStats]] = None):
 
         with self.stats_lock:
+            if len(self.stats) > self.max_stats_len:
+                self.stats.pop(0)
             self.stats.append((stats, req_stats))
 
     def _process_iter_stats(
diff --git a/tensorrt_llm/inputs/__init__.py b/tensorrt_llm/inputs/__init__.py
index 406a71d4f5ca..4587d39f033e 100644
--- a/tensorrt_llm/inputs/__init__.py
+++ b/tensorrt_llm/inputs/__init__.py
@@ -16,7 +16,8 @@
                     async_load_audio, async_load_image, async_load_video,
                     convert_image_mode, default_multimodal_input_loader,
                     encode_base64_content_from_url, encode_base64_image,
-                    get_cache_salt_id, load_image, load_video)
+                    get_cache_salt_id, load_base64_image_embeds, load_image,
+                    load_video)
 
 __all__ = [
     "ALL_SUPPORTED_MULTIMODAL_MODELS",
@@ -57,4 +58,5 @@
     "get_cache_salt_id",
     "compute_retained_tokens_count",
     "compute_retention_mask",
+    "load_base64_image_embeds",
 ]
diff --git a/tensorrt_llm/inputs/utils.py b/tensorrt_llm/inputs/utils.py
index bbbd5f4f8f22..0dc09547b3d2 100644
--- a/tensorrt_llm/inputs/utils.py
+++ b/tensorrt_llm/inputs/utils.py
@@ -114,6 +114,15 @@ def load_base64_image(parsed_url: str) -> Image.Image:
     return image
 
 
+def load_base64_image_embeds(str_content: str) -> torch.Tensor:
+    content_bytes = base64.b64decode(str_content)
+    with BytesIO(content_bytes) as buf:
+        image_data: torch.Tensor = torch.load(buf,
+                                              weights_only=True,
+                                              map_location="cpu")
+    return image_data
+
+
 def load_image(image: Union[str, Image.Image],
                format: str = "pt",
                device: str = "cpu") -> Union[Image.Image, torch.Tensor]:
@@ -425,13 +434,14 @@ class MultimodalData(TypedDict):
     """Type definition for multimodal data structure."""
     modality: str
     data: Any
+    is_embedding: bool
 
 
 class ConversationMessage(TypedDict):
     """Type definition for conversation message structure."""
     role: str
     content: List[dict[str, Any]]
-    media: List[MultimodalData] | List[torch.Tensor] | List[Dict[str, Any]]
+    media: List[MultimodalData]
 
     # @classmethod
     # def fromSample(cls, sample: dict[str, str]) -> "ConversationMessage":
@@ -446,33 +456,57 @@ def __init__(
             model_type: str,
             multimodal_server_config: Optional[MultimodalServerConfig] = None):
         self._model_type = model_type
-        self._data = defaultdict[str](list)
-        self._placeholder_counts = defaultdict[str](int)
+        self._data = defaultdict[str, list](list)
+        self._embeddings = defaultdict[str, list](list)
+        self._placeholder_counts = defaultdict[str, int](int)
         self._multimodal_server_config = multimodal_server_config if multimodal_server_config is not None else MultimodalServerConfig(
         )
 
-    async def retrieve_all_async(self) -> Optional[Dict[str, List[Any]]]:
-        """Retrieve all collected multimodal data."""
-        if not self._data:
-            return None
-
-        return {
-            modality: await asyncio.gather(*items)
-            for modality, items in self._data.items()
-        }
-
-    def retrieve_all_sync(self) -> Optional[Dict[str, List[Any]]]:
-        """Retrieve all collected multimodal data."""
-        if not self._data:
-            return None
-
-        return {modality: items for modality, items in self._data.items()}
-
-    def add_data(self, media_type: str, data: Union[Coroutine, Any]):
-        current_count = len(self._data[media_type]) + 1
+    async def retrieve_all_async(
+        self
+    ) -> tuple[Optional[Dict[str, List[Any]]], Optional[Dict[str, List[Any]]]]:
+        """Retrieve all collected multimodal data and embeddings."""
+
+        async def _retrieve(
+                data: Optional[dict[str,
+                                    list]]) -> Optional[Dict[str, List[Any]]]:
+            if not data:
+                return None
+            return {
+                modality: await asyncio.gather(*items)
+                for modality, items in data.items() if items
+            }
+
+        return await _retrieve(self._data), await _retrieve(self._embeddings)
+
+    def retrieve_all_sync(
+        self
+    ) -> tuple[Optional[Dict[str, List[Any]]], Optional[Dict[str, List[Any]]]]:
+        """Retrieve all collected multimodal data and embeddings."""
+
+        def _retrieve(
+                data: Optional[dict[str,
+                                    list]]) -> Optional[Dict[str, List[Any]]]:
+            if not data:
+                return None
+            return {
+                modality: items
+                for modality, items in data.items() if items
+            }
+
+        return _retrieve(self._data), _retrieve(self._embeddings)
+
+    def add_data(self,
+                 media_type: str,
+                 data: Union[Coroutine, Any],
+                 *,
+                 is_embedding: bool = False):
+        current_count = len(self._data[media_type]) + len(
+            self._embeddings[media_type]) + 1
         placeholder = retrieve_multimodal_placeholder(self._model_type,
                                                       media_type, current_count)
-        self._data[media_type].append(data)
+        (self._embeddings
+         if is_embedding else self._data)[media_type].append(data)
         if placeholder:
             self._placeholder_counts[placeholder] += 1
 
@@ -643,33 +677,34 @@ def convert_to_conversation_message(
             media = [media]
         if modality in ["image", "multiple_image"]:
             if is_embedding:
+                _load = lambda mm: mm
+
                 # each mm_embedding corresponds to each image placeholder
                 if not isinstance(media, list):
                     media = [media]
-
-                mm_data = [{
-                    'modality': modality,
-                    'mm_embedding_info': mm
-                } for mm in media]
             else:
-                mm_data = [
-                    MultimodalData(modality=modality,
-                                   data=load_image(i,
-                                                   format=image_data_format,
-                                                   device=device))
-                    for i in media
-                ]
+                _load = lambda mm: load_image(
+                    mm, format=image_data_format, device=device)
+
+            mm_data = [
+                MultimodalData(modality=modality,
+                               data=_load(mm),
+                               is_embedding=is_embedding) for mm in media
+            ]
         elif modality == "video":
             if is_embedding:
                 raise ValueError(
                     "External embedding is not supported for video modality yet."
                 )
             mm_data = [
-                MultimodalData(modality=modality,
-                               data=load_video(i,
-                                               num_frames,
-                                               format=image_data_format,
-                                               device=device)) for i in media
+                MultimodalData(
+                    modality=modality,
+                    data=load_video(i,
+                                    num_frames,
+                                    format=image_data_format,
+                                    device=device),
+                    is_embedding=False,
+                ) for i in media
             ]
         elif modality == "audio":
             if is_embedding:
@@ -677,8 +712,11 @@ def convert_to_conversation_message(
                     "External embedding is not supported for audio modality yet."
                 )
             mm_data = [
-                MultimodalData(modality=modality,
-                               data=load_audio(i, device=device)) for i in media
+                MultimodalData(
+                    modality=modality,
+                    data=load_audio(i, device=device),
+                    is_embedding=False,
+                ) for i in media
             ]
         elif modality == "image_audio":
             if is_embedding:
@@ -706,16 +744,22 @@ def convert_to_conversation_message(
                         pass
                 if _modal is None:
                     raise ValueError(f"Unknown matching modality: {modality}")
-                mm_data.append(MultimodalData(modality=_modal, data=data))
+                mm_data.append(
+                    MultimodalData(modality=_modal,
+                                   data=data,
+                                   is_embedding=False))
         elif modality == "mixture_text_image":
             mm_data = []
             for m in media:
                 if m:
                     mm_data.append(
-                        MultimodalData(modality="image",
-                                       data=load_image(m,
-                                                       format=image_data_format,
-                                                       device=device)))
+                        MultimodalData(
+                            modality="image",
+                            data=load_image(m,
+                                            format=image_data_format,
+                                            device=device),
+                            is_embedding=False,
+                        ))
         else:
             raise ValueError(f"Unknown modality: {modality}")
         return ConversationMessage(role="user", content=prompt, media=mm_data)
@@ -749,17 +793,12 @@ def convert_to_conversation_message(
                                                is_embedding)
         mm_data_tracker = MultimodalDataTracker(model_type)
         for mdata in conv["media"]:
-            # Check if mdata is a MultimodalData
-            if isinstance(mdata,
-                          dict) and "modality" in mdata and "data" in mdata:
-                mdata_modality = mdata["modality"]
-                if modality == "multiple_image":
-                    mdata_modality = "image"
-                mm_data_tracker.add_data(mdata_modality, mdata["data"])
-            else:
-                # Add embeddings to the tracker for placeholder handling
-                mm_data_tracker.add_data(mdata["modality"],
-                                         mdata["mm_embedding_info"])
+            mdata_modality = mdata["modality"]
+            if modality == "multiple_image":
+                mdata_modality = "image"
+            mm_data_tracker.add_data(mdata_modality,
+                                     mdata["data"],
+                                     is_embedding=is_embedding)
         mm_placeholder_counts = mm_data_tracker.placeholder_counts()
         prompt = conv["content"]
         if mm_placeholder_counts:
@@ -776,11 +815,13 @@ def convert_to_conversation_message(
 
         if mm_placeholder_counts:
             if mm_embeddings is not None:
-                input[
+                _, input[
                     "multi_modal_embeddings"] = mm_data_tracker.retrieve_all_sync(
                     )
             else:
-                input["multi_modal_data"] = mm_data_tracker.retrieve_all_sync()
+                input[
+                    "multi_modal_data"], _ = mm_data_tracker.retrieve_all_sync(
+                    )
         inputs.append(input)
 
     return inputs
diff --git a/tensorrt_llm/llmapi/llm.py b/tensorrt_llm/llmapi/llm.py
index ac869d765a5e..71d34af1a647 100644
--- a/tensorrt_llm/llmapi/llm.py
+++ b/tensorrt_llm/llmapi/llm.py
@@ -31,7 +31,7 @@
                         GenerationResult, IterationResult, LoRARequest,
                         PostprocWorkerConfig, PromptAdapterRequest)
 from ..executor.postproc_worker import PostprocParams
-from ..executor.utils import (create_mpi_comm_session,
+from ..executor.utils import (RequestError, create_mpi_comm_session,
                               get_spawn_proxy_process_env)
 from ..inputs import (PromptInputs, create_input_processor,
                       create_input_processor_with_hash, get_cache_salt_id,
@@ -686,7 +686,7 @@ def _check_arguments(self, prompt_len: int, query_len: int,
             if self.args.backend == "pytorch" and not self.args.enable_chunked_prefill and not is_gen_only:
                 max_num_tokens = self.args.max_num_tokens
                 if max_num_tokens and prompt_len / self.args.parallel_config.cp_size + query_len > max_num_tokens:
-                    raise ValueError(
+                    raise RequestError(
                         f"The sum of prompt length ({prompt_len/self.args.parallel_config.cp_size}), query length ({query_len}) should not exceed "
                         f"max_num_tokens ({max_num_tokens})")
             return
diff --git a/tensorrt_llm/llmapi/llm_args.py b/tensorrt_llm/llmapi/llm_args.py
index 3f15252b84fb..9a13eab29a83 100644
--- a/tensorrt_llm/llmapi/llm_args.py
+++ b/tensorrt_llm/llmapi/llm_args.py
@@ -2963,6 +2963,12 @@ class TorchLlmArgs(BaseLlmArgs):
         status="prototype",
     )
 
+    max_stats_len: int = Field(
+        default=1000,
+        description="The max number of performance statistic entries.",
+        status="prototype",
+    )
+
     @property
     def quant_config(self) -> QuantConfig:
         if self._quant_config is None:
diff --git a/tensorrt_llm/serve/chat_utils.py b/tensorrt_llm/serve/chat_utils.py
index 26ee17c4f407..e08caadaaf5f 100644
--- a/tensorrt_llm/serve/chat_utils.py
+++ b/tensorrt_llm/serve/chat_utils.py
@@ -17,7 +17,8 @@
 from tensorrt_llm.inputs import (ConversationMessage, MultimodalData,
                                  MultimodalDataTracker,
                                  add_multimodal_placeholders, async_load_audio,
-                                 async_load_image, async_load_video)
+                                 async_load_image, async_load_video,
+                                 load_base64_image_embeds)
 from tensorrt_llm.inputs.multimodal import MultimodalServerConfig
 from tensorrt_llm.logger import logger
 
@@ -33,24 +34,45 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
     type: Required[Literal["video_url"]]
 
 
+class ImageEmbedsData(TypedDict):
+    """Type definition for serialized image embeddings structure."""
+    data: Required[str]
+
+
+class ChatCompletionContentPartImageEmbedsParam(TypedDict, total=False):
+    """Type definition for image embeddings passed in base64-encoded PyTorch tensor format."""
+    image_embeds: Required[
+        # TODO: Besides "data", could support "url" and "ipc_handle" in the future.
+        ImageEmbedsData]
+    type: Required[Literal["image_embeds"]]
+
+
 # Type Aliases and Constants
 ChatCompletionContentPartParam: TypeAlias = Union[
-    OpenAIChatCompletionContentPartParam, ChatCompletionContentPartVideoParam,
-    str]
+    OpenAIChatCompletionContentPartParam,
+    ChatCompletionContentPartVideoParam,
+    ChatCompletionContentPartImageEmbedsParam,
+    str,
+]
 
 # TODO: Add "input_audio" to support byte_encoded audio input.
 VALID_MESSAGE_CONTENT_MM_PART_TYPES = [
-    "text", "image_url", "video_url", "audio_url"
+    "text",
+    "image_url",
+    "video_url",
+    "audio_url",
+    "image_embeds",
 ]
 
 # Parser Functions
 _TextParser = partial(cast, ChatCompletionContentPartTextParam)
 _ImageParser = partial(cast, ChatCompletionContentPartImageParam)
+_ImageEmbedsParser = partial(cast, ChatCompletionContentPartImageEmbedsParam)
 _VideoParser = partial(cast, ChatCompletionContentPartVideoParam)
 _AudioParser = partial(cast, ChatCompletionContentPartInputAudioParam)
 
 MM_PARSER_MAP: dict[str, Callable[[ChatCompletionContentPartParam], Union[
-    str, dict[str, str]]]] = {
+    str, dict[str, str], None]]] = {
         "text":
         lambda part: _TextParser(part).get("text", None),
         "image_url":
@@ -59,12 +81,15 @@ class ChatCompletionContentPartVideoParam(TypedDict, total=False):
         lambda part: _VideoParser(part).get("video_url", {}).get("url", None),
         "audio_url":
         lambda part: _AudioParser(part).get("audio_url", {}).get("url", None),
+        "image_embeds":
+        lambda part: _ImageEmbedsParser(part).get("image_embeds", {}).get(
+            "data", None),
     }
 
 
 def _parse_chat_message_content_mm_part(
     part: ChatCompletionContentPartParam
-) -> tuple[str, Union[str, dict[str, str]]]:
+) -> tuple[str, Union[str, dict[str, str], None]]:
     """Parse a single multimodal part of a chat message."""
     assert isinstance(part, dict)
     part_type = part.get("type", None)
@@ -78,9 +103,9 @@ def _parse_chat_message_content_mm_part(
 
 
 def parse_chat_message_content_part(
-    part: ChatCompletionMessageParam,
+    part: ChatCompletionContentPartParam,
     mm_data_tracker: MultimodalDataTracker,
-) -> Optional[Any]:
+) -> str | MultimodalData | None:
     """Parse a single part of a chat message."""
     if isinstance(part, str):
         return part
@@ -110,7 +135,23 @@ async def load_image_async():
                 logger.error(f"Failed to load image: {str(e)}")
                 return None
 
-        return MultimodalData(modality="image", data=load_image_async())
+        return MultimodalData(modality="image",
+                              data=load_image_async(),
+                              is_embedding=False)
+
+    if part_type == "image_embeds":
+        str_content = cast(str, content)
+
+        async def decode_image_embeds_async():
+            try:
+                return load_base64_image_embeds(str_content)
+            except Exception as e:
+                logger.error(f"Failed to decode image data: {str(e)}")
+                return None
+
+        return MultimodalData(modality="image",
+                              data=decode_image_embeds_async(),
+                              is_embedding=True)
 
     if part_type == "video_url":
         str_content = cast(str, content)
@@ -125,7 +166,9 @@ async def load_video_async():
                 logger.error(f"Failed to load video: {str(e)}")
                 return None
 
-        return MultimodalData(modality="video", data=load_video_async())
+        return MultimodalData(modality="video",
+                              data=load_video_async(),
+                              is_embedding=False)
 
     if part_type == "audio_url":
         str_content = cast(str, content)
@@ -140,14 +183,16 @@ async def load_audio_async():
                 logger.error(f"Failed to load audio: {str(e)}")
                 return None
 
-        return MultimodalData(modality="audio", data=load_audio_async())
+        return MultimodalData(modality="audio",
+                              data=load_audio_async(),
+                              is_embedding=False)
 
     raise NotImplementedError(f"Unknown part type: {part_type}")
 
 
 def parse_chat_message_content_parts(
     role: str,
-    parts: Iterable[ChatCompletionMessageParam],
+    parts: Iterable[ChatCompletionContentPartParam],
     mm_data_tracker: MultimodalDataTracker,
 ) -> ConversationMessage:
     """Parse multiple parts of a chat message."""
@@ -224,8 +269,9 @@ def parse_chat_messages_coroutines(
     messages: List[ChatCompletionMessageParam],
     model_config: AutoConfig,
     multimodal_server_config: Optional[MultimodalServerConfig] = None
-) -> Tuple[List[ConversationMessage], Optional[Coroutine[
-        Any, Any, Optional[Dict[str, List[Any]]]]]]:
+) -> Tuple[List[ConversationMessage], Coroutine[Any, Any, tuple[Optional[Dict[
+        str, List[Any]]], Optional[Dict[str, List[Any]]]]], list[dict[str,
+                                                                      int]]]:
     """Parse multiple chat messages and return conversation and coroutine."""
     conversation = []
     mm_placeholder_counts = []
@@ -237,7 +283,9 @@ def parse_chat_messages_coroutines(
         conversation.append(parsed_msg)
         if parsed_msg["media"]:
             for mdata in parsed_msg["media"]:
-                mm_data_tracker.add_data(mdata["modality"], mdata["data"])
+                mm_data_tracker.add_data(mdata["modality"],
+                                         mdata["data"],
+                                         is_embedding=mdata["is_embedding"])
         mm_placeholder_count = mm_data_tracker.placeholder_counts()
         if mm_placeholder_count:
             parsed_msg["content"] = add_multimodal_placeholders(
diff --git a/tensorrt_llm/serve/openai_server.py b/tensorrt_llm/serve/openai_server.py
index 44983306dc83..afb97aa6f0c6 100644
--- a/tensorrt_llm/serve/openai_server.py
+++ b/tensorrt_llm/serve/openai_server.py
@@ -563,9 +563,13 @@ async def create_chat_response(
                 )
             prompt = prompt_inputs(prompt)
 
-            mm_data = await mm_coroutines
-            if mm_data is not None:
+            mm_data, mm_embeddings = await mm_coroutines
+            if mm_data:
                 prompt["multi_modal_data"] = mm_data
+            if mm_embeddings:
+                prompt["multi_modal_embeddings"] = mm_embeddings
+            if mm_data and mm_embeddings:
+                raise ValueError("Passing 'multi_modal_data' and 'multi_modal_embeddings' at the same time is not supported.")
 
             postproc_args.reasoning_parser = self.llm.args.reasoning_parser
             postproc_args.tool_parser = self.tool_parser
@@ -666,7 +670,9 @@ async def create_mm_embedding_response(promise: RequestOutput):
                 )
             prompt = prompt_inputs(prompt)
 
-            mm_data = await mm_coroutines
+            mm_data, mm_embeddings = await mm_coroutines
+            if mm_embeddings:
+                raise ValueError("Cannot use multimodal embeddings as input")
             if mm_data is not None:
                 prompt["multi_modal_data"] = mm_data
 
diff --git a/tests/integration/defs/accuracy/references/gsm8k.yaml b/tests/integration/defs/accuracy/references/gsm8k.yaml
index 76cf65bcb3d4..96a9ef6b94c4 100644
--- a/tests/integration/defs/accuracy/references/gsm8k.yaml
+++ b/tests/integration/defs/accuracy/references/gsm8k.yaml
@@ -313,3 +313,7 @@ nvidia/Nemotron-3-Nano:
   - quant_algo: FP8
     kv_cache_quant_algo: FP8
     accuracy: 68.73
+MiniMaxAI/MiniMax-M2:
+  - accuracy: 85
+  - quant_algo: FP8_BLOCK_SCALES
+    accuracy: 85
diff --git a/tests/integration/defs/accuracy/references/longbench_v1.yaml b/tests/integration/defs/accuracy/references/longbench_v1.yaml
index c638ab92bb8e..e54288d0945e 100644
--- a/tests/integration/defs/accuracy/references/longbench_v1.yaml
+++ b/tests/integration/defs/accuracy/references/longbench_v1.yaml
@@ -1,8 +1,8 @@
 Qwen3/Qwen3-30B-A3B-Instruct-2507:
   # Skip Softmax Attention ref accuracy
   - extra_acc_spec: "target_sparsity=0.0"
-    accuracy: 47.22
+    accuracy: 47.357
   - extra_acc_spec: "target_sparsity=0.5"
-    accuracy: 47.22
+    accuracy: 47.102
   - extra_acc_spec: "target_sparsity=0.9"
-    accuracy: 45.90
+    accuracy: 46.169
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 0ca0842b006e..d5835744a7c9 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -3884,7 +3884,6 @@ class TestQwen3_30B_A3B_Instruct_2507(LlmapiAccuracyTestHarness):
     MODEL_PATH = f"{llm_models_root()}/{MODEL_NAME}"
 
     @skip_pre_hopper
-    # @pytest.mark.skip_less_device_memory(140000)  # Only test for H200, B200
     @pytest.mark.parametrize(
         "target_sparsity,thr_prefill,thr_decode",
         [
@@ -3903,15 +3902,51 @@ def test_skip_softmax_attention(self, target_sparsity: float,
                 "prefill": thr_prefill,
                 "decode": thr_decode,
             })
-        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.85)
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
+                                        enable_block_reuse=False)
 
         if get_sm_version() >= 100:
-            pytest.skip("Bug to be fixed on Blackwell")
+            pytest.skip("https://nvbugs/5783509: Bug to be fixed on Blackwell")
+
+        with LLM(self.MODEL_PATH,
+                 attn_backend="TRTLLM",
+                 max_batch_size=256,
+                 max_num_tokens=100000,
+                 kv_cache_config=kv_cache_config,
+                 sparse_attention_config=sparse_attention_config) as llm:
+            task = LongBenchV1(self.MODEL_NAME)
+            task.evaluate(llm,
+                          extra_acc_spec=f"target_sparsity={target_sparsity}")
+
+    @pytest.mark.parametrize(
+        "target_sparsity,thr_prefill,thr_decode",
+        [
+            (0.0, 0.0, 0.0),
+            (0.5, 85.97384174442398, 55.48258322852407),
+            (0.9, 1418.142868970396, 863.147841750025),
+        ],
+        ids=[
+            "target_sparsity_0.0", "target_sparsity_0.5", "target_sparsity_0.9"
+        ],
+    )
+    def test_skip_softmax_attention_2gpus(self, target_sparsity: float,
+                                          thr_prefill: float,
+                                          thr_decode: float):
+        sparse_attention_config = SkipSoftmaxAttentionConfig(
+            threshold_scale_factor={
+                "prefill": thr_prefill,
+                "decode": thr_decode,
+            })
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75,
+                                        enable_block_reuse=False)
 
         with LLM(self.MODEL_PATH,
                  attn_backend="TRTLLM",
                  max_batch_size=256,
                  max_num_tokens=100000,
+                 tensor_parallel_size=2,
+                 moe_expert_parallel_size=2,
+                 enable_attention_dp=True,
                  kv_cache_config=kv_cache_config,
                  sparse_attention_config=sparse_attention_config) as llm:
             task = LongBenchV1(self.MODEL_NAME)
@@ -5350,3 +5385,35 @@ def test_nvfp4_8gpus(self, attention_dp):
             task = GSM8K(self.MODEL_NAME)
             task.evaluate(llm,
                           extra_evaluator_kwargs=self.EXTRA_EVALUATOR_KWARGS)
+
+
+@skip_pre_hopper
+class TestMiniMaxM2(LlmapiAccuracyTestHarness):
+    MODEL_NAME = "MiniMaxAI/MiniMax-M2"
+    MODEL_PATH = f"{llm_models_root()}/MiniMax-M2"
+
+    @parametrize_with_ids("tp_size,ep_size", [(4, 4)])
+    @pytest.mark.skip_less_device(4)
+    @parametrize_with_ids("attention_dp,cuda_graph,overlap_scheduler",
+                          [(False, True, True), (True, True, True)])
+    def test_4gpus(self, tp_size, ep_size, attention_dp, cuda_graph,
+                   overlap_scheduler):
+        kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.6)
+
+        pytorch_config = dict(
+            disable_overlap_scheduler=not overlap_scheduler,
+            cuda_graph_config=CudaGraphConfig() if cuda_graph else None,
+            moe_config=MoeConfig(
+                backend="DEEPGEMM" if get_sm_version() >= 100 else "CUTLASS"))
+
+        with LLM(self.MODEL_PATH,
+                 tensor_parallel_size=tp_size,
+                 pipeline_parallel_size=1,
+                 moe_expert_parallel_size=ep_size,
+                 kv_cache_config=kv_cache_config,
+                 max_seq_len=4096,
+                 **pytorch_config,
+                 enable_attention_dp=attention_dp) as llm:
+            assert llm.args.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+            task = GSM8K(self.MODEL_NAME)
+            task.evaluate(llm)
diff --git a/tests/integration/defs/test_e2e.py b/tests/integration/defs/test_e2e.py
index ab1690c87a68..f1d12a3ffbc6 100644
--- a/tests/integration/defs/test_e2e.py
+++ b/tests/integration/defs/test_e2e.py
@@ -1683,9 +1683,13 @@ def test_openai_lora(llm_root, llm_venv):
 
 def test_openai_chat_multimodal_example(llm_root, llm_venv):
     test_root = unittest_path() / "llmapi" / "apps"
-    llm_venv.run_cmd(
-        ["-m", "pytest",
-         str(test_root / "_test_openai_chat_multimodal.py")])
+    llm_venv.run_cmd([
+        "-m",
+        "pytest",
+        str(test_root / "_test_openai_chat_multimodal.py"),
+        "-m",
+        "not needs_l40s",
+    ])
 
 
 def test_openai_mmencoder_example(llm_root, llm_venv):
diff --git a/tests/integration/test_lists/qa/llm_function_core_sanity.txt b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
index 8a276d4615fc..8fae1d62933f 100644
--- a/tests/integration/test_lists/qa/llm_function_core_sanity.txt
+++ b/tests/integration/test_lists/qa/llm_function_core_sanity.txt
@@ -148,7 +148,6 @@ accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-True-True-False]
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-1-False-False-True]
 accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-False-True]
-accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_8gpus
 accuracy/test_llm_api_pytorch.py::TestPhi4::test_auto_dtype
 accuracy/test_llm_api_pytorch.py::TestPhi4::test_fp8
 accuracy/test_llm_api_pytorch.py::TestPhi4MiniInstruct::test_auto_dtype
diff --git a/tests/integration/test_lists/test-db/l0_b200.yml b/tests/integration/test_lists/test-db/l0_b200.yml
index 809810c6d963..02616d7eda35 100644
--- a/tests/integration/test_lists/test-db/l0_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_b200.yml
@@ -55,7 +55,6 @@ l0_b200:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-TRTLLM]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a8_mxfp4[mxfp8-latency-CUTLASS]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_w4a16_mxfp4[latency-TRTLLM]
-  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9]
   - accuracy/test_llm_api_pytorch.py::TestQwen3NextInstruct::test_nvfp4[tp1-cutlass]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index a914a00c5348..84af0aae2b16 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -17,7 +17,7 @@ l0_dgx_b200:
   tests:
   - unittest/_torch/misc/test_autotuner.py::test_autotuner_distributed_strategy
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEPLowLatency]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[NVLinkTwoSided]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_configurable_moe-disable_finalize_fusion-TRTLLM-dtype1]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4a8_nvfp4_fp8[enable_configurable_moe-TRTLLM]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_mxfp4_mxfp8[enable_configurable_moe-True-8-64-TRTLLM]
@@ -31,6 +31,8 @@ l0_dgx_b200:
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_ucx[DeepSeek-V3-Lite-fp8]
   - disaggregated/test_disaggregated.py::test_disaggregated_deepseek_v3_lite_fp8_nixl[DeepSeek-V3-Lite-fp8]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekR1::test_nvfp4_multi_gpus[latency_adp_lmtp_tp4]
+  - accuracy/test_llm_api_pytorch.py::TestMiniMaxM2::test_4gpus[attention_dp=False-cuda_graph=True-overlap_scheduler=True-tp_size=4-ep_size=4] TIMEOUT (60)
+
   # ------------- AutoDeploy tests ---------------
   - accuracy/test_llm_api_autodeploy.py::TestNemotronMOE::test_bf16
 - condition:
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index 85b018c1894c..91bf2542b7ef 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -48,6 +48,9 @@ l0_dgx_h100:
   - accuracy/test_llm_api_autodeploy.py::TestLlama3_1_8B::test_auto_dtype[False-2]
   # llmapi
   - unittest/llmapi/test_mpi_session.py::test_llmapi_launch_multiple_tasks
+  # ------------- Skip softmax attention tests ---------------
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.5]
+  - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention_2gpus[target_sparsity_0.9]
 - condition:
     ranges:
       system_gpu_count:
@@ -146,7 +149,7 @@ l0_dgx_h100:
   - unittest/_torch/multi_gpu_modeling/test_deepseek.py::test_deepseek_streaming[tp4-bf16-trtllm-deepseekv3_lite]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEP]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEPLowLatency]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[MNNVL]
+  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[NVLinkTwoSided]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype0]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype1]
   - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.W4A8_CUSTOM-dtype0]
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index adae47f6268b..994c43a1fcbc 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -78,10 +78,6 @@ l0_h100:
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_fp8[latency-torch_compile=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B::test_dummy_load_format
-  # Waive known failures in https://nvbugs/5774869
-  # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] TIMEOUT (90)
-  # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] TIMEOUT (90)
-  # - accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] TIMEOUT (90)
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=False]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=True-eagle3_one_model=True]
   - accuracy/test_llm_api_pytorch.py::TestQwen3_8B::test_eagle3[enable_chunked_prefill=False-eagle3_one_model=True]
diff --git a/tests/integration/test_lists/test-db/l0_l40s.yml b/tests/integration/test_lists/test-db/l0_l40s.yml
index c3037894895d..d10ba9fc2cdd 100644
--- a/tests/integration/test_lists/test-db/l0_l40s.yml
+++ b/tests/integration/test_lists/test-db/l0_l40s.yml
@@ -28,6 +28,7 @@ l0_l40s:
   - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-audio]
   - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image]
   - test_e2e.py::test_ptp_quickstart_multimodal_phi4mm[phi4-multimodal-instruct-multimodals/Phi-4-multimodal-instruct-image_audio]
+  - unittest/llmapi/apps/_test_openai_chat_multimodal.py::test_single_chat_session_image_embeds -m needs_l40s
   # MMMU sanity check
   - accuracy/test_llm_api_pytorch_multimodal.py::TestQwen2_5_VL_7B::test_auto_dtype
   - accuracy/test_llm_api_pytorch_multimodal.py::TestVILA1_5_3B::test_auto_dtype
diff --git a/tests/integration/test_lists/waives.txt b/tests/integration/test_lists/waives.txt
index 8e763d0000a7..047c4019d109 100644
--- a/tests/integration/test_lists/waives.txt
+++ b/tests/integration/test_lists/waives.txt
@@ -220,11 +220,7 @@ examples/test_multimodal.py::test_llm_multimodal_general[Mistral-Small-3.1-24B-I
 accuracy/test_llm_api_pytorch.py::TestLlama3_1NemotronNano8Bv1::test_fp8_prequantized SKIP (https://nvbugs/5640697)
 accuracy/test_llm_api_pytorch.py::TestQwQ_32B::test_auto_dtype_tp4 SKIP (https://nvbugs/5640697)
 test_e2e.py::test_trtllm_multimodal_benchmark_serving SKIP (https://nvbugs/5647825)
-unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[MNNVL] SKIP (https://nvbugs/5664904)
-unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEP] SKIP (https://nvbugs/5664904)
-unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEPLowLatency] SKIP (https://nvbugs/5664904)
-unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEP] SKIP (https://nvbugs/5664904)
-unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[MNNVL] SKIP (https://nvbugs/5664904)
+unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEPLowLatency] SKIP (https://nvbugs/5808500)
 test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-FP8-nemotron-nas/Llama-3_3-Nemotron-Super-49B-v1-FP8] SKIP (https://nvbugs/5670469)
 test_e2e.py::test_ptp_quickstart_advanced[Nemotron-Super-49B-v1-NVFP4-nvfp4-quantized/Llama-3_3-Nemotron-Super-49B-v1_nvfp4_hf] SKIP (https://nvbugs/5670469)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_w4_1gpu[True-True-cutlass-auto] SKIP (https://nvbugs/5756804)
@@ -258,7 +254,6 @@ accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4_4gpus[moe_backe
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=CUTLASS-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False] SKIP (https://nvbugs/5722629)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_2gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-two_model-overlap_scheduler] SKIP (https://nvbugs/5702826)
-unittest/llmapi/test_llm_pytorch.py::test_llm_reward_model SKIP (https://nvbugs/5670458)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[llguidance-mtp_nextn=2] SKIP (https://nvbugs/5740075)
 accuracy/test_disaggregated_serving.py::TestDeepSeekV3Lite::test_guided_decoding[xgrammar-mtp_nextn=2] SKIP (https://nvbugs/5740075)
 unittest/_torch/modeling/test_modeling_out_of_tree.py::TestOutOfTree::test_llm_api[False] SKIP (https://nvbugs/5739981)
@@ -283,7 +278,6 @@ examples/test_granite.py::test_llm_granite[granite-3.0-2b-instruct-bfloat16] SKI
 unittest/executor/test_base_worker.py::TestWorkerBase SKIP (https://nvbugs/5759698)
 triton_server/test_triton.py::test_gpt_disaggregated_serving_bls[gpt-disaggregated-serving-bls] SKIP (https://nvbugs/5582118)
 cpp/test_multi_gpu.py::test_cache_transceiver[8proc-mooncake_kvcache-90] SKIP (https://nvbugs/5760737)
-unittest/_torch/ray_orchestrator/multi_gpu/test_ops.py::test_allreduce_pg_op[seqlen:16-hidden:1024] SKIP (https://nvbugs/5760740)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[trtllm-two_model-no_overlap_scheduler] SKIP (https://nvbugs/5760747)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=False-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] SKIP (https://nvbugs/5759338)
 accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_bfloat16[mtp_nextn=2-attention_dp=True-cuda_graph=False-overlap_scheduler=False-torch_compile=False-enable_chunked_prefill=False] SKIP (https://nvbugs/5759338)
@@ -304,6 +298,9 @@ full:RTXPro6000D/accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp
 examples/test_ray.py::test_ray_disaggregated_serving[tp2] SKIP (https://nvbugs/5612502)
 unittest/executor/test_rpc_proxy.py SKIP (https://nvbugs/5605741)
 unittest/executor/test_rpc_worker.py SKIP (https://nvbugs/5605741)
+cpp/test_e2e.py::test_model[-redrafter-86] SKIP (https://nvbugs/5761642)
+unittest/_torch/thop/parallel/test_fp8_block_scale_gemm.py::test_deep_gemm_in_subprocess[env2] SKIP (https://nvbugs/5766853)
+test_e2e.py::test_openai_responses SKIP (https://nvbugs/5804146)
 triton_server/test_triton.py::test_gpt_gather_logits[gpt-gather-logits] SKIP (https://nvbugs/5766960)
 stress_test/stress_test.py::test_run_stress_test[llama-v3-8b-instruct-hf_tp1-stress_time_300s_timeout_450s-GUARANTEED_NO_EVICT-pytorch-stress-test] SKIP (https://nvbugs/5766952)
 full:sm89/accuracy/test_disaggregated_serving.py::TestLlama3_1_8BInstruct::test_ctx_pp_gen_tp_asymmetric[MMLU-gen_tp=1-ctx_pp=2] SKIP (https://nvbugs/5596337)
@@ -324,9 +321,6 @@ accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_eagle3_tp8[
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_nvfp4_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
 accuracy/test_llm_api_pytorch.py::TestLlama3_3_70BInstruct::test_fp8_tp4[torch_compile=False] SKIP (https://nvbugs/5794796)
 accuracy/test_llm_api_pytorch.py::TestQwen3_235B_A22B::test_fp8[throughput_latency] SKIP (https://nvbugs/5775544)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.5] SKIP (https://nvbugs/5774869)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.0] SKIP (https://nvbugs/5774869)
-accuracy/test_llm_api_pytorch.py::TestQwen3_30B_A3B_Instruct_2507::test_skip_softmax_attention[target_sparsity_0.9] SKIP (https://nvbugs/5774869)
 triton_server/test_triton.py::test_llava_onevision[llava_onevision] SKIP (https://nvbugs/5775205)
 triton_server/test_triton.py::test_gpt_ib_lad[gpt-ib-lad] SKIP (https://nvbugs/5775223)
 unittest/_torch/modules/test_fused_moe.py::test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu[MoEWeightLoadingMode.FUSED_GATE_UP_PROJ-DefaultMoeRoutingMethod-1] SKIP (https://nvbugs/5775256)
@@ -387,3 +381,6 @@ accuracy/test_llm_api_pytorch.py::TestGemma3_1BInstruct::test_auto_dtype SKIP (h
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[one_model] SKIP (https://nvbugs/5596343)
 accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_guided_decoding_4gpus[two_model] SKIP (https://nvbugs/5596343)
 accuracy/test_disaggregated_serving.py::TestGemma3_1BInstruct::test_auto_dtype[True] SKIP (https://nvbugs/5799901)
+accuracy/test_disaggregated_serving.py::TestQwen3_30B_A3B::test_mixed_ctx_gen_model[ctxpp2gentp2] SKIP (https://nvbugs/5748664)
+accuracy/test_llm_api_pytorch.py::TestMistralLarge3_675B::test_nvfp4_4gpus[latency_moe_trtllm_eagle] SKIP (https://nvbugs/5804683)
+accuracy/test_llm_api_pytorch.py::TestGPTOSS::test_eagle3_4gpus[cutlass-one_model-no_overlap_scheduler] SKIP (https://nvbugs/5809169)
diff --git a/tests/unittest/api_stability/references/llm.yaml b/tests/unittest/api_stability/references/llm.yaml
index 4b6f8cedab1c..2e3457f3d452 100644
--- a/tests/unittest/api_stability/references/llm.yaml
+++ b/tests/unittest/api_stability/references/llm.yaml
@@ -227,6 +227,10 @@ methods:
         annotation: Optional[Dict[str, str]]
         default: null
         status: prototype
+      max_stats_len:
+        annotation: int
+        default: 1000
+        status: prototype
     return_annotation: None
   generate:
     parameters:
diff --git a/tests/unittest/llmapi/apps/_attach_multimodal_embeddings_patch/__init__.py b/tests/unittest/llmapi/apps/_attach_multimodal_embeddings_patch/__init__.py
new file mode 100644
index 000000000000..7d8281ecd2b2
--- /dev/null
+++ b/tests/unittest/llmapi/apps/_attach_multimodal_embeddings_patch/__init__.py
@@ -0,0 +1,53 @@
+# Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#    http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# used by tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py
+
+import tempfile
+from pathlib import Path
+from typing import Optional
+
+import torch
+
+from tensorrt_llm._torch.models.modeling_qwen2vl import Qwen2VLInputProcessorBase
+from tensorrt_llm.inputs import ExtraProcessedInputs, TextPrompt
+from tensorrt_llm.sampling_params import SamplingParams
+
+_attach_multimodal_embeddings_orig = Qwen2VLInputProcessorBase.attach_multimodal_embeddings
+
+
+# signature taken from tensorrt_llm/inputs/registry.py
+def _attach_multimodal_embeddings(
+    self,
+    inputs: TextPrompt,
+    multimodal_embedding: dict[str, list[torch.Tensor]],
+    sampling_params: SamplingParams,
+) -> tuple[list[int], Optional[ExtraProcessedInputs]]:
+    try:
+        _attach_multimodal_embeddings_orig(self, inputs, multimodal_embedding, sampling_params)
+    except NotImplementedError:
+        pass
+    else:
+        raise ValueError(
+            "Remove this custom module, Qwen2VLInputProcessorBase implements attach_multimodal_embeddings"
+        )
+
+    tempdir = tempfile.gettempdir()
+    file_path = Path(tempdir) / "multimodal_embedding.pickle"
+    with open(file_path, "wb") as f:
+        torch.save(multimodal_embedding, f)
+    raise ValueError(file_path)
+
+
+Qwen2VLInputProcessorBase.attach_multimodal_embeddings = _attach_multimodal_embeddings
diff --git a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py
index fda0f8a49333..4183e1874e75 100644
--- a/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py
+++ b/tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py
@@ -1,13 +1,18 @@
+import io
 import os
+import sys
 import tempfile
+from base64 import b64encode
 from pathlib import Path
 from typing import List
 
 import openai
 import pytest
+import torch
 import yaml
 from PIL import Image
 
+from tensorrt_llm._torch.shared_tensor import SharedTensorContainer
 from tensorrt_llm.inputs import encode_base64_image
 
 from ..test_llm import get_model_path
@@ -17,6 +22,13 @@
 
 from utils.llm_data import llm_models_root
 
+from ._test_openai_mmencoder import RemoteMMEncoderServer
+from ._test_openai_mmencoder import server as mm_encoder_server
+from ._test_openai_mmencoder import \
+    test_multimodal_content_mm_encoder as _test_multimodal_content_mm_encoder
+
+assert mm_encoder_server is not None  # keep 'mm_encoder_server' fixture visible in this module
+
 
 @pytest.fixture(scope="module", ids=["Qwen2.5-VL-3B-Instruct"])
 def model_name():
@@ -25,7 +37,7 @@ def model_name():
 
 @pytest.fixture(scope="module")
 def temp_extra_llm_api_options_file(request):
-    temp_dir = tempfile.gettempdir()
+    temp_dir = tempfile.mkdtemp()
     temp_file_path = os.path.join(temp_dir, "extra_llm_api_options.yaml")
     try:
         extra_llm_api_options_dict = {
@@ -123,6 +135,98 @@ def test_single_chat_session_image(client: openai.OpenAI, model_name: str):
         == chat_completion.choices[0].message.content
 
 
+# used by mm_encoder_server
+@pytest.fixture(scope="module")
+def extra_encoder_options() -> bool:
+    return False
+
+
+# used by mm_encoder_server
+@pytest.fixture(scope="module")
+def temp_extra_encoder_options_file() -> str:
+    return "/dummy/path"
+
+
+@pytest.fixture(scope="module")
+def server_patched(model_name: str, temp_extra_llm_api_options_file: str):
+    # Custom module implements missing 'attach_multimodal_embeddings' to intercept
+    # embeddings.
+    model_path = get_model_path(model_name)
+    args = [
+        "--extra_llm_api_options",
+        temp_extra_llm_api_options_file,
+        "--max_batch_size",
+        "64",
+        "--max_num_tokens",
+        "16384",
+        "--custom_module_dirs",
+        str(
+            Path(sys.modules[test_single_chat_session_image_embeds.__module__].
+                 __file__).parent / "_attach_multimodal_embeddings_patch"),
+    ]
+    with RemoteOpenAIServer(model_path, args) as remote_server:
+        yield remote_server
+
+
+@pytest.mark.needs_l40s
+@pytest.mark.asyncio(loop_scope="module")
+def test_single_chat_session_image_embeds(
+    server_patched: RemoteOpenAIServer,
+    model_name: str,
+    mm_encoder_server: RemoteMMEncoderServer,
+):
+    client = server_patched.get_client()
+    messages, mm_embed_handle = _test_multimodal_content_mm_encoder(
+        mm_encoder_server.get_client(), model_name)
+
+    max_completion_tokens = 10
+
+    chat_completion_image = client.chat.completions.create(
+        model=model_name,
+        messages=messages,
+        max_completion_tokens=max_completion_tokens,
+        temperature=0.0,
+        logprobs=False)
+
+    mm_embed = SharedTensorContainer.from_dict(mm_embed_handle).get_local_view()
+    with io.BytesIO() as buf:
+        torch.save(mm_embed, buf)
+        mm_embed_bytes = buf.getvalue()
+
+    image_content = messages[0]["content"][1]
+    assert image_content["type"] == "image_url"
+    image_content.clear()
+    image_content["type"] = "image_embeds"
+    image_content["image_embeds"] = {
+        "data": b64encode(mm_embed_bytes).decode("ascii")
+    }
+
+    # test single completion
+    #
+    # FIXME: Remove try-except and use 'server' instead of 'server_patched',
+    #        once Qwen2VLInputProcessorBase implements attach_multimodal_embeddings.
+    try:
+        chat_completion_embeds = client.chat.completions.create(
+            model=model_name,
+            messages=messages,
+            max_completion_tokens=max_completion_tokens,
+            temperature=0.0,
+            logprobs=False)
+
+        assert chat_completion_embeds.choices[
+            0].message == chat_completion_image.choices[0].message
+    except openai.BadRequestError as e:
+        assert isinstance(e.body, dict)
+        with open(Path(e.body["message"]), "rb") as f:
+            intercepted_embeddings = torch.load(f, weights_only=True)
+        assert list(intercepted_embeddings.keys()) == ["image"]
+        assert len(intercepted_embeddings["image"]) == 1
+        torch.testing.assert_close(intercepted_embeddings["image"][0],
+                                   mm_embed.cpu())
+        pytest.xfail(
+            reason="Model does not implement 'attach_multimodal_embeddings'")
+
+
 @pytest.mark.asyncio(loop_scope="module")
 def test_single_chat_session_multi_image(client: openai.OpenAI,
                                          model_name: str):
diff --git a/tests/unittest/llmapi/apps/_test_openai_mmencoder.py b/tests/unittest/llmapi/apps/_test_openai_mmencoder.py
index 312f9232d400..483f9ad994c2 100644
--- a/tests/unittest/llmapi/apps/_test_openai_mmencoder.py
+++ b/tests/unittest/llmapi/apps/_test_openai_mmencoder.py
@@ -1,5 +1,6 @@
 import os
 import tempfile
+from typing import Any
 
 import openai
 import pytest
@@ -67,7 +68,9 @@ def async_client(server: RemoteMMEncoderServer):
     return server.get_async_client()
 
 
-def test_multimodal_content_mm_encoder(client: openai.OpenAI, model_name: str):
+def test_multimodal_content_mm_encoder(
+        client: openai.OpenAI,
+        model_name: str) -> tuple[list[dict[str, Any]], dict[str, Any]]:
 
     content_text = "Describe the natural environment in the image."
     image_url = str(llm_models_root() / "multimodals" / "test_data" /
@@ -105,6 +108,8 @@ def test_multimodal_content_mm_encoder(client: openai.OpenAI, model_name: str):
     assert mm_handle["tensor_size"][
         1] == 2048  # qwen2.5-vl: hidden_size of the vision encoder
 
+    return messages, mm_handle  # used by tests/unittest/llmapi/apps/_test_openai_chat_multimodal.py
+
 
 def test_health(server: RemoteMMEncoderServer):
     health_url = server.url_for("health")
diff --git a/tests/unittest/llmapi/test_llm.py b/tests/unittest/llmapi/test_llm.py
index f8ffe8fc7bd7..87624b61b8e4 100644
--- a/tests/unittest/llmapi/test_llm.py
+++ b/tests/unittest/llmapi/test_llm.py
@@ -2393,7 +2393,8 @@ def fail_path():
                   enable_chunked_prefill=False,
                   fast_build=True)
 
-        with pytest.raises(ValueError):
+        # max_num_tokens validation now raises RequestError consistently
+        with pytest.raises(RequestError):
             output = llm.generate_async(
                 "A " * build_config.max_num_tokens,
                 sampling_params=sampling_params,
@@ -2436,13 +2437,9 @@ def _test_llm_capture_request_error(pytorch_backend: bool, tp_size: int = 1):
     )
 
     prompt = 'A ' * 65  # the minimum max_num_tokens is 64
-    if pytorch_backend:
-        # pytorch backend will raise ValueError for max_num_tokens
-        with pytest.raises(ValueError):
-            llm.generate(prompt)
-    else:
-        with pytest.raises(RequestError):
-            llm.generate(prompt)
+    # Both backends now consistently raise RequestError for max_num_tokens validation
+    with pytest.raises(RequestError):
+        llm.generate(prompt)
 
 
 def test_llm_capture_request_error():
diff --git a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
index 1ea15a2e781e..68cdc62ba7ea 100644
--- a/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_multi_gpu_pytorch.py
@@ -66,7 +66,7 @@ def test_llama_7b_multi_lora_tp2():
         cuda_graph_config=None)
 
 
-@skip_ray
+@skip_ray  # https://nvbugs/5727075
 @pytest.mark.gpu2
 def test_phi3_lora_fused_modules_output_on_tp2_identical_to_tp1() -> None:
     check_phi3_lora_fused_modules_output_tp2_identical_to_tp1(
diff --git a/tests/unittest/llmapi/test_llm_pytorch.py b/tests/unittest/llmapi/test_llm_pytorch.py
index b9702133152e..86f48d31266e 100644
--- a/tests/unittest/llmapi/test_llm_pytorch.py
+++ b/tests/unittest/llmapi/test_llm_pytorch.py
@@ -8,7 +8,7 @@
 
 from tensorrt_llm import LLM
 from tensorrt_llm.disaggregated_params import DisaggregatedParams
-from tensorrt_llm.executor import GenerationExecutorWorker
+from tensorrt_llm.executor import GenerationExecutorWorker, RequestError
 from tensorrt_llm.executor.rpc_proxy import GenerationExecutorRpcProxy
 from tensorrt_llm.llmapi import CacheTransceiverConfig, KvCacheConfig
 from tensorrt_llm.llmapi.llm_args import NGramDecodingConfig, PeftCacheConfig
@@ -830,10 +830,13 @@ def test_max_num_token_check(self):
                   kv_cache_config=global_kvcache_config,
                   max_num_tokens=100)
 
-        with pytest.raises(ValueError,
-                           match="should not exceed max_num_tokens"):
-            ids = [random.randint(10, 100) for _ in range(101)]
-            llm.generate([ids])
+        try:
+            with pytest.raises(RequestError,
+                               match="should not exceed max_num_tokens"):
+                ids = [random.randint(10, 100) for _ in range(101)]
+                llm.generate([ids])
+        finally:
+            llm.shutdown()
 
 
 class FailingExecutorWorker(GenerationExecutorWorker):
@@ -962,10 +965,13 @@ def test_max_num_token_check(self):
                   kv_cache_config=global_kvcache_config,
                   max_num_tokens=100)
 
-        with pytest.raises(ValueError,
-                           match="should not exceed max_num_tokens"):
-            ids = [random.randint(10, 100) for _ in range(101)]
-            llm.generate([ids])
+        try:
+            with pytest.raises(RequestError,
+                               match="should not exceed max_num_tokens"):
+                ids = [random.randint(10, 100) for _ in range(101)]
+                llm.generate([ids])
+        finally:
+            llm.shutdown()
 
 
 @skip_ray