diff --git a/cpp/kernels/fmha_v2/README.md b/cpp/kernels/fmha_v2/README.md
index ce189f21875..c068452b368 100644
--- a/cpp/kernels/fmha_v2/README.md
+++ b/cpp/kernels/fmha_v2/README.md
@@ -20,7 +20,12 @@ the `setup.py` code:
 export TORCH_CUDA_ARCH_LIST=9.0 ENABLE_SM89_QMMA=1 ENABLE_HMMA_FP32=1 SCHEDULING_MODE=1 ENABLE_SM100=1 ENABLE_SM120=1
 ```
 
-To generate subset of kernels, you can add conditions in setup.py.
+To generate subset of kernels, you can add conditions in setup.py. Or set `FMHA_FILTER_ARCH` before calling setup.py:
+
+```
+# Build only for a specific arch (or list of architectures). Will not enable kernels that are disabled by default
+export FMHA_FILTER_ARCH=90
+```
 
 To generate the files and compile the kernels:
 ```
diff --git a/cpp/kernels/fmha_v2/fmha_test.py b/cpp/kernels/fmha_v2/fmha_test.py
index d4a5beaa010..b79bef940dc 100644
--- a/cpp/kernels/fmha_v2/fmha_test.py
+++ b/cpp/kernels/fmha_v2/fmha_test.py
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2020-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import subprocess
 
 import pytest
@@ -268,3 +282,29 @@ def test_trtllm_chunked_attention(chunked_attention_size, input_layout):
             -chunked-attention-size {chunked_attention_size} -paged-kv",
             shell=True,
             check=True)
+
+
+# The test cases for sliding window attention.
+@pytest.mark.parametrize(
+    'sliding_window_size', [64, 127, 128, 129, 256, 512],
+    ids=[
+        "sliding-window-size-64", "sliding-window-size-127",
+        "sliding-window-size-128", "sliding-window-size-129",
+        "sliding-window-size-256", "sliding-window-size-512"
+    ])
+@pytest.mark.parametrize(
+    'mask_type',
+    ["-sliding-or-chunked-causal-mask", "-bidirectional-sliding-window-mask"])
+def test_trtllm_sliding_window_attention(sliding_window_size, mask_type):
+    if mask_type == "-bidirectional-sliding-window-mask":
+        sliding_window_size *= 2
+
+    subprocess.run(f"bin/fmha.exe -d 128 -b 2 -h 5 -s 2048 -min-s 1024 -bf16 \
+        -sliding-window-size {sliding_window_size} {mask_type}",
+                   shell=True,
+                   check=True)
+
+    subprocess.run(f"bin/fmha.exe -d 64 -b 2 -h 5 -s 2048 -min-s 1024 -bf16 \
+        -sliding-window-size {sliding_window_size} {mask_type}",
+                   shell=True,
+                   check=True)
diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py
index 163d6d0c588..88cba8f793f 100644
--- a/cpp/kernels/fmha_v2/setup.py
+++ b/cpp/kernels/fmha_v2/setup.py
@@ -99,7 +99,8 @@ class AttentionMaskType(IntEnum):
     PADDING = 0
     CAUSAL = 1
     SLIDING_OR_CHUNKED_CAUSAL = 2
-    CUSTOM_MASK = 3
+    BIDIRECTIONAL_SLIDING_WINDOW = 3
+    CUSTOM_MASK = 4
 
 
 class InputLayout(IntEnum):
@@ -738,6 +739,20 @@ def get_makefile_code(specs_names):
     /*bmm2_fp16_epilogue*/ true,
     {output_dtype_}>;
 
+using Kernel_traits_nl_bidirectional_sliding_window = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {kv_loop_step},
+    {head_size},
+    {head_size_v},
+    {noloop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */,
+    /*bidirectional sliding window mask*/ 5,
+    /*bmm2_fp16_epilogue*/ true,
+    {output_dtype_}>;
+
 using Kernel_traits_nl_custom_mask = fmha::{kernel_traits}<
     fmha::{instruction_traits},
     {kv_loop_step},
@@ -748,7 +763,7 @@ def get_makefile_code(specs_names):
     {warps_n},
     {ctas_per_head},
     {kernel_flags} | 0x200 /* no_loop flag */,
-    /*custom mask*/ 5,
+    /*custom mask*/ 6,
     /*bmm2_fp16_epilogue*/ true,
     {output_dtype_}>;
 
@@ -782,6 +797,16 @@ def get_makefile_code(specs_names):
 
 #endif // sliding_or_chunked_causal_mask
 
+#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask
+
+extern "C"
+__global__
+void {bidirectional_sliding_window_kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl<Kernel_traits_nl_bidirectional_sliding_window>(params);
+}}
+
+#endif // bidirectional_sliding_window_mask
+
 #if {custom_mask} // custom_mask
 
 extern "C"
@@ -820,6 +845,15 @@ def get_makefile_code(specs_names):
     }}
     {sliding_or_chunked_causal_kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>({params_str});
 #endif // sliding_or_chunked_causal_mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW ) {{
+#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask
+    if( smem_size >= 48*1024 ) {{
+       FMHA_CHECK_CUDA(cudaFuncSetAttribute({bidirectional_sliding_window_kernel_name}_nl,
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smem_size));
+    }}
+    {bidirectional_sliding_window_kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // bidirectional_sliding_window_mask
   }} else if( launch_params.attention_mask_type == Attention_mask_type::PADDING ) {{
 #if {padding_mask} // padding_mask
     if( smem_size >= 48*1024 ) {{
@@ -890,6 +924,20 @@ def get_makefile_code(specs_names):
     /*bmm2_fp16_epilogue*/ true,
     {output_dtype_}>;
 
+using Kernel_traits_nl_tiled_bidirectional_sliding_window = fmha::{kernel_traits}<
+    fmha::{instruction_traits},
+    {kv_loop_step},
+    {head_size},
+    {head_size_v},
+    {noloop_step},
+    {warps_m},
+    {warps_n},
+    {ctas_per_head},
+    {kernel_flags} | 0x200 /* no_loop flag */,
+    /*bidirectional sliding window mask*/ 5,
+    /*bmm2_fp16_epilogue*/ true,
+    {output_dtype_}>;
+
 using Kernel_traits_nl_tiled_custom_mask = fmha::{kernel_traits}<
     fmha::{instruction_traits},
     {kv_loop_step},
@@ -900,7 +948,7 @@ def get_makefile_code(specs_names):
     {warps_n},
     {ctas_per_head},
     {kernel_flags} | 0x200 /* no_loop flag */,
-    /*custom mask*/ 5,
+    /*custom mask*/ 6,
     /*bmm2_fp16_epilogue*/ true,
     {output_dtype_}>;
 
@@ -934,6 +982,16 @@ def get_makefile_code(specs_names):
 
 #endif // sliding_or_chunked_causal_mask
 
+#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask
+
+extern "C"
+__global__
+void {bidirectional_sliding_window_kernel_name}_nl_tiled({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl_tiled<Kernel_traits_nl_tiled_bidirectional_sliding_window>(params);
+}}
+
+#endif // bidirectional_sliding_window_mask
+
 #if {custom_mask} // custom_mask
 
 extern "C"
@@ -973,6 +1031,15 @@ def get_makefile_code(specs_names):
     }}
     {sliding_or_chunked_causal_kernel_name}_nl_tiled<<<grid, Kernel_traits_nl_tiled::THREADS, Kernel_traits_nl_tiled::BYTES_PER_SMEM, stream>>>({params_str});
 #endif // sliding_or_chunked_causal_mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW ) {{
+#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask
+    if( smem_size >= 48*1024 ) {{
+       FMHA_CHECK_CUDA(cudaFuncSetAttribute({bidirectional_sliding_window_kernel_name}_nl_tiled,
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smem_size));
+    }}
+    {bidirectional_sliding_window_kernel_name}_nl_tiled<<<grid, Kernel_traits_nl_tiled::THREADS, Kernel_traits_nl_tiled::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // bidirectional_sliding_window_mask
   }} else if( launch_params.attention_mask_type == Attention_mask_type::PADDING ) {{
 #if {padding_mask} // padding_mask
     if( smem_size >= 48*1024 ) {{
@@ -1083,6 +1150,17 @@ def get_makefile_code(specs_names):
                                            4,
                                            {kernel_flags}>;
 
+using Kernel_traits_bidirectional_sliding_window = {kernel_traits}<
+                                           Traits_p,
+                                           Traits_o,
+                                           {seq_len},
+                                           {head_size},
+                                           {loop_step},
+                                           {warps_m},
+                                           {warps_n},
+                                           5,
+                                           {kernel_flags}>;
+
 #if {use_tma} // use_tma
 
 #if {padding_mask} // padding_mask
@@ -1115,6 +1193,16 @@ def get_makefile_code(specs_names):
 
 #endif // sliding_or_chunked_causal_mask
 
+#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask
+
+extern "C"
+__global__
+void {bidirectional_sliding_window_kernel_name}(const __grid_constant__ {params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_tma<Kernel_traits_bidirectional_sliding_window>(params);
+}}
+
+#endif // bidirectional_sliding_window_mask
+
 #else
 
 #if {padding_mask}
@@ -1144,10 +1232,21 @@ def get_makefile_code(specs_names):
 void {sliding_or_chunked_causal_kernel_name}(const __grid_constant__ {params_type} params){{
   fused_multihead_attention::device_{kernel_variant}<Kernel_traits_sliding_or_chunked_causal>(params);
 }}
-#endif
 
 #endif // sliding_or_chunked_causal_mask
 
+#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask
+
+extern "C"
+__global__
+void {bidirectional_sliding_window_kernel_name}(const __grid_constant__ {params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}<Kernel_traits_bidirectional_sliding_window>(params);
+}}
+
+#endif // bidirectional_sliding_window_mask
+
+#endif
+
 void {launcher_name}({fused_multihead_attention_params_v2_str} &params,
     const Launch_params &launch_params, cudaStream_t stream){{
   // setting TMA descriptors if needed.
@@ -1259,6 +1358,15 @@ def get_makefile_code(specs_names):
     }}
     {sliding_or_chunked_causal_kernel_name}<<<grid, Kernel_traits::THREADS, Kernel_traits::BYTES_PER_SMEM, stream>>>({params_str});
 #endif // sliding_or_chunked_causal_mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW ) {{
+#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask
+    if( smem_size >= 48*1024 ) {{
+       FMHA_CHECK_CUDA(cudaFuncSetAttribute({bidirectional_sliding_window_kernel_name},
+                                        cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                        smem_size));
+    }}
+    {bidirectional_sliding_window_kernel_name}<<<grid, Kernel_traits::THREADS, Kernel_traits::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // bidirectional_sliding_window_mask
   }} else {{
 #if {padding_mask} // padding_mask
     constexpr int smem_size = Kernel_traits::BYTES_PER_SMEM;
@@ -1308,6 +1416,17 @@ def get_makefile_code(specs_names):
                                               4,
                                               {kernel_flags}>;
 
+using Kernel_traits_bidirectional_sliding_window_nl = {kernel_traits}<
+                                              Traits_p,
+                                              Traits_o,
+                                              {seq_len},
+                                              {head_size},
+                                              {noloop_step},
+                                              {warps_m},
+                                              {warps_n},
+                                              5,
+                                              {kernel_flags}>;
+
 #if {padding_mask} // padding_mask
 
 extern "C"
@@ -1338,6 +1457,16 @@ def get_makefile_code(specs_names):
 
 #endif // sliding_or_chunked_causal_mask
 
+#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask
+
+extern "C"
+__global__
+void {bidirectional_sliding_window_kernel_name}_nl({params_type} params){{
+  fused_multihead_attention::device_{kernel_variant}_nl<Kernel_traits_bidirectional_sliding_window_nl>(params);
+}}
+
+#endif // bidirectional_sliding_window_mask
+
 void {launcher_name}_nl({fused_multihead_attention_params_v2_str} &params,
     const Launch_params& launch_params, cudaStream_t stream){{
   constexpr int loop_iters = {seq_len} / {noloop_step};
@@ -1364,6 +1493,15 @@ def get_makefile_code(specs_names):
     }}
     {sliding_or_chunked_causal_kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>({params_str});
 #endif // sliding_or_chunked_causal_mask
+  }} else if( launch_params.attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW ) {{
+#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask
+    if( smem_size >= 48*1024 ) {{
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({bidirectional_sliding_window_kernel_name}_nl,
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         smem_size));
+    }}
+    {bidirectional_sliding_window_kernel_name}_nl<<<grid, Kernel_traits_nl::THREADS, Kernel_traits_nl::BYTES_PER_SMEM, stream>>>({params_str});
+#endif // bidirectional_sliding_window_mask
   }} else {{
 #if {padding_mask} // padding_mask
     if( smem_size >= 48*1024 ) {{
@@ -1487,6 +1625,27 @@ def get_makefile_code(specs_names):
                                       {enable_skip_softmax_flag},
                                       {output_dtype_}>;
 
+using Ktraits_bidirectional_sliding_window = {kernel_traits_header}
+                                      {loop_step},
+                                      {kv_loop_step},
+                                      {head_size},
+                                      {head_size_v},
+                                      {q_tile_buffers},
+                                      {kv_tile_buffers},
+                                      NUM_COMPUTE_GROUPS,
+                                      DMA2COMPUTE_DEPTH,
+                                      3,
+                                      {heads_interleaved_flag},
+                                      {has_alibi},
+                                      {enable_mutex_flag},
+                                      {scheduling_mode},
+                                      {input_layout_flag},
+                                      USE_TMA_STORE && false,
+                                      {enable_attn_logit_softcapping_flag},
+                                      {return_softmax_stats_flag},
+                                      {enable_skip_softmax_flag},
+                                      {output_dtype_}>;
+
 using Ktraits_custom_mask = {kernel_traits_header}
                             {loop_step},
                             {kv_loop_step},
@@ -1496,7 +1655,7 @@ def get_makefile_code(specs_names):
                             {kv_tile_buffers},
                             NUM_COMPUTE_GROUPS,
                             DMA2COMPUTE_DEPTH,
-                            3,
+                            4,
                             {heads_interleaved_flag},
                             {has_alibi},
                             {enable_mutex_flag},
@@ -1658,6 +1817,56 @@ def get_makefile_code(specs_names):
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask
+
+using Shared_bidirectional_sliding_window = typename Ktraits_bidirectional_sliding_window::Shared;
+
+extern "C"
+__global__ __launch_bounds__(Ktraits_bidirectional_sliding_window::THREADS, 1)
+void {bidirectional_sliding_window_kernel_name}(
+    const __grid_constant__ {params_type} params){{
+
+    extern __shared__ char smem_[];
+    char *smem_aligned = fmha::align_1024(smem_);
+
+    Shared_bidirectional_sliding_window *shared =
+        reinterpret_cast<Shared_bidirectional_sliding_window *>(&smem_aligned[0]);
+    shared->init(threadIdx.x == 0);
+    __syncthreads();
+
+    // special trick to avoid warp_sync (leads to illegal instruction)
+    int warp_group = __shfl_sync(0xffffffff, threadIdx.x / 128, 0);
+    int tidx = threadIdx.x % 128;
+
+    if( warp_group == NUM_COMPUTE_GROUPS ) {{  // dma + sched
+
+        {setmaxnreg_dma_str}
+        uint32_t elect_one = tidx == 0;
+
+        // Need all threads involved when the dam group needs to transpose the v tile explicltly.
+        if constexpr ( Ktraits_bidirectional_sliding_window::DMA_GROUP_TRANSPOSE_V ) {{
+            fmha::ws::DMA<Ktraits_bidirectional_sliding_window>::Device dma_device(elect_one);
+            dma_device.{run_fct_name}(params, shared);
+        }} else {{
+            fmha::ws::DMA<Ktraits_bidirectional_sliding_window>::Device dma_device(elect_one);
+            if( tidx < 32 ) {{
+                dma_device.{run_fct_name}(params, shared);
+            }}
+        }}
+
+    }} else {{  // math
+
+        {setmaxnreg_compute_str}
+
+        fmha::ws::Compute<fmha::{instruction_traits}, Ktraits_bidirectional_sliding_window> compute;
+        compute.run(warp_group, tidx, shared, params);
+    }}
+}}
+
+#endif // bidirectional_sliding_window_mask
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 #if {custom_mask} // custom_mask
 
 using Shared_custom_mask = typename Ktraits_custom_mask::Shared;
@@ -1784,6 +1993,15 @@ def get_makefile_code(specs_names):
         {sliding_or_chunked_causal_kernel_name}
             <<<block_size, Ktraits::THREADS, SMEM_BYTES, stream>>>({params_str});
 #endif // sliding_or_chunked_causal_mask
+    }} else if( launch_params.attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW ) {{
+#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask
+        FMHA_CHECK_CUDA(cudaFuncSetAttribute({bidirectional_sliding_window_kernel_name},
+                                         cudaFuncAttributeMaxDynamicSharedMemorySize,
+                                         SMEM_BYTES));
+
+        {bidirectional_sliding_window_kernel_name}
+            <<<block_size, Ktraits::THREADS, SMEM_BYTES, stream>>>({params_str});
+#endif // bidirectional_sliding_window_mask
     }} else if( launch_params.attention_mask_type == Attention_mask_type::CUSTOM_MASK ) {{
 #if {custom_mask} // custom_mask
         FMHA_CHECK_CUDA(cudaFuncSetAttribute({custom_mask_kernel_name},
@@ -1962,6 +2180,7 @@ def selected_mask_types(kspec):
     padding_mask = '1'
     causal_mask = '1'
     sliding_or_chunked_causal_mask = '1'
+    bidirectional_sliding_window_mask = '1'
     custom_mask = '1'
     # only generate certain needed combinations of input_layout and mask types for trt-llm.
     if "GENERATE_CUBIN" in os.environ:
@@ -1969,15 +2188,18 @@ def selected_mask_types(kspec):
             # SageAttention only needs padding mask now
             causal_mask = '0'
             sliding_or_chunked_causal_mask = '0'
+            bidirectional_sliding_window_mask = '0'
             custom_mask = '0'
         elif (kspec.head_size, kspec.head_size_v) == (192, 128):
             # MLA context phase only needs causal mask and padding mask (for chunked prefill) now
             sliding_or_chunked_causal_mask = '0'
+            bidirectional_sliding_window_mask = '0'
             custom_mask = '0'
         elif (kspec.head_size, kspec.head_size_v) == (576, 512):
             # MLA generation phase only needs padding mask (MtpMask) now
             causal_mask = '0'
             sliding_or_chunked_causal_mask = '0'
+            bidirectional_sliding_window_mask = '0'
             custom_mask = '0'
         # encoder models (head_size = 32 / 64 / 128) need packed_qkv input layout + padding mask.
         elif kspec.input_layout == InputLayout.PACKED_QKV:
@@ -1988,6 +2210,7 @@ def selected_mask_types(kspec):
         elif kspec.input_layout == InputLayout.CONTIGUOUS_Q_KV:
             causal_mask = '0'
             sliding_or_chunked_causal_mask = '0'
+            bidirectional_sliding_window_mask = '0'
             if kspec.head_size not in [32, 64, 72, 128]:
                 padding_mask = '0'
                 custom_mask = '0'
@@ -2001,14 +2224,16 @@ def selected_mask_types(kspec):
         if (kspec.alibi and kspec.warp_specialization):
             padding_mask = '0'
             sliding_or_chunked_causal_mask = '0'
+            bidirectional_sliding_window_mask = '0'
             custom_mask = '0'
 
         # enable_attn_logit_softcapping kernels only need causal mask or sliding_or_chunked_causal_mask.
         if kspec.enable_attn_logit_softcapping:
             padding_mask = '0'
             custom_mask = '0'
+            bidirectional_sliding_window_mask = '0'
 
-    return padding_mask, causal_mask, sliding_or_chunked_causal_mask, custom_mask
+    return padding_mask, causal_mask, sliding_or_chunked_causal_mask, bidirectional_sliding_window_mask, custom_mask
 
 
 def get_kernel_code(kspec, kname, lname):
@@ -2025,6 +2250,8 @@ def get_kernel_code(kspec, kname, lname):
     custom_mask_kernel_name = kname.replace('__placeholder__', '_custom_mask')
     sliding_or_chunked_causal_kernel_name = kname.replace(
         '__placeholder__', '_sliding_or_chunked_causal')
+    bidirectional_sliding_window_kernel_name = kname.replace(
+        '__placeholder__', '_bidirectional_sliding_window')
     kernel_name = kname.replace('__placeholder__', '')
 
     # FIXME: use separate parameters when generating cubins for trtllm.
@@ -2107,12 +2334,12 @@ def get_kernel_code(kspec, kname, lname):
         flags |= 8192
 
     # only generate certain needed combinations of input_layout and mask types for trt-llm.
-    padding_mask, causal_mask, sliding_or_chunked_causal_mask, custom_mask = \
+    padding_mask, causal_mask, sliding_or_chunked_causal_mask, bidirectional_sliding_window_mask, custom_mask = \
         selected_mask_types(kspec)
 
     if any(selected_mask_flag == '1'
            for selected_mask_flag in selected_mask_types(kspec)):
-        padding_mask, causal_mask, sliding_or_chunked_causal_mask, custom_mask = \
+        padding_mask, causal_mask, sliding_or_chunked_causal_mask, bidirectional_sliding_window_mask, custom_mask = \
             selected_mask_types(kspec)
     else:
         return None
@@ -2894,6 +3121,11 @@ def get_kernel_traits_code(specs_names):
             snippet_flash_nl_tiled_sliding_or_chunked_causal = snippet_flash_nl_template.replace(
                 '__placeholder__',
                 '_sliding_or_chunked_causal').replace('_nl', '_nl_tiled')
+            snippet_flash_nl_bidirectional_sliding_window = snippet_flash_nl_template.replace(
+                '__placeholder__', '_bidirectional_sliding_window')
+            snippet_flash_nl_tiled_bidirectional_sliding_window = snippet_flash_nl_template.replace(
+                '__placeholder__',
+                '_bidirectional_sliding_window').replace('_nl', '_nl_tiled')
             snippet_flash_nl_custom_mask = snippet_flash_nl_template.replace(
                 '__placeholder__', '_custom_mask')
             snippet_flash_nl_tiled_custom_mask = snippet_flash_nl_template.replace(
@@ -2941,9 +3173,13 @@ def get_kernel_traits_code(specs_names):
                 snippet_ws_template.replace('__placeholder__', '_sliding_or_chunked_causal').\
                                        replace('mask_type', '2').\
                                        replace('__use_tma_store__', 'false')
+            snippet_ws_bidirectional_sliding_window = \
+                snippet_ws_template.replace('__placeholder__', '_bidirectional_sliding_window').\
+                                       replace('mask_type', '3').\
+                                       replace('__use_tma_store__', 'false')
             snippet_ws_custom_mask = \
                 snippet_ws_template.replace('__placeholder__', '_custom_mask').\
-                                       replace('mask_type', '2').\
+                                       replace('mask_type', '4').\
                                        replace('__use_tma_store__', 'true')
         elif effective_sm >= 90:  #GMMA no flash yet
             snippet_template = '''    {{
@@ -3007,7 +3243,8 @@ def get_kernel_traits_code(specs_names):
         padding_mask = int(selected_types[0])
         causal_mask = int(selected_types[1])
         sliding_or_chunked_causal_mask = int(selected_types[2])
-        custom_mask = int(selected_types[3])
+        bidirectional_sliding_window_mask = int(selected_types[3])
+        custom_mask = int(selected_types[4])
 
         if not padding_mask:
             snippet = None
@@ -3027,6 +3264,10 @@ def get_kernel_traits_code(specs_names):
             snippet_ws_sliding_or_chunked_causal = None
             snippet_flash_nl_sliding_or_chunked_causal = None
             snippet_flash_nl_tiled_sliding_or_chunked_causal = None
+        if not bidirectional_sliding_window_mask:
+            snippet_ws_bidirectional_sliding_window = None
+            snippet_flash_nl_bidirectional_sliding_window = None
+            snippet_flash_nl_tiled_bidirectional_sliding_window = None
         if not custom_mask:
             snippet_ws_custom_mask = None
             snippet_flash_nl_custom_mask = None
@@ -3047,12 +3288,16 @@ def get_kernel_traits_code(specs_names):
                 print_kernel_specs.append(snippet_flash_nl_tiled_causal)
                 print_kernel_specs.append(
                     snippet_flash_nl_tiled_sliding_or_chunked_causal)
+                print_kernel_specs.append(
+                    snippet_flash_nl_tiled_bidirectional_sliding_window)
                 print_kernel_specs.append(snippet_flash_nl_tiled_custom_mask)
             elif kspec.flash_attention and kspec.tiled == 0:
                 print_kernel_specs.append(snippet_flash_nl)
                 print_kernel_specs.append(snippet_flash_nl_causal)
                 print_kernel_specs.append(
                     snippet_flash_nl_sliding_or_chunked_causal)
+                print_kernel_specs.append(
+                    snippet_flash_nl_bidirectional_sliding_window)
                 print_kernel_specs.append(snippet_flash_nl_custom_mask)
             else:
                 print_kernel_specs.append(snippet_nl)
@@ -3066,6 +3311,7 @@ def get_kernel_traits_code(specs_names):
             print_kernel_specs.append(snippet_ws)
             print_kernel_specs.append(snippet_ws_causal)
             print_kernel_specs.append(snippet_ws_sliding_or_chunked_causal)
+            print_kernel_specs.append(snippet_ws_bidirectional_sliding_window)
             print_kernel_specs.append(snippet_ws_custom_mask)
     # remove none.
     print_kernel_specs = [
@@ -3133,13 +3379,16 @@ def get_cubin_header(kernel_traits, specs_names):
                 '').replace('ldgsts_', '').replace('causal_', '').replace(
                     'alibi_', '').replace('softmax_', '').replace(
                         'sliding_or_chunked_', '').replace(
-                            'custom_mask_', '').replace('qkv_', '').replace(
-                                'q_kv_', '').replace('q_paged_kv_', '').replace(
-                                    'q_k_v_', '').replace('ws_', '').replace(
-                                        'softcapping_',
-                                        '').replace('sage_', '').replace(
-                                            'skipSoftmax_',
-                                            '').replace('output_', ''))
+                            'bidirectional_sliding_window_', '').replace(
+                                'custom_mask_', '').replace('qkv_', '').replace(
+                                    'q_kv_',
+                                    '').replace('q_paged_kv_', '').replace(
+                                        'q_k_v_',
+                                        '').replace('ws_', '').replace(
+                                            'softcapping_',
+                                            '').replace('sage_', '').replace(
+                                                'skipSoftmax_',
+                                                '').replace('output_', ''))
         flash_attention = 'flash_attention' in kname
         warp_specialization = 'tma_ws' in kname
         toks = tname.split('_')
@@ -3207,11 +3456,13 @@ def get_cubin_header(kernel_traits, specs_names):
         is_tiled = pythonBoolean2cpp['_tiled' in kname]
 
         # Attention mask type:
-        # padding (0), causal_mask (1), sliding_or_chunked_causal_mask (2), custom_mask (3).
+        # padding (0), causal_mask (1), sliding_or_chunked_causal_mask (2), bidirectional_sliding_window_mask (3), custom_mask (4).
         if '_custom_mask' in kname:
             attention_mask_type = AttentionMaskType.CUSTOM_MASK
         elif '_sliding_or_chunked_causal' in kname:
             attention_mask_type = AttentionMaskType.SLIDING_OR_CHUNKED_CAUSAL
+        elif '_bidirectional_sliding_window' in kname:
+            attention_mask_type = AttentionMaskType.BIDIRECTIONAL_SLIDING_WINDOW
         elif '_causal' in kname:
             attention_mask_type = AttentionMaskType.CAUSAL
 
@@ -3269,7 +3520,9 @@ def get_lname_from_kname(kname: str) -> str:
                         return 'nullptr'
                     lname = kname.replace('_kernel', '')
                     mask_types = [
-                        '_sliding_or_chunked_causal', '_custom_mask', '_causal'
+                        '_sliding_or_chunked_causal',
+                        '_bidirectional_sliding_window', '_custom_mask',
+                        '_causal'
                     ]
                     for mask_type in mask_types:
                         lname = lname.replace(mask_type, '')
@@ -6687,6 +6940,11 @@ def enumerate_kernels():
                   and (kspec.head_size == 128 or kspec.head_size == 256 or not kspec.enable_attn_logit_softcapping)]
     # yapf: enable
 
+    # A separate more aggressive filter for building the fmha.exe binary. Can be ignored for building the cubins.
+    if "FMHA_FILTER_ARCH" in os.environ:
+        archs = [int(x) for x in os.environ["FMHA_FILTER_ARCH"].split(",")]
+        specs_names = [kspec for kspec in specs_names if kspec[0].sm in archs]
+
     generate_files(specs_names)
 
 
diff --git a/cpp/kernels/fmha_v2/src/fmha/hopper/kernel_traits.h b/cpp/kernels/fmha_v2/src/fmha/hopper/kernel_traits.h
index ece561a6d21..90459572873 100644
--- a/cpp/kernels/fmha_v2/src/fmha/hopper/kernel_traits.h
+++ b/cpp/kernels/fmha_v2/src/fmha/hopper/kernel_traits.h
@@ -49,7 +49,8 @@ template <
     int WARPS_N,
     // The version of the kernel.
     int VERSION_,
-    // The mask version of the kernel, (2 denotes dense mask, 3 denotes causal mask)
+    // The mask version of the kernel, (2 denotes dense mask, 3 denotes causal mask, 4 denotes sliding window causal
+    // mask, 5 denotes bidirectional sliding window mask)
     int MASK_VERSION_ = 2,
     // The flags to control the behaviour of LDGs.
     uint32_t FLAGS = 0x8u>
@@ -111,7 +112,7 @@ struct FMHA_kernel_traits_hopper
     // Whether use causal mask or not.
     enum
     {
-        CAUSAL_MASK = MASK_VERSION_ >= 3
+        CAUSAL_MASK = MASK_VERSION_ == 3 || MASK_VERSION_ == 4
     };
 
     // Whether use the sliding window attention mask or not.
@@ -120,6 +121,12 @@ struct FMHA_kernel_traits_hopper
         SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 4
     };
 
+    // Whether use the bidirectional sliding window attention mask or not.
+    enum
+    {
+        BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 5
+    };
+
     // Do we use LDGSTS for Q, K or V. If not, TMA is used!
     enum
     {
diff --git a/cpp/kernels/fmha_v2/src/fmha/kernel_traits.h b/cpp/kernels/fmha_v2/src/fmha/kernel_traits.h
index 3391cf3d28e..e4a54252bf5 100644
--- a/cpp/kernels/fmha_v2/src/fmha/kernel_traits.h
+++ b/cpp/kernels/fmha_v2/src/fmha/kernel_traits.h
@@ -271,7 +271,8 @@ struct Kernel_traits_
         VERSION = VERSION_
     };
 
-    // The mask version: padding (2), causal (3), sliding_window_causal (4), custom_mask (5).
+    // The mask version: padding (2), causal (3), sliding_window_causal (4), bidirectional_sliding_window (5),
+    // custom_mask (6).
     enum
     {
         MASK_VERSION = MASK_VERSION_
@@ -289,10 +290,16 @@ struct Kernel_traits_
         SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 4
     };
 
+    // Whether use the bidirectional sliding window attention or not.
+    enum
+    {
+        BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 5
+    };
+
     // Whether use the custom mask or not.
     enum
     {
-        CUSTOM_MASK = MASK_VERSION_ == 5
+        CUSTOM_MASK = MASK_VERSION_ == 6
     };
 
     // Do we use LDGSTS for Q, K or V.
@@ -551,7 +558,7 @@ struct Kernel_traits_fmhca_
     // Whether use causal mask or not.
     enum
     {
-        CAUSAL_MASK = MASK_VERSION >= 3
+        CAUSAL_MASK = MASK_VERSION == 3 || MASK_VERSION == 4
     };
 
     // Whether use the sliding window attention or not.
@@ -560,6 +567,12 @@ struct Kernel_traits_fmhca_
         SLIDING_WINDOW_ATTENTION = MASK_VERSION == 4
     };
 
+    // Whether use the bidirectional sliding window attention or not.
+    enum
+    {
+        BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = MASK_VERSION == 5
+    };
+
     // Do we use LDGSTS for Q, K or V.
     enum
     {
@@ -745,7 +758,7 @@ struct Kernel_traits_interleaved_v2_
     // Whether use causal mask or not.
     enum
     {
-        CAUSAL_MASK = MASK_VERSION_ >= 3
+        CAUSAL_MASK = MASK_VERSION_ == 3 || MASK_VERSION_ == 4
     };
 
     // Whether use the sliding window attention or not.
@@ -754,6 +767,12 @@ struct Kernel_traits_interleaved_v2_
         SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 4
     };
 
+    // Whether use the bidirectional sliding window attention or not.
+    enum
+    {
+        BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 5
+    };
+
     // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K
     enum
     {
diff --git a/cpp/kernels/fmha_v2/src/fmha/mask.h b/cpp/kernels/fmha_v2/src/fmha/mask.h
index fc490286a8b..04cf4afcca0 100644
--- a/cpp/kernels/fmha_v2/src/fmha/mask.h
+++ b/cpp/kernels/fmha_v2/src/fmha/mask.h
@@ -492,9 +492,62 @@ struct Mask<Traits, Cta_tile, 4> : public Mask<Traits, Cta_tile, 3>
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
+// Assume we only pay attention to bidirectional sliding-window-size long sequence.
+// v v v x x x x x x
+// v v v v x x x x x
+// v v v v v x x x x
+// x v v v v v x x x
+// x v v v v v x x x
+// x x v v v v v x x
+// x x x v v v v v x
+// x x x x v v v v v
+// x x x x x v v v v
+// x x x x x x v v v
+
+template <typename Traits, typename Cta_tile>
+struct Mask<Traits, Cta_tile, 5> : public Mask<Traits, Cta_tile, 4>
+{
+    // V5 mask is the bidirectional sliding window mask.
+    using Base = Mask<Traits, Cta_tile, 4>;
+
+    // The shape of the MMA tile.
+    using Mma_tile = typename Base::Mma_tile;
+
+    // Ctor.
+    template <typename Params, typename Block_info>
+    inline __device__ Mask(Params const& params, Block_info const& block_info, int tidx)
+        : Base(params, block_info, tidx)
+        , seqlen_(block_info.actual_seqlen)
+    {
+    }
+
+    // Is a given position valid?
+    inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const
+    {
+        int row, col;
+        this->get_row_col(row, col, mi, ni, ii, jj);
+
+        // Is it a valid position in the sequence?
+        return is_valid(row, col);
+    }
+
+    // Is a given position valid?
+    inline __device__ bool is_valid(int row, int col) const
+    {
+        // Is it a valid position in the sequence, i.e. are we in the lower triangle?
+        return (col >= max(0, row - Base::sliding_window_size_ / 2))
+            && (col <= min(seqlen_ - 1, row + Base::sliding_window_size_ / 2));
+    }
+
+    // The sequence length.
+    int seqlen_;
+};
+
+////////////////////////////////////////////////////////////////////////////////////////////////////
+
 // The custom mask (from global memory).
 template <typename Traits, typename Cta_tile>
-struct Mask<Traits, Cta_tile, 5> : public Mask<Traits, Cta_tile, 3>
+struct Mask<Traits, Cta_tile, 6> : public Mask<Traits, Cta_tile, 3>
 {
 
     using Base = Mask<Traits, Cta_tile, 3>;
@@ -958,6 +1011,46 @@ struct Mask_hopper<Traits, Cta_tile, 4> : public Mask_hopper<Traits, Cta_tile, 3
     int sliding_window_size_;
 };
 
+template <typename Traits, typename Cta_tile>
+struct Mask_hopper<Traits, Cta_tile, 5> : public Mask_hopper<Traits, Cta_tile, 4>
+{
+
+    // V5 mask is the bidirectional sliding window mask.
+    using Base = Mask_hopper<Traits, Cta_tile, 4>;
+
+    // The shape of the MMA tile.
+    using Mma_tile = typename Traits::template Mma_tile<Cta_tile>;
+
+    // Ctor.
+    template <typename Params, typename Block_info>
+    inline __device__ Mask_hopper(Params const& params, Block_info const& block_info, int tidx)
+        : Base(params, block_info, tidx)
+        , seqlen_(block_info.actual_seqlen)
+    {
+    }
+
+    // Is a given position valid?
+    inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const
+    {
+        int row, col;
+        this->get_row_col(row, col, mi, ni, ii, jj);
+
+        // Is it a valid position in the sequence?
+        return is_valid(row, col);
+    }
+
+    // Is a given position valid?
+    inline __device__ bool is_valid(int row, int col) const
+    {
+        // Is it a valid position in the sequence?
+        return col >= max(0, row - Base::sliding_window_size_ / 2)
+            && col <= min(seqlen_ - 1, row + Base::sliding_window_size_ / 2);
+    }
+
+    // The sequence length.
+    int seqlen_;
+};
+
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
 } // namespace fmha
diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h
index bdc9b6d9dee..bf18d4921f6 100644
--- a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h
+++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h
@@ -116,6 +116,12 @@ struct Compute
         SLIDING_OR_CHUNKED_ATTENTION = Kernel_traits::SLIDING_OR_CHUNKED_ATTENTION
     };
 
+    // Whether use the bidirectional sliding window attention or not.
+    enum
+    {
+        BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION
+    };
+
     // Are we applying alibi bias (drop FMA optimizations for accuracy reasons).
     enum
     {
@@ -288,17 +294,30 @@ struct Compute
         // Is the chunked_attention used ?
         bool is_chunked_attention = params.log2_chunked_attention_size > 0;
 
-        // The left mask is needed when we attend to a specific sliding window or chunk.
+        // Handle sliding window or chunked attention masking
         if constexpr (SLIDING_OR_CHUNKED_ATTENTION)
         {
-            // The kv_left_mask_end is the start of the chunk.
-            kv_left_mask_end = div_up(is_chunked_attention
-                    ? ((tile_offset_end >> params.log2_chunked_attention_size) << params.log2_chunked_attention_size)
-                    : (tile_offset_end + 1 - params.sliding_window_size),
-                STEP_KV);
+            if constexpr (BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION)
+            {
+                // Handle bidirectional sliding window attention
+                kv_left_mask_end = div_up(tile_offset_end - params.sliding_window_size / 2, STEP_KV);
+                kv_right_mask_start
+                    = min(kv_idx_end - 1, (tile_offset_start + params.sliding_window_size / 2 + 1) / STEP_KV);
+            }
+            else if (is_chunked_attention)
+            {
+                // Handle chunked attention
+                kv_left_mask_end = div_up(
+                    ((tile_offset_end >> params.log2_chunked_attention_size) << params.log2_chunked_attention_size),
+                    STEP_KV);
+            }
+            else
+            {
+                kv_left_mask_end = div_up(tile_offset_end + 1 - params.sliding_window_size, STEP_KV);
+            }
         }
 
-        // The right mask is needed when causal mask (including sliding_window_attention or chunked attention) is used.
+        // The right mask is needed when causal mask is used.
         if constexpr (SKIP_CAUSAL_MASK_TILES)
         {
             kv_right_mask_start = tile_offset_start / STEP_KV;
diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h
index c1ebf5a7bba..d77c5414455 100644
--- a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h
+++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h
@@ -114,6 +114,12 @@ struct DMA
         SLIDING_OR_CHUNKED_ATTENTION = Kernel_traits::SLIDING_OR_CHUNKED_ATTENTION
     };
 
+    // Whether use the bidirectional sliding window attention or not.
+    enum
+    {
+        BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION
+    };
+
     // Is heads interleaved ?
     enum
     {
@@ -201,11 +207,27 @@ struct DMA
             // Skip initial kv tiles due to sliding_window_size
             if (SLIDING_OR_CHUNKED_ATTENTION)
             {
-                // The kv_offset_start.
-                int kv_offset_start = is_chunked_attention
-                    ? ((q_step_offset >> params.log2_chunked_attention_size) << params.log2_chunked_attention_size)
-                    : max(0, q_step_offset + 1 - params.sliding_window_size);
-                kv_idx_start = kv_offset_start / STEP_KV;
+                if constexpr (BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION)
+                {
+                    int kv_offset_start = max(0, q_step_offset - params.sliding_window_size / 2);
+                    int kv_offset_end = min(kv_steps * STEP_KV - 1, q_step_end + params.sliding_window_size / 2);
+
+                    // We do floor division plus 1 to get the correct kv_idx_end, this is because kv_idx_end is
+                    // exclusive
+                    kv_idx_start = kv_offset_start / STEP_KV;
+                    kv_idx_end = kv_offset_end / STEP_KV + 1;
+                }
+                else if (is_chunked_attention)
+                {
+                    int kv_offset_start
+                        = ((q_step_offset >> params.log2_chunked_attention_size) << params.log2_chunked_attention_size);
+                    kv_idx_start = kv_offset_start / STEP_KV;
+                }
+                else
+                {
+                    int kv_offset_start = max(0, q_step_offset + 1 - params.sliding_window_size);
+                    kv_idx_start = kv_offset_start / STEP_KV;
+                }
             }
 
             // Early stop when causal mask is enabled.
diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h
index 465c9430cb0..9393b4fd4e3 100644
--- a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h
+++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h
@@ -76,6 +76,12 @@ struct Softmax_base
         SLIDING_OR_CHUNKED_ATTENTION = Kernel_traits::SLIDING_OR_CHUNKED_ATTENTION
     };
 
+    // Whether use the bidirectional sliding window attention or not.
+    enum
+    {
+        BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION
+    };
+
     // Are we applying alibi bias (drop FMA optimizations for accuracy reasons).
     enum
     {
@@ -134,7 +140,7 @@ struct Softmax_base
         // The corresponding row/col for each thread after MMA.
         // fixed 4x1 warp layout.
         quad_col_ = lane % 4;
-        if (CAUSAL_MASK)
+        if (CAUSAL_MASK || SLIDING_OR_CHUNKED_ATTENTION)
         {
             quad_row_ = warp * 16 + lane / 4;
         }
@@ -149,9 +155,14 @@ struct Softmax_base
             // The attention chunk start.
             return (row >> log2_chunked_attention_size_) << log2_chunked_attention_size_;
         }
+        else if constexpr (BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION)
+        {
+            // The bidirectional sliding window start is the max of 0 and row - sliding_window_size/2.
+            return max(0, row - sliding_window_size_ / 2);
+        }
         else
         {
-            // The sliding window start is the max of 0 and row - sliding_window_size.
+            // The sliding window start is the max of 0 and row + 1 - sliding_window_size.
             return max(0, row + 1 - sliding_window_size_);
         }
     }
@@ -286,14 +297,18 @@ struct Softmax_base
                         valid_positions(mi, ni, v0, v1);
                         // Causal mask.
                     }
-                    else if constexpr (CAUSAL_MASK)
+                    else if constexpr (CAUSAL_MASK || SLIDING_OR_CHUNKED_ATTENTION)
                     {
                         // Causal Mask: we have to apply mask before getting max.
                         int row = row_offset + quad_row_ + mi * 8;
                         col = col_offset + quad_col_ * 2 + ni * 8;
-                        // Mask for the two N elements.
-                        v0 = (col <= row);
-                        v1 = (col + 1 <= row);
+
+                        if constexpr (CAUSAL_MASK)
+                        {
+                            // Mask for the two N elements.
+                            v0 &= (col <= row);
+                            v1 &= (col + 1 <= row);
+                        }
 
                         // Attend to the specific sliding window or chunk.
                         if constexpr (SLIDING_OR_CHUNKED_ATTENTION)
@@ -301,6 +316,15 @@ struct Softmax_base
                             int sliding_window_or_chunk_start = compute_sliding_window_or_chunk_start(row);
                             v0 &= (col >= sliding_window_or_chunk_start);
                             v1 &= (col + 1 >= sliding_window_or_chunk_start);
+
+                            if constexpr (BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION)
+                            {
+                                assert(log2_chunked_attention_size_ == 0
+                                    && "Bidirectional sliding window attention should not use chunked attention");
+                                int sliding_window_end = min(actual_seqlen - 1, row + sliding_window_size_ / 2);
+                                v0 &= (col <= sliding_window_end);
+                                v1 &= (col + 1 <= sliding_window_end);
+                            }
                         }
                         // Dense(padding) mask.
                     }
diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h
index f8d7004939c..966ff015627 100644
--- a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h
+++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h
@@ -51,8 +51,9 @@ template <
     int NUM_COMPUTE_GROUPS_,
     // The number of data warpgroups (TMA).
     int DMA2COMPUTE_DEPTH_,
-    // The attention mask type: padding (0), causal (1), sliding_window_causal (2), custom_mask (3).
-    // See fused_multihead_attention_kernel.h for description.
+    // The attention mask type: padding (0), causal (1), sliding_or_chunked_attention (2),
+    // bidirectional_sliding_window_attention (3), custom_mask (4). See fused_multihead_attention_kernel.h for
+    // description.
     int ATTENTION_MASK_TYPE_ = 0,
     // Is head interleaved ?
     // (head_interleaved means input [bxs, h, 3, d], otherwise [bx3, 3, h, d]).
@@ -250,7 +251,8 @@ struct Kernel_traits
         WARP_GROUP_K = 1
     };
 
-    // The attention mask type: padding (0), causal (1), sliding_or_chunked_attention (2), custom_mask (3).
+    // The attention mask type: padding (0), causal (1), sliding_or_chunked_attention (2),
+    // bidirectional_sliding_window_attention (3), custom_mask (4).
     enum
     {
         CAUSAL_MASK = (ATTENTION_MASK_TYPE_ == 1 || ATTENTION_MASK_TYPE_ == 2)
@@ -258,7 +260,12 @@ struct Kernel_traits
 
     enum
     {
-        SLIDING_OR_CHUNKED_ATTENTION = ATTENTION_MASK_TYPE_ == 2
+        SLIDING_OR_CHUNKED_ATTENTION = ATTENTION_MASK_TYPE_ == 2 || ATTENTION_MASK_TYPE_ == 3
+    };
+
+    enum
+    {
+        BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = ATTENTION_MASK_TYPE_ == 3
     };
 
     // Is head interleaved ?
@@ -286,10 +293,10 @@ struct Kernel_traits
         ENABLE_BMM1_SOFTCAPPING_SCALE = ENABLE_BMM1_SOFTCAPPING_SCALE_
     };
 
-    // Use the custom mask input ( attention_mask_type == 3.)
+    // Use the custom mask input ( attention_mask_type == 4.)
     enum
     {
-        USE_CUSTOM_MASK = ATTENTION_MASK_TYPE_ == 3
+        USE_CUSTOM_MASK = ATTENTION_MASK_TYPE_ == 4
     };
 
     // Are we enabling skip softmax attention feature?
diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp
index 5a32f0a5116..f4ec62cd032 100644
--- a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp
+++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp
@@ -781,6 +781,10 @@ int main(int argc, char** argv)
         {
             attention_mask_type = Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL;
         }
+        else if (!strcmp(argv[ii], "-bidirectional-sliding-window-mask"))
+        {
+            attention_mask_type = Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW;
+        }
         else if (!strcmp(argv[ii], "-custom-mask"))
         {
             attention_mask_type = Attention_mask_type::CUSTOM_MASK;
@@ -943,13 +947,20 @@ int main(int argc, char** argv)
     {
         assert(
             chunked_attention_size == 0 && "chunked_attention_size should not be used when sliding_window_size is set");
-        attention_mask_type = Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL;
+        // Default to causal sliding window if the user did not explicitly set the mask type to bidirectional sliding
+        // window
+        if (attention_mask_type != Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW)
+        {
+            attention_mask_type = Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL;
+        }
     }
     // Chunked attention.
     if (chunked_attention_size > 0)
     {
         assert((chunked_attention_size & (chunked_attention_size - 1)) == 0
             && "chunked_attention_size has to be a power of 2");
+        assert(attention_mask_type != Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW
+            && "Bidirectional sliding window attention should not use chunked attention");
         attention_mask_type = Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL;
     }
 
@@ -1632,6 +1643,11 @@ int main(int argc, char** argv)
                         valid = valid && (si >= std::max(int(so + 1 - sliding_window_size), 0));
                     }
                 }
+                else if (attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW)
+                {
+                    valid = valid && si >= std::max(int(so - sliding_window_size / 2), 0);
+                    valid = valid && si <= std::min(int(so + sliding_window_size / 2), int(actual_seqlen - 1));
+                }
                 if (is_mtp)
                 {
                     // Only the last s_q tokens are used for verifying the results.
diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.h b/cpp/kernels/fmha_v2/src/fused_multihead_attention.h
index f71bd948676..32e2ed465ba 100644
--- a/cpp/kernels/fmha_v2/src/fused_multihead_attention.h
+++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention.h
@@ -49,6 +49,8 @@ enum class Attention_mask_type
     CAUSAL,
     // Causal mask + attend to the specific sliding window or chunk.
     SLIDING_OR_CHUNKED_CAUSAL,
+    // Bidirectional sliding window attention.
+    BIDIRECTIONAL_SLIDING_WINDOW,
     // The custom mask input.
     CUSTOM_MASK,
 };
@@ -62,6 +64,7 @@ static inline std::string mask_type_to_string(Attention_mask_type mask_type)
     case Attention_mask_type::PADDING: return "padding";
     case Attention_mask_type::CAUSAL: return "causal";
     case Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL: return "sliding_or_chunked_causal";
+    case Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW: return "bidirectional_sliding_window";
     case Attention_mask_type::CUSTOM_MASK: return "custom_mask";
     default: assert(false); return "";
     }
diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h
index 467d614c1d0..2c38c1703e4 100644
--- a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h
+++ b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h
@@ -172,19 +172,42 @@ inline __device__ void device_flash_attention_nl(Params const& params)
     static_assert(MASK_LOOPS * Cta_tile_p::N == Cta_tile_p::M || Cta_tile_p::N >= Cta_tile_p::M, "");
 
     // The start/end step of kv loops.
-    // Do we need to mask out the tokens that is far away from the beginning.
+    // Do we need to mask out the tokens that is not in the sliding window.
     bool const mask_sliding_window
-        = Kernel_traits::SLIDING_WINDOW_ATTENTION && binfo.actual_kv_seqlen > params.sliding_window_size;
+        = (Kernel_traits::SLIDING_WINDOW_ATTENTION && binfo.actual_kv_seqlen > params.sliding_window_size)
+        || (Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION
+            && binfo.actual_kv_seqlen > params.sliding_window_size / 2 + 1); // +1 to include self token
+
     int const valid_seqlen = Kernel_traits::CAUSAL_MASK ? min(q_sequence_start + Cta_tile_p::M, binfo.actual_kv_seqlen)
                                                         : binfo.actual_kv_seqlen;
 
-    int const kv_loop_end = ((valid_seqlen + Cta_tile_p::N - 1) / Cta_tile_p::N) * Cta_tile_p::N;
-    int const kv_loop_start = mask_sliding_window
-        ? (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N
-        : 0;
-    int const sliding_window_mask_end = mask_sliding_window
-        ? (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N
-        : 0;
+    int kv_loop_start = 0;
+    int kv_loop_end = fmha::div_up(valid_seqlen, int(Cta_tile_p::N)) * int(Cta_tile_p::N);
+    int sliding_window_mask_left = 0;
+    int sliding_window_mask_right = kv_loop_end;
+    if (mask_sliding_window)
+    {
+        if constexpr (Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION)
+        {
+            kv_loop_start = (max(0, q_sequence_start - params.sliding_window_size / 2) / Cta_tile_p::N) * Cta_tile_p::N;
+            sliding_window_mask_left
+                = (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size / 2) / Cta_tile_p::N)
+                * Cta_tile_p::N;
+
+            kv_loop_end = min(kv_loop_end,
+                (fmha::div_up(q_sequence_start + Cta_tile_p::M + params.sliding_window_size / 2, int(Cta_tile_p::N))
+                    * Cta_tile_p::N));
+            sliding_window_mask_right = min(sliding_window_mask_right,
+                ((q_sequence_start + params.sliding_window_size / 2) / int(Cta_tile_p::N)) * Cta_tile_p::N);
+        }
+        else
+        {
+            kv_loop_start = (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N;
+            sliding_window_mask_left
+                = (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) / Cta_tile_p::N)
+                * Cta_tile_p::N;
+        }
+    }
 
     static_assert(Cta_tile_p::M >= Cta_tile_p::N, "");
 
@@ -337,7 +360,8 @@ inline __device__ void device_flash_attention_nl(Params const& params)
     // Do we need to check if there are negative inf for softmax row_max ?
     enum
     {
-        CHECK_NEG_INF = Kernel_traits::SLIDING_WINDOW_ATTENTION || Kernel_traits::CUSTOM_MASK
+        CHECK_NEG_INF = Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION || Kernel_traits::SLIDING_WINDOW_ATTENTION
+            || Kernel_traits::CUSTOM_MASK
     };
 
     // Load the mask for that iteration.
@@ -363,7 +387,8 @@ inline __device__ void device_flash_attention_nl(Params const& params)
 
         bool const first_step = (kv_loop == kv_loop_start);
         // It is possible that all tokens are masked out (sliding-window-attention).
-        bool const apply_sliding_window_mask = (mask_sliding_window && kv_loop <= sliding_window_mask_end);
+        bool const apply_sliding_window_mask
+            = (mask_sliding_window && (kv_loop <= sliding_window_mask_left || kv_loop >= sliding_window_mask_right));
         bool const apply_mask = params.has_alibi || (kv_loop >= kv_mask_loop_start) || apply_sliding_window_mask;
 
         // Move mask offset.
diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h
index 55ba07ed8c3..2f3f05a4f48 100644
--- a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h
+++ b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h
@@ -175,17 +175,40 @@ inline __device__ void device_flash_attention_nl_tiled(Params const& params)
     // The start/end step of kv loops.
     // Do we need to mask out the tokens that is not in the sliding window.
     bool const mask_sliding_window
-        = Kernel_traits::SLIDING_WINDOW_ATTENTION && binfo.actual_kv_seqlen > params.sliding_window_size;
+        = (Kernel_traits::SLIDING_WINDOW_ATTENTION && binfo.actual_kv_seqlen > params.sliding_window_size)
+        || (Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION
+            && binfo.actual_kv_seqlen > params.sliding_window_size / 2 + 1); // +1 to include self token
+
     int const valid_seqlen = Kernel_traits::CAUSAL_MASK ? min(q_sequence_start + Cta_tile_p::M, binfo.actual_kv_seqlen)
                                                         : binfo.actual_kv_seqlen;
 
-    int const kv_loop_end = ((valid_seqlen + Cta_tile_p::N - 1) / Cta_tile_p::N) * Cta_tile_p::N;
-    int const kv_loop_start = mask_sliding_window
-        ? (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N
-        : 0;
-    int const sliding_window_mask_end = mask_sliding_window
-        ? (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N
-        : 0;
+    int kv_loop_start = 0;
+    int kv_loop_end = fmha::div_up(valid_seqlen, int(Cta_tile_p::N)) * int(Cta_tile_p::N);
+    int sliding_window_mask_left = 0;
+    int sliding_window_mask_right = kv_loop_end;
+    if (mask_sliding_window)
+    {
+        if constexpr (Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION)
+        {
+            kv_loop_start = (max(0, q_sequence_start - params.sliding_window_size / 2) / Cta_tile_p::N) * Cta_tile_p::N;
+            sliding_window_mask_left
+                = (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size / 2) / Cta_tile_p::N)
+                * Cta_tile_p::N;
+
+            kv_loop_end = min(kv_loop_end,
+                (fmha::div_up(q_sequence_start + Cta_tile_p::M + params.sliding_window_size / 2, int(Cta_tile_p::N))
+                    * Cta_tile_p::N));
+            sliding_window_mask_right = min(sliding_window_mask_right,
+                ((q_sequence_start + params.sliding_window_size / 2) / int(Cta_tile_p::N)) * Cta_tile_p::N);
+        }
+        else
+        {
+            kv_loop_start = (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N;
+            sliding_window_mask_left
+                = (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) / Cta_tile_p::N)
+                * Cta_tile_p::N;
+        }
+    }
 
     // Move K and V tiles.
     // We need offset here since we split single k loops into finer granularity.
@@ -301,7 +324,8 @@ inline __device__ void device_flash_attention_nl_tiled(Params const& params)
 
         bool const first_step = (kv_loop == kv_loop_start);
         // It is possible that all tokens are masked out (sliding-window-attention).
-        bool const apply_sliding_window_mask = (mask_sliding_window && kv_loop <= sliding_window_mask_end);
+        bool const apply_sliding_window_mask
+            = (mask_sliding_window && (kv_loop <= sliding_window_mask_left || kv_loop >= sliding_window_mask_right));
         bool const apply_mask = params.has_alibi || (kv_loop >= kv_mask_loop_start) || apply_sliding_window_mask;
 
         // Declare the accumulators for the 1st gemm.
diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h
index 9679be86fcc..f129a973ac4 100644
--- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h
+++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h
@@ -69,6 +69,8 @@ enum class ContextAttentionMaskType
     CAUSAL,
     // Causal mask + attend to the specific sliding window or chunk.
     SLIDING_OR_CHUNKED_CAUSAL,
+    // Bidirectional sliding window attention.
+    BIDIRECTIONAL_SLIDING_WINDOW,
     // The custom mask input.
     CUSTOM_MASK
 };
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
index ac3f7b2e977..19799dcecf3 100644
--- a/jenkins/L0_Test.groovy
+++ b/jenkins/L0_Test.groovy
@@ -3294,7 +3294,8 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 2, 4],
         "DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["auto:dgx-h100-x4", "l0_dgx_h100", 2, 2, 4],
         "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
-        "DGX_H100-4_GPUs-PyTorch-Others-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
+        "DGX_H100-4_GPUs-PyTorch-Others-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 2, 4],
+        "DGX_H100-4_GPUs-PyTorch-Others-2": ["auto:dgx-h100-x4", "l0_dgx_h100", 2, 2, 4],
         "DGX_H100-4_GPUs-PyTorch-Ray-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-AutoDeploy-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
         "DGX_H100-4_GPUs-AutoDeploy-Post-Merge-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4],
@@ -3305,7 +3306,9 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_B200-Triton-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true],
         "DGX_B200-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 2, 1, 1, true],
         "DGX_B200-PyTorch-Post-Merge-2": ["auto:dgx-b200-flex", "l0_b200", 2, 2, 1, 1, true],
-        "DGX_B200-4_GPUs-PyTorch-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 1, 4, 1, true],
+        "DGX_B200-4_GPUs-PyTorch-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 3, 4, 1, true],
+        "DGX_B200-4_GPUs-PyTorch-2": ["auto:dgx-b200-flex", "l0_dgx_b200", 2, 3, 4, 1, true],
+        "DGX_B200-4_GPUs-PyTorch-3": ["auto:dgx-b200-flex", "l0_dgx_b200", 3, 3, 4, 1, true],
         "DGX_B200-4_GPUs-PyTorch-Ray-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 1, 4, 1, true],
         "DGX_B200-4_GPUs-AutoDeploy-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 1, 4, 1, true],
         "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 2, 4, 1, true],
diff --git a/requirements.txt b/requirements.txt
index 1b0455b72d3..678d640864e 100644
--- a/requirements.txt
+++ b/requirements.txt
@@ -84,4 +84,4 @@ cuda-tile>=1.0.1
 nvidia-cuda-tileiras>=13.1
 etcd-sdk-python==0.0.7
 python-multipart
-smg-grpc-proto>=0.3.3
+smg-grpc-proto>=0.4.2
diff --git a/security_scanning/docs/poetry.lock b/security_scanning/docs/poetry.lock
index 9ac6899f28f..861bbdd4bbc 100644
--- a/security_scanning/docs/poetry.lock
+++ b/security_scanning/docs/poetry.lock
@@ -1046,14 +1046,14 @@ test = ["flake8", "mypy", "pytest"]
 
 [[package]]
 name = "sphinxcontrib-mermaid"
-version = "2.0.0"
+version = "2.0.1"
 description = "Mermaid diagrams in your Sphinx-powered docs"
 optional = false
 python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "sphinxcontrib_mermaid-2.0.0-py3-none-any.whl", hash = "sha256:59a73249bbee2c74b1a4db036f8e8899ade65982bdda6712cf22b4f4e9874bb5"},
-    {file = "sphinxcontrib_mermaid-2.0.0.tar.gz", hash = "sha256:cf4f7d453d001132eaba5d1fdf53d42049f02e913213cf8337427483bfca26f4"},
+    {file = "sphinxcontrib_mermaid-2.0.1-py3-none-any.whl", hash = "sha256:9dca7fbe827bad5e7e2b97c4047682cfd26e3e07398cfdc96c7a8842ae7f06e7"},
+    {file = "sphinxcontrib_mermaid-2.0.1.tar.gz", hash = "sha256:a21a385a059a6cafd192aa3a586b14bf5c42721e229db67b459dc825d7f0a497"},
 ]
 
 [package.dependencies]
@@ -1222,4 +1222,4 @@ test = ["pytest (>=6.0.0)", "setuptools (>=77)"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "f93bbd8da205c4e4374138aeea92fdc5a73d7764638e5f27c12351866592f2bb"
+content-hash = "25155b7ceb59522a3d568a3a7f15a11aca6e1b2e7f17bde117f1b1b33be32945"
diff --git a/security_scanning/docs/pyproject.toml b/security_scanning/docs/pyproject.toml
index 40191f553af..da05d31bade 100644
--- a/security_scanning/docs/pyproject.toml
+++ b/security_scanning/docs/pyproject.toml
@@ -15,7 +15,7 @@ dependencies = [
     "sphinx-copybutton (>=0.5.2,<0.6.0)",
     "autodoc-pydantic (>=2.2.0,<3.0.0)",
     "sphinx-togglebutton (>=0.4.4,<0.5.0)",
-    "sphinxcontrib-mermaid (>=2.0.0,<3.0.0)"
+    "sphinxcontrib-mermaid (>=2.0.1,<3.0.0)"
 ]
 
 
diff --git a/security_scanning/examples/apps/poetry.lock b/security_scanning/examples/apps/poetry.lock
index ed6e03e3fcf..33f100adc22 100644
--- a/security_scanning/examples/apps/poetry.lock
+++ b/security_scanning/examples/apps/poetry.lock
@@ -275,14 +275,14 @@ files = [
 
 [[package]]
 name = "openai"
-version = "2.24.0"
+version = "2.26.0"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "openai-2.24.0-py3-none-any.whl", hash = "sha256:fed30480d7d6c884303287bde864980a4b137b60553ffbcf9ab4a233b7a73d94"},
-    {file = "openai-2.24.0.tar.gz", hash = "sha256:1e5769f540dbd01cb33bc4716a23e67b9d695161a734aff9c5f925e2bf99a673"},
+    {file = "openai-2.26.0-py3-none-any.whl", hash = "sha256:6151bf8f83802f036117f06cc8a57b3a4da60da9926826cc96747888b57f394f"},
+    {file = "openai-2.26.0.tar.gz", hash = "sha256:b41f37c140ae0034a6e92b0c509376d907f3a66109935fba2c1b471a7c05a8fb"},
 ]
 
 [package.dependencies]
diff --git a/security_scanning/examples/models/contrib/mmdit/poetry.lock b/security_scanning/examples/models/contrib/mmdit/poetry.lock
index e2ff09a48d0..925d0e528a1 100644
--- a/security_scanning/examples/models/contrib/mmdit/poetry.lock
+++ b/security_scanning/examples/models/contrib/mmdit/poetry.lock
@@ -197,14 +197,14 @@ files = [
 
 [[package]]
 name = "diffusers"
-version = "0.36.0"
+version = "0.37.0"
 description = "State-of-the-art diffusion in PyTorch and JAX."
 optional = false
-python-versions = ">=3.8.0"
+python-versions = ">=3.10.0"
 groups = ["main"]
 files = [
-    {file = "diffusers-0.36.0-py3-none-any.whl", hash = "sha256:525d42abc74bfc3b2db594999961295c054b48ef40a11724dacf50e6abd1af98"},
-    {file = "diffusers-0.36.0.tar.gz", hash = "sha256:a9cde8721b415bde6a678f2d02abb85396487e1b0e0d2b4abb462d14a9825ab0"},
+    {file = "diffusers-0.37.0-py3-none-any.whl", hash = "sha256:7eab74bf896974250b5e1027cae813aba1004f02d97c9b44891b83713386aa08"},
+    {file = "diffusers-0.37.0.tar.gz", hash = "sha256:408789af73898585f525afd07ca72b3955affea4216a669558e9f59b5b1fe704"},
 ]
 
 [package.dependencies]
@@ -220,14 +220,14 @@ safetensors = ">=0.3.1"
 
 [package.extras]
 bitsandbytes = ["accelerate (>=0.31.0)", "bitsandbytes (>=0.43.3)"]
-dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "timm", "torch (>=1.4)", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"]
+dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "ftfy", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "timm", "torch (>=1.4)", "torchsde", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"]
 docs = ["hf-doc-builder (>=0.3.0)"]
 flax = ["flax (>=0.4.1)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)"]
 gguf = ["accelerate (>=0.31.0)", "gguf (>=0.10.0)"]
 nvidia-modelopt = ["nvidia_modelopt[hf] (>=0.33.1)"]
 optimum-quanto = ["accelerate (>=0.31.0)", "optimum_quanto (>=0.2.6)"]
 quality = ["hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.9.10)", "urllib3 (<=2.0.0)"]
-test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "invisible-watermark (>=0.2.0)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "phonemizer", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tiktoken (>=0.7.0)", "torchvision", "transformers (>=4.41.2)"]
+test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "ftfy", "invisible-watermark (>=0.2.0)", "librosa", "parameterized", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tiktoken (>=0.7.0)", "torchsde", "torchvision", "transformers (>=4.41.2)"]
 torch = ["accelerate (>=0.31.0)", "torch (>=1.4)"]
 torchao = ["accelerate (>=0.31.0)", "torchao (>=0.7.0)"]
 training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.17.0)", "protobuf (>=3.20.3,<4)", "tensorboard", "timm"]
diff --git a/security_scanning/examples/models/contrib/stdit/poetry.lock b/security_scanning/examples/models/contrib/stdit/poetry.lock
index ea9387bbbb3..defb5de3256 100644
--- a/security_scanning/examples/models/contrib/stdit/poetry.lock
+++ b/security_scanning/examples/models/contrib/stdit/poetry.lock
@@ -2564,86 +2564,102 @@ files = [
 
 [[package]]
 name = "wrapt"
-version = "2.1.1"
+version = "2.1.2"
 description = "Module for decorators, wrappers and monkey patching."
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "wrapt-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7e927375e43fd5a985b27a8992327c22541b6dede1362fc79df337d26e23604f"},
-    {file = "wrapt-2.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c99544b6a7d40ca22195563b6d8bc3986ee8bb82f272f31f0670fe9440c869"},
-    {file = "wrapt-2.1.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b2be3fa5f4efaf16ee7c77d0556abca35f5a18ad4ac06f0ef3904c3399010ce9"},
-    {file = "wrapt-2.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67c90c1ae6489a6cb1a82058902caa8006706f7b4e8ff766f943e9d2c8e608d0"},
-    {file = "wrapt-2.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:05c0db35ccffd7480143e62df1e829d101c7b86944ae3be7e4869a7efa621f53"},
-    {file = "wrapt-2.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0c2ec9f616755b2e1e0bf4d0961f59bb5c2e7a77407e7e2c38ef4f7d2fdde12c"},
-    {file = "wrapt-2.1.1-cp310-cp310-win32.whl", hash = "sha256:203ba6b3f89e410e27dbd30ff7dccaf54dcf30fda0b22aa1b82d560c7f9fe9a1"},
-    {file = "wrapt-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:6f9426d9cfc2f8732922fc96198052e55c09bb9db3ddaa4323a18e055807410e"},
-    {file = "wrapt-2.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:69c26f51b67076b40714cff81bdd5826c0b10c077fb6b0678393a6a2f952a5fc"},
-    {file = "wrapt-2.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6c366434a7fb914c7a5de508ed735ef9c133367114e1a7cb91dfb5cd806a1549"},
-    {file = "wrapt-2.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d6a2068bd2e1e19e5a317c8c0b288267eec4e7347c36bc68a6e378a39f19ee7"},
-    {file = "wrapt-2.1.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:891ab4713419217b2aed7dd106c9200f64e6a82226775a0d2ebd6bef2ebd1747"},
-    {file = "wrapt-2.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c8ef36a0df38d2dc9d907f6617f89e113c5892e0a35f58f45f75901af0ce7d81"},
-    {file = "wrapt-2.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:76e9af3ebd86f19973143d4d592cbf3e970cf3f66ddee30b16278c26ae34b8ab"},
-    {file = "wrapt-2.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ff562067485ebdeaef2fa3fe9b1876bc4e7b73762e0a01406ad81e2076edcebf"},
-    {file = "wrapt-2.1.1-cp311-cp311-win32.whl", hash = "sha256:9e60a30aa0909435ec4ea2a3c53e8e1b50ac9f640c0e9fe3f21fd248a22f06c5"},
-    {file = "wrapt-2.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:7d79954f51fcf84e5ec4878ab4aea32610d70145c5bbc84b3370eabfb1e096c2"},
-    {file = "wrapt-2.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:d3ffc6b0efe79e08fd947605fd598515aebefe45e50432dc3b5cd437df8b1ada"},
-    {file = "wrapt-2.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ab8e3793b239db021a18782a5823fcdea63b9fe75d0e340957f5828ef55fcc02"},
-    {file = "wrapt-2.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7c0300007836373d1c2df105b40777986accb738053a92fe09b615a7a4547e9f"},
-    {file = "wrapt-2.1.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2b27c070fd1132ab23957bcd4ee3ba707a91e653a9268dc1afbd39b77b2799f7"},
-    {file = "wrapt-2.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b0e36d845e8b6f50949b6b65fc6cd279f47a1944582ed4ec8258cd136d89a64"},
-    {file = "wrapt-2.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4aeea04a9889370fcfb1ef828c4cc583f36a875061505cd6cd9ba24d8b43cc36"},
-    {file = "wrapt-2.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d88b46bb0dce9f74b6817bc1758ff2125e1ca9e1377d62ea35b6896142ab6825"},
-    {file = "wrapt-2.1.1-cp312-cp312-win32.whl", hash = "sha256:63decff76ca685b5c557082dfbea865f3f5f6d45766a89bff8dc61d336348833"},
-    {file = "wrapt-2.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:b828235d26c1e35aca4107039802ae4b1411be0fe0367dd5b7e4d90e562fcbcd"},
-    {file = "wrapt-2.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:75128507413a9f1bcbe2db88fd18fbdbf80f264b82fa33a6996cdeaf01c52352"},
-    {file = "wrapt-2.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ce9646e17fa7c3e2e7a87e696c7de66512c2b4f789a8db95c613588985a2e139"},
-    {file = "wrapt-2.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:428cfc801925454395aa468ba7ddb3ed63dc0d881df7b81626cdd433b4e2b11b"},
-    {file = "wrapt-2.1.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5797f65e4d58065a49088c3b32af5410751cd485e83ba89e5a45e2aa8905af98"},
-    {file = "wrapt-2.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a2db44a71202c5ae4bb5f27c6d3afbc5b23053f2e7e78aa29704541b5dad789"},
-    {file = "wrapt-2.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8d5350c3590af09c1703dd60ec78a7370c0186e11eaafb9dda025a30eee6492d"},
-    {file = "wrapt-2.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d9b076411bed964e752c01b49fd224cc385f3a96f520c797d38412d70d08359"},
-    {file = "wrapt-2.1.1-cp313-cp313-win32.whl", hash = "sha256:0bb7207130ce6486727baa85373503bf3334cc28016f6928a0fa7e19d7ecdc06"},
-    {file = "wrapt-2.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:cbfee35c711046b15147b0ae7db9b976f01c9520e6636d992cd9e69e5e2b03b1"},
-    {file = "wrapt-2.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:7d2756061022aebbf57ba14af9c16e8044e055c22d38de7bf40d92b565ecd2b0"},
-    {file = "wrapt-2.1.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4814a3e58bc6971e46baa910ecee69699110a2bf06c201e24277c65115a20c20"},
-    {file = "wrapt-2.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:106c5123232ab9b9f4903692e1fa0bdc231510098f04c13c3081f8ad71c3d612"},
-    {file = "wrapt-2.1.1-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1a40b83ff2535e6e56f190aff123821eea89a24c589f7af33413b9c19eb2c738"},
-    {file = "wrapt-2.1.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:789cea26e740d71cf1882e3a42bb29052bc4ada15770c90072cb47bf73fb3dbf"},
-    {file = "wrapt-2.1.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ba49c14222d5e5c0ee394495a8655e991dc06cbca5398153aefa5ac08cd6ccd7"},
-    {file = "wrapt-2.1.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ac8cda531fe55be838a17c62c806824472bb962b3afa47ecbd59b27b78496f4e"},
-    {file = "wrapt-2.1.1-cp313-cp313t-win32.whl", hash = "sha256:b8af75fe20d381dd5bcc9db2e86a86d7fcfbf615383a7147b85da97c1182225b"},
-    {file = "wrapt-2.1.1-cp313-cp313t-win_amd64.whl", hash = "sha256:45c5631c9b6c792b78be2d7352129f776dd72c605be2c3a4e9be346be8376d83"},
-    {file = "wrapt-2.1.1-cp313-cp313t-win_arm64.whl", hash = "sha256:da815b9263947ac98d088b6414ac83507809a1d385e4632d9489867228d6d81c"},
-    {file = "wrapt-2.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9aa1765054245bb01a37f615503290d4e207e3fd59226e78341afb587e9c1236"},
-    {file = "wrapt-2.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:feff14b63a6d86c1eee33a57f77573649f2550935981625be7ff3cb7342efe05"},
-    {file = "wrapt-2.1.1-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:81fc5f22d5fcfdbabde96bb3f5379b9f4476d05c6d524d7259dc5dfb501d3281"},
-    {file = "wrapt-2.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:951b228ecf66def855d22e006ab9a1fc12535111ae7db2ec576c728f8ddb39e8"},
-    {file = "wrapt-2.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ddf582a95641b9a8c8bd643e83f34ecbbfe1b68bc3850093605e469ab680ae3"},
-    {file = "wrapt-2.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fc5c500966bf48913f795f1984704e6d452ba2414207b15e1f8c339a059d5b16"},
-    {file = "wrapt-2.1.1-cp314-cp314-win32.whl", hash = "sha256:4aa4baadb1f94b71151b8e44a0c044f6af37396c3b8bcd474b78b49e2130a23b"},
-    {file = "wrapt-2.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:860e9d3fd81816a9f4e40812f28be4439ab01f260603c749d14be3c0a1170d19"},
-    {file = "wrapt-2.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:3c59e103017a2c1ea0ddf589cbefd63f91081d7ce9d491d69ff2512bb1157e23"},
-    {file = "wrapt-2.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9fa7c7e1bee9278fc4f5dd8275bc8d25493281a8ec6c61959e37cc46acf02007"},
-    {file = "wrapt-2.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:39c35e12e8215628984248bd9c8897ce0a474be2a773db207eb93414219d8469"},
-    {file = "wrapt-2.1.1-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:94ded4540cac9125eaa8ddf5f651a7ec0da6f5b9f248fe0347b597098f8ec14c"},
-    {file = "wrapt-2.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da0af328373f97ed9bdfea24549ac1b944096a5a71b30e41c9b8b53ab3eec04a"},
-    {file = "wrapt-2.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4ad839b55f0bf235f8e337ce060572d7a06592592f600f3a3029168e838469d3"},
-    {file = "wrapt-2.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0d89c49356e5e2a50fa86b40e0510082abcd0530f926cbd71cf25bee6b9d82d7"},
-    {file = "wrapt-2.1.1-cp314-cp314t-win32.whl", hash = "sha256:f4c7dd22cf7f36aafe772f3d88656559205c3af1b7900adfccb70edeb0d2abc4"},
-    {file = "wrapt-2.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f76bc12c583ab01e73ba0ea585465a41e48d968f6d1311b4daec4f8654e356e3"},
-    {file = "wrapt-2.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:7ea74fc0bec172f1ae5f3505b6655c541786a5cabe4bbc0d9723a56ac32eb9b9"},
-    {file = "wrapt-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9e03b3d486eb39f5d3f562839f59094dcee30c4039359ea15768dc2214d9e07c"},
-    {file = "wrapt-2.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0fdf3073f488ce4d929929b7799e3b8c52b220c9eb3f4a5a51e2dc0e8ff07881"},
-    {file = "wrapt-2.1.1-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0cb4f59238c6625fae2eeb72278da31c9cfba0ff4d9cbe37446b73caa0e9bcf7"},
-    {file = "wrapt-2.1.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f794a1c148871b714cb566f5466ec8288e0148a1c417550983864b3981737cd"},
-    {file = "wrapt-2.1.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:95ef3866631c6da9ce1fc0f1e17b90c4c0aa6d041fc70a11bc90733aee122e1a"},
-    {file = "wrapt-2.1.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:66bc1b2446f01cbbd3c56b79a3a8435bcd4178ac4e06b091913f7751a7f528b8"},
-    {file = "wrapt-2.1.1-cp39-cp39-win32.whl", hash = "sha256:1b9e08e57cabc32972f7c956d10e85093c5da9019faa24faf411e7dd258e528c"},
-    {file = "wrapt-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:e75ad48c3cca739f580b5e14c052993eb644c7fa5b4c90aa51193280b30875ae"},
-    {file = "wrapt-2.1.1-cp39-cp39-win_arm64.whl", hash = "sha256:9ccd657873b7f964711447d004563a2bc08d1476d7a1afcad310f3713e6f50f4"},
-    {file = "wrapt-2.1.1-py3-none-any.whl", hash = "sha256:3b0f4629eb954394a3d7c7a1c8cca25f0b07cefe6aa8545e862e9778152de5b7"},
-    {file = "wrapt-2.1.1.tar.gz", hash = "sha256:5fdcb09bf6db023d88f312bd0767594b414655d58090fc1c46b3414415f67fac"},
+    {file = "wrapt-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b7a86d99a14f76facb269dc148590c01aaf47584071809a70da30555228158c"},
+    {file = "wrapt-2.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a819e39017f95bf7aede768f75915635aa8f671f2993c036991b8d3bfe8dbb6f"},
+    {file = "wrapt-2.1.2-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5681123e60aed0e64c7d44f72bbf8b4ce45f79d81467e2c4c728629f5baf06eb"},
+    {file = "wrapt-2.1.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b8b28e97a44d21836259739ae76284e180b18abbb4dcfdff07a415cf1016c3e"},
+    {file = "wrapt-2.1.2-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cef91c95a50596fcdc31397eb6955476f82ae8a3f5a8eabdc13611b60ee380ba"},
+    {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:dad63212b168de8569b1c512f4eac4b57f2c6934b30df32d6ee9534a79f1493f"},
+    {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d307aa6888d5efab2c1cde09843d48c843990be13069003184b67d426d145394"},
+    {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c87cf3f0c85e27b3ac7d9ad95da166bf8739ca215a8b171e8404a2d739897a45"},
+    {file = "wrapt-2.1.2-cp310-cp310-win32.whl", hash = "sha256:d1c5fea4f9fe3762e2b905fdd67df51e4be7a73b7674957af2d2ade71a5c075d"},
+    {file = "wrapt-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:d8f7740e1af13dff2684e4d56fe604a7e04d6c94e737a60568d8d4238b9a0c71"},
+    {file = "wrapt-2.1.2-cp310-cp310-win_arm64.whl", hash = "sha256:1c6cc827c00dc839350155f316f1f8b4b0c370f52b6a19e782e2bda89600c7dc"},
+    {file = "wrapt-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:96159a0ee2b0277d44201c3b5be479a9979cf154e8c82fa5df49586a8e7679bb"},
+    {file = "wrapt-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98ba61833a77b747901e9012072f038795de7fc77849f1faa965464f3f87ff2d"},
+    {file = "wrapt-2.1.2-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:767c0dbbe76cae2a60dd2b235ac0c87c9cccf4898aef8062e57bead46b5f6894"},
+    {file = "wrapt-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c691a6bc752c0cc4711cc0c00896fcd0f116abc253609ef64ef930032821842"},
+    {file = "wrapt-2.1.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f3b7d73012ea75aee5844de58c88f44cf62d0d62711e39da5a82824a7c4626a8"},
+    {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:577dff354e7acd9d411eaf4bfe76b724c89c89c8fc9b7e127ee28c5f7bcb25b6"},
+    {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3d7b6fd105f8b24e5bd23ccf41cb1d1099796524bcc6f7fbb8fe576c44befbc9"},
+    {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:866abdbf4612e0b34764922ef8b1c5668867610a718d3053d59e24a5e5fcfc15"},
+    {file = "wrapt-2.1.2-cp311-cp311-win32.whl", hash = "sha256:5a0a0a3a882393095573344075189eb2d566e0fd205a2b6414e9997b1b800a8b"},
+    {file = "wrapt-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:64a07a71d2730ba56f11d1a4b91f7817dc79bc134c11516b75d1921a7c6fcda1"},
+    {file = "wrapt-2.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:b89f095fe98bc12107f82a9f7d570dc83a0870291aeb6b1d7a7d35575f55d98a"},
+    {file = "wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9"},
+    {file = "wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748"},
+    {file = "wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e"},
+    {file = "wrapt-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79847b83eb38e70d93dc392c7c5b587efe65b3e7afcc167aa8abd5d60e8761c8"},
+    {file = "wrapt-2.1.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f8fba1bae256186a83d1875b2b1f4e2d1242e8fac0f58ec0d7e41b26967b965c"},
+    {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e3d3b35eedcf5f7d022291ecd7533321c4775f7b9cd0050a31a68499ba45757c"},
+    {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6f2c5390460de57fa9582bc8a1b7a6c86e1a41dfad74c5225fc07044c15cc8d1"},
+    {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7dfa9f2cf65d027b951d05c662cc99ee3bd01f6e4691ed39848a7a5fffc902b2"},
+    {file = "wrapt-2.1.2-cp312-cp312-win32.whl", hash = "sha256:eba8155747eb2cae4a0b913d9ebd12a1db4d860fc4c829d7578c7b989bd3f2f0"},
+    {file = "wrapt-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1c51c738d7d9faa0b3601708e7e2eda9bf779e1b601dce6c77411f2a1b324a63"},
+    {file = "wrapt-2.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c8e46ae8e4032792eb2f677dbd0d557170a8e5524d22acc55199f43efedd39bf"},
+    {file = "wrapt-2.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787fd6f4d67befa6fe2abdffcbd3de2d82dfc6fb8a6d850407c53332709d030b"},
+    {file = "wrapt-2.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4bdf26e03e6d0da3f0e9422fd36bcebf7bc0eeb55fdf9c727a09abc6b9fe472e"},
+    {file = "wrapt-2.1.2-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bbac24d879aa22998e87f6b3f481a5216311e7d53c7db87f189a7a0266dafffb"},
+    {file = "wrapt-2.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16997dfb9d67addc2e3f41b62a104341e80cac52f91110dece393923c0ebd5ca"},
+    {file = "wrapt-2.1.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:162e4e2ba7542da9027821cb6e7c5e068d64f9a10b5f15512ea28e954893a267"},
+    {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f29c827a8d9936ac320746747a016c4bc66ef639f5cd0d32df24f5eacbf9c69f"},
+    {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:a9dd9813825f7ecb018c17fd147a01845eb330254dff86d3b5816f20f4d6aaf8"},
+    {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f8dbdd3719e534860d6a78526aafc220e0241f981367018c2875178cf83a413"},
+    {file = "wrapt-2.1.2-cp313-cp313-win32.whl", hash = "sha256:5c35b5d82b16a3bc6e0a04349b606a0582bc29f573786aebe98e0c159bc48db6"},
+    {file = "wrapt-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:f8bc1c264d8d1cf5b3560a87bbdd31131573eb25f9f9447bb6252b8d4c44a3a1"},
+    {file = "wrapt-2.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:3beb22f674550d5634642c645aba4c72a2c66fb185ae1aebe1e955fae5a13baf"},
+    {file = "wrapt-2.1.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0fc04bc8664a8bc4c8e00b37b5355cffca2535209fba1abb09ae2b7c76ddf82b"},
+    {file = "wrapt-2.1.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a9b9d50c9af998875a1482a038eb05755dfd6fe303a313f6a940bb53a83c3f18"},
+    {file = "wrapt-2.1.2-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2d3ff4f0024dd224290c0eabf0240f1bfc1f26363431505fb1b0283d3b08f11d"},
+    {file = "wrapt-2.1.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3278c471f4468ad544a691b31bb856374fbdefb7fee1a152153e64019379f015"},
+    {file = "wrapt-2.1.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a8914c754d3134a3032601c6984db1c576e6abaf3fc68094bb8ab1379d75ff92"},
+    {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ff95d4264e55839be37bafe1536db2ab2de19da6b65f9244f01f332b5286cfbf"},
+    {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:76405518ca4e1b76fbb1b9f686cff93aebae03920cc55ceeec48ff9f719c5f67"},
+    {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c0be8b5a74c5824e9359b53e7e58bef71a729bacc82e16587db1c4ebc91f7c5a"},
+    {file = "wrapt-2.1.2-cp313-cp313t-win32.whl", hash = "sha256:f01277d9a5fc1862f26f7626da9cf443bebc0abd2f303f41c5e995b15887dabd"},
+    {file = "wrapt-2.1.2-cp313-cp313t-win_amd64.whl", hash = "sha256:84ce8f1c2104d2f6daa912b1b5b039f331febfeee74f8042ad4e04992bd95c8f"},
+    {file = "wrapt-2.1.2-cp313-cp313t-win_arm64.whl", hash = "sha256:a93cd767e37faeddbe07d8fc4212d5cba660af59bdb0f6372c93faaa13e6e679"},
+    {file = "wrapt-2.1.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:1370e516598854e5b4366e09ce81e08bfe94d42b0fd569b88ec46cc56d9164a9"},
+    {file = "wrapt-2.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6de1a3851c27e0bd6a04ca993ea6f80fc53e6c742ee1601f486c08e9f9b900a9"},
+    {file = "wrapt-2.1.2-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:de9f1a2bbc5ac7f6012ec24525bdd444765a2ff64b5985ac6e0692144838542e"},
+    {file = "wrapt-2.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:970d57ed83fa040d8b20c52fe74a6ae7e3775ae8cff5efd6a81e06b19078484c"},
+    {file = "wrapt-2.1.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3969c56e4563c375861c8df14fa55146e81ac11c8db49ea6fb7f2ba58bc1ff9a"},
+    {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:57d7c0c980abdc5f1d98b11a2aa3bb159790add80258c717fa49a99921456d90"},
+    {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:776867878e83130c7a04237010463372e877c1c994d449ca6aaafeab6aab2586"},
+    {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fab036efe5464ec3291411fabb80a7a39e2dd80bae9bcbeeca5087fdfa891e19"},
+    {file = "wrapt-2.1.2-cp314-cp314-win32.whl", hash = "sha256:e6ed62c82ddf58d001096ae84ce7f833db97ae2263bff31c9b336ba8cfe3f508"},
+    {file = "wrapt-2.1.2-cp314-cp314-win_amd64.whl", hash = "sha256:467e7c76315390331c67073073d00662015bb730c566820c9ca9b54e4d67fd04"},
+    {file = "wrapt-2.1.2-cp314-cp314-win_arm64.whl", hash = "sha256:da1f00a557c66225d53b095a97eace0fc5349e3bfda28fa34ffae238978ee575"},
+    {file = "wrapt-2.1.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:62503ffbc2d3a69891cf29beeaccdb4d5e0a126e2b6a851688d4777e01428dbb"},
+    {file = "wrapt-2.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c7e6cd120ef837d5b6f860a6ea3745f8763805c418bb2f12eeb1fa6e25f22d22"},
+    {file = "wrapt-2.1.2-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3769a77df8e756d65fbc050333f423c01ae012b4f6731aaf70cf2bef61b34596"},
+    {file = "wrapt-2.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a76d61a2e851996150ba0f80582dd92a870643fa481f3b3846f229de88caf044"},
+    {file = "wrapt-2.1.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6f97edc9842cf215312b75fe737ee7c8adda75a89979f8e11558dfff6343cc4b"},
+    {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4006c351de6d5007aa33a551f600404ba44228a89e833d2fadc5caa5de8edfbf"},
+    {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:a9372fc3639a878c8e7d87e1556fa209091b0a66e912c611e3f833e2c4202be2"},
+    {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3144b027ff30cbd2fca07c0a87e67011adb717eb5f5bd8496325c17e454257a3"},
+    {file = "wrapt-2.1.2-cp314-cp314t-win32.whl", hash = "sha256:3b8d15e52e195813efe5db8cec156eebe339aaf84222f4f4f051a6c01f237ed7"},
+    {file = "wrapt-2.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:08ffa54146a7559f5b8df4b289b46d963a8e74ed16ba3687f99896101a3990c5"},
+    {file = "wrapt-2.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:72aaa9d0d8e4ed0e2e98019cea47a21f823c9dd4b43c7b77bba6679ffcca6a00"},
+    {file = "wrapt-2.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5e0fa9cc32300daf9eb09a1f5bdc6deb9a79defd70d5356ba453bcd50aef3742"},
+    {file = "wrapt-2.1.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:710f6e5dfaf6a5d5c397d2d6758a78fecd9649deb21f1b645f5b57a328d63050"},
+    {file = "wrapt-2.1.2-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:305d8a1755116bfdad5dda9e771dcb2138990a1d66e9edd81658816edf51aed1"},
+    {file = "wrapt-2.1.2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0d8fc30a43b5fe191cf2b1a0c82bab2571dadd38e7c0062ee87d6df858dd06e"},
+    {file = "wrapt-2.1.2-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a5d516e22aedb7c9c1d47cba1c63160b1a6f61ec2f3948d127cd38d5cfbb556f"},
+    {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:45914e8efbe4b9d5102fcf0e8e2e3258b83a5d5fba9f8f7b6d15681e9d29ffe0"},
+    {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:478282ebd3795a089154fb16d3db360e103aa13d3b2ad30f8f6aac0d2207de0e"},
+    {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3756219045f73fb28c5d7662778e4156fbd06cf823c4d2d4b19f97305e52819c"},
+    {file = "wrapt-2.1.2-cp39-cp39-win32.whl", hash = "sha256:b8aefb4dbb18d904b96827435a763fa42fc1f08ea096a391710407a60983ced8"},
+    {file = "wrapt-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:e5aeab8fe15c3dff75cfee94260dcd9cded012d4ff06add036c28fae7718593b"},
+    {file = "wrapt-2.1.2-cp39-cp39-win_arm64.whl", hash = "sha256:f069e113743a21a3defac6677f000068ebb931639f789b5b226598e247a4c89e"},
+    {file = "wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8"},
+    {file = "wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e"},
 ]
 
 [package.extras]
diff --git a/security_scanning/examples/models/core/qwen/poetry.lock b/security_scanning/examples/models/core/qwen/poetry.lock
index 3f3a9015e09..ea55a2676aa 100644
--- a/security_scanning/examples/models/core/qwen/poetry.lock
+++ b/security_scanning/examples/models/core/qwen/poetry.lock
@@ -1681,14 +1681,14 @@ files = [
 
 [[package]]
 name = "openai"
-version = "2.24.0"
+version = "2.26.0"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "openai-2.24.0-py3-none-any.whl", hash = "sha256:fed30480d7d6c884303287bde864980a4b137b60553ffbcf9ab4a233b7a73d94"},
-    {file = "openai-2.24.0.tar.gz", hash = "sha256:1e5769f540dbd01cb33bc4716a23e67b9d695161a734aff9c5f925e2bf99a673"},
+    {file = "openai-2.26.0-py3-none-any.whl", hash = "sha256:6151bf8f83802f036117f06cc8a57b3a4da60da9926826cc96747888b57f394f"},
+    {file = "openai-2.26.0.tar.gz", hash = "sha256:b41f37c140ae0034a6e92b0c509376d907f3a66109935fba2c1b471a7c05a8fb"},
 ]
 
 [package.dependencies]
@@ -2709,31 +2709,31 @@ six = ">=1.14.0"
 
 [[package]]
 name = "ruff"
-version = "0.15.4"
+version = "0.15.5"
 description = "An extremely fast Python linter and code formatter, written in Rust."
 optional = false
 python-versions = ">=3.7"
 groups = ["main"]
 markers = "sys_platform != \"emscripten\""
 files = [
-    {file = "ruff-0.15.4-py3-none-linux_armv6l.whl", hash = "sha256:a1810931c41606c686bae8b5b9a8072adac2f611bb433c0ba476acba17a332e0"},
-    {file = "ruff-0.15.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:5a1632c66672b8b4d3e1d1782859e98d6e0b4e70829530666644286600a33992"},
-    {file = "ruff-0.15.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a4386ba2cd6c0f4ff75252845906acc7c7c8e1ac567b7bc3d373686ac8c222ba"},
-    {file = "ruff-0.15.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2496488bdfd3732747558b6f95ae427ff066d1fcd054daf75f5a50674411e75"},
-    {file = "ruff-0.15.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3f1c4893841ff2d54cbda1b2860fa3260173df5ddd7b95d370186f8a5e66a4ac"},
-    {file = "ruff-0.15.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:820b8766bd65503b6c30aaa6331e8ef3a6e564f7999c844e9a547c40179e440a"},
-    {file = "ruff-0.15.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9fb74bab47139c1751f900f857fa503987253c3ef89129b24ed375e72873e85"},
-    {file = "ruff-0.15.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f80c98765949c518142b3a50a5db89343aa90f2c2bf7799de9986498ae6176db"},
-    {file = "ruff-0.15.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:451a2e224151729b3b6c9ffb36aed9091b2996fe4bdbd11f47e27d8f2e8888ec"},
-    {file = "ruff-0.15.4-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:a8f157f2e583c513c4f5f896163a93198297371f34c04220daf40d133fdd4f7f"},
-    {file = "ruff-0.15.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:917cc68503357021f541e69b35361c99387cdbbf99bd0ea4aa6f28ca99ff5338"},
-    {file = "ruff-0.15.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e9737c8161da79fd7cfec19f1e35620375bd8b2a50c3e77fa3d2c16f574105cc"},
-    {file = "ruff-0.15.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:291258c917539e18f6ba40482fe31d6f5ac023994ee11d7bdafd716f2aab8a68"},
-    {file = "ruff-0.15.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:3f83c45911da6f2cd5936c436cf86b9f09f09165f033a99dcf7477e34041cbc3"},
-    {file = "ruff-0.15.4-py3-none-win32.whl", hash = "sha256:65594a2d557d4ee9f02834fcdf0a28daa8b3b9f6cb2cb93846025a36db47ef22"},
-    {file = "ruff-0.15.4-py3-none-win_amd64.whl", hash = "sha256:04196ad44f0df220c2ece5b0e959c2f37c777375ec744397d21d15b50a75264f"},
-    {file = "ruff-0.15.4-py3-none-win_arm64.whl", hash = "sha256:60d5177e8cfc70e51b9c5fad936c634872a74209f934c1e79107d11787ad5453"},
-    {file = "ruff-0.15.4.tar.gz", hash = "sha256:3412195319e42d634470cc97aa9803d07e9d5c9223b99bcb1518f0c725f26ae1"},
+    {file = "ruff-0.15.5-py3-none-linux_armv6l.whl", hash = "sha256:4ae44c42281f42e3b06b988e442d344a5b9b72450ff3c892e30d11b29a96a57c"},
+    {file = "ruff-0.15.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6edd3792d408ebcf61adabc01822da687579a1a023f297618ac27a5b51ef0080"},
+    {file = "ruff-0.15.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:89f463f7c8205a9f8dea9d658d59eff49db05f88f89cc3047fb1a02d9f344010"},
+    {file = "ruff-0.15.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba786a8295c6574c1116704cf0b9e6563de3432ac888d8f83685654fe528fd65"},
+    {file = "ruff-0.15.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fd4b801e57955fe9f02b31d20375ab3a5c4415f2e5105b79fb94cf2642c91440"},
+    {file = "ruff-0.15.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:391f7c73388f3d8c11b794dbbc2959a5b5afe66642c142a6effa90b45f6f5204"},
+    {file = "ruff-0.15.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8dc18f30302e379fe1e998548b0f5e9f4dff907f52f73ad6da419ea9c19d66c8"},
+    {file = "ruff-0.15.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1cc6e7f90087e2d27f98dc34ed1b3ab7c8f0d273cc5431415454e22c0bd2a681"},
+    {file = "ruff-0.15.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1cb7169f53c1ddb06e71a9aebd7e98fc0fea936b39afb36d8e86d36ecc2636a"},
+    {file = "ruff-0.15.5-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:9b037924500a31ee17389b5c8c4d88874cc6ea8e42f12e9c61a3d754ff72f1ca"},
+    {file = "ruff-0.15.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:65bb414e5b4eadd95a8c1e4804f6772bbe8995889f203a01f77ddf2d790929dd"},
+    {file = "ruff-0.15.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d20aa469ae3b57033519c559e9bc9cd9e782842e39be05b50e852c7c981fa01d"},
+    {file = "ruff-0.15.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:15388dd28c9161cdb8eda68993533acc870aa4e646a0a277aa166de9ad5a8752"},
+    {file = "ruff-0.15.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b30da330cbd03bed0c21420b6b953158f60c74c54c5f4c1dabbdf3a57bf355d2"},
+    {file = "ruff-0.15.5-py3-none-win32.whl", hash = "sha256:732e5ee1f98ba5b3679029989a06ca39a950cced52143a0ea82a2102cb592b74"},
+    {file = "ruff-0.15.5-py3-none-win_amd64.whl", hash = "sha256:821d41c5fa9e19117616c35eaa3f4b75046ec76c65e7ae20a333e9a8696bc7fe"},
+    {file = "ruff-0.15.5-py3-none-win_arm64.whl", hash = "sha256:b498d1c60d2fe5c10c45ec3f698901065772730b411f164ae270bb6bfcc4740b"},
+    {file = "ruff-0.15.5.tar.gz", hash = "sha256:7c3601d3b6d76dce18c5c824fc8d06f4eef33d6df0c21ec7799510cde0f159a2"},
 ]
 
 [[package]]
@@ -3665,4 +3665,4 @@ propcache = ">=0.2.1"
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "027f86b8ac2302108765cdc43dea3ccb6defb9a96e7429d9af8ce8c435b5f8ba"
+content-hash = "c422d916b7a9ecf1a5857f50866e3160d65b7018e79bcaa5f3f83e7cd16d3ab4"
diff --git a/security_scanning/examples/models/core/qwen/pyproject.toml b/security_scanning/examples/models/core/qwen/pyproject.toml
index 963c16e2aaa..c94fe4ee5ee 100644
--- a/security_scanning/examples/models/core/qwen/pyproject.toml
+++ b/security_scanning/examples/models/core/qwen/pyproject.toml
@@ -19,7 +19,7 @@ dependencies = [
     "mdtex2html (>=1.3.2,<2.0.0)",
     "sse-starlette (>=3.3.2,<4.0.0)",
     "aiohttp-sse-client (>=0.2.1,<0.3.0)",
-    "openai (>=2.24.0,<3.0.0)"
+    "openai (>=2.26.0,<3.0.0)"
 ]
 
 
diff --git a/security_scanning/examples/models/core/whisper/poetry.lock b/security_scanning/examples/models/core/whisper/poetry.lock
index 7d8c51ae5c9..d0c117a1323 100644
--- a/security_scanning/examples/models/core/whisper/poetry.lock
+++ b/security_scanning/examples/models/core/whisper/poetry.lock
@@ -2258,14 +2258,14 @@ xml = ["lxml (>=5.3.0)"]
 
 [[package]]
 name = "platformdirs"
-version = "4.9.2"
+version = "4.9.4"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
 optional = false
 python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "platformdirs-4.9.2-py3-none-any.whl", hash = "sha256:9170634f126f8efdae22fb58ae8a0eaa86f38365bc57897a6c4f781d1f5875bd"},
-    {file = "platformdirs-4.9.2.tar.gz", hash = "sha256:9a33809944b9db043ad67ca0db94b14bf452cc6aeaac46a88ea55b26e2e9d291"},
+    {file = "platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868"},
+    {file = "platformdirs-4.9.4.tar.gz", hash = "sha256:1ec356301b7dc906d83f371c8f487070e99d3ccf9e501686456394622a01a934"},
 ]
 
 [[package]]
diff --git a/security_scanning/examples/ray_orchestrator/poetry.lock b/security_scanning/examples/ray_orchestrator/poetry.lock
index 22941a93c3c..4dbc20315f1 100644
--- a/security_scanning/examples/ray_orchestrator/poetry.lock
+++ b/security_scanning/examples/ray_orchestrator/poetry.lock
@@ -1313,14 +1313,14 @@ files = [
 
 [[package]]
 name = "platformdirs"
-version = "4.9.2"
+version = "4.9.4"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
 optional = false
 python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "platformdirs-4.9.2-py3-none-any.whl", hash = "sha256:9170634f126f8efdae22fb58ae8a0eaa86f38365bc57897a6c4f781d1f5875bd"},
-    {file = "platformdirs-4.9.2.tar.gz", hash = "sha256:9a33809944b9db043ad67ca0db94b14bf452cc6aeaac46a88ea55b26e2e9d291"},
+    {file = "platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868"},
+    {file = "platformdirs-4.9.4.tar.gz", hash = "sha256:1ec356301b7dc906d83f371c8f487070e99d3ccf9e501686456394622a01a934"},
 ]
 
 [[package]]
@@ -2183,86 +2183,102 @@ typing-extensions = {version = ">=4.13.2", markers = "python_version < \"3.11\""
 
 [[package]]
 name = "wrapt"
-version = "2.1.1"
+version = "2.1.2"
 description = "Module for decorators, wrappers and monkey patching."
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "wrapt-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7e927375e43fd5a985b27a8992327c22541b6dede1362fc79df337d26e23604f"},
-    {file = "wrapt-2.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c99544b6a7d40ca22195563b6d8bc3986ee8bb82f272f31f0670fe9440c869"},
-    {file = "wrapt-2.1.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b2be3fa5f4efaf16ee7c77d0556abca35f5a18ad4ac06f0ef3904c3399010ce9"},
-    {file = "wrapt-2.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67c90c1ae6489a6cb1a82058902caa8006706f7b4e8ff766f943e9d2c8e608d0"},
-    {file = "wrapt-2.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:05c0db35ccffd7480143e62df1e829d101c7b86944ae3be7e4869a7efa621f53"},
-    {file = "wrapt-2.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0c2ec9f616755b2e1e0bf4d0961f59bb5c2e7a77407e7e2c38ef4f7d2fdde12c"},
-    {file = "wrapt-2.1.1-cp310-cp310-win32.whl", hash = "sha256:203ba6b3f89e410e27dbd30ff7dccaf54dcf30fda0b22aa1b82d560c7f9fe9a1"},
-    {file = "wrapt-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:6f9426d9cfc2f8732922fc96198052e55c09bb9db3ddaa4323a18e055807410e"},
-    {file = "wrapt-2.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:69c26f51b67076b40714cff81bdd5826c0b10c077fb6b0678393a6a2f952a5fc"},
-    {file = "wrapt-2.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6c366434a7fb914c7a5de508ed735ef9c133367114e1a7cb91dfb5cd806a1549"},
-    {file = "wrapt-2.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d6a2068bd2e1e19e5a317c8c0b288267eec4e7347c36bc68a6e378a39f19ee7"},
-    {file = "wrapt-2.1.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:891ab4713419217b2aed7dd106c9200f64e6a82226775a0d2ebd6bef2ebd1747"},
-    {file = "wrapt-2.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c8ef36a0df38d2dc9d907f6617f89e113c5892e0a35f58f45f75901af0ce7d81"},
-    {file = "wrapt-2.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:76e9af3ebd86f19973143d4d592cbf3e970cf3f66ddee30b16278c26ae34b8ab"},
-    {file = "wrapt-2.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ff562067485ebdeaef2fa3fe9b1876bc4e7b73762e0a01406ad81e2076edcebf"},
-    {file = "wrapt-2.1.1-cp311-cp311-win32.whl", hash = "sha256:9e60a30aa0909435ec4ea2a3c53e8e1b50ac9f640c0e9fe3f21fd248a22f06c5"},
-    {file = "wrapt-2.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:7d79954f51fcf84e5ec4878ab4aea32610d70145c5bbc84b3370eabfb1e096c2"},
-    {file = "wrapt-2.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:d3ffc6b0efe79e08fd947605fd598515aebefe45e50432dc3b5cd437df8b1ada"},
-    {file = "wrapt-2.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ab8e3793b239db021a18782a5823fcdea63b9fe75d0e340957f5828ef55fcc02"},
-    {file = "wrapt-2.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7c0300007836373d1c2df105b40777986accb738053a92fe09b615a7a4547e9f"},
-    {file = "wrapt-2.1.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2b27c070fd1132ab23957bcd4ee3ba707a91e653a9268dc1afbd39b77b2799f7"},
-    {file = "wrapt-2.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b0e36d845e8b6f50949b6b65fc6cd279f47a1944582ed4ec8258cd136d89a64"},
-    {file = "wrapt-2.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4aeea04a9889370fcfb1ef828c4cc583f36a875061505cd6cd9ba24d8b43cc36"},
-    {file = "wrapt-2.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d88b46bb0dce9f74b6817bc1758ff2125e1ca9e1377d62ea35b6896142ab6825"},
-    {file = "wrapt-2.1.1-cp312-cp312-win32.whl", hash = "sha256:63decff76ca685b5c557082dfbea865f3f5f6d45766a89bff8dc61d336348833"},
-    {file = "wrapt-2.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:b828235d26c1e35aca4107039802ae4b1411be0fe0367dd5b7e4d90e562fcbcd"},
-    {file = "wrapt-2.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:75128507413a9f1bcbe2db88fd18fbdbf80f264b82fa33a6996cdeaf01c52352"},
-    {file = "wrapt-2.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ce9646e17fa7c3e2e7a87e696c7de66512c2b4f789a8db95c613588985a2e139"},
-    {file = "wrapt-2.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:428cfc801925454395aa468ba7ddb3ed63dc0d881df7b81626cdd433b4e2b11b"},
-    {file = "wrapt-2.1.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5797f65e4d58065a49088c3b32af5410751cd485e83ba89e5a45e2aa8905af98"},
-    {file = "wrapt-2.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a2db44a71202c5ae4bb5f27c6d3afbc5b23053f2e7e78aa29704541b5dad789"},
-    {file = "wrapt-2.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8d5350c3590af09c1703dd60ec78a7370c0186e11eaafb9dda025a30eee6492d"},
-    {file = "wrapt-2.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d9b076411bed964e752c01b49fd224cc385f3a96f520c797d38412d70d08359"},
-    {file = "wrapt-2.1.1-cp313-cp313-win32.whl", hash = "sha256:0bb7207130ce6486727baa85373503bf3334cc28016f6928a0fa7e19d7ecdc06"},
-    {file = "wrapt-2.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:cbfee35c711046b15147b0ae7db9b976f01c9520e6636d992cd9e69e5e2b03b1"},
-    {file = "wrapt-2.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:7d2756061022aebbf57ba14af9c16e8044e055c22d38de7bf40d92b565ecd2b0"},
-    {file = "wrapt-2.1.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4814a3e58bc6971e46baa910ecee69699110a2bf06c201e24277c65115a20c20"},
-    {file = "wrapt-2.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:106c5123232ab9b9f4903692e1fa0bdc231510098f04c13c3081f8ad71c3d612"},
-    {file = "wrapt-2.1.1-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1a40b83ff2535e6e56f190aff123821eea89a24c589f7af33413b9c19eb2c738"},
-    {file = "wrapt-2.1.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:789cea26e740d71cf1882e3a42bb29052bc4ada15770c90072cb47bf73fb3dbf"},
-    {file = "wrapt-2.1.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ba49c14222d5e5c0ee394495a8655e991dc06cbca5398153aefa5ac08cd6ccd7"},
-    {file = "wrapt-2.1.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ac8cda531fe55be838a17c62c806824472bb962b3afa47ecbd59b27b78496f4e"},
-    {file = "wrapt-2.1.1-cp313-cp313t-win32.whl", hash = "sha256:b8af75fe20d381dd5bcc9db2e86a86d7fcfbf615383a7147b85da97c1182225b"},
-    {file = "wrapt-2.1.1-cp313-cp313t-win_amd64.whl", hash = "sha256:45c5631c9b6c792b78be2d7352129f776dd72c605be2c3a4e9be346be8376d83"},
-    {file = "wrapt-2.1.1-cp313-cp313t-win_arm64.whl", hash = "sha256:da815b9263947ac98d088b6414ac83507809a1d385e4632d9489867228d6d81c"},
-    {file = "wrapt-2.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9aa1765054245bb01a37f615503290d4e207e3fd59226e78341afb587e9c1236"},
-    {file = "wrapt-2.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:feff14b63a6d86c1eee33a57f77573649f2550935981625be7ff3cb7342efe05"},
-    {file = "wrapt-2.1.1-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:81fc5f22d5fcfdbabde96bb3f5379b9f4476d05c6d524d7259dc5dfb501d3281"},
-    {file = "wrapt-2.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:951b228ecf66def855d22e006ab9a1fc12535111ae7db2ec576c728f8ddb39e8"},
-    {file = "wrapt-2.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ddf582a95641b9a8c8bd643e83f34ecbbfe1b68bc3850093605e469ab680ae3"},
-    {file = "wrapt-2.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fc5c500966bf48913f795f1984704e6d452ba2414207b15e1f8c339a059d5b16"},
-    {file = "wrapt-2.1.1-cp314-cp314-win32.whl", hash = "sha256:4aa4baadb1f94b71151b8e44a0c044f6af37396c3b8bcd474b78b49e2130a23b"},
-    {file = "wrapt-2.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:860e9d3fd81816a9f4e40812f28be4439ab01f260603c749d14be3c0a1170d19"},
-    {file = "wrapt-2.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:3c59e103017a2c1ea0ddf589cbefd63f91081d7ce9d491d69ff2512bb1157e23"},
-    {file = "wrapt-2.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9fa7c7e1bee9278fc4f5dd8275bc8d25493281a8ec6c61959e37cc46acf02007"},
-    {file = "wrapt-2.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:39c35e12e8215628984248bd9c8897ce0a474be2a773db207eb93414219d8469"},
-    {file = "wrapt-2.1.1-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:94ded4540cac9125eaa8ddf5f651a7ec0da6f5b9f248fe0347b597098f8ec14c"},
-    {file = "wrapt-2.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da0af328373f97ed9bdfea24549ac1b944096a5a71b30e41c9b8b53ab3eec04a"},
-    {file = "wrapt-2.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4ad839b55f0bf235f8e337ce060572d7a06592592f600f3a3029168e838469d3"},
-    {file = "wrapt-2.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0d89c49356e5e2a50fa86b40e0510082abcd0530f926cbd71cf25bee6b9d82d7"},
-    {file = "wrapt-2.1.1-cp314-cp314t-win32.whl", hash = "sha256:f4c7dd22cf7f36aafe772f3d88656559205c3af1b7900adfccb70edeb0d2abc4"},
-    {file = "wrapt-2.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f76bc12c583ab01e73ba0ea585465a41e48d968f6d1311b4daec4f8654e356e3"},
-    {file = "wrapt-2.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:7ea74fc0bec172f1ae5f3505b6655c541786a5cabe4bbc0d9723a56ac32eb9b9"},
-    {file = "wrapt-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9e03b3d486eb39f5d3f562839f59094dcee30c4039359ea15768dc2214d9e07c"},
-    {file = "wrapt-2.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0fdf3073f488ce4d929929b7799e3b8c52b220c9eb3f4a5a51e2dc0e8ff07881"},
-    {file = "wrapt-2.1.1-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0cb4f59238c6625fae2eeb72278da31c9cfba0ff4d9cbe37446b73caa0e9bcf7"},
-    {file = "wrapt-2.1.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f794a1c148871b714cb566f5466ec8288e0148a1c417550983864b3981737cd"},
-    {file = "wrapt-2.1.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:95ef3866631c6da9ce1fc0f1e17b90c4c0aa6d041fc70a11bc90733aee122e1a"},
-    {file = "wrapt-2.1.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:66bc1b2446f01cbbd3c56b79a3a8435bcd4178ac4e06b091913f7751a7f528b8"},
-    {file = "wrapt-2.1.1-cp39-cp39-win32.whl", hash = "sha256:1b9e08e57cabc32972f7c956d10e85093c5da9019faa24faf411e7dd258e528c"},
-    {file = "wrapt-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:e75ad48c3cca739f580b5e14c052993eb644c7fa5b4c90aa51193280b30875ae"},
-    {file = "wrapt-2.1.1-cp39-cp39-win_arm64.whl", hash = "sha256:9ccd657873b7f964711447d004563a2bc08d1476d7a1afcad310f3713e6f50f4"},
-    {file = "wrapt-2.1.1-py3-none-any.whl", hash = "sha256:3b0f4629eb954394a3d7c7a1c8cca25f0b07cefe6aa8545e862e9778152de5b7"},
-    {file = "wrapt-2.1.1.tar.gz", hash = "sha256:5fdcb09bf6db023d88f312bd0767594b414655d58090fc1c46b3414415f67fac"},
+    {file = "wrapt-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b7a86d99a14f76facb269dc148590c01aaf47584071809a70da30555228158c"},
+    {file = "wrapt-2.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a819e39017f95bf7aede768f75915635aa8f671f2993c036991b8d3bfe8dbb6f"},
+    {file = "wrapt-2.1.2-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5681123e60aed0e64c7d44f72bbf8b4ce45f79d81467e2c4c728629f5baf06eb"},
+    {file = "wrapt-2.1.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b8b28e97a44d21836259739ae76284e180b18abbb4dcfdff07a415cf1016c3e"},
+    {file = "wrapt-2.1.2-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cef91c95a50596fcdc31397eb6955476f82ae8a3f5a8eabdc13611b60ee380ba"},
+    {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:dad63212b168de8569b1c512f4eac4b57f2c6934b30df32d6ee9534a79f1493f"},
+    {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d307aa6888d5efab2c1cde09843d48c843990be13069003184b67d426d145394"},
+    {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c87cf3f0c85e27b3ac7d9ad95da166bf8739ca215a8b171e8404a2d739897a45"},
+    {file = "wrapt-2.1.2-cp310-cp310-win32.whl", hash = "sha256:d1c5fea4f9fe3762e2b905fdd67df51e4be7a73b7674957af2d2ade71a5c075d"},
+    {file = "wrapt-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:d8f7740e1af13dff2684e4d56fe604a7e04d6c94e737a60568d8d4238b9a0c71"},
+    {file = "wrapt-2.1.2-cp310-cp310-win_arm64.whl", hash = "sha256:1c6cc827c00dc839350155f316f1f8b4b0c370f52b6a19e782e2bda89600c7dc"},
+    {file = "wrapt-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:96159a0ee2b0277d44201c3b5be479a9979cf154e8c82fa5df49586a8e7679bb"},
+    {file = "wrapt-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98ba61833a77b747901e9012072f038795de7fc77849f1faa965464f3f87ff2d"},
+    {file = "wrapt-2.1.2-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:767c0dbbe76cae2a60dd2b235ac0c87c9cccf4898aef8062e57bead46b5f6894"},
+    {file = "wrapt-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c691a6bc752c0cc4711cc0c00896fcd0f116abc253609ef64ef930032821842"},
+    {file = "wrapt-2.1.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f3b7d73012ea75aee5844de58c88f44cf62d0d62711e39da5a82824a7c4626a8"},
+    {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:577dff354e7acd9d411eaf4bfe76b724c89c89c8fc9b7e127ee28c5f7bcb25b6"},
+    {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3d7b6fd105f8b24e5bd23ccf41cb1d1099796524bcc6f7fbb8fe576c44befbc9"},
+    {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:866abdbf4612e0b34764922ef8b1c5668867610a718d3053d59e24a5e5fcfc15"},
+    {file = "wrapt-2.1.2-cp311-cp311-win32.whl", hash = "sha256:5a0a0a3a882393095573344075189eb2d566e0fd205a2b6414e9997b1b800a8b"},
+    {file = "wrapt-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:64a07a71d2730ba56f11d1a4b91f7817dc79bc134c11516b75d1921a7c6fcda1"},
+    {file = "wrapt-2.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:b89f095fe98bc12107f82a9f7d570dc83a0870291aeb6b1d7a7d35575f55d98a"},
+    {file = "wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9"},
+    {file = "wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748"},
+    {file = "wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e"},
+    {file = "wrapt-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79847b83eb38e70d93dc392c7c5b587efe65b3e7afcc167aa8abd5d60e8761c8"},
+    {file = "wrapt-2.1.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f8fba1bae256186a83d1875b2b1f4e2d1242e8fac0f58ec0d7e41b26967b965c"},
+    {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e3d3b35eedcf5f7d022291ecd7533321c4775f7b9cd0050a31a68499ba45757c"},
+    {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6f2c5390460de57fa9582bc8a1b7a6c86e1a41dfad74c5225fc07044c15cc8d1"},
+    {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7dfa9f2cf65d027b951d05c662cc99ee3bd01f6e4691ed39848a7a5fffc902b2"},
+    {file = "wrapt-2.1.2-cp312-cp312-win32.whl", hash = "sha256:eba8155747eb2cae4a0b913d9ebd12a1db4d860fc4c829d7578c7b989bd3f2f0"},
+    {file = "wrapt-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1c51c738d7d9faa0b3601708e7e2eda9bf779e1b601dce6c77411f2a1b324a63"},
+    {file = "wrapt-2.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c8e46ae8e4032792eb2f677dbd0d557170a8e5524d22acc55199f43efedd39bf"},
+    {file = "wrapt-2.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787fd6f4d67befa6fe2abdffcbd3de2d82dfc6fb8a6d850407c53332709d030b"},
+    {file = "wrapt-2.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4bdf26e03e6d0da3f0e9422fd36bcebf7bc0eeb55fdf9c727a09abc6b9fe472e"},
+    {file = "wrapt-2.1.2-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bbac24d879aa22998e87f6b3f481a5216311e7d53c7db87f189a7a0266dafffb"},
+    {file = "wrapt-2.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16997dfb9d67addc2e3f41b62a104341e80cac52f91110dece393923c0ebd5ca"},
+    {file = "wrapt-2.1.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:162e4e2ba7542da9027821cb6e7c5e068d64f9a10b5f15512ea28e954893a267"},
+    {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f29c827a8d9936ac320746747a016c4bc66ef639f5cd0d32df24f5eacbf9c69f"},
+    {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:a9dd9813825f7ecb018c17fd147a01845eb330254dff86d3b5816f20f4d6aaf8"},
+    {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f8dbdd3719e534860d6a78526aafc220e0241f981367018c2875178cf83a413"},
+    {file = "wrapt-2.1.2-cp313-cp313-win32.whl", hash = "sha256:5c35b5d82b16a3bc6e0a04349b606a0582bc29f573786aebe98e0c159bc48db6"},
+    {file = "wrapt-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:f8bc1c264d8d1cf5b3560a87bbdd31131573eb25f9f9447bb6252b8d4c44a3a1"},
+    {file = "wrapt-2.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:3beb22f674550d5634642c645aba4c72a2c66fb185ae1aebe1e955fae5a13baf"},
+    {file = "wrapt-2.1.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0fc04bc8664a8bc4c8e00b37b5355cffca2535209fba1abb09ae2b7c76ddf82b"},
+    {file = "wrapt-2.1.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a9b9d50c9af998875a1482a038eb05755dfd6fe303a313f6a940bb53a83c3f18"},
+    {file = "wrapt-2.1.2-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2d3ff4f0024dd224290c0eabf0240f1bfc1f26363431505fb1b0283d3b08f11d"},
+    {file = "wrapt-2.1.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3278c471f4468ad544a691b31bb856374fbdefb7fee1a152153e64019379f015"},
+    {file = "wrapt-2.1.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a8914c754d3134a3032601c6984db1c576e6abaf3fc68094bb8ab1379d75ff92"},
+    {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ff95d4264e55839be37bafe1536db2ab2de19da6b65f9244f01f332b5286cfbf"},
+    {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:76405518ca4e1b76fbb1b9f686cff93aebae03920cc55ceeec48ff9f719c5f67"},
+    {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c0be8b5a74c5824e9359b53e7e58bef71a729bacc82e16587db1c4ebc91f7c5a"},
+    {file = "wrapt-2.1.2-cp313-cp313t-win32.whl", hash = "sha256:f01277d9a5fc1862f26f7626da9cf443bebc0abd2f303f41c5e995b15887dabd"},
+    {file = "wrapt-2.1.2-cp313-cp313t-win_amd64.whl", hash = "sha256:84ce8f1c2104d2f6daa912b1b5b039f331febfeee74f8042ad4e04992bd95c8f"},
+    {file = "wrapt-2.1.2-cp313-cp313t-win_arm64.whl", hash = "sha256:a93cd767e37faeddbe07d8fc4212d5cba660af59bdb0f6372c93faaa13e6e679"},
+    {file = "wrapt-2.1.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:1370e516598854e5b4366e09ce81e08bfe94d42b0fd569b88ec46cc56d9164a9"},
+    {file = "wrapt-2.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6de1a3851c27e0bd6a04ca993ea6f80fc53e6c742ee1601f486c08e9f9b900a9"},
+    {file = "wrapt-2.1.2-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:de9f1a2bbc5ac7f6012ec24525bdd444765a2ff64b5985ac6e0692144838542e"},
+    {file = "wrapt-2.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:970d57ed83fa040d8b20c52fe74a6ae7e3775ae8cff5efd6a81e06b19078484c"},
+    {file = "wrapt-2.1.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3969c56e4563c375861c8df14fa55146e81ac11c8db49ea6fb7f2ba58bc1ff9a"},
+    {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:57d7c0c980abdc5f1d98b11a2aa3bb159790add80258c717fa49a99921456d90"},
+    {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:776867878e83130c7a04237010463372e877c1c994d449ca6aaafeab6aab2586"},
+    {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fab036efe5464ec3291411fabb80a7a39e2dd80bae9bcbeeca5087fdfa891e19"},
+    {file = "wrapt-2.1.2-cp314-cp314-win32.whl", hash = "sha256:e6ed62c82ddf58d001096ae84ce7f833db97ae2263bff31c9b336ba8cfe3f508"},
+    {file = "wrapt-2.1.2-cp314-cp314-win_amd64.whl", hash = "sha256:467e7c76315390331c67073073d00662015bb730c566820c9ca9b54e4d67fd04"},
+    {file = "wrapt-2.1.2-cp314-cp314-win_arm64.whl", hash = "sha256:da1f00a557c66225d53b095a97eace0fc5349e3bfda28fa34ffae238978ee575"},
+    {file = "wrapt-2.1.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:62503ffbc2d3a69891cf29beeaccdb4d5e0a126e2b6a851688d4777e01428dbb"},
+    {file = "wrapt-2.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c7e6cd120ef837d5b6f860a6ea3745f8763805c418bb2f12eeb1fa6e25f22d22"},
+    {file = "wrapt-2.1.2-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3769a77df8e756d65fbc050333f423c01ae012b4f6731aaf70cf2bef61b34596"},
+    {file = "wrapt-2.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a76d61a2e851996150ba0f80582dd92a870643fa481f3b3846f229de88caf044"},
+    {file = "wrapt-2.1.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6f97edc9842cf215312b75fe737ee7c8adda75a89979f8e11558dfff6343cc4b"},
+    {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4006c351de6d5007aa33a551f600404ba44228a89e833d2fadc5caa5de8edfbf"},
+    {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:a9372fc3639a878c8e7d87e1556fa209091b0a66e912c611e3f833e2c4202be2"},
+    {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3144b027ff30cbd2fca07c0a87e67011adb717eb5f5bd8496325c17e454257a3"},
+    {file = "wrapt-2.1.2-cp314-cp314t-win32.whl", hash = "sha256:3b8d15e52e195813efe5db8cec156eebe339aaf84222f4f4f051a6c01f237ed7"},
+    {file = "wrapt-2.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:08ffa54146a7559f5b8df4b289b46d963a8e74ed16ba3687f99896101a3990c5"},
+    {file = "wrapt-2.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:72aaa9d0d8e4ed0e2e98019cea47a21f823c9dd4b43c7b77bba6679ffcca6a00"},
+    {file = "wrapt-2.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5e0fa9cc32300daf9eb09a1f5bdc6deb9a79defd70d5356ba453bcd50aef3742"},
+    {file = "wrapt-2.1.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:710f6e5dfaf6a5d5c397d2d6758a78fecd9649deb21f1b645f5b57a328d63050"},
+    {file = "wrapt-2.1.2-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:305d8a1755116bfdad5dda9e771dcb2138990a1d66e9edd81658816edf51aed1"},
+    {file = "wrapt-2.1.2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0d8fc30a43b5fe191cf2b1a0c82bab2571dadd38e7c0062ee87d6df858dd06e"},
+    {file = "wrapt-2.1.2-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a5d516e22aedb7c9c1d47cba1c63160b1a6f61ec2f3948d127cd38d5cfbb556f"},
+    {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:45914e8efbe4b9d5102fcf0e8e2e3258b83a5d5fba9f8f7b6d15681e9d29ffe0"},
+    {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:478282ebd3795a089154fb16d3db360e103aa13d3b2ad30f8f6aac0d2207de0e"},
+    {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3756219045f73fb28c5d7662778e4156fbd06cf823c4d2d4b19f97305e52819c"},
+    {file = "wrapt-2.1.2-cp39-cp39-win32.whl", hash = "sha256:b8aefb4dbb18d904b96827435a763fa42fc1f08ea096a391710407a60983ced8"},
+    {file = "wrapt-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:e5aeab8fe15c3dff75cfee94260dcd9cded012d4ff06add036c28fae7718593b"},
+    {file = "wrapt-2.1.2-cp39-cp39-win_arm64.whl", hash = "sha256:f069e113743a21a3defac6677f000068ebb931639f789b5b226598e247a4c89e"},
+    {file = "wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8"},
+    {file = "wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e"},
 ]
 
 [package.extras]
diff --git a/security_scanning/examples/serve/poetry.lock b/security_scanning/examples/serve/poetry.lock
index 6350255c191..4a92644ea02 100644
--- a/security_scanning/examples/serve/poetry.lock
+++ b/security_scanning/examples/serve/poetry.lock
@@ -2458,14 +2458,14 @@ xmp = ["defusedxml"]
 
 [[package]]
 name = "platformdirs"
-version = "4.9.2"
+version = "4.9.4"
 description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`."
 optional = false
 python-versions = ">=3.10"
 groups = ["main"]
 files = [
-    {file = "platformdirs-4.9.2-py3-none-any.whl", hash = "sha256:9170634f126f8efdae22fb58ae8a0eaa86f38365bc57897a6c4f781d1f5875bd"},
-    {file = "platformdirs-4.9.2.tar.gz", hash = "sha256:9a33809944b9db043ad67ca0db94b14bf452cc6aeaac46a88ea55b26e2e9d291"},
+    {file = "platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868"},
+    {file = "platformdirs-4.9.4.tar.gz", hash = "sha256:1ec356301b7dc906d83f371c8f487070e99d3ccf9e501686456394622a01a934"},
 ]
 
 [[package]]
diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json
index 222f60c2f9e..12ddd9e9954 100644
--- a/security_scanning/metadata.json
+++ b/security_scanning/metadata.json
@@ -1,4 +1,4 @@
 {
-  "commit_hash": "e01c38f83a5ed13c0abad3a970e9344bfabf7453",
-  "timestamp": "2026-03-05T02:50:04Z"
+  "commit_hash": "e699f232511bde5ab7c15af72528484175771e7b",
+  "timestamp": "2026-03-06T02:46:54Z"
 }
diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock
index 01a480d3f29..383362e6228 100644
--- a/security_scanning/poetry.lock
+++ b/security_scanning/poetry.lock
@@ -1216,14 +1216,14 @@ vision = ["Pillow (>=9.4.0)"]
 
 [[package]]
 name = "diffusers"
-version = "0.36.0"
+version = "0.37.0"
 description = "State-of-the-art diffusion in PyTorch and JAX."
 optional = false
-python-versions = ">=3.8.0"
+python-versions = ">=3.10.0"
 groups = ["main"]
 files = [
-    {file = "diffusers-0.36.0-py3-none-any.whl", hash = "sha256:525d42abc74bfc3b2db594999961295c054b48ef40a11724dacf50e6abd1af98"},
-    {file = "diffusers-0.36.0.tar.gz", hash = "sha256:a9cde8721b415bde6a678f2d02abb85396487e1b0e0d2b4abb462d14a9825ab0"},
+    {file = "diffusers-0.37.0-py3-none-any.whl", hash = "sha256:7eab74bf896974250b5e1027cae813aba1004f02d97c9b44891b83713386aa08"},
+    {file = "diffusers-0.37.0.tar.gz", hash = "sha256:408789af73898585f525afd07ca72b3955affea4216a669558e9f59b5b1fe704"},
 ]
 
 [package.dependencies]
@@ -1239,14 +1239,14 @@ safetensors = ">=0.3.1"
 
 [package.extras]
 bitsandbytes = ["accelerate (>=0.31.0)", "bitsandbytes (>=0.43.3)"]
-dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "timm", "torch (>=1.4)", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"]
+dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "ftfy", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "timm", "torch (>=1.4)", "torchsde", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"]
 docs = ["hf-doc-builder (>=0.3.0)"]
 flax = ["flax (>=0.4.1)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)"]
 gguf = ["accelerate (>=0.31.0)", "gguf (>=0.10.0)"]
 nvidia-modelopt = ["nvidia_modelopt[hf] (>=0.33.1)"]
 optimum-quanto = ["accelerate (>=0.31.0)", "optimum_quanto (>=0.2.6)"]
 quality = ["hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.9.10)", "urllib3 (<=2.0.0)"]
-test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "invisible-watermark (>=0.2.0)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "phonemizer", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tiktoken (>=0.7.0)", "torchvision", "transformers (>=4.41.2)"]
+test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "ftfy", "invisible-watermark (>=0.2.0)", "librosa", "parameterized", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tiktoken (>=0.7.0)", "torchsde", "torchvision", "transformers (>=4.41.2)"]
 torch = ["accelerate (>=0.31.0)", "torch (>=1.4)"]
 torchao = ["accelerate (>=0.31.0)", "torchao (>=0.7.0)"]
 training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.17.0)", "protobuf (>=3.20.3,<4)", "tensorboard", "timm"]
@@ -3874,14 +3874,14 @@ typing_extensions = ">=4.10"
 
 [[package]]
 name = "openai"
-version = "2.24.0"
+version = "2.26.0"
 description = "The official Python library for the openai API"
 optional = false
 python-versions = ">=3.9"
 groups = ["main"]
 files = [
-    {file = "openai-2.24.0-py3-none-any.whl", hash = "sha256:fed30480d7d6c884303287bde864980a4b137b60553ffbcf9ab4a233b7a73d94"},
-    {file = "openai-2.24.0.tar.gz", hash = "sha256:1e5769f540dbd01cb33bc4716a23e67b9d695161a734aff9c5f925e2bf99a673"},
+    {file = "openai-2.26.0-py3-none-any.whl", hash = "sha256:6151bf8f83802f036117f06cc8a57b3a4da60da9926826cc96747888b57f394f"},
+    {file = "openai-2.26.0.tar.gz", hash = "sha256:b41f37c140ae0034a6e92b0c509376d907f3a66109935fba2c1b471a7c05a8fb"},
 ]
 
 [package.dependencies]
@@ -6996,4 +6996,4 @@ type = ["pytest-mypy"]
 [metadata]
 lock-version = "2.1"
 python-versions = ">=3.10,<3.13"
-content-hash = "ec5d305a29833682cefb2c4c995403be74c8fdaeb6ac096f94480d4582f568a5"
+content-hash = "2a21a66f0b0512caff14b587ff5a19a8d42d3ae9b5624172c4e68f8d07044c48"
diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml
index b8587dd5f95..ccb19badfc0 100644
--- a/security_scanning/pyproject.toml
+++ b/security_scanning/pyproject.toml
@@ -20,7 +20,7 @@ dependencies = [
     "onnx-graphsurgeon (>=0.5.2)",
     "onnxscript (==0.5.4)",
     "graphviz (>=0.21,<0.22)",
-    "openai (>=2.24.0,<3.0.0)",
+    "openai (>=2.26.0,<3.0.0)",
     "polygraphy (>=0.49.26,<0.50.0)",
     "psutil (>=7.2.2,<8.0.0)",
     "nvidia-ml-py (>=13)",
@@ -83,7 +83,7 @@ dependencies = [
     "nvidia-cuda-tileiras (>=13.1)",
     "etcd-sdk-python (==0.0.7)",
     "python-multipart (>=0.0.22,<0.0.23)",
-    "smg-grpc-proto (>=0.3.3)"
+    "smg-grpc-proto (>=0.4.2)"
 ]
 
 
diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
index a7504a8b85d..19fd880c1dc 100644
--- a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
+++ b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py
@@ -867,6 +867,12 @@ def get_valid_tactics(
             valid_tactics = []
             for mma_tiler_mn, cluster_shape_mn in itertools.product(
                     mma_tiler_mn_candidates, cluster_shape_mn_candidates):
+                # Skip tactics where the cluster shape exceeds available
+                # tiles. Launching more cluster CTAs than tiles causes
+                # out-of-bounds memory access in the CuteDSL kernel.
+                if (ceil_div(m, mma_tiler_mn[0]) < cluster_shape_mn[0]
+                        or ceil_div(n, mma_tiler_mn[1]) < cluster_shape_mn[1]):
+                    continue
                 if self.__class__.kernel_class.can_implement(
                         ab_dtype=cutlass.Float4E2M1FN,
                         sf_dtype=cutlass.Float8E4M3FN,
@@ -1162,6 +1168,12 @@ def get_valid_tactics(
             for mma_tiler_mn, cluster_shape_mn, raster_along_m in itertools.product(
                     mma_tiler_mn_candidates, cluster_shape_mn_candidates,
                     raster_along_m_candidates):
+                # Skip tactics where the cluster shape exceeds available
+                # tiles. Launching more cluster CTAs than tiles causes
+                # out-of-bounds memory access in the CuteDSL kernel.
+                if (ceil_div(m, mma_tiler_mn[0]) < cluster_shape_mn[0]
+                        or ceil_div(n, mma_tiler_mn[1]) < cluster_shape_mn[1]):
+                    continue
                 if self.__class__.kernel_class.can_implement(
                         ab_dtype=cutlass.Float4E2M1FN,
                         sf_dtype=cutlass.Float8E4M3FN,
@@ -1548,6 +1560,12 @@ def get_valid_tactics(
             valid_tactics = []
             for mma_tiler_mn, cluster_shape_mn in itertools.product(
                     mma_tiler_mn_candidates, cluster_shape_mn_candidates):
+                # Skip tactics where the cluster shape exceeds available
+                # tiles. Launching more cluster CTAs than tiles causes
+                # out-of-bounds memory access in the CuteDSL kernel.
+                if (ceil_div(m, mma_tiler_mn[0]) < cluster_shape_mn[0]
+                        or ceil_div(n, mma_tiler_mn[1]) < cluster_shape_mn[1]):
+                    continue
                 if self.__class__.kernel_class.can_implement(
                         ab_dtype=cutlass.Float4E2M1FN,
                         sf_dtype=cutlass.Float8E4M3FN,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
index cd0cb71fbec..607b5d870e8 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py
@@ -733,7 +733,7 @@ def _forward_chunk_impl(
                 )
 
                 # Step 4b: Quantization AFTER dispatch
-                x, x_sf = self.backend.quantize_input(x)
+                x, x_sf = self.backend.quantize_input(x, post_quant_comm=False)
         else:
             # No communication, just quantize
             # (use non-post-quant-comm path for TRTLLMGenFusedMoE)
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
index 9812c4ef868..1273262f5f4 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py
@@ -513,11 +513,16 @@ def run_moe_nvfp4(
                                          self.hidden_size)
             assert moe_output.dtype == output_dtype
 
+        # After DeepEPLowLatency dispatch, token_selected_experts has shape
+        # [N, 1] instead of [N, top_k], because each row is already assigned
+        # to exactly one expert. Use the tensor shape as the effective top_k.
+        effective_top_k = token_selected_experts.size(-1)
+
         tuner = AutoTuner.get()
         runner = CuteDslFusedMoENvfp4Runner(
             forward_impl=self.run_moe_nvfp4_impl,
             num_experts=self.num_slots,
-            top_k=self.routing_method.experts_per_token,
+            top_k=effective_top_k,
             num_local_experts=self.expert_size_per_partition,
             local_expert_offset=self.slot_start,
             enable_finalize_fusion=self.use_fused_finalize,
@@ -547,11 +552,15 @@ def run_moe_nvfp4_impl(
     ) -> torch.Tensor:
         output_dtype = torch.bfloat16
 
+        # Use effective top_k from tensor shape rather than routing config.
+        # After DeepEPLowLatency dispatch, each row maps to one expert (top_k=1).
+        effective_top_k = token_selected_experts.size(1)
+
         tile_idx_to_expert_idx, tile_idx_to_mn_limit, expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, total_num_padded_tokens, num_non_exiting_tiles = torch.ops.trtllm.moe_sort(
             token_selected_experts=token_selected_experts,
             token_final_scales=token_final_scales,
             num_experts=self.num_slots,
-            top_k=self.routing_method.experts_per_token,
+            top_k=effective_top_k,
             local_expert_offset=self.slot_start,
             local_num_experts=self.expert_size_per_partition,
             tile_tokens_dim=tile_size,
@@ -574,7 +583,7 @@ def run_moe_nvfp4_impl(
             num_non_exiting_tiles=num_non_exiting_tiles,
             global_sf=self.fc2_input_scale,
             num_experts=self.num_slots,
-            top_k=self.routing_method.experts_per_token,
+            top_k=effective_top_k,
             num_local_experts=self.expert_size_per_partition,
             local_expert_offset=self.slot_start,
             tile_size=tile_size,
@@ -591,7 +600,7 @@ def run_moe_nvfp4_impl(
                     permuted_idx_to_expanded_idx=permuted_idx_to_expanded_idx,
                     num_non_exiting_tiles=num_non_exiting_tiles,
                     tile_tokens_dim=tile_size,
-                    top_k=self.routing_method.experts_per_token,
+                    top_k=effective_top_k,
                     ep_size=self.mapping.moe_ep_size,
                     enable_alltoall=enable_alltoall,
                 )
@@ -612,7 +621,7 @@ def run_moe_nvfp4_impl(
                 num_non_exiting_tiles=num_non_exiting_tiles,
                 token_final_scales=token_final_scales,
                 num_experts=self.num_slots,
-                top_k=self.routing_method.experts_per_token,
+                top_k=effective_top_k,
                 num_local_experts=self.expert_size_per_partition,
                 local_expert_offset=self.slot_start,
                 tile_size=tile_size,
@@ -629,7 +638,7 @@ def run_moe_nvfp4_impl(
                 tile_idx_to_group_idx=tile_idx_to_expert_idx,
                 num_non_exiting_tiles=num_non_exiting_tiles,
                 num_experts=self.num_slots,
-                top_k=self.routing_method.experts_per_token,
+                top_k=effective_top_k,
                 num_local_experts=self.expert_size_per_partition,
                 local_expert_offset=self.slot_start,
                 tile_size=tile_size,
diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
index 83aae9a06a5..5432445bc5f 100755
--- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py
@@ -76,14 +76,17 @@ class CutlassFusedMoE(MoE):
             "sm_constraint": ("min", 89),
             "dtypes": {torch.float16, torch.bfloat16, torch.float32},
         },
-        # FP8_BLOCK_SCALES: SM == 90 only
+        # FP8_BLOCK_SCALES: SM == 90 only (float16 not supported by kernel)
         QuantAlgo.FP8_BLOCK_SCALES: {
             "sm_constraint": ("exact", 90),
-            "dtypes": {torch.float16, torch.bfloat16, torch.float32},
+            "dtypes": {torch.bfloat16},
         },
-        # NVFP4: SM in {100, 103}
+        # NVFP4: SM in {100, 103, 120, 121}
+        # SM 120 = desktop Blackwell (e.g. RTX 5090 / GB202)
+        # SM 121 = GB10 / DGX Spark
+        # C++ kernel: isValidSM120MOESpecialisation() supports FP4xFP4 and FP8xFP4
         QuantAlgo.NVFP4: {
-            "sm_constraint": ("in", {100, 103}),
+            "sm_constraint": ("in", {100, 103, 120, 121}),
             "dtypes": {torch.float16, torch.bfloat16, torch.float8_e4m3fn},
         },
         # W4A8_AWQ: SM in {89, 90} only
@@ -130,7 +133,7 @@ def can_implement(
         - Unquantized (FP16/BF16): SM >= 80
         - FP8 per-tensor (QDQ): SM >= 89
         - FP8_BLOCK_SCALES: SM == 90 only
-        - NVFP4: SM in {100, 103}
+        - NVFP4: SM in {100, 103, 120, 121}
         - W4A8_AWQ: SM in {89, 90} only
         - W8A16: SM >= 80
         - W4A16_MXFP4: SM == 90 only
diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
index 49c00f8c752..2a349c28e03 100644
--- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py
+++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py
@@ -1900,7 +1900,7 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
             [torch.stack(all_w3_scales),
              torch.stack(all_w1_scales)], dim=-2)
 
-        w3_w1_scales = all_w3_w1_scales.to(torch.bfloat16).view(module.dtype)
+        w3_w1_scales = all_w3_w1_scales.to(torch.bfloat16)
         w3_w1_s_shape = w3_w1_scales.shape
         w3_w1_scales_interleaved = w3_w1_scales.reshape(
             w3_w1_s_shape[0], w3_w1_s_shape[1],
@@ -1928,8 +1928,7 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict):
                 w2_scales_shard, (0, pad_size_inter, 0, pad_size_hidden))
             all_w2_scales.append(w2_scales_shard)
 
-        w2_scales = torch.stack(all_w2_scales).to(torch.bfloat16).view(
-            module.dtype)
+        w2_scales = torch.stack(all_w2_scales).to(torch.bfloat16)
         w2_s_shape = w2_scales.shape
         w2_scales_interleaved = w2_scales.reshape(
             w2_s_shape[0], w2_s_shape[1],
diff --git a/tensorrt_llm/grpc/grpc_request_manager.py b/tensorrt_llm/grpc/grpc_request_manager.py
index c18af48ba26..c0fa15af4f6 100644
--- a/tensorrt_llm/grpc/grpc_request_manager.py
+++ b/tensorrt_llm/grpc/grpc_request_manager.py
@@ -233,10 +233,11 @@ def create_sampling_params_from_proto(
     proto_config: pb2.SamplingConfig,
     output_config: pb2.OutputConfig,
     max_tokens: int,
-    end_id: Optional[int] = None,
-    pad_id: Optional[int] = None,
-    bad_words: Optional[List[pb2.TokenSequence]] = None,
-    stop_words: Optional[List[pb2.TokenSequence]] = None,
+    stop: Optional[List[str]] = None,
+    stop_token_ids: Optional[List[int]] = None,
+    ignore_eos: bool = False,
+    bad: Optional[List[str]] = None,
+    bad_token_ids: Optional[List[int]] = None,
     guided_decoding: Optional[pb2.GuidedDecodingParams] = None,
     embedding_bias: Optional[List[float]] = None,
 ) -> SamplingParams:
@@ -246,10 +247,11 @@ def create_sampling_params_from_proto(
         proto_config: Protobuf SamplingConfig message
         output_config: Protobuf OutputConfig message
         max_tokens: Maximum tokens to generate
-        end_id: End-of-sequence token ID
-        pad_id: Padding token ID
-        bad_words: Bad word token sequences
-        stop_words: Stop word token sequences
+        stop: Stop strings (tokenized by TRT-LLM's _setup())
+        stop_token_ids: Stop token IDs
+        ignore_eos: Whether to ignore end-of-sequence token
+        bad: Bad word strings (tokenized by TRT-LLM's _setup())
+        bad_token_ids: Bad word token IDs
         guided_decoding: Guided decoding parameters
         embedding_bias: Embedding bias tensor
 
@@ -317,13 +319,19 @@ def create_sampling_params_from_proto(
     if proto_config.HasField("no_repeat_ngram_size"):
         kwargs["no_repeat_ngram_size"] = proto_config.no_repeat_ngram_size
 
-    # End/pad tokens
-    if end_id is not None:
-        kwargs["end_id"] = end_id
-        if end_id == -1:
-            kwargs["ignore_eos"] = True
-    if pad_id is not None:
-        kwargs["pad_id"] = pad_id
+    # Stop sequences and ignore_eos (TRT-LLM's _setup() tokenizes stop strings)
+    if stop:
+        kwargs["stop"] = stop
+    if stop_token_ids:
+        kwargs["stop_token_ids"] = stop_token_ids
+    if ignore_eos:
+        kwargs["ignore_eos"] = True
+
+    # Bad words (TRT-LLM's _setup() tokenizes bad word strings)
+    if bad:
+        kwargs["bad"] = bad
+    if bad_token_ids:
+        kwargs["bad_token_ids"] = bad_token_ids
 
     # Output configuration - logprobs
     if output_config.HasField("logprobs"):
@@ -337,11 +345,6 @@ def create_sampling_params_from_proto(
     if output_config.exclude_input_from_output:
         kwargs["exclude_input_from_output"] = True
 
-    # Pre-tokenized stop/bad word sequences (set after construction since
-    # SamplingParams._stop_word_ids/_bad_word_ids are init=False fields)
-    stop_word_ids = [list(seq.token_ids) for seq in stop_words] if stop_words else None
-    bad_word_ids = [list(seq.token_ids) for seq in bad_words] if bad_words else None
-
     # Embedding bias
     if embedding_bias:
         kwargs["embedding_bias"] = list(embedding_bias)
@@ -363,13 +366,6 @@ def create_sampling_params_from_proto(
 
     params = SamplingParams(**kwargs)
 
-    # Set pre-tokenized stop/bad word IDs directly (these come pre-tokenized
-    # from the router, so we bypass the tokenizer-based setup path)
-    if stop_word_ids:
-        params._stop_word_ids = stop_word_ids
-    if bad_word_ids:
-        params._bad_word_ids = bad_word_ids
-
     return params
 
 
diff --git a/tensorrt_llm/grpc/grpc_servicer.py b/tensorrt_llm/grpc/grpc_servicer.py
index 4ad8addd80d..5dbb8291348 100644
--- a/tensorrt_llm/grpc/grpc_servicer.py
+++ b/tensorrt_llm/grpc/grpc_servicer.py
@@ -97,10 +97,11 @@ async def Generate(
                 proto_config=request.sampling_config,
                 output_config=request.output_config,
                 max_tokens=request.max_tokens,
-                end_id=request.end_id if request.HasField("end_id") else None,
-                pad_id=request.pad_id if request.HasField("pad_id") else None,
-                bad_words=list(request.bad_words) if request.bad_words else None,
-                stop_words=list(request.stop_words) if request.stop_words else None,
+                stop=list(request.stop) if request.stop else None,
+                stop_token_ids=list(request.stop_token_ids) if request.stop_token_ids else None,
+                ignore_eos=request.ignore_eos,
+                bad=list(request.bad) if request.bad else None,
+                bad_token_ids=list(request.bad_token_ids) if request.bad_token_ids else None,
                 guided_decoding=request.guided_decoding
                 if request.HasField("guided_decoding")
                 else None,
@@ -485,15 +486,18 @@ def _complete_responses(
             complete = trtllm_service_pb2.GenerateComplete(
                 output_token_ids=output_tokens,
                 sequence_index=completion.index,
-                finish_reason=completion.finish_reason or "stop",
+                finish_reason=completion.finish_reason or "",
                 prompt_tokens=len(prompt_token_ids),
                 completion_tokens=len(output_tokens),
                 cached_tokens=cached_tokens,
             )
 
-            # Add stop reason if available
-            if hasattr(completion, "stop_reason") and completion.stop_reason:
-                complete.stop_reason = str(completion.stop_reason)
+            # Add matched stop if available (int token ID or str stop sequence)
+            if hasattr(completion, "stop_reason") and completion.stop_reason is not None:
+                if isinstance(completion.stop_reason, int):
+                    complete.matched_token_id = completion.stop_reason
+                else:
+                    complete.matched_stop_str = str(completion.stop_reason)
 
             # Add generation logprobs if available
             if completion.logprobs:
diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
index 60f0624fc3d..91cc83b32d5 100644
--- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py
+++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py
@@ -1565,17 +1565,8 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn,
                            (False, False, False, True),
                            (True, False, True, True), (True, True, True, True)])
     @parametrize_with_ids("mtp", ["disable", "eagle", "vanilla"])
-    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
-                             ids=lambda x: ""
-                             if x == 0 else "enable_configurable_moe")
     def test_fp8_block_scales(self, mtp, fp8kv, attention_dp, cuda_graph,
-                              overlap_scheduler, torch_compile,
-                              enable_configurable_moe, mocker):
-        # Patch MpiPoolSession to propagate env vars to MPI worker processes
-        env_value = "1" if enable_configurable_moe == 1 else "0"
-        patch_mpi_pool_session_for_env(mocker,
-                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
-
+                              overlap_scheduler, torch_compile):
         if torch_compile and mtp != "disable":
             pytest.skip("https://nvbugs/5252313")
         kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75)
@@ -2001,28 +1992,9 @@ def test_nvfp4_batch_waiting(self, torch_compile, fp8kv, cuda_graph,
                              ids=["tp4", "ep4", "tp2pp2", "pp4"])
     @parametrize_with_ids("mtp_nextn", [0, 2])
     @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM", "CUTEDSL"])
-    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
-                             ids=lambda x: ""
-                             if x == 0 else "enable_configurable_moe")
     def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph,
                          overlap_scheduler, tp_size, pp_size, ep_size,
-                         torch_compile, mtp_nextn, moe_backend,
-                         enable_configurable_moe, mocker):
-        # Handle ENABLE_CONFIGURABLE_MOE environment variable
-        if enable_configurable_moe == 1 and moe_backend not in [
-                "TRTLLM", "CUTLASS"
-        ]:
-            pytest.skip(
-                f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM and CUTLASS backend, "
-                f"current backend is {moe_backend}")
-
-        # Patch MpiPoolSession to propagate env vars to MPI worker processes
-        env_value = "1" if enable_configurable_moe == 1 and moe_backend in [
-            "TRTLLM", "CUTLASS"
-        ] else "0"
-        patch_mpi_pool_session_for_env(mocker,
-                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
-
+                         torch_compile, mtp_nextn, moe_backend):
         sm_version = get_sm_version()
         if moe_backend == "TRTLLM" and sm_version in (120, 121):
             pytest.skip(f"{moe_backend} backend does not support SM 120 or 121")
@@ -4119,27 +4091,9 @@ def test_nvfp4(
         ids=["latency", "ep2", "ep4"])
     @pytest.mark.parametrize("activation_dtype", ["static_fp8", "mxfp8"],
                              ids=["fp8", "mxfp8"])
-    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
-                             ids=lambda x: ""
-                             if x == 0 else "enable_configurable_moe")
     def test_w4a8_mxfp4(self, moe_backend, tp_size, pp_size, ep_size,
                         attention_dp, cuda_graph, overlap_scheduler,
-                        activation_dtype, enable_configurable_moe, mocker):
-        # Handle ENABLE_CONFIGURABLE_MOE environment variable
-        if enable_configurable_moe == 1 and moe_backend not in [
-                "TRTLLM", "CUTLASS"
-        ]:
-            pytest.skip(
-                f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM and CUTLASS backend, "
-                f"current backend is {moe_backend}")
-
-        # Patch MpiPoolSession to propagate env vars to MPI worker processes
-        env_value = "1" if enable_configurable_moe == 1 and moe_backend in [
-            "TRTLLM", "CUTLASS"
-        ] else "0"
-        patch_mpi_pool_session_for_env(mocker,
-                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
-
+                        activation_dtype):
         if moe_backend in ["CUTLASS", "TRTLLM"] and get_sm_version() < 100:
             pytest.skip(
                 "CUTLASS or TRTLLM moe backend requires Blackwell or newer.")
@@ -5230,17 +5184,8 @@ def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler,
     @pytest.mark.parametrize(
         "kv_cache_dtype",
         ["auto", pytest.param("fp8", marks=skip_pre_blackwell)])
-    @pytest.mark.parametrize("enable_configurable_moe", [0, 1],
-                             ids=lambda x: ""
-                             if x == 0 else "enable_configurable_moe")
-    def test_w4_4gpus_online_eplb(self, kv_cache_dtype, enable_configurable_moe,
-                                  mocker):
+    def test_w4_4gpus_online_eplb(self, kv_cache_dtype, mocker):
         """Test GPTOSS with online expert parallel load balancer using TRTLLM backend and attention DP."""
-        # Patch MpiPoolSession to propagate env vars to MPI worker processes
-        env_value = "1" if enable_configurable_moe == 1 else "0"
-        patch_mpi_pool_session_for_env(mocker,
-                                       {"ENABLE_CONFIGURABLE_MOE": env_value})
-
         mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192)
         mocker.patch.dict(GSM8K.EVALUATE_KWARGS,
                           {"scores_filter": "exact_match,flexible-extract"})
diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py
index 48f57261a98..cd399acb415 100644
--- a/tests/integration/defs/conftest.py
+++ b/tests/integration/defs/conftest.py
@@ -2212,94 +2212,6 @@ def pytest_generate_tests(metafunc: pytest.Metafunc):
     metafunc.parametrize("case", uts, ids=lambda x: x)
 
 
-# Test cases that use enable_configurable_moe parameter and need ID conversion
-TESTS_WITH_CONFIGURABLE_MOE = [
-    "TestDeepSeekV3Lite::test_nvfp4_4gpus[",
-    "TestDeepSeekV3Lite::test_fp8_block_scales[",
-    "TestGPTOSS::test_w4_4gpus_online_eplb[",
-    "TestQwen3_30B_A3B::test_w4a8_mxfp4[",
-]
-
-
-def _convert_clean_to_original_moe_test_id(test_id):
-    """Convert clean MoE test ID back to original format for pytest collection.
-
-    Example: "test_llm_api_pytorch.py::test_foo[param]" -> "test_llm_api_pytorch.py::test_foo[-param]"
-
-    This is needed because the `enable_configurable_moe` parameter uses empty string
-    as ID when value is 0, resulting in test IDs like "test_foo[-param]".
-    We clean these up in pytest_collection_modifyitems, but pytest filters tests
-    during collection using the original IDs. So when user runs with clean test name,
-    we need to convert it back to match the original.
-    """
-    if "test_llm_api_pytorch.py" not in test_id:
-        return test_id
-
-    # Match pattern like "test_name[params]" and add leading dash after "["
-    # But only if params don't already start with "-" or "enable_configurable_moe"
-    match = re.search(r"\[([^\]]+)\]", test_id)
-    if match:
-        params = match.group(1)
-        # Skip if already has leading dash or starts with enable_configurable_moe
-        if not params.startswith("-") and not params.startswith(
-                "enable_configurable_moe"):
-            # Add leading dash to params
-            new_params = "-" + params
-            test_id = test_id.replace(f"[{params}]", f"[{new_params}]")
-
-    return test_id
-
-
-def pytest_sessionstart(session):
-    """Convert clean MoE test IDs in config.args to original format for collection.
-
-    This is needed because pytest filters tests during collection using original IDs.
-    When user runs with clean test name, we convert it back to match the original.
-    """
-    args = session.config.args
-    for i, arg in enumerate(args):
-        if "test_llm_api_pytorch.py" in arg and "[" in arg:
-            # Only apply conversion to specific tests that use enable_configurable_moe
-            should_convert = any(test_name in arg
-                                 for test_name in TESTS_WITH_CONFIGURABLE_MOE)
-            if should_convert:
-                args[i] = _convert_clean_to_original_moe_test_id(arg)
-
-
-def _clean_moe_test_ids(items):
-    """Clean up test IDs by removing leading/trailing dashes from parameter IDs.
-
-    This is needed because `enable_configurable_moe` parameter can be empty,
-    resulting in ugly test IDs like "test_foo[-True]" or "test_foo[--abc]".
-    We clean these up to "test_foo[True]" or "test_foo[abc]" so that:
-    1. Test names in waive files and test lists remain unchanged
-    2. Test reports look cleaner
-    """
-    for item in items:
-        if "test_llm_api_pytorch.py" in item.nodeid and "[" in item.nodeid:
-            # Only apply cleanup to specific tests that use enable_configurable_moe
-            should_cleanup = any(test_name in item.nodeid
-                                 for test_name in TESTS_WITH_CONFIGURABLE_MOE)
-            if should_cleanup:
-                original_nodeid = item.nodeid
-                original_name = item.name
-                nodeid = item.nodeid
-                name = item.name
-
-                # Clean up leading/trailing dashes in nodeid
-                nodeid = nodeid.replace("[-", "[")
-                nodeid = nodeid.replace("-]", "]")
-
-                # Clean up leading/trailing dashes in name
-                name = name.replace("[-", "[")
-                name = name.replace("-]", "]")
-
-                if nodeid != original_nodeid:
-                    item._nodeid = nodeid
-                if name != original_name:
-                    item.name = name
-
-
 @pytest.hookimpl(tryfirst=True, hookwrapper=True)
 def pytest_collection_modifyitems(session, config, items):
     testlist_path = config.getoption("--test-list")
@@ -2308,10 +2220,6 @@ def pytest_collection_modifyitems(session, config, items):
     perf_test = config.getoption("--perf")
     test_model_suites = config.getoption("--test-model-suites")
 
-    # TODO Once the MoE refactor is complete, this should be removed.
-    # This is a temporary WAR to minimize the impact of the MoE refactor on the existing test lists.
-    _clean_moe_test_ids(items)
-
     if perf_test:
         global ALL_PYTEST_ITEMS
         ALL_PYTEST_ITEMS = None
diff --git a/tests/integration/defs/test_fmha.py b/tests/integration/defs/test_fmha.py
index c596da374f6..96ac2864966 100644
--- a/tests/integration/defs/test_fmha.py
+++ b/tests/integration/defs/test_fmha.py
@@ -3,6 +3,8 @@
 from pathlib import Path
 from subprocess import run
 
+from tests.unittest.utils.util import getSMVersion
+
 
 def test_fmha():
     build_run = partial(run, shell=True, check=True)
@@ -14,6 +16,17 @@ def test_fmha():
     try:
         os.chdir(fmha_v2_dir)
 
+        test_arch = getSMVersion()
+        # SM70 is deprecated in TRTLLM, so we don't need to test it
+        all_archs = [80, 86, 89, 90, 100, 120]
+
+        # TODO Find a way to get this programmatically
+        # Filter out the architectures that are tested explicitly to not double up
+        tested_archs = [80, 86, 89, 90]
+
+        # Select the family we belong to (e.g. 103 -> 100)
+        test_arch = max(filter(lambda x: x <= test_arch, all_archs))
+
         env = os.environ.copy()
         env.update({
             "TORCH_CUDA_ARCH_LIST": "9.0",
@@ -26,10 +39,23 @@ def test_fmha():
             "1",  # Do not run tests with skip-softmax feature.
         })
 
-        build_run(
-            "rm -rf generated temp obj .pytest_cache __pycache__ bin cubin")
-        build_run("python3 setup.py", env=env)
-        build_run("make -j 16", env=env)
+        # The test executable is too large if we build all the architectures, so we must build architectures individually
+        def build_arch(arch):
+            env["FMHA_FILTER_ARCH"] = str(arch)
+            build_run(
+                "rm -rf generated temp obj .pytest_cache __pycache__ bin cubin")
+            build_run("python3 setup.py", env=env)
+            build_run("make -j 16", env=env)
+
+        # As part of the A100 test we compile all the architectures we dont have executors for, even if we dont run them
+        if test_arch == 80:
+            build_only_on_archs = set(all_archs) - set(tested_archs)
+
+            for arch in build_only_on_archs:
+                build_arch(arch)
+
+        # Run the test of our current architecture
+        build_arch(test_arch)
         build_run("pytest fmha_test.py", env=env)
 
     finally:
diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml
index a7fc2569ea8..bfff4e577cd 100644
--- a/tests/integration/test_lists/test-db/l0_a100.yml
+++ b/tests/integration/test_lists/test-db/l0_a100.yml
@@ -105,4 +105,4 @@ l0_a100:
       stage: post_merge
       backend: fmha
   tests:
-  - test_fmha.py::test_fmha TIMEOUT (90)
+  - test_fmha.py::test_fmha TIMEOUT (120) # Longer timeout for A100 as it builds all the architectures
diff --git a/tests/integration/test_lists/test-db/l0_b300.yml b/tests/integration/test_lists/test-db/l0_b300.yml
index c866812d89b..7cd7fefc31b 100644
--- a/tests/integration/test_lists/test-db/l0_b300.yml
+++ b/tests/integration/test_lists/test-db/l0_b300.yml
@@ -21,7 +21,6 @@ l0_b300:
   - unittest/_torch/thop/serial
   - unittest/_torch/executor # 250s
   # ------------- modules (non-MoE) ---------------
-  - unittest/_torch/modules/test_mla_helix.py
   - unittest/_torch/modules/test_fused_add_rms_norm_quant.py
   - unittest/_torch/modules/test_fused_activation_quant.py
   - unittest/_torch/modules/test_awq_quantization.py
@@ -33,7 +32,6 @@ l0_b300:
   # ------------- MoE components tests ---------------
   - unittest/_torch/modules/test_moe_load_balancer.py
   - unittest/_torch/modules/test_moe_routing.py
-  - unittest/_torch/modules/test_moe_host_sharer.py
   # ------------- legacy MoE tests ---------------
   - unittest/_torch/modules/test_fused_moe.py
   # ------------- MoE: test_moe_backend (by backend) ---------------
@@ -41,11 +39,21 @@ l0_b300:
   - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM"
   - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "CUTEDSL"
   - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "DEEPGEMM"
-  # ------------- MoE: test_single_gpu (by backend) ---------------
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "CUTLASS"
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "TRTLLM"
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "CUTEDSL"
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "DEEPGEMM"
+  # ------------- MoE: test_single_gpu (specific quant per backend) ---------------
+  # CUTLASS backend: FP8, NVFP4, W4A8_MXFP4_MXFP8, W8A16
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=CUTLASS-quant=FP8-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=CUTLASS-quant=NVFP4-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=CUTLASS-quant=W4A8_MXFP4_MXFP8-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=CUTLASS-quant=W8A16-routing=Renormalize]
+  # TRTLLM backend: NVFP4, FP8_BLOCK_SCALES, W4A8_NVFP4_FP8, W4A16_MXFP4
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=TRTLLM-quant=FP8_BLOCK_SCALES-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A8_NVFP4_FP8-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A16_MXFP4-routing=Renormalize]
+  # CUTEDSL backend: NVFP4
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=CUTEDSL-quant=NVFP4-routing=Renormalize]
+  # DEEPGEMM backend: FP8_BLOCK_SCALES
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=DEEPGEMM-quant=FP8_BLOCK_SCALES-routing=Renormalize]
   # ---- end MoE tests ----
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
index 4821c6a3c14..d57e041023b 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml
@@ -16,30 +16,6 @@ l0_dgx_b200:
       orchestrator: mpi
   tests:
   - unittest/_torch/misc/test_autotuner.py::test_autotuner_distributed_strategy
-  # ------------- legacy MoE tests ---------------
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEPLowLatency]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEPLowLatency]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[NVLinkTwoSided]
-  # ------------- MoE: test_multi_gpu (by backend x quant) ---------------
-  # --- CUTLASS ---
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and None"
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and FP8 and not MXFP8"
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and NVFP4"
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W4A8_MXFP4_MXFP8"
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W8A16"
-  # --- TRTLLM ---
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and NVFP4 and not W4A8"
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and FP8_BLOCK_SCALES"
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A8_NVFP4_FP8"
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A16_MXFP4"
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A8_MXFP4_MXFP8"
-  # --- CUTEDSL (NVFP4 only) ---
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTEDSL"
-  # --- DEEPGEMM (FP8_BLOCK_SCALES only) ---
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "DEEPGEMM"
-  # ------------- MoE: test_multi_gpu_eplb ---------------
-  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb
-  # ---- end MoE tests ----
   - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-True]
   - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-True-True]
   - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_4gpu_mtp_ar TIMEOUT (60)
@@ -60,6 +36,39 @@ l0_dgx_b200:
   - unittest/_torch/visual_gen/test_wan_i2v.py::TestWanI2VCombinedOptimizations::test_all_optimizations_combined
   - unittest/_torch/visual_gen/test_flux_pipeline.py::TestFluxParallelism::test_ulysses_2gpu_correctness
   - unittest/_torch/visual_gen/test_flux_pipeline.py::TestFluxCombinedOptimizations::test_all_optimizations_combined
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*b200*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: pre_merge
+      backend: pytorch
+      orchestrator: mpi
+  tests:
+  # ------------- MoE: test_multi_gpu (by backend x quant) ---------------
+  # --- CUTLASS ---
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and FP8 and not MXFP8"
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and NVFP4"
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W4A8_MXFP4_MXFP8"
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W8A16"
+  # --- TRTLLM ---
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and NVFP4 and not W4A8"
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and FP8_BLOCK_SCALES"
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A8_NVFP4_FP8"
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A16_MXFP4"
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A8_MXFP4_MXFP8"
+  # --- CUTEDSL (NVFP4 only) ---
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTEDSL"
+  # --- DEEPGEMM (FP8_BLOCK_SCALES only) ---
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "DEEPGEMM"
+  # ------------- MoE: test_multi_gpu_eplb ---------------
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb
 - condition:
     ranges:
       system_gpu_count:
@@ -169,7 +178,6 @@ l0_dgx_b200:
       backend: pytorch
       orchestrator: mpi
   tests:
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEP]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=FLASHINFER-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=False]
   - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False]
diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300.yml b/tests/integration/test_lists/test-db/l0_dgx_b300.yml
index 2b71bb5bac1..602d7112ec4 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_b300.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_b300.yml
@@ -17,22 +17,32 @@ l0_dgx_b300:
   tests:
   - unittest/_torch/attention
   - unittest/_torch/executor
-  # ------------- modules (non-MoE) ---------------
+  # ------------- modules (multi-GPU) ---------------
   - unittest/_torch/modules/test_mla_helix.py
-  - unittest/_torch/modules/test_fused_add_rms_norm_quant.py
-  - unittest/_torch/modules/test_fused_activation_quant.py
-  - unittest/_torch/modules/test_awq_quantization.py
-  - unittest/_torch/modules/test_triton_linear.py
-  - unittest/_torch/modules/test_group_rmn_norm.py
-  - unittest/_torch/modules/test_rotary_embedding.py
-  - unittest/_torch/modules/mamba
-  - unittest/_torch/modules/tests_lora_modules
-  # ------------- MoE components tests ---------------
-  - unittest/_torch/modules/test_moe_load_balancer.py
-  - unittest/_torch/modules/test_moe_routing.py
+  # ------------- MoE components tests (multi-GPU) ---------------
   - unittest/_torch/modules/test_moe_host_sharer.py
-  # ------------- legacy MoE tests ---------------
+  # ------------- legacy MoE tests (multi-GPU) ---------------
   - unittest/_torch/modules/test_fused_moe.py
+  # ------------- MoE: multi-GPU module tests (DEP parallel, per backend per quant) ---------------
+  # CUTLASS backend: FP8, NVFP4, W4A8_MXFP4_MXFP8, W8A16
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=FP8-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=NVFP4-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=W4A8_MXFP4_MXFP8-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e256_k8_h7168_i2048-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=W8A16-routing=Renormalize]
+  # TRTLLM backend: NVFP4, FP8_BLOCK_SCALES, W4A8_NVFP4_FP8, W4A16_MXFP4, W4A8_MXFP4_MXFP8
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e256_k8_h7168_i2048-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=FP8_BLOCK_SCALES-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A8_NVFP4_FP8-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e256_k8_h7168_i2048-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A16_MXFP4-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e8_k1_h512_i512-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A8_MXFP4_MXFP8-routing=Renormalize]
+  # CUTEDSL backend: NVFP4
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=CUTEDSL-quant=NVFP4-routing=Renormalize]
+  # DEEPGEMM backend: FP8_BLOCK_SCALES
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=DEEPGEMM-quant=FP8_BLOCK_SCALES-routing=Renormalize]
+  # ------------- MoE: EPLB (Expert Load Balancing) tests ---------------
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb[parallel=DEP-comm=NVLINK_ONE_SIDED-e8_k2_h512_i512-slots=16-dtype=torch.bfloat16-backend=CUTLASS-quant=NVFP4-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb[parallel=DEP-comm=NVLINK_ONE_SIDED-e8_k2_h512_i512-slots=16-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb[parallel=DEP-comm=NVLINK_ONE_SIDED-e8_k2_h512_i512-slots=16-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A16_MXFP4-routing=Renormalize]
   - unittest/_torch/modeling -k "modeling_llama"
   - unittest/_torch/modeling -k "modeling_mixtral"
   - unittest/_torch/modeling -k "modeling_gpt_oss"
diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
index 7ac527a94d7..31535f49e81 100644
--- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml
@@ -143,6 +143,30 @@ l0_dgx_h100:
   - disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing]
   - disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin]
   - disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin]
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*h100*'
+      linux_distribution_name: ubuntu*
+    terms:
+      stage: pre_merge
+      backend: pytorch
+      auto_trigger: others
+      orchestrator: mpi
+  tests:
+  # ------------- MoE: test_multi_gpu (by backend x quant) ---------------
+  # Only CUTLASS backend runs on H100 (SM90). TRTLLM/CUTEDSL/DEEPGEMM require SM100+.
+  # --- CUTLASS ---
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and FP8 and not MXFP8"
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W8A16"
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W4A16_MXFP4"
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W4A8_AWQ"
+  # ------------- MoE: test_multi_gpu_eplb ---------------
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb
 - condition:
     ranges:
       system_gpu_count:
@@ -160,12 +184,6 @@ l0_dgx_h100:
   tests:
   - unittest/_torch/multi_gpu_modeling/test_deepseek.py::test_deepseek_streaming[tp1-bf16-trtllm-deepseekv3_lite]
   - unittest/_torch/multi_gpu_modeling/test_deepseek.py::test_deepseek_streaming[tp4-bf16-trtllm-deepseekv3_lite]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEP]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[NVLinkTwoSided]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype0]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype1]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.W4A8_CUSTOM-dtype0]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.W4A8_CUSTOM-dtype1]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False]
   - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False]
diff --git a/tests/integration/test_lists/test-db/l0_gb10.yml b/tests/integration/test_lists/test-db/l0_gb10.yml
index a749f27a54e..efc6a64530f 100644
--- a/tests/integration/test_lists/test-db/l0_gb10.yml
+++ b/tests/integration/test_lists/test-db/l0_gb10.yml
@@ -38,5 +38,5 @@ l0_gb10:
   # Below cases which are commented out due to they failed on gb10
   # - unittest/_torch/modeling -k "modeling_mllama"
   - unittest/_torch/modeling -k "modeling_out_of_tree"
-  # - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype0]
-  # - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype1]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e8_k1_h512_i512-seq=8-dtype=torch.float16-backend=CUTLASS-quant=NVFP4-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e8_k1_h512_i512-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=NVFP4-routing=Renormalize]
diff --git a/tests/integration/test_lists/test-db/l0_gb202.yml b/tests/integration/test_lists/test-db/l0_gb202.yml
index 0255ba1086d..84c63b7f7a5 100644
--- a/tests/integration/test_lists/test-db/l0_gb202.yml
+++ b/tests/integration/test_lists/test-db/l0_gb202.yml
@@ -17,8 +17,8 @@ l0_gb202:
   # ------------- PyTorch tests ---------------
   - unittest/_torch/modeling -k "modeling_mllama"
   - unittest/_torch/modeling -k "modeling_out_of_tree"
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_finalize_fusion-CUTLASS-dtype0]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_finalize_fusion-CUTLASS-dtype1]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e8_k1_h512_i512-seq=8-dtype=torch.float16-backend=CUTLASS-quant=NVFP4-routing=Renormalize]
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e8_k1_h512_i512-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=NVFP4-routing=Renormalize]
   # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
   - unittest/_torch/attention/test_attention_mla.py
   - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml
index 3c3a4b0cbd4..6631322f8d9 100644
--- a/tests/integration/test_lists/test-db/l0_h100.yml
+++ b/tests/integration/test_lists/test-db/l0_h100.yml
@@ -36,6 +36,10 @@ l0_h100:
   - unittest/_torch/modules/test_moe_host_sharer.py
   # ------------- legacy MoE tests ---------------
   - unittest/_torch/modules/test_fused_moe.py
+  # ------------- MoE: test_moe_backend (by backend) ---------------
+  - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "CUTLASS"
+  # ------------- MoE: test_single_gpu (by backend) ---------------
+  - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "CUTLASS"
   - unittest/_torch/multimodal
   - unittest/_torch/sampler
   - unittest/_torch/speculative -k "eagle3"
diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
index 96ee52c85dc..a2c9e0fcb51 100644
--- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
+++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml
@@ -18,8 +18,6 @@ l0_rtx_pro_6000:
   - unittest/_torch/modeling -k "modeling_out_of_tree"
   # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573
   - unittest/_torch/attention/test_attention_mla.py
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_finalize_fusion-CUTLASS-dtype0]
-  - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_finalize_fusion-CUTLASS-dtype1]
   - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
   - test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity]
   - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B]
diff --git a/tests/unittest/_torch/modules/moe/moe_test_utils.py b/tests/unittest/_torch/modules/moe/moe_test_utils.py
index 5db2ddbc020..69c6418559f 100644
--- a/tests/unittest/_torch/modules/moe/moe_test_utils.py
+++ b/tests/unittest/_torch/modules/moe/moe_test_utils.py
@@ -28,6 +28,7 @@
 """
 
 import logging
+import os
 import time
 from dataclasses import dataclass
 from enum import Enum
@@ -92,6 +93,41 @@ def __str__(self) -> str:
 # ============================================================================
 # Skip Logic Functions
 # ============================================================================
+def _is_fp4_fp8_standalone_gemm_available() -> bool:
+    """Check if standalone fp4_fp8_gemm_trtllmgen kernel has compiled configs on this GPU.
+
+    The W4A8_NVFP4_FP8 reference module (W4A8NVFP4FP8RefGatedMLPFusedMoE) uses
+    standalone fp4_fp8_gemm_trtllmgen GEMM calls via W4A8NVFP4FP8LinearMethod.
+    These standalone GEMM kernels may not have compiled configurations for all SM
+    versions, even when the fused MoE kernel (TRTLLMGenFusedMoE) works fine.
+
+    Returns True if the standalone kernel is available, False otherwise.
+    Result is cached after first call.
+    """
+    if hasattr(_is_fp4_fp8_standalone_gemm_available, "_cached_result"):
+        return _is_fp4_fp8_standalone_gemm_available._cached_result
+
+    try:
+        import tensorrt_llm.quantization.utils.fp4_utils as fp4_utils
+
+        # Create minimal valid tensors for GEMM probe:
+        # mat1: (m, k) FP8, mat2: (n, k/2) FP4, scale: FP8, global_scale: FP32
+        m, n, k = 1, 128, 128
+        fp8_input = torch.zeros((m, k), dtype=torch.float8_e4m3fn, device="cuda")
+        fp4_weight = torch.zeros((n, k // 2), dtype=fp4_utils.float4_e2m1x2, device="cuda")
+        weight_scale = torch.ones((n * (k // 32),), dtype=torch.float8_e4m3fn, device="cuda")
+        global_scale = torch.ones((1,), dtype=torch.float32, device="cuda")
+        torch.ops.trtllm.fp4_fp8_gemm_trtllmgen(
+            fp8_input, fp4_weight, weight_scale, global_scale, torch.float16
+        )
+        result = True
+    except RuntimeError:
+        result = False
+
+    _is_fp4_fp8_standalone_gemm_available._cached_result = result
+    return result
+
+
 def should_skip_trtllm(
     backend_type: MoeBackendType,
     quant_algo: Optional[QuantAlgo],
@@ -99,6 +135,8 @@ def should_skip_trtllm(
     routing_method_cls=None,
     swiglu_gptoss_style: bool = False,
     comm_method: Optional[str] = None,
+    seq_len: Optional[int] = None,
+    moe_tp_size: int = 1,
 ) -> Optional[str]:
     """
     Check TRTLLM Gen backend specific constraints.
@@ -115,6 +153,8 @@ def should_skip_trtllm(
         swiglu_gptoss_style: Whether using swiglu gptoss style
         comm_method: Optional communication method (e.g. "DEEPEP", "DEEPEPLOWLATENCY")
             for multi-GPU EP mode checks
+        seq_len: Optional sequence length for seq_len-sensitive skip checks
+        moe_tp_size: MoE TP parallelism size (default: 1, no TP sharding)
 
     Returns:
         Skip reason string if test should be skipped, None otherwise
@@ -226,6 +266,20 @@ def should_skip_trtllm(
                 f"block_scale_interleave_reverse rows must be multiple of 128."
             )
 
+    # -----------------Reference module constraints------------------
+    # The W4A8_NVFP4_FP8 reference module (W4A8NVFP4FP8RefGatedMLPFusedMoE) uses
+    # standalone fp4_fp8_gemm_trtllmgen GEMM calls via W4A8NVFP4FP8LinearMethod.
+    # These standalone GEMM kernels may not have compiled configs for all SM versions,
+    # even though the fused MoE kernel (TRTLLMGenFusedMoE) works fine on those SMs.
+    # Skip if the standalone kernel is not available on the current GPU.
+    if quant_algo == QuantAlgo.W4A8_NVFP4_FP8:
+        if not _is_fp4_fp8_standalone_gemm_available():
+            return (
+                "W4A8_NVFP4_FP8 reference module requires standalone "
+                "fp4_fp8_gemm_trtllmgen kernel which is not available on this GPU. "
+                "The fused MoE kernel works but the reference GatedMLP cannot run."
+            )
+
     # -----------------Potential issues------------------
     # These are known issues that need investigation. Skipping to avoid test failures
     # and CUDA errors that can cascade to subsequent tests.
@@ -237,6 +291,30 @@ def should_skip_trtllm(
             "causes CUDA illegal memory access."
         )
 
+    # Issue: NVFP4 with large expert count + large hidden_size + seq_len=1
+    # has a single FP4BlockScaleMoERunner tactic with accuracy failure.
+    # Observed: e256_k8_h7168_i2048, seq=1, bfloat16 — tactic[204] with tile
+    # config [8, 83] produces 8.37% element mismatch (threshold: 3%).
+    # All other 207/208 tactics pass. seq=8 with the same config also passes
+    # (different tile behavior). The swiglu_gptoss_style variant passes too
+    # (uses relaxed tolerance: rtol=0.1, percent=0.95).
+    # Root cause: FP4 quantization error accumulates in the large GEMM reduction
+    # dimension (h=7168) and the [8, 83] tile config hits an edge case at seq=1.
+    if (
+        quant_algo == QuantAlgo.NVFP4
+        and not swiglu_gptoss_style
+        and seq_len == 1
+        and num_experts >= 256
+        and model_config.hidden_size >= 7168
+    ):
+        return (
+            f"[Potential Bug] TRTLLMGenFusedMoE NVFP4 with large model "
+            f"(num_experts={num_experts}, hidden_size={model_config.hidden_size}) "
+            f"and seq_len=1: 207/208 tactics pass but tactic[204] "
+            f"(FP4BlockScaleMoERunner tile [8, 83]) has 8.37% mismatch "
+            f"(threshold 3%). seq_len=8 passes all tactics."
+        )
+
     # Issue: NVFP4 with large intermediate_size has known accuracy issues
     if quant_algo == QuantAlgo.NVFP4 and intermediate_size >= 14336:
         return (
@@ -285,6 +363,43 @@ def should_skip_trtllm(
                 f"Single-GPU tests pass; issue is in the kernel runner under EP."
             )
 
+        # Issue: NVFP4 with large model configs crashes with CUDA illegal memory
+        # access in DeepEP mode (deep_ep.cpp:86).
+        # Verified: e60_k4_h2048_i1408 passes, e256_k8_h7168_i2048 crashes.
+        # The crash kills the entire pytest process, blocking all subsequent tests.
+        if (
+            quant_algo == QuantAlgo.NVFP4
+            and num_experts >= 256
+            and model_config.hidden_size >= 7168
+        ):
+            return (
+                f"[Potential Bug] TRTLLMGenFusedMoE NVFP4 with large model "
+                f"(num_experts={num_experts}, hidden_size={model_config.hidden_size}) "
+                f"crashes with CUDA illegal memory access in DeepEP mode "
+                f"(comm={comm_method}). Smaller configs pass."
+            )
+
+    # TP per-shard alignment: when moe_tp_size > 1, intermediate_size is sharded.
+    # MXFP4 variants (W4A16_MXFP4, W4A8_MXFP4_MXFP8) auto-pad to 128 alignment,
+    # but other quants (FP8_BLOCK_SCALES, NVFP4, W4A8_NVFP4_FP8) crash:
+    #   - FP8_BLOCK_SCALES: block scale tensor size mismatch
+    #     (ceil(per_shard/128) vs floor(per_shard/128))
+    #   - NVFP4: unswizzle_sf shape '[-1, w3_w1, 128]' invalid
+    #   - W4A8_NVFP4_FP8: No valid config for non-aligned N dimension
+    if moe_tp_size > 1 and intermediate_size % moe_tp_size == 0:
+        per_shard = intermediate_size // moe_tp_size
+        tp_crash_quants = {
+            QuantAlgo.FP8_BLOCK_SCALES,
+            QuantAlgo.NVFP4,
+            QuantAlgo.W4A8_NVFP4_FP8,
+        }
+        if quant_algo in tp_crash_quants and per_shard % 128 != 0:
+            return (
+                f"TRTLLMGenFusedMoE {quant_algo}: per-shard intermediate_size="
+                f"{per_shard} (= {intermediate_size} / {moe_tp_size}) is not "
+                f"128-aligned."
+            )
+
     return None
 
 
@@ -294,6 +409,7 @@ def should_skip_cutedsl(
     model_config: "MoeModelConfig" = None,
     comm_method: Optional[str] = None,
     routing_method_cls=None,
+    moe_tp_size: int = 1,
 ) -> Optional[str]:
     """
     Check CuteDSL backend specific constraints.
@@ -304,42 +420,45 @@ def should_skip_cutedsl(
     if backend_type != MoeBackendType.CUTEDSL:
         return None
 
-    # DeepEPLowLatency _modify_output_to_adapt_fused_moe converts dispatch output
-    # to a format where token_selected_slots has shape [num_local_experts, tokens_per_expert]
-    # instead of [num_tokens, top_k]. CuteDSL moe_sort asserts
-    # token_selected_experts.size(1) == top_k, which fails with this format.
-    if comm_method == "DEEPEPLOWLATENCY":
-        return (
-            "[Potential Bug] CuteDslFusedMoE is incompatible with DeepEPLowLatency: "
-            "DeepEPLowLatency _modify_output_to_adapt_fused_moe reshapes "
-            "token_selected_slots to [num_local_experts, tokens_per_expert] "
-            "(effectively top_k=1), but CuteDSL moe_sort requires "
-            "token_selected_experts.size(1) == top_k."
-        )
-
     if model_config is None:
         return None
 
     intermediate_size = model_config.intermediate_size
-    num_experts = model_config.num_experts
 
-    # NVFP4 with large intermediate_size has known accuracy issues
+    # NVFP4 with large intermediate_size has known accuracy issues (8.5% mismatch
+    # at i=14336, threshold 3%). Both CuteDSL and reference have FP4 intermediate
+    # storage, but produce DIFFERENT FP4 values due to:
+    # 1) SwiGLU precision: CuteDSL kernel uses approximate math ops for sigmoid
+    #    (rcp_approx + exp2 fastmath, see utils.py:sigmoid_f32), while reference
+    #    Triton kernel uses standard tl.sigmoid (see swiglu.py:42).
+    # 2) Precision chain: CuteDSL computes SwiGLU in FP32 (GEMM accumulator →
+    #    FP32 SwiGLU → FP4), reference goes FP32 accumulator → BF16 → SwiGLU →
+    #    BF16 → fp4_quantize. Two BF16 truncation points create different values.
+    # 3) FP4 quantization: CuteDSL uses rcp_approx for block scale reciprocal
+    #    (blockscaled_...fusion.py:2588), fp4_quantize uses exact division.
+    # These per-element FP4 value differences accumulate through FC2 GEMM dot
+    # product (K=intermediate_size). CUTLASS avoids this entirely with a single
+    # fused kernel keeping BF16 intermediate precision.
     if quant_algo == QuantAlgo.NVFP4 and intermediate_size >= 14336:
         return (
-            f"[Potential Bug] CuteDslFusedMoE NVFP4 with large intermediate_size "
-            f"has known accuracy issues (intermediate_size={intermediate_size} >= 14336)."
-        )
-
-    # NVFP4 with prime num_experts causes CUDA_ERROR_ILLEGAL_ADDRESS
-    prime_experts_with_issues = {7, 13}
-    if quant_algo == QuantAlgo.NVFP4 and num_experts in prime_experts_with_issues:
-        return (
-            f"[Potential Bug] CuteDslFusedMoE NVFP4 with prime num_experts={num_experts} "
-            f"causes CUDA_ERROR_ILLEGAL_ADDRESS due to autotuner cache bucket mapping."
+            f"[Design Limitation] CuteDslFusedMoE NVFP4 with large "
+            f"intermediate_size has accuracy issues due to FP4 intermediate "
+            f"storage between FC1+SwiGLU and FC2 kernels "
+            f"(intermediate_size={intermediate_size} >= 14336, "
+            f"FC2 accumulates over K={intermediate_size} with 896+ blocks)."
         )
 
-    # NVFP4 with Llama4Renormalize routing has significant accuracy issues on bfloat16.
-    # Observed mismatch up to 34.6% (threshold 2% at rtol=0.01, percent=0.98).
+    # NVFP4 with Llama4Renormalize routing has significant accuracy issues.
+    # Same root cause as the large intermediate_size skip above: CuteDSL and
+    # reference produce different FP4 intermediate values due to approximate
+    # math ops (rcp_approx, exp2 fastmath) and BF16 truncation differences.
+    # Llama4's sigmoid routing amplifies these differences: standard Renormalize
+    # uses softmax (weights sum to 1, per-expert errors averaged), while Llama4
+    # uses sigmoid (weights independent in (0,1), per-expert errors summed
+    # without normalization). This amplifies FP4 value differences by ~top_k/2.
+    # Mismatch correlates with hidden_size (FC1 K dimension): h=512 passes,
+    # h=2048 fails 8-17%, h=7168 fails 24-35%. Observed: e60(9.4%),
+    # e64(16.5%), e256(34.6%), e384(30.9%) at threshold 3%.
     if routing_method_cls is not None:
         from tensorrt_llm._torch.modules.fused_moe import Llama4RenormalizeMoeRoutingMethod
 
@@ -348,8 +467,20 @@ def should_skip_cutedsl(
             and routing_method_cls == Llama4RenormalizeMoeRoutingMethod
         ):
             return (
-                "[Potential Bug] CuteDslFusedMoE NVFP4 with Llama4Renormalize "
-                "routing has significant accuracy issues (mismatch up to 34.6%%)."
+                "[Design Limitation] CuteDslFusedMoE NVFP4 with Llama4Renormalize "
+                "routing: FP4 intermediate errors amplified by non-normalized "
+                "sigmoid routing weights (mismatch up to 34.6%)."
+            )
+
+    # TP per-shard alignment: NVFP4 requires 128-aligned per-shard intermediate_size.
+    # fp4_utils.py asserts M % 128 == 0 where M = 2 * per_shard (combined w3_w1).
+    if moe_tp_size > 1 and quant_algo == QuantAlgo.NVFP4 and intermediate_size % moe_tp_size == 0:
+        per_shard = intermediate_size // moe_tp_size
+        if per_shard % 128 != 0:
+            return (
+                f"CuteDslFusedMoE NVFP4: per-shard intermediate_size="
+                f"{per_shard} (= {intermediate_size} / {moe_tp_size}) is not "
+                f"128-aligned. fp4_utils asserts M % 128 == 0."
             )
 
     return None
@@ -360,6 +491,8 @@ def should_skip_cutlass(
     comm_method: Optional[str] = None,
     quant_algo: Optional[QuantAlgo] = None,
     model_config: "MoeModelConfig" = None,
+    moe_tp_size: int = 1,
+    dtype=None,
 ) -> Optional[str]:
     """
     Check CUTLASS backend specific constraints for multi-GPU tests.
@@ -370,25 +503,35 @@ def should_skip_cutlass(
     if backend_type != MoeBackendType.CUTLASS:
         return None
 
-    # Issue: CUTLASS + DeepEP + (W4A8_MXFP4_MXFP8 or W8A16) has significant accuracy
-    # issues in multi-GPU EP mode. Observed failures:
-    #   - e32_k8_h7168_i2048, seq=8: mismatch 24-37% (rtol=0.15)
-    #   - e8_k1_h512_i512, seq=1/8:  mismatch 86-100% (rtol=0.10), results completely wrong
-    # NVLINK communication with the same configs passes.
-    # Root cause: likely data layout or all-to-all dispatch/combine issue in the
-    # DeepEP communication path for these quantization methods.
-    if comm_method in ("DEEPEP", "DEEPEPLOWLATENCY"):
-        deepep_accuracy_quant_algos = {
-            QuantAlgo.W4A8_MXFP4_MXFP8,
+    # TP per-shard alignment: W8A16, NVFP4, and W4A8_AWQ require 128-aligned
+    # per-shard intermediate_size. W8A16 fails in preprocess_weights_for_mixed_gemm
+    # (num_rows % rows_per_tile != 0). NVFP4 pads to 128-alignment
+    # (NVFP4_ROW_ALIGNMENT in quantization.py:2312) but zero-padding +
+    # blockwise quantization interaction causes ~6-7% mismatch.
+    # W4A8_AWQ (WInt4AFP8FusedMoEMethod) requires K dimensions to be multiples
+    # of 128 on SM90 for interleave factor selection (quantization.py:1310-1324).
+    # W4A8_MXFP4_MXFP8 uses MXFP4 auto-padding that handles this correctly.
+    if moe_tp_size > 1 and model_config is not None:
+        tp_alignment_quants = {
             QuantAlgo.W8A16,
+            QuantAlgo.NVFP4,
+            QuantAlgo.W4A8_AWQ,
         }
-        if quant_algo in deepep_accuracy_quant_algos:
-            return (
-                f"[Potential Bug] CutlassFusedMoE {quant_algo} has significant accuracy "
-                f"issues with DeepEP communication (comm={comm_method}). "
-                f"Mismatch up to 100% on small models (e8_k1). "
-                f"NVLINK communication with the same config passes."
-            )
+        # FP8_BLOCK_SCALES has this issue only on Hopper (SM90)
+        if torch.cuda.get_device_capability(0) == (9, 0):
+            tp_alignment_quants.add(QuantAlgo.FP8_BLOCK_SCALES)
+
+        if quant_algo in tp_alignment_quants:
+            intermediate_size = model_config.intermediate_size
+            if intermediate_size % moe_tp_size == 0:
+                per_shard = intermediate_size // moe_tp_size
+                if per_shard % 128 != 0:
+                    return (
+                        f"CutlassFusedMoE {quant_algo}: per-shard "
+                        f"intermediate_size={per_shard} "
+                        f"(= {intermediate_size} / {moe_tp_size}) is not "
+                        f"128-aligned."
+                    )
 
     return None
 
@@ -398,6 +541,7 @@ def should_skip_deepgemm(
     comm_method: Optional[str] = None,
     quant_algo: Optional[QuantAlgo] = None,
     model_config: "MoeModelConfig" = None,
+    moe_tp_size: int = 1,
 ) -> Optional[str]:
     """
     Check DeepGemm backend specific constraints.
@@ -409,20 +553,50 @@ def should_skip_deepgemm(
         return None
 
     # Issue: DEEPGEMM + FP8_BLOCK_SCALES crashes with CUDA illegal memory access
-    # on large expert counts (e.g. e384_k8_h7168_i2048) during post_load_weights().
-    # The crash occurs in get_col_major_tma_aligned_packed_tensor (fp8_utils.py)
-    # when resmoothing FP8 E8M0 scales on SM100f (Blackwell).
-    # Small configs (e.g. e60_k4_h2048_i1408) pass fine.
+    # in _resmooth_kernel (Triton JIT) during post_load_weights() FP8 E8M0 scale
+    # resmoothing on SM100f (Blackwell). Root cause is a Triton compiler/runtime
+    # bug on SM100f: the kernel crashes when total grid blocks exceed ~65K.
+    # The crash depends on grid size, not just num_experts — Grok-1 (e8, h=6144,
+    # i=32768) crashes despite having only 8 experts because its weight tensors
+    # produce grids with 196K+ blocks.
+    # Weight shapes: w3_w1=[E, I*2, H], w2=[E, H, I] (from quantization.py)
+    # Grid for resmooth: (E, cdiv(M,128), cdiv(K,128))
+    # Verified boundary: max_blocks <= 57344 passes, >= 98304 crashes.
+    # Threshold: 65536 blocks (64K). Affected: DeepSeek-V3, Kimi-K2, Grok-1.
+    _RESMOOTH_GRID_BLOCK_LIMIT = 65536
     if quant_algo == QuantAlgo.FP8_BLOCK_SCALES and model_config is not None:
-        if model_config.num_experts > 128:
+        num_e = model_config.num_experts
+        hidden = model_config.hidden_size
+        inter = model_config.intermediate_size
+
+        def _cdiv(x, y):
+            return (x + y - 1) // y
+
+        w31_blocks = num_e * _cdiv(inter * 2, 128) * _cdiv(hidden, 128)
+        w2_blocks = num_e * _cdiv(hidden, 128) * _cdiv(inter, 128)
+        max_blocks = max(w31_blocks, w2_blocks)
+        if max_blocks > _RESMOOTH_GRID_BLOCK_LIMIT:
             return (
-                f"[Potential Bug] DeepGemmFusedMoE FP8_BLOCK_SCALES crashes with "
-                f"CUDA illegal memory access on large expert count "
-                f"(num_experts={model_config.num_experts}). The crash occurs in "
-                f"get_col_major_tma_aligned_packed_tensor during "
-                f"post_load_weights() FP8 E8M0 scale resmoothing on SM100f."
+                f"[Triton Bug] DeepGemmFusedMoE FP8_BLOCK_SCALES crashes in "
+                f"_resmooth_kernel on SM100f when grid blocks exceed ~64K "
+                f"(max_blocks={max_blocks:,} > {_RESMOOTH_GRID_BLOCK_LIMIT:,}). "
+                f"Affected: E={num_e}, H={hidden}, I={inter}."
             )
 
+    # TP per-shard alignment: FP8_BLOCK_SCALES requires 128-aligned per-shard
+    # intermediate_size for block scale tensor operations.
+    if moe_tp_size > 1 and quant_algo == QuantAlgo.FP8_BLOCK_SCALES and model_config is not None:
+        intermediate_size = model_config.intermediate_size
+        if intermediate_size % moe_tp_size == 0:
+            per_shard = intermediate_size // moe_tp_size
+            if per_shard % 128 != 0:
+                return (
+                    f"DeepGemmFusedMoE FP8_BLOCK_SCALES: per-shard "
+                    f"intermediate_size={per_shard} "
+                    f"(= {intermediate_size} / {moe_tp_size}) is not "
+                    f"128-aligned."
+                )
+
     return None
 
 
@@ -430,6 +604,7 @@ def should_skip_multi_gpu(
     parallel_mode: str,
     model_config: "MoeModelConfig",
     world_size: int = 4,
+    comm_method: Optional[str] = None,
 ) -> Optional[str]:
     """
     Check if a multi-GPU test should be skipped due to EP partitioning constraints.
@@ -442,10 +617,20 @@ def should_skip_multi_gpu(
         parallel_mode: Parallelism strategy ("DEP", "TEP", "DTP", "TTP")
         model_config: MoE model configuration containing num_experts
         world_size: Total number of GPUs (default: 4)
+        comm_method: Optional communication method (e.g. "DEEPEP", "DEEPEPLOWLATENCY")
 
     Returns:
         Skip reason string if test should be skipped, None otherwise
     """
+    # DEEPEPLOWLATENCY hangs on H100 (SM90) in CI multi-GPU tests.
+    if comm_method == "DEEPEPLOWLATENCY":
+        capability = torch.cuda.get_device_capability(0)
+        if capability == (9, 0):
+            return (
+                "[CI Hang] DEEPEPLOWLATENCY hangs on H100 (SM90) in "
+                "multi-GPU tests. Skipping until the issue is resolved."
+            )
+
     # Only EP modes have ep_size = world_size; TP modes have ep_size = 1
     if parallel_mode not in ("DEP", "TEP"):
         return None
@@ -527,6 +712,7 @@ def get_quick_skip_reason(
     model_config: "MoeModelConfig",
     routing_method_cls=None,
     swiglu_gptoss_style: bool = False,
+    seq_len: Optional[int] = None,
 ) -> Optional[str]:
     """
     Fast skip check that calls backend's can_implement() method.
@@ -534,6 +720,7 @@ def get_quick_skip_reason(
     Unified version supporting both backend-level and module-level tests:
     - routing_method_cls: Used by test_moe_module.py for routing method compatibility checks
     - swiglu_gptoss_style: Used by test_moe_backend.py for SwiGLU parameter checks
+    - seq_len: Optional sequence length for seq_len-sensitive skip checks
 
     Returns:
         Skip reason string if test should be skipped, None otherwise
@@ -559,7 +746,12 @@ def get_quick_skip_reason(
         skip_checks = [
             lambda: should_skip_routing_method(routing_method_cls, model_config),
             lambda: should_skip_trtllm(
-                backend_type, quant_algo, model_config, routing_method_cls, swiglu_gptoss_style
+                backend_type,
+                quant_algo,
+                model_config,
+                routing_method_cls,
+                swiglu_gptoss_style,
+                seq_len=seq_len,
             ),
             lambda: should_skip_cutedsl(
                 backend_type, quant_algo, model_config, routing_method_cls=routing_method_cls
@@ -600,6 +792,45 @@ def get_quick_skip_reason(
         trtllm_logger.setLevel(original_level)
 
 
+# ============================================================================
+# GPU Memory Check
+# ============================================================================
+def skip_if_insufficient_gpu_memory(
+    num_experts: int,
+    hidden_size: int,
+    intermediate_size: int,
+    dtype: torch.dtype = torch.float32,
+    overhead_factor: float = 4.0,
+) -> None:
+    """
+    Skip the current test if estimated GPU memory exceeds device capacity.
+
+    Each expert has gate_up_proj [2*I, H] + down_proj [H, I] = 3*H*I elements.
+    The overhead_factor (default 4x) accounts for ref model + DUT model +
+    quantization scales/activations + CUDA allocator overhead.
+
+    Args:
+        num_experts: Number of MoE experts
+        hidden_size: Hidden dimension size
+        intermediate_size: Intermediate (FFN) dimension size
+        dtype: Weight data type for byte-size calculation
+        overhead_factor: Multiplier over single-model weight bytes
+    """
+    if not torch.cuda.is_available():
+        return
+    bytes_per_elem = torch.tensor([], dtype=dtype).element_size()
+    single_model_bytes = num_experts * 3 * hidden_size * intermediate_size * bytes_per_elem
+    estimated_total_bytes = int(single_model_bytes * overhead_factor)
+    gpu_total_bytes = torch.cuda.get_device_properties(0).total_memory
+    if estimated_total_bytes > gpu_total_bytes:
+        pytest.skip(
+            f"Estimated memory {estimated_total_bytes / (1 << 30):.1f}GB "
+            f"exceeds GPU memory {gpu_total_bytes / (1 << 30):.1f}GB "
+            f"(num_experts={num_experts}, hidden_size={hidden_size}, "
+            f"intermediate_size={intermediate_size}, dtype={dtype})"
+        )
+
+
 # ============================================================================
 # Autotuner Tactic Replay
 # ============================================================================
@@ -667,6 +898,113 @@ def create_test_param(param_values, test_id, skip_reason=None):
     return pytest.param(*param_values, id=test_id)
 
 
+# ============================================================================
+# CI Mode Detection
+# ============================================================================
+_TRTLLM_TEST_MOE_CI_ENV = "TRTLLM_TEST_MOE_CI"
+IS_CI_MODE = os.environ.get(_TRTLLM_TEST_MOE_CI_ENV, "1") == "1"
+
+# ============================================================================
+# CI Acceleration Skip Logic
+# ============================================================================
+
+# Routing methods that require full routing coverage in CI
+_CI_ROUTING_METHODS = {"Renormalize", "DeepSeekV3"}
+
+
+def should_skip_to_accelerate_ci(
+    backend_type: "MoeBackendType",
+    quant_algo: Optional[QuantAlgo],
+    model_config: "MoeModelConfig",
+    routing_method_cls=None,
+    dtype: Optional[torch.dtype] = None,
+    seq_len: Optional[int] = None,
+    swiglu_gptoss_style: bool = False,
+    parallel_mode: Optional[str] = None,
+) -> Optional[str]:
+    """
+    Skip low-information-density test combinations to accelerate CI.
+
+    Only active when TRTLLM_TEST_MOE_CI=1 (default). When TRTLLM_TEST_MOE_CI=0,
+    all combinations run (local exhaustive testing).
+
+    Rules applied (in order):
+    0. Skip unquantized (quant=None) — quantized paths are the focus of CI
+    1. e256 model: only DeepSeekV3 routing, bfloat16, seq=1, non-gptoss
+    2. Multi-GPU: only DEP and TTP parallel modes
+    3. Routing: full 6 routing methods only on (CUTLASS or TRTLLM) with NVFP4;
+       other backend+quant combos only run Renormalize
+       and DeepSeekV3. This rule is overridden by rule 1 for e256.
+
+    Args:
+        backend_type: MoE backend type
+        quant_algo: Quantization algorithm
+        model_config: MoE model configuration
+        routing_method_cls: Routing method class (None means no routing filter)
+        dtype: Activation data type
+        seq_len: Sequence length
+        swiglu_gptoss_style: Whether using SwiGLU gptoss style
+        parallel_mode: Multi-GPU parallel mode (None for single-GPU tests)
+
+    Returns:
+        Skip reason string if test should be skipped for CI, None otherwise
+    """
+    if not IS_CI_MODE:
+        return None
+
+    if model_config is None:
+        return None
+
+    # --- Rule 0: Skip unquantized (quant=None) ---
+    if quant_algo is None:
+        return "[CI accel] Skip unquantized (quant=None) in CI"
+
+    is_large_model = model_config.num_experts >= 256 and model_config.hidden_size >= 7168
+
+    # --- Rule 1: Large model (e256_k8_h7168_i2048) restrictions ---
+    if is_large_model:
+        if routing_method_cls is not None:
+            from tensorrt_llm._torch.modules.fused_moe import DeepSeekV3MoeRoutingMethod
+
+            if routing_method_cls != DeepSeekV3MoeRoutingMethod:
+                routing_name = routing_method_cls.__name__
+                return (
+                    f"[CI accel] Large model (num_experts={model_config.num_experts}) "
+                    f"only tests DeepSeekV3 routing in CI (got {routing_name})"
+                )
+
+        if dtype is not None and dtype != torch.bfloat16:
+            return f"[CI accel] Large model only tests bfloat16 in CI (got {dtype})"
+
+        if seq_len is not None and seq_len != 1:
+            return f"[CI accel] Large model only tests seq=1 in CI (got seq={seq_len})"
+
+        if swiglu_gptoss_style:
+            return "[CI accel] Large model only tests non-gptoss in CI"
+
+    # --- Rule 2: Multi-GPU parallel mode restrictions ---
+    if parallel_mode is not None and parallel_mode not in ("DEP", "TTP"):
+        return f"[CI accel] Only DEP and TTP parallel modes in CI (got {parallel_mode})"
+
+    # --- Rule 3: Routing method restrictions per backend+quant ---
+    # Full routing coverage on: (CUTLASS, or TRTLLM) with NVFP4
+    # Other combos: only Renormalize + DeepSeekV3
+    # Rule 1 already handles e256 (DeepSeekV3 only), so this only applies to non-e256.
+    if not is_large_model and routing_method_cls is not None:
+        routing_name = routing_method_cls.__name__.replace("MoeRoutingMethod", "")
+        if routing_name not in _CI_ROUTING_METHODS:
+            allows_full_routing = (
+                backend_type == MoeBackendType.CUTLASS or backend_type == MoeBackendType.TRTLLM
+            ) and quant_algo == QuantAlgo.NVFP4
+            if not allows_full_routing:
+                return (
+                    f"[CI accel] {backend_type.value}+{quant_algo} only tests "
+                    f"Renormalize/DeepSeekV3 routing in CI (got {routing_name})"
+                )
+
+    return None
+
+
 # ============================================================================
 # Timing Fixture
 # ============================================================================
@@ -729,6 +1067,7 @@ def iter_base_test_configs(
             model_config,
             routing_method_cls,
             swiglu_gptoss_style=swiglu_gptoss_style,
+            seq_len=seq_len,
         )
         routing_name = routing_method_cls.__name__.replace("MoeRoutingMethod", "")
         swiglu_id = (
diff --git a/tests/unittest/_torch/modules/moe/quantize_utils.py b/tests/unittest/_torch/modules/moe/quantize_utils.py
index 24652a1c068..99c00be7928 100644
--- a/tests/unittest/_torch/modules/moe/quantize_utils.py
+++ b/tests/unittest/_torch/modules/moe/quantize_utils.py
@@ -24,6 +24,7 @@
     per_block_cast_to_fp8_e8m0,
     per_token_cast_to_fp8_e8m0,
 )
+from _torch.modules.moe.moe_test_utils import skip_if_insufficient_gpu_memory
 from utils.util import check_accuracy
 
 from tensorrt_llm._torch.model_config import ModelConfig
@@ -217,6 +218,10 @@ def __init__(
             model_config = ModelConfig()
         self.quant_config = model_config.quant_config
 
+        skip_if_insufficient_gpu_memory(
+            num_experts, hidden_size, intermediate_size, dtype or torch.float32
+        )
+
         # Custom swiglu activation for swiglu_gptoss_style
         def custom_swiglu(x):
             gate, value = x.chunk(2, dim=-1)
@@ -253,11 +258,18 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> t
         final_hidden_states = torch.zeros(
             hidden_states.shape, dtype=hidden_states.dtype, device=hidden_states.device
         )
+        # FP8_BLOCK_SCALES linear kernel requires bfloat16 activation input
+        ref_requires_bf16 = (
+            self.quant_config is not None
+            and self.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES
+        )
         for expert_id in range(self.num_experts):
             if not torch.any(selected_experts == expert_id):
                 continue
             batch_idx, nth_expert = torch.where(selected_experts == expert_id)
             expert_inputs = hidden_states[batch_idx]
+            if ref_requires_bf16:
+                expert_inputs = expert_inputs.to(torch.bfloat16)
             output = self.experts[expert_id](expert_inputs)
             final_hidden_states[batch_idx] += (
                 routing_weights[batch_idx, nth_expert, None] * output.float()
@@ -1948,6 +1960,24 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> t
         else:
             weight_scale_key = "weight_scale_inv"
 
+        # For W4A8_CUSTOM mode, the fused kernel uses a GLOBAL max input_scale
+        # across all experts (not per-expert), because the kernel applies a single
+        # pre-quant scale to all tokens before dispatching to experts.
+        # The reference must match this behavior to produce identical results.
+        if self.weight_loading_mode == MoEWeightLoadingMode.W4A8_CUSTOM:
+            all_fc31_input_scales = []
+            all_fc2_input_scales = []
+            for eid in range(self.num_experts):
+                p1 = self.weights[f"{eid}.w1.input_scale"].cuda()
+                p3 = self.weights[f"{eid}.w3.input_scale"].cuda()
+                all_fc31_input_scales.append(torch.max(p1, p3))
+                all_fc2_input_scales.append(self.weights[f"{eid}.w2.input_scale"].cuda())
+            global_fc31_input_scale = torch.stack(all_fc31_input_scales).max()
+            global_fc2_input_scale = torch.stack(all_fc2_input_scales).max()
+        else:
+            global_fc31_input_scale = None
+            global_fc2_input_scale = None
+
         for expert_id in range(self.num_experts):
             mask = selected_experts == expert_id
             activated_tokens = mask.sum(1).bool()
@@ -1970,12 +2000,16 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> t
             # Fuse scales - must cat in same order as weights
             s3_s1 = torch.cat([s3, s1], dim=-1)
 
-            # Get input scales
-            p1 = self.weights[f"{expert_id}.w1.input_scale"].cuda()
-            p2 = self.weights[f"{expert_id}.w2.input_scale"].cuda()
-            p3 = self.weights[f"{expert_id}.w3.input_scale"].cuda()
-            # IMPORTANT: Use max for fused computation to ensure consistent quantization
-            p3_p1 = torch.max(p1, p3)
+            # Get input scales - use global max for W4A8_CUSTOM, per-expert for VANILLA
+            if global_fc31_input_scale is not None:
+                p3_p1 = global_fc31_input_scale
+                p2 = global_fc2_input_scale
+            else:
+                p1 = self.weights[f"{expert_id}.w1.input_scale"].cuda()
+                p2 = self.weights[f"{expert_id}.w2.input_scale"].cuda()
+                p3 = self.weights[f"{expert_id}.w3.input_scale"].cuda()
+                # IMPORTANT: Use max for fused computation to ensure consistent quantization
+                p3_p1 = torch.max(p1, p3)
 
             # Get pre_quant_scale (only for VANILLA mode)
             a1 = a2 = a3 = a1_a3 = None
@@ -2023,7 +2057,12 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> t
         return results.reshape(hidden_states.shape)
 
     def check_accuracy(self, output, ref_output):
-        torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
+        # W4A8_AWQ accumulates FP8 QDQ noise from two layers (fc31 + fc2).
+        # With higher top_k, more experts contribute per token, increasing
+        # the accumulated numerical noise in the final summation.
+        top_k = self.routing_method.top_k if hasattr(self.routing_method, "top_k") else 1
+        atol = 0.1 * max(1, top_k / 4)
+        check_accuracy(output, ref_output, rtol=1e-2, atol=atol, percent=0.97)
 
 
 class W4A8AWQQuantizeUtil(BaseQuantizeUtil):
@@ -2039,8 +2078,9 @@ def __init__(
         intermediate_size: int,
         hidden_size: int,
         quant_config: QuantConfig,
+        **kwargs,
     ):
-        super().__init__(num_experts, dtype, intermediate_size, hidden_size, quant_config)
+        super().__init__(num_experts, dtype, intermediate_size, hidden_size, quant_config, **kwargs)
         # These will be set in create_weights and used in create_ref_module
         self.weight_loading_mode = MoEWeightLoadingMode.W4A8_CUSTOM
         self.scaling_group_size = 128
diff --git a/tests/unittest/_torch/modules/moe/test_moe_backend.py b/tests/unittest/_torch/modules/moe/test_moe_backend.py
index 65721e4b924..09339d21475 100644
--- a/tests/unittest/_torch/modules/moe/test_moe_backend.py
+++ b/tests/unittest/_torch/modules/moe/test_moe_backend.py
@@ -28,18 +28,20 @@
 
 import itertools
 import logging
-import os
 from typing import List, Optional
 
 import pytest
 import torch
 from _torch.modules.moe.moe_test_utils import (
+    IS_CI_MODE,
     MoeBackendType,
     MoeModelConfig,
     create_test_param,
     get_backend_class,
     iter_base_test_configs,
     replay_tactics_and_check,
+    should_skip_to_accelerate_ci,
+    skip_if_insufficient_gpu_memory,
     supports_autotuner_capture,
 )
 from _torch.modules.moe.quantize_utils import get_test_quant_params
@@ -49,7 +51,7 @@
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.modules.fused_moe import RenormalizeMoeRoutingMethod
 from tensorrt_llm._torch.modules.fused_moe.create_moe import create_moe_backend
-from tensorrt_llm._torch.modules.fused_moe.interface import MoE
+from tensorrt_llm._torch.modules.fused_moe.interface import MoE, MoEWeightLoadingMode
 from tensorrt_llm._utils import mpi_rank
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantAlgo
@@ -103,6 +105,7 @@ def create_test_backend(
     swiglu_alpha: Optional[torch.Tensor] = None,
     swiglu_beta: Optional[torch.Tensor] = None,
     swiglu_limit: Optional[torch.Tensor] = None,
+    weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.VANILLA,
 ) -> MoE:
     """Create a MoE backend for testing."""
     backend_cls = get_backend_class(backend_type)
@@ -134,6 +137,7 @@ def create_test_backend(
         swiglu_alpha=swiglu_alpha,
         swiglu_beta=swiglu_beta,
         swiglu_limit=swiglu_limit,
+        weight_loading_mode=weight_loading_mode,
     )
 
 
@@ -226,30 +230,28 @@ def run_backend_moe(
 # Default runs the CI subset (TRTLLM_TEST_MOE_CI=1).
 # Set TRTLLM_TEST_MOE_CI=0 for the full local config matrix.
 CI_MOE_MODEL_CONFIGS = [
+    # Real models (small/medium — tactic replay is model-size-independent,
+    # e256 is covered by test_moe_module integration tests)
     MoeModelConfig(60, 4, 2048, 1408),  # Qwen1.5-MoE-A2.7B
-    MoeModelConfig(256, 8, 7168, 2048),  # DeepSeek-V3
     MoeModelConfig(128, 4, 2880, 2880),  # GPT-OSS-120B
     MoeModelConfig(8, 1, 512, 512),  # boundary: top_k=1, single expert activated
+    # Boundary tests for tactic correctness
+    MoeModelConfig(4, 4, 512, 512),  # top_k=num_experts, all experts activated
+    MoeModelConfig(7, 2, 256, 512),  # prime num_experts
+    MoeModelConfig(13, 3, 256, 512),  # prime num_experts, odd top_k
 ]
 
 LOCAL_MOE_MODEL_CONFIGS = CI_MOE_MODEL_CONFIGS + [
+    MoeModelConfig(256, 8, 7168, 2048),  # DeepSeek-V3
     MoeModelConfig(8, 2, 4096, 14336),  # Mixtral-8x7B
     MoeModelConfig(64, 6, 2048, 1408),  # DeepSeek-MoE-16B / DeepSeek-V2-Lite
     MoeModelConfig(8, 2, 6144, 32768),  # Grok-1
-    # === Boundary Tests: num_experts / top_k ===
-    MoeModelConfig(4, 4, 512, 512),  # top_k=num_experts, all experts activated
-    MoeModelConfig(7, 2, 256, 512),  # prime num_experts
-    MoeModelConfig(13, 3, 256, 512),  # prime num_experts, odd top_k
     # === Boundary Tests: small sizes ===
     MoeModelConfig(4, 2, 64, 128),  # very small hidden_size
     MoeModelConfig(4, 2, 128, 64),  # intermediate < hidden
 ]
 
-MOE_MODEL_CONFIGS = (
-    CI_MOE_MODEL_CONFIGS
-    if os.environ.get("TRTLLM_TEST_MOE_CI", "1") == "1"
-    else LOCAL_MOE_MODEL_CONFIGS
-)
+MOE_MODEL_CONFIGS = CI_MOE_MODEL_CONFIGS if IS_CI_MODE else LOCAL_MOE_MODEL_CONFIGS
 
 # Sequence lengths to test
 SEQ_LENS_TO_TEST = [1, 8]
@@ -270,9 +272,7 @@ def run_backend_moe(
     (1.702, 1.0, 7.0),  # gptoss style (GPT-OSS real values)
 ]
 
-SWIGLU_COMBOS = (
-    CI_SWIGLU_COMBOS if os.environ.get("TRTLLM_TEST_MOE_CI", "1") == "1" else LOCAL_SWIGLU_COMBOS
-)
+SWIGLU_COMBOS = CI_SWIGLU_COMBOS if IS_CI_MODE else LOCAL_SWIGLU_COMBOS
 
 
 def generate_test_params() -> List:
@@ -381,7 +381,6 @@ def generate_test_params() -> List:
 # - 128-alignment requirements for quantization
 #
 # =============================================================================
-@pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test")
 @pytest.mark.parametrize(
     "dtype_activation,backend_type,quant_algo,seq_len,model_config,"
     "routing_method_cls,swiglu_alpha,swiglu_beta,swiglu_limit",
@@ -412,10 +411,17 @@ def test_moe_backend(
     # Default values: alpha=1, beta=0, limit=inf
     swiglu_gptoss_style = swiglu_alpha != 1 or swiglu_beta != 0 or swiglu_limit != float("inf")
 
-    # Note: Skip logic is now handled at parametrize level via get_quick_skip_reason()
-    # which calls backend's can_implement() and should_skip_* functions.
-    # This avoids entering test function for invalid combinations, significantly
-    # reducing test collection time (from ~17 min to ~5 sec for 3400+ skipped tests).
+    ci_skip = should_skip_to_accelerate_ci(
+        backend_type=backend_type,
+        quant_algo=quant_algo,
+        model_config=model_config,
+        routing_method_cls=routing_method_cls,
+        dtype=dtype_activation,
+        seq_len=seq_len,
+        swiglu_gptoss_style=swiglu_gptoss_style,
+    )
+    if ci_skip:
+        pytest.skip(ci_skip)
 
     # Extract model parameters
     num_experts = model_config.num_experts
@@ -423,6 +429,8 @@ def test_moe_backend(
     hidden_size = model_config.hidden_size
     intermediate_size = model_config.intermediate_size
 
+    skip_if_insufficient_gpu_memory(num_experts, hidden_size, intermediate_size, dtype_activation)
+
     # Create mapping
     mapping = Mapping()
     mapping.rank = mpi_rank()
@@ -464,6 +472,11 @@ def test_moe_backend(
         # Get swiglu tensors if swiglu_gptoss_style is enabled
         swiglu_tensors = quantize_util.get_swiglu_tensors()
 
+        # Determine weight loading mode based on quantization algorithm
+        weight_loading_mode = MoEWeightLoadingMode.VANILLA
+        if hasattr(quantize_util, "weight_loading_mode"):
+            weight_loading_mode = quantize_util.weight_loading_mode
+
         # Create backend first (needed for MXFP4_MXFP8 to get shapes)
         backend = create_test_backend(
             backend_type=backend_type,
@@ -478,6 +491,7 @@ def test_moe_backend(
             swiglu_alpha=swiglu_tensors["swiglu_alpha"] if swiglu_tensors else None,
             swiglu_beta=swiglu_tensors["swiglu_beta"] if swiglu_tensors else None,
             swiglu_limit=swiglu_tensors["swiglu_limit"] if swiglu_tensors else None,
+            weight_loading_mode=weight_loading_mode,
         )
 
         # W4A8_MXFP4_MXFP8 requires different weights for backend and reference
diff --git a/tests/unittest/_torch/modules/moe/test_moe_module.py b/tests/unittest/_torch/modules/moe/test_moe_module.py
index a86a84a0dcd..099a9641fac 100644
--- a/tests/unittest/_torch/modules/moe/test_moe_module.py
+++ b/tests/unittest/_torch/modules/moe/test_moe_module.py
@@ -26,6 +26,7 @@
 """
 
 import copy
+import functools
 import logging
 import os
 import pickle
@@ -40,6 +41,7 @@
 import pytest
 import torch
 from _torch.modules.moe.moe_test_utils import (
+    IS_CI_MODE,
     MoeBackendType,
     MoeModelConfig,
     create_test_param,
@@ -50,7 +52,9 @@
     should_skip_cutlass,
     should_skip_deepgemm,
     should_skip_multi_gpu,
+    should_skip_to_accelerate_ci,
     should_skip_trtllm,
+    skip_if_insufficient_gpu_memory,
     supports_autotuner_capture,
 )
 from _torch.modules.moe.quantize_utils import get_test_quant_params
@@ -59,6 +63,7 @@
 from transformers.configuration_utils import PretrainedConfig
 
 import tensorrt_llm.bindings.internal.runtime as _tbr
+from tensorrt_llm._mnnvl_utils import MnnvlMemory
 from tensorrt_llm._torch.autotuner import AutoTuner, autotune
 from tensorrt_llm._torch.model_config import ModelConfig
 from tensorrt_llm._torch.modules.fused_moe import (
@@ -70,6 +75,8 @@
     RenormalizeNaiveMoeRoutingMethod,
     create_moe,
 )
+from tensorrt_llm._torch.modules.fused_moe.communication.deep_ep_low_latency import DeepEPLowLatency
+from tensorrt_llm._torch.modules.fused_moe.interface import MoEWeightLoadingMode
 from tensorrt_llm._torch.modules.fused_moe.moe_load_balancer import (
     MoeLoadBalancer,
     MoeLoadBalancerIterContext,
@@ -518,6 +525,12 @@ def _test_moe_worker_impl(
         # Get swiglu tensors if swiglu_gptoss_style is enabled
         swiglu_tensors = quantize_util.get_swiglu_tensors()
 
+        # Get weight_loading_mode from quantize_util if available
+        # (e.g., W4A8AWQQuantizeUtil uses W4A8_CUSTOM mode)
+        weight_loading_mode = getattr(
+            quantize_util, "weight_loading_mode", MoEWeightLoadingMode.VANILLA
+        )
+
         with moe_load_balancer:
             # Create and setup fused MoE module
             fused_moe = create_moe(
@@ -528,6 +541,7 @@ def _test_moe_worker_impl(
                 swiglu_alpha=swiglu_tensors["swiglu_alpha"] if swiglu_tensors else None,
                 swiglu_beta=swiglu_tensors["swiglu_beta"] if swiglu_tensors else None,
                 swiglu_limit=swiglu_tensors["swiglu_limit"] if swiglu_tensors else None,
+                weight_loading_mode=weight_loading_mode,
             )
             fused_moe.load_weights([weights])
             fused_moe.post_load_weights()
@@ -724,11 +738,7 @@ def init_worker(custom_paths, comm_method_type):
     MoeModelConfig(4, 2, 128, 64),  # intermediate < hidden
 ]
 
-MOE_MODEL_CONFIGS = (
-    CI_MOE_MODEL_CONFIGS
-    if os.environ.get("TRTLLM_TEST_MOE_CI", "1") == "1"
-    else LOCAL_MOE_MODEL_CONFIGS
-)
+MOE_MODEL_CONFIGS = CI_MOE_MODEL_CONFIGS if IS_CI_MODE else LOCAL_MOE_MODEL_CONFIGS
 
 # Sequence lengths to test
 SEQ_LENS = [1, 8]
@@ -786,23 +796,47 @@ def init_worker(custom_paths, comm_method_type):
 ]
 
 # Default runs CI subset. Set TRTLLM_TEST_MOE_CI=0 for full local matrix.
-SWIGLU_COMBOS = (
-    CI_SWIGLU_COMBOS if os.environ.get("TRTLLM_TEST_MOE_CI", "1") == "1" else LOCAL_SWIGLU_COMBOS
-)
+SWIGLU_COMBOS = CI_SWIGLU_COMBOS if IS_CI_MODE else LOCAL_SWIGLU_COMBOS
+
+
+@functools.lru_cache(maxsize=1)
+def _is_mnnvl_supported() -> bool:
+    """Cached check for MNNVL platform support (pynvml query is expensive)."""
+    return MnnvlMemory.supports_mnnvl()
 
 
 def _get_comm_method_skip_reason(
     comm_method: str,
     model_config: "MoeModelConfig",
+    dtype: Optional[torch.dtype] = None,
 ) -> Optional[str]:
     """
     Check if a communication method is compatible with the given model config.
 
     Returns a skip reason string if incompatible, None otherwise.
     """
-    from tensorrt_llm._torch.modules.fused_moe.communication.deep_ep_low_latency import (
-        DeepEPLowLatency,
-    )
+    # NVLink-based methods require MNNVL support (all NVLink links active).
+    # See: _mnnvl_utils.py:supports_mnnvl() -> support_nvlink(need_all_up=True)
+    # Without MNNVL, Communication.__init__() raises RuntimeError (base.py:53-58).
+    if comm_method in ("NVLINK_ONE_SIDED", "NVLINK_TWO_SIDED"):
+        if not _is_mnnvl_supported():
+            return (
+                f"{comm_method} requires MNNVL support (all NVLink links active). "
+                f"Not supported on this platform."
+            )
+
+    # DeepEP normal mode: is_workload_feasible (deep_ep.py:127) rejects
+    # non-bfloat16, causing a runtime fallback to AllGather. The fallback
+    # replaces self.comm, and when the old DeepEP object is GC'd its
+    # Buffer destructor calls intranode::barrier (deep_ep.cpp:90) which
+    # requires all ranks simultaneously -- non-deterministic GC timing
+    # across MPI ranks causes the barrier to timeout and crash.
+    if comm_method == "DEEPEP" and dtype is not None and dtype != torch.bfloat16:
+        return (
+            f"DeepEP is_workload_feasible rejects dtype={dtype} "
+            f"(requires bfloat16), and the runtime fallback triggers an "
+            f"unsafe Buffer destruction that crashes all ranks."
+        )
 
     if comm_method == "DEEPEPLOWLATENCY":
         if model_config.hidden_size not in DeepEPLowLatency.SUPPORTED_HIDDEN_SIZES:
@@ -866,19 +900,42 @@ def generate_multi_gpu_test_params(
         ):
             # Check multi-GPU specific skip conditions (short-circuit on first match)
             if not skip_reason:
+                # TP modes shard intermediate_size; EP modes don't
+                moe_tp_size = 4 if parallel_mode in ("DTP", "TTP") else 1
                 for reason in (
-                    _get_comm_method_skip_reason(comm_method, model_config),
+                    _get_comm_method_skip_reason(comm_method, model_config, dtype=dtype),
                     should_skip_trtllm(
-                        backend_type, quant_algo, model_config, comm_method=comm_method
+                        backend_type,
+                        quant_algo,
+                        model_config,
+                        comm_method=comm_method,
+                        moe_tp_size=moe_tp_size,
                     ),
                     should_skip_cutlass(
-                        backend_type, comm_method, quant_algo=quant_algo, model_config=model_config
+                        backend_type,
+                        comm_method,
+                        quant_algo=quant_algo,
+                        model_config=model_config,
+                        moe_tp_size=moe_tp_size,
+                        dtype=dtype,
+                    ),
+                    should_skip_cutedsl(
+                        backend_type,
+                        quant_algo,
+                        model_config,
+                        comm_method,
+                        moe_tp_size=moe_tp_size,
                     ),
-                    should_skip_cutedsl(backend_type, quant_algo, model_config, comm_method),
                     should_skip_deepgemm(
-                        backend_type, comm_method, quant_algo=quant_algo, model_config=model_config
+                        backend_type,
+                        comm_method,
+                        quant_algo=quant_algo,
+                        model_config=model_config,
+                        moe_tp_size=moe_tp_size,
+                    ),
+                    should_skip_multi_gpu(
+                        parallel_mode, model_config, world_size=4, comm_method=comm_method
                     ),
-                    should_skip_multi_gpu(parallel_mode, model_config, world_size=4),
                 ):
                     if reason:
                         skip_reason = reason
@@ -973,7 +1030,6 @@ def generate_base_test_params(
 )
 
 
-@pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test")
 @pytest.mark.parametrize(
     "dtype,moe_backend,quant_algo,seq_len,model_config,routing_method_cls,"
     "swiglu_alpha,swiglu_beta,swiglu_limit",
@@ -999,6 +1055,26 @@ def test_configurable_moe_single_gpu(
     3. Autotune captures and replays all tactics properly
     4. swiglu_gptoss_style (SwiGLU with custom parameters) works correctly
     """
+    swiglu_gptoss_style = swiglu_alpha != 1 or swiglu_beta != 0 or swiglu_limit != float("inf")
+    ci_skip = should_skip_to_accelerate_ci(
+        backend_type=MoeBackendType(moe_backend),
+        quant_algo=quant_algo,
+        model_config=model_config,
+        routing_method_cls=routing_method_cls,
+        dtype=dtype,
+        seq_len=seq_len,
+        swiglu_gptoss_style=swiglu_gptoss_style,
+    )
+    if ci_skip:
+        pytest.skip(ci_skip)
+
+    skip_if_insufficient_gpu_memory(
+        model_config.num_experts,
+        model_config.hidden_size,
+        model_config.intermediate_size,
+        dtype,
+    )
+
     # DeepSeekV3 routing requires float32 routing_logits for TRTLLM backend
     # See: cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp:70-72
     dtype_routing_logits = None
@@ -1032,7 +1108,7 @@ def test_configurable_moe_single_gpu(
     comm_methods=COMM_METHODS,
     swiglu_combos=SWIGLU_COMBOS,
     model_configs=MOE_MODEL_CONFIGS,
-    seq_lens=SEQ_LENS,
+    seq_lens=[8] if IS_CI_MODE else SEQ_LENS,
     dtypes=DTYPES,
     backend_types=BACKEND_TYPES,
     quant_algos=QUANT_ALGOS,
@@ -1040,7 +1116,6 @@ def test_configurable_moe_single_gpu(
 )
 
 
-@pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test")
 @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test")
 @pytest.mark.parametrize(
     "parallel_mode,comm_method_type,dtype,moe_backend,quant_algo,seq_len,model_config,"
@@ -1060,6 +1135,27 @@ def test_configurable_moe_multi_gpu(
     swiglu_beta,
     swiglu_limit,
 ):
+    swiglu_gptoss_style = swiglu_alpha != 1 or swiglu_beta != 0 or swiglu_limit != float("inf")
+    ci_skip = should_skip_to_accelerate_ci(
+        backend_type=MoeBackendType(moe_backend),
+        quant_algo=quant_algo,
+        model_config=model_config,
+        routing_method_cls=routing_method_cls,
+        dtype=dtype,
+        seq_len=seq_len,
+        swiglu_gptoss_style=swiglu_gptoss_style,
+        parallel_mode=parallel_mode,
+    )
+    if ci_skip:
+        pytest.skip(ci_skip)
+
+    skip_if_insufficient_gpu_memory(
+        model_config.num_experts,
+        model_config.hidden_size,
+        model_config.intermediate_size,
+        dtype,
+    )
+
     # DeepSeekV3 routing requires float32 routing_logits for TRTLLM backend
     # See: cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp:70-72
     dtype_routing_logits = None
@@ -1245,6 +1341,10 @@ def generate_eplb_test_params(
             backend_type, quant_algo, dtype, model_config, routing_method_cls
         )
 
+        # Check comm method platform compatibility (e.g. NVLink support)
+        if not skip_reason:
+            skip_reason = _get_comm_method_skip_reason(comm_method, model_config)
+
         # Check EPLB-specific skip conditions
         if not skip_reason:
             skip_reason = _should_skip_EPLB(
@@ -1288,7 +1388,6 @@ def generate_eplb_test_params(
 )
 
 
-@pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test")
 @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test")
 @pytest.mark.skipif(
     not _tbr.is_host_accessible_device_memory_supported(),
@@ -1308,6 +1407,13 @@ def test_configurable_moe_multi_gpu_eplb(
     num_slots,
     routing_method_cls,
 ):
+    skip_if_insufficient_gpu_memory(
+        model_config.num_experts,
+        model_config.hidden_size,
+        model_config.intermediate_size,
+        dtype,
+    )
+
     world_size = 4
     _test_moe_multi_gpu(
         comm_method_type,
diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py
index 6bdf570457c..0453c66c6de 100644
--- a/tests/unittest/_torch/modules/test_fused_moe.py
+++ b/tests/unittest/_torch/modules/test_fused_moe.py
@@ -45,6 +45,10 @@
 from tensorrt_llm.mapping import Mapping
 from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig
 
+# NOTE: Most tests in this file are deprecated and skipped. They are now covered by the
+# unified MoE test framework in tests/unittest/_torch/modules/moe/test_moe_backend.py
+# and test_moe_module.py. Add new MoE tests there instead of here.
+
 cloudpickle.register_pickle_by_value(sys.modules[__name__])
 cloudpickle.register_pickle_by_value(_torch.helpers)
 MPI.pickle.__init__(
@@ -76,6 +80,10 @@ def round_up(x, alignment):
     return (x + alignment - 1) // alignment * alignment
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @pytest.mark.parametrize(
     "moe_backend, dtype, experts, routing_cls, bias",
     product(["CUTLASS", "VANILLA", "TRITON"], [torch.float16, torch.bfloat16],
@@ -195,6 +203,10 @@ def test_fused_moe(moe_backend,
         m //= 2
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
 @pytest.mark.parametrize("moe_cls", ["CUTLASS", "VANILLA"])
@@ -215,6 +227,10 @@ def test_fused_moe_multi_gpu(moe_cls, ep_size):
             assert r is None
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
 @pytest.mark.parametrize("alltoall_method_type", [
@@ -328,6 +344,10 @@ def per_rank_test_fused_moe_alltoall(job_id):
             assert r is None
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
 @pytest.mark.parametrize("alltoall_method_type", [
@@ -510,6 +530,10 @@ def per_rank_test_fused_moe_alltoall(job_id, weights, x_list):
             assert r is None
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_pre_hopper
 @pytest.mark.parametrize(
     "moe_backend",
@@ -698,6 +722,10 @@ def set_tensor_value_4(x, num_row, num_cols):
     x.copy_(repeated)
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_pre_blackwell
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
@@ -853,6 +881,10 @@ def per_rank_test_fused_moe_alltoall_fp8_blockwise(job_id):
             assert r is None
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_pre_blackwell
 @pytest.mark.parametrize(
     "dtype, num_experts, seq_len, hidden_size, RoutingMethodCls",
@@ -1038,6 +1070,10 @@ def grouped_gemm(a: torch.Tensor, b: torch.Tensor, a_sf: torch.Tensor,
     torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_pre_blackwell
 @pytest.mark.parametrize(
     "dtype, num_experts, seq_len, hidden_size, RoutingMethodCls, WeightLoadingMode",
@@ -1172,6 +1208,10 @@ def test_fused_moe_fp8_blockwise_cute_dsl(dtype,
     return True
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_no_hopper
 @pytest.mark.parametrize(
     "dtype, num_experts, seq_len, hidden_size, RoutingMethodCls, WeightLoadingMode",
@@ -1304,6 +1344,10 @@ def test_fused_moe_fp8_blockwise_cutlass(dtype,
     return True
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_no_hopper
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
@@ -1337,6 +1381,10 @@ def test_fused_moe_fp8_blockwise_cutlass_multi_gpu(ep_size, routing_method,
             assert r is True
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_pre_blackwell
 @pytest.mark.skipif(torch.cuda.device_count() < 4,
                     reason="needs 4 GPUs to run this test")
@@ -1370,6 +1418,10 @@ def test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu(ep_size, routing_method,
             assert r is True
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_pre_blackwell
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("moe_backend", [
@@ -1383,6 +1435,10 @@ def test_fused_moe_nvfp4(dtype, moe_backend, finalize_fusion):
     run_fused_moe_nvfp4(dtype, moe_backend, finalize_fusion)
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_pre_blackwell
 @pytest.mark.parametrize("hidden_size, intermediate_size", [(2880, 2880)])
 @pytest.mark.parametrize("swiglu_alpha", [1, 0.1], ids=lambda v: f"alpha{v}")
@@ -1645,6 +1701,10 @@ def run_fused_moe_nvfp4(dtype,
                                                atol=atol)
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_pre_blackwell
 @pytest.mark.parametrize(
     "moe_backend",
@@ -1782,6 +1842,10 @@ def test_fused_moe_w4a8_nvfp4_fp8(moe_backend):
                                            atol=0.5)
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_neither_ada_nor_hopper_unittest
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize(
@@ -2057,6 +2121,10 @@ def process_layer(
         torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1)
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @skip_pre_blackwell
 @pytest.mark.parametrize(
     "moe_backend",
@@ -2308,6 +2376,10 @@ def prepare_weights(num_experts: int,
             torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.15)
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @pytest.mark.parametrize("dtype", [torch.bfloat16])
 @pytest.mark.parametrize("hidden_size", [768, 2880])
 @pytest.mark.parametrize(
@@ -2615,6 +2687,10 @@ def mxfp4_to_fp32(tensor, scales):
         check_accuracy(output, ref_output, rtol=0.6, atol=0.6, percent=0.945)
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16])
 @pytest.mark.parametrize("weight_dtype", [torch.int8])
 def test_fused_moe_int8_woq_per_channel(dtype, weight_dtype):
diff --git a/tests/unittest/_torch/thop/serial/test_moe.py b/tests/unittest/_torch/thop/serial/test_moe.py
index 9c0d00bbe67..a1912def29b 100644
--- a/tests/unittest/_torch/thop/serial/test_moe.py
+++ b/tests/unittest/_torch/thop/serial/test_moe.py
@@ -22,6 +22,11 @@
 import torch.nn.functional as F
 
 sys.path.append(os.path.join(os.path.dirname(__file__), '..'))
+
+# NOTE: Some tests in this file are deprecated and skipped. They are now covered by the
+# unified MoE test framework in tests/unittest/_torch/modules/moe/test_moe_backend.py
+# and test_moe_module.py. Add new MoE tests there instead of here.
+
 from enum import Enum
 
 from utils.util import getSMVersion
@@ -872,6 +877,10 @@ def are_groups_valid(top_k_groups, n_groups):
     return True
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @pytest.mark.skipif(
     getSMVersion() < 100 or getSMVersion() >= 110,
     reason="The kernel only supports Blackwell. Current SM is %d." %
@@ -1006,6 +1015,10 @@ def run_moe_fp8_test(self, num_tokens: int, expert_info: Tuple[int, int,
                        percent=0.925)
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @pytest.mark.skipif(
     getSMVersion() < 100 or getSMVersion() >= 110,
     reason="The kernel only supports Blackwell. Current SM is %d." %
@@ -1939,6 +1952,10 @@ def run_moe_fp8_fp4_test(self, num_tokens: int, hidden_size: int,
                        percent=0.925)
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @pytest.mark.skipif(
     getSMVersion() < 100 or getSMVersion() >= 110,
     reason="The kernel only supports Blackwell. Current SM is %d." %
@@ -2164,6 +2181,10 @@ def test_moe_fp8_per_tensor_scale(num_tokens, hidden_size, intermediate_size,
                    percent=0.925)
 
 
+@pytest.mark.skip(
+    reason=
+    "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there."
+)
 @pytest.mark.skipif(
     getSMVersion() != 100,
     reason="The kernel only supports Blackwell. Current SM is %d." %
diff --git a/tests/unittest/llmapi/test_grpc.py b/tests/unittest/llmapi/test_grpc.py
index 08d712f6ab1..a74ec33e578 100644
--- a/tests/unittest/llmapi/test_grpc.py
+++ b/tests/unittest/llmapi/test_grpc.py
@@ -376,23 +376,17 @@ def test_all_sampling_config_fields(self):
             return_generation_logits=True,
             exclude_input_from_output=True,
         )
-        stop_words = [
-            pb2.TokenSequence(token_ids=[50256]),
-            pb2.TokenSequence(token_ids=[50257, 50258]),
-        ]
-        bad_words = [
-            pb2.TokenSequence(token_ids=[100, 101]),
-        ]
         embedding_bias = [0.0] * 10 + [1.5, -1.5]
 
         params = create_sampling_params_from_proto(
             proto_config=proto_config,
             output_config=output_config,
             max_tokens=256,
-            end_id=50256,
-            pad_id=50257,
-            stop_words=stop_words,
-            bad_words=bad_words,
+            stop=["<|endoftext|>", "<|end|>"],
+            stop_token_ids=[50256],
+            ignore_eos=True,
+            bad=["badword1"],
+            bad_token_ids=[100, 101],
             embedding_bias=embedding_bias,
         )
 
@@ -432,20 +426,21 @@ def test_all_sampling_config_fields(self):
 
         # Other params
         assert params.max_tokens == 256
-        assert params.end_id == 50256
-        assert params.pad_id == 50257
         assert params.detokenize is False  # key optimization
+        assert params.ignore_eos is True
 
-        # Stop/bad words (set as pre-tokenized word IDs)
-        assert params._stop_word_ids == [[50256], [50257, 50258]]
-        assert params._bad_word_ids == [[100, 101]]
+        # Stop/bad words (passed as strings/token IDs for TRT-LLM's _setup() to tokenize)
+        assert params.stop == ["<|endoftext|>", "<|end|>"]
+        assert params.stop_token_ids == [50256]
+        assert params.bad == ["badword1"]
+        assert params.bad_token_ids == [100, 101]
 
         # Embedding bias converted to torch.Tensor
         assert params.embedding_bias is not None
         assert len(params.embedding_bias) == 12
 
-    def test_end_id_minus_one_sets_ignore_eos(self):
-        """Test that end_id=-1 correctly sets ignore_eos=True."""
+    def test_ignore_eos_flag(self):
+        """Test that ignore_eos=True correctly sets ignore_eos on SamplingParams."""
         proto_config = pb2.SamplingConfig(temperature=0.7)
         output_config = pb2.OutputConfig()
 
@@ -453,10 +448,9 @@ def test_end_id_minus_one_sets_ignore_eos(self):
             proto_config=proto_config,
             output_config=output_config,
             max_tokens=100,
-            end_id=-1,
+            ignore_eos=True,
         )
 
-        assert params.end_id == -1
         assert params.ignore_eos is True
 
     def test_defaults_when_fields_unset(self):