diff --git a/cpp/kernels/fmha_v2/README.md b/cpp/kernels/fmha_v2/README.md index ce189f21875..c068452b368 100644 --- a/cpp/kernels/fmha_v2/README.md +++ b/cpp/kernels/fmha_v2/README.md @@ -20,7 +20,12 @@ the `setup.py` code: export TORCH_CUDA_ARCH_LIST=9.0 ENABLE_SM89_QMMA=1 ENABLE_HMMA_FP32=1 SCHEDULING_MODE=1 ENABLE_SM100=1 ENABLE_SM120=1 ``` -To generate subset of kernels, you can add conditions in setup.py. +To generate subset of kernels, you can add conditions in setup.py. Or set `FMHA_FILTER_ARCH` before calling setup.py: + +``` +# Build only for a specific arch (or list of architectures). Will not enable kernels that are disabled by default +export FMHA_FILTER_ARCH=90 +``` To generate the files and compile the kernels: ``` diff --git a/cpp/kernels/fmha_v2/fmha_test.py b/cpp/kernels/fmha_v2/fmha_test.py index d4a5beaa010..b79bef940dc 100644 --- a/cpp/kernels/fmha_v2/fmha_test.py +++ b/cpp/kernels/fmha_v2/fmha_test.py @@ -1,3 +1,17 @@ +# SPDX-FileCopyrightText: Copyright (c) 2020-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. import subprocess import pytest @@ -268,3 +282,29 @@ def test_trtllm_chunked_attention(chunked_attention_size, input_layout): -chunked-attention-size {chunked_attention_size} -paged-kv", shell=True, check=True) + + +# The test cases for sliding window attention. +@pytest.mark.parametrize( + 'sliding_window_size', [64, 127, 128, 129, 256, 512], + ids=[ + "sliding-window-size-64", "sliding-window-size-127", + "sliding-window-size-128", "sliding-window-size-129", + "sliding-window-size-256", "sliding-window-size-512" + ]) +@pytest.mark.parametrize( + 'mask_type', + ["-sliding-or-chunked-causal-mask", "-bidirectional-sliding-window-mask"]) +def test_trtllm_sliding_window_attention(sliding_window_size, mask_type): + if mask_type == "-bidirectional-sliding-window-mask": + sliding_window_size *= 2 + + subprocess.run(f"bin/fmha.exe -d 128 -b 2 -h 5 -s 2048 -min-s 1024 -bf16 \ + -sliding-window-size {sliding_window_size} {mask_type}", + shell=True, + check=True) + + subprocess.run(f"bin/fmha.exe -d 64 -b 2 -h 5 -s 2048 -min-s 1024 -bf16 \ + -sliding-window-size {sliding_window_size} {mask_type}", + shell=True, + check=True) diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py index 163d6d0c588..88cba8f793f 100644 --- a/cpp/kernels/fmha_v2/setup.py +++ b/cpp/kernels/fmha_v2/setup.py @@ -99,7 +99,8 @@ class AttentionMaskType(IntEnum): PADDING = 0 CAUSAL = 1 SLIDING_OR_CHUNKED_CAUSAL = 2 - CUSTOM_MASK = 3 + BIDIRECTIONAL_SLIDING_WINDOW = 3 + CUSTOM_MASK = 4 class InputLayout(IntEnum): @@ -738,6 +739,20 @@ def get_makefile_code(specs_names): /*bmm2_fp16_epilogue*/ true, {output_dtype_}>; +using Kernel_traits_nl_bidirectional_sliding_window = fmha::{kernel_traits}< + fmha::{instruction_traits}, + {kv_loop_step}, + {head_size}, + {head_size_v}, + {noloop_step}, + {warps_m}, + {warps_n}, + {ctas_per_head}, + {kernel_flags} | 0x200 /* no_loop flag */, + /*bidirectional sliding window mask*/ 5, + /*bmm2_fp16_epilogue*/ true, + {output_dtype_}>; + using Kernel_traits_nl_custom_mask = fmha::{kernel_traits}< fmha::{instruction_traits}, {kv_loop_step}, @@ -748,7 +763,7 @@ def get_makefile_code(specs_names): {warps_n}, {ctas_per_head}, {kernel_flags} | 0x200 /* no_loop flag */, - /*custom mask*/ 5, + /*custom mask*/ 6, /*bmm2_fp16_epilogue*/ true, {output_dtype_}>; @@ -782,6 +797,16 @@ def get_makefile_code(specs_names): #endif // sliding_or_chunked_causal_mask +#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask + +extern "C" +__global__ +void {bidirectional_sliding_window_kernel_name}_nl({params_type} params){{ + fused_multihead_attention::device_{kernel_variant}_nl(params); +}} + +#endif // bidirectional_sliding_window_mask + #if {custom_mask} // custom_mask extern "C" @@ -820,6 +845,15 @@ def get_makefile_code(specs_names): }} {sliding_or_chunked_causal_kernel_name}_nl<<>>({params_str}); #endif // sliding_or_chunked_causal_mask + }} else if( launch_params.attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW ) {{ +#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask + if( smem_size >= 48*1024 ) {{ + FMHA_CHECK_CUDA(cudaFuncSetAttribute({bidirectional_sliding_window_kernel_name}_nl, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size)); + }} + {bidirectional_sliding_window_kernel_name}_nl<<>>({params_str}); +#endif // bidirectional_sliding_window_mask }} else if( launch_params.attention_mask_type == Attention_mask_type::PADDING ) {{ #if {padding_mask} // padding_mask if( smem_size >= 48*1024 ) {{ @@ -890,6 +924,20 @@ def get_makefile_code(specs_names): /*bmm2_fp16_epilogue*/ true, {output_dtype_}>; +using Kernel_traits_nl_tiled_bidirectional_sliding_window = fmha::{kernel_traits}< + fmha::{instruction_traits}, + {kv_loop_step}, + {head_size}, + {head_size_v}, + {noloop_step}, + {warps_m}, + {warps_n}, + {ctas_per_head}, + {kernel_flags} | 0x200 /* no_loop flag */, + /*bidirectional sliding window mask*/ 5, + /*bmm2_fp16_epilogue*/ true, + {output_dtype_}>; + using Kernel_traits_nl_tiled_custom_mask = fmha::{kernel_traits}< fmha::{instruction_traits}, {kv_loop_step}, @@ -900,7 +948,7 @@ def get_makefile_code(specs_names): {warps_n}, {ctas_per_head}, {kernel_flags} | 0x200 /* no_loop flag */, - /*custom mask*/ 5, + /*custom mask*/ 6, /*bmm2_fp16_epilogue*/ true, {output_dtype_}>; @@ -934,6 +982,16 @@ def get_makefile_code(specs_names): #endif // sliding_or_chunked_causal_mask +#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask + +extern "C" +__global__ +void {bidirectional_sliding_window_kernel_name}_nl_tiled({params_type} params){{ + fused_multihead_attention::device_{kernel_variant}_nl_tiled(params); +}} + +#endif // bidirectional_sliding_window_mask + #if {custom_mask} // custom_mask extern "C" @@ -973,6 +1031,15 @@ def get_makefile_code(specs_names): }} {sliding_or_chunked_causal_kernel_name}_nl_tiled<<>>({params_str}); #endif // sliding_or_chunked_causal_mask + }} else if( launch_params.attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW ) {{ +#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask + if( smem_size >= 48*1024 ) {{ + FMHA_CHECK_CUDA(cudaFuncSetAttribute({bidirectional_sliding_window_kernel_name}_nl_tiled, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size)); + }} + {bidirectional_sliding_window_kernel_name}_nl_tiled<<>>({params_str}); +#endif // bidirectional_sliding_window_mask }} else if( launch_params.attention_mask_type == Attention_mask_type::PADDING ) {{ #if {padding_mask} // padding_mask if( smem_size >= 48*1024 ) {{ @@ -1083,6 +1150,17 @@ def get_makefile_code(specs_names): 4, {kernel_flags}>; +using Kernel_traits_bidirectional_sliding_window = {kernel_traits}< + Traits_p, + Traits_o, + {seq_len}, + {head_size}, + {loop_step}, + {warps_m}, + {warps_n}, + 5, + {kernel_flags}>; + #if {use_tma} // use_tma #if {padding_mask} // padding_mask @@ -1115,6 +1193,16 @@ def get_makefile_code(specs_names): #endif // sliding_or_chunked_causal_mask +#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask + +extern "C" +__global__ +void {bidirectional_sliding_window_kernel_name}(const __grid_constant__ {params_type} params){{ + fused_multihead_attention::device_{kernel_variant}_tma(params); +}} + +#endif // bidirectional_sliding_window_mask + #else #if {padding_mask} @@ -1144,10 +1232,21 @@ def get_makefile_code(specs_names): void {sliding_or_chunked_causal_kernel_name}(const __grid_constant__ {params_type} params){{ fused_multihead_attention::device_{kernel_variant}(params); }} -#endif #endif // sliding_or_chunked_causal_mask +#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask + +extern "C" +__global__ +void {bidirectional_sliding_window_kernel_name}(const __grid_constant__ {params_type} params){{ + fused_multihead_attention::device_{kernel_variant}(params); +}} + +#endif // bidirectional_sliding_window_mask + +#endif + void {launcher_name}({fused_multihead_attention_params_v2_str} ¶ms, const Launch_params &launch_params, cudaStream_t stream){{ // setting TMA descriptors if needed. @@ -1259,6 +1358,15 @@ def get_makefile_code(specs_names): }} {sliding_or_chunked_causal_kernel_name}<<>>({params_str}); #endif // sliding_or_chunked_causal_mask + }} else if( launch_params.attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW ) {{ +#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask + if( smem_size >= 48*1024 ) {{ + FMHA_CHECK_CUDA(cudaFuncSetAttribute({bidirectional_sliding_window_kernel_name}, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size)); + }} + {bidirectional_sliding_window_kernel_name}<<>>({params_str}); +#endif // bidirectional_sliding_window_mask }} else {{ #if {padding_mask} // padding_mask constexpr int smem_size = Kernel_traits::BYTES_PER_SMEM; @@ -1308,6 +1416,17 @@ def get_makefile_code(specs_names): 4, {kernel_flags}>; +using Kernel_traits_bidirectional_sliding_window_nl = {kernel_traits}< + Traits_p, + Traits_o, + {seq_len}, + {head_size}, + {noloop_step}, + {warps_m}, + {warps_n}, + 5, + {kernel_flags}>; + #if {padding_mask} // padding_mask extern "C" @@ -1338,6 +1457,16 @@ def get_makefile_code(specs_names): #endif // sliding_or_chunked_causal_mask +#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask + +extern "C" +__global__ +void {bidirectional_sliding_window_kernel_name}_nl({params_type} params){{ + fused_multihead_attention::device_{kernel_variant}_nl(params); +}} + +#endif // bidirectional_sliding_window_mask + void {launcher_name}_nl({fused_multihead_attention_params_v2_str} ¶ms, const Launch_params& launch_params, cudaStream_t stream){{ constexpr int loop_iters = {seq_len} / {noloop_step}; @@ -1364,6 +1493,15 @@ def get_makefile_code(specs_names): }} {sliding_or_chunked_causal_kernel_name}_nl<<>>({params_str}); #endif // sliding_or_chunked_causal_mask + }} else if( launch_params.attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW ) {{ +#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask + if( smem_size >= 48*1024 ) {{ + FMHA_CHECK_CUDA(cudaFuncSetAttribute({bidirectional_sliding_window_kernel_name}_nl, + cudaFuncAttributeMaxDynamicSharedMemorySize, + smem_size)); + }} + {bidirectional_sliding_window_kernel_name}_nl<<>>({params_str}); +#endif // bidirectional_sliding_window_mask }} else {{ #if {padding_mask} // padding_mask if( smem_size >= 48*1024 ) {{ @@ -1487,6 +1625,27 @@ def get_makefile_code(specs_names): {enable_skip_softmax_flag}, {output_dtype_}>; +using Ktraits_bidirectional_sliding_window = {kernel_traits_header} + {loop_step}, + {kv_loop_step}, + {head_size}, + {head_size_v}, + {q_tile_buffers}, + {kv_tile_buffers}, + NUM_COMPUTE_GROUPS, + DMA2COMPUTE_DEPTH, + 3, + {heads_interleaved_flag}, + {has_alibi}, + {enable_mutex_flag}, + {scheduling_mode}, + {input_layout_flag}, + USE_TMA_STORE && false, + {enable_attn_logit_softcapping_flag}, + {return_softmax_stats_flag}, + {enable_skip_softmax_flag}, + {output_dtype_}>; + using Ktraits_custom_mask = {kernel_traits_header} {loop_step}, {kv_loop_step}, @@ -1496,7 +1655,7 @@ def get_makefile_code(specs_names): {kv_tile_buffers}, NUM_COMPUTE_GROUPS, DMA2COMPUTE_DEPTH, - 3, + 4, {heads_interleaved_flag}, {has_alibi}, {enable_mutex_flag}, @@ -1658,6 +1817,56 @@ def get_makefile_code(specs_names): //////////////////////////////////////////////////////////////////////////////////////////////////// +#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask + +using Shared_bidirectional_sliding_window = typename Ktraits_bidirectional_sliding_window::Shared; + +extern "C" +__global__ __launch_bounds__(Ktraits_bidirectional_sliding_window::THREADS, 1) +void {bidirectional_sliding_window_kernel_name}( + const __grid_constant__ {params_type} params){{ + + extern __shared__ char smem_[]; + char *smem_aligned = fmha::align_1024(smem_); + + Shared_bidirectional_sliding_window *shared = + reinterpret_cast(&smem_aligned[0]); + shared->init(threadIdx.x == 0); + __syncthreads(); + + // special trick to avoid warp_sync (leads to illegal instruction) + int warp_group = __shfl_sync(0xffffffff, threadIdx.x / 128, 0); + int tidx = threadIdx.x % 128; + + if( warp_group == NUM_COMPUTE_GROUPS ) {{ // dma + sched + + {setmaxnreg_dma_str} + uint32_t elect_one = tidx == 0; + + // Need all threads involved when the dam group needs to transpose the v tile explicltly. + if constexpr ( Ktraits_bidirectional_sliding_window::DMA_GROUP_TRANSPOSE_V ) {{ + fmha::ws::DMA::Device dma_device(elect_one); + dma_device.{run_fct_name}(params, shared); + }} else {{ + fmha::ws::DMA::Device dma_device(elect_one); + if( tidx < 32 ) {{ + dma_device.{run_fct_name}(params, shared); + }} + }} + + }} else {{ // math + + {setmaxnreg_compute_str} + + fmha::ws::Compute compute; + compute.run(warp_group, tidx, shared, params); + }} +}} + +#endif // bidirectional_sliding_window_mask + +//////////////////////////////////////////////////////////////////////////////////////////////////// + #if {custom_mask} // custom_mask using Shared_custom_mask = typename Ktraits_custom_mask::Shared; @@ -1784,6 +1993,15 @@ def get_makefile_code(specs_names): {sliding_or_chunked_causal_kernel_name} <<>>({params_str}); #endif // sliding_or_chunked_causal_mask + }} else if( launch_params.attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW ) {{ +#if {bidirectional_sliding_window_mask} // bidirectional_sliding_window_mask + FMHA_CHECK_CUDA(cudaFuncSetAttribute({bidirectional_sliding_window_kernel_name}, + cudaFuncAttributeMaxDynamicSharedMemorySize, + SMEM_BYTES)); + + {bidirectional_sliding_window_kernel_name} + <<>>({params_str}); +#endif // bidirectional_sliding_window_mask }} else if( launch_params.attention_mask_type == Attention_mask_type::CUSTOM_MASK ) {{ #if {custom_mask} // custom_mask FMHA_CHECK_CUDA(cudaFuncSetAttribute({custom_mask_kernel_name}, @@ -1962,6 +2180,7 @@ def selected_mask_types(kspec): padding_mask = '1' causal_mask = '1' sliding_or_chunked_causal_mask = '1' + bidirectional_sliding_window_mask = '1' custom_mask = '1' # only generate certain needed combinations of input_layout and mask types for trt-llm. if "GENERATE_CUBIN" in os.environ: @@ -1969,15 +2188,18 @@ def selected_mask_types(kspec): # SageAttention only needs padding mask now causal_mask = '0' sliding_or_chunked_causal_mask = '0' + bidirectional_sliding_window_mask = '0' custom_mask = '0' elif (kspec.head_size, kspec.head_size_v) == (192, 128): # MLA context phase only needs causal mask and padding mask (for chunked prefill) now sliding_or_chunked_causal_mask = '0' + bidirectional_sliding_window_mask = '0' custom_mask = '0' elif (kspec.head_size, kspec.head_size_v) == (576, 512): # MLA generation phase only needs padding mask (MtpMask) now causal_mask = '0' sliding_or_chunked_causal_mask = '0' + bidirectional_sliding_window_mask = '0' custom_mask = '0' # encoder models (head_size = 32 / 64 / 128) need packed_qkv input layout + padding mask. elif kspec.input_layout == InputLayout.PACKED_QKV: @@ -1988,6 +2210,7 @@ def selected_mask_types(kspec): elif kspec.input_layout == InputLayout.CONTIGUOUS_Q_KV: causal_mask = '0' sliding_or_chunked_causal_mask = '0' + bidirectional_sliding_window_mask = '0' if kspec.head_size not in [32, 64, 72, 128]: padding_mask = '0' custom_mask = '0' @@ -2001,14 +2224,16 @@ def selected_mask_types(kspec): if (kspec.alibi and kspec.warp_specialization): padding_mask = '0' sliding_or_chunked_causal_mask = '0' + bidirectional_sliding_window_mask = '0' custom_mask = '0' # enable_attn_logit_softcapping kernels only need causal mask or sliding_or_chunked_causal_mask. if kspec.enable_attn_logit_softcapping: padding_mask = '0' custom_mask = '0' + bidirectional_sliding_window_mask = '0' - return padding_mask, causal_mask, sliding_or_chunked_causal_mask, custom_mask + return padding_mask, causal_mask, sliding_or_chunked_causal_mask, bidirectional_sliding_window_mask, custom_mask def get_kernel_code(kspec, kname, lname): @@ -2025,6 +2250,8 @@ def get_kernel_code(kspec, kname, lname): custom_mask_kernel_name = kname.replace('__placeholder__', '_custom_mask') sliding_or_chunked_causal_kernel_name = kname.replace( '__placeholder__', '_sliding_or_chunked_causal') + bidirectional_sliding_window_kernel_name = kname.replace( + '__placeholder__', '_bidirectional_sliding_window') kernel_name = kname.replace('__placeholder__', '') # FIXME: use separate parameters when generating cubins for trtllm. @@ -2107,12 +2334,12 @@ def get_kernel_code(kspec, kname, lname): flags |= 8192 # only generate certain needed combinations of input_layout and mask types for trt-llm. - padding_mask, causal_mask, sliding_or_chunked_causal_mask, custom_mask = \ + padding_mask, causal_mask, sliding_or_chunked_causal_mask, bidirectional_sliding_window_mask, custom_mask = \ selected_mask_types(kspec) if any(selected_mask_flag == '1' for selected_mask_flag in selected_mask_types(kspec)): - padding_mask, causal_mask, sliding_or_chunked_causal_mask, custom_mask = \ + padding_mask, causal_mask, sliding_or_chunked_causal_mask, bidirectional_sliding_window_mask, custom_mask = \ selected_mask_types(kspec) else: return None @@ -2894,6 +3121,11 @@ def get_kernel_traits_code(specs_names): snippet_flash_nl_tiled_sliding_or_chunked_causal = snippet_flash_nl_template.replace( '__placeholder__', '_sliding_or_chunked_causal').replace('_nl', '_nl_tiled') + snippet_flash_nl_bidirectional_sliding_window = snippet_flash_nl_template.replace( + '__placeholder__', '_bidirectional_sliding_window') + snippet_flash_nl_tiled_bidirectional_sliding_window = snippet_flash_nl_template.replace( + '__placeholder__', + '_bidirectional_sliding_window').replace('_nl', '_nl_tiled') snippet_flash_nl_custom_mask = snippet_flash_nl_template.replace( '__placeholder__', '_custom_mask') snippet_flash_nl_tiled_custom_mask = snippet_flash_nl_template.replace( @@ -2941,9 +3173,13 @@ def get_kernel_traits_code(specs_names): snippet_ws_template.replace('__placeholder__', '_sliding_or_chunked_causal').\ replace('mask_type', '2').\ replace('__use_tma_store__', 'false') + snippet_ws_bidirectional_sliding_window = \ + snippet_ws_template.replace('__placeholder__', '_bidirectional_sliding_window').\ + replace('mask_type', '3').\ + replace('__use_tma_store__', 'false') snippet_ws_custom_mask = \ snippet_ws_template.replace('__placeholder__', '_custom_mask').\ - replace('mask_type', '2').\ + replace('mask_type', '4').\ replace('__use_tma_store__', 'true') elif effective_sm >= 90: #GMMA no flash yet snippet_template = ''' {{ @@ -3007,7 +3243,8 @@ def get_kernel_traits_code(specs_names): padding_mask = int(selected_types[0]) causal_mask = int(selected_types[1]) sliding_or_chunked_causal_mask = int(selected_types[2]) - custom_mask = int(selected_types[3]) + bidirectional_sliding_window_mask = int(selected_types[3]) + custom_mask = int(selected_types[4]) if not padding_mask: snippet = None @@ -3027,6 +3264,10 @@ def get_kernel_traits_code(specs_names): snippet_ws_sliding_or_chunked_causal = None snippet_flash_nl_sliding_or_chunked_causal = None snippet_flash_nl_tiled_sliding_or_chunked_causal = None + if not bidirectional_sliding_window_mask: + snippet_ws_bidirectional_sliding_window = None + snippet_flash_nl_bidirectional_sliding_window = None + snippet_flash_nl_tiled_bidirectional_sliding_window = None if not custom_mask: snippet_ws_custom_mask = None snippet_flash_nl_custom_mask = None @@ -3047,12 +3288,16 @@ def get_kernel_traits_code(specs_names): print_kernel_specs.append(snippet_flash_nl_tiled_causal) print_kernel_specs.append( snippet_flash_nl_tiled_sliding_or_chunked_causal) + print_kernel_specs.append( + snippet_flash_nl_tiled_bidirectional_sliding_window) print_kernel_specs.append(snippet_flash_nl_tiled_custom_mask) elif kspec.flash_attention and kspec.tiled == 0: print_kernel_specs.append(snippet_flash_nl) print_kernel_specs.append(snippet_flash_nl_causal) print_kernel_specs.append( snippet_flash_nl_sliding_or_chunked_causal) + print_kernel_specs.append( + snippet_flash_nl_bidirectional_sliding_window) print_kernel_specs.append(snippet_flash_nl_custom_mask) else: print_kernel_specs.append(snippet_nl) @@ -3066,6 +3311,7 @@ def get_kernel_traits_code(specs_names): print_kernel_specs.append(snippet_ws) print_kernel_specs.append(snippet_ws_causal) print_kernel_specs.append(snippet_ws_sliding_or_chunked_causal) + print_kernel_specs.append(snippet_ws_bidirectional_sliding_window) print_kernel_specs.append(snippet_ws_custom_mask) # remove none. print_kernel_specs = [ @@ -3133,13 +3379,16 @@ def get_cubin_header(kernel_traits, specs_names): '').replace('ldgsts_', '').replace('causal_', '').replace( 'alibi_', '').replace('softmax_', '').replace( 'sliding_or_chunked_', '').replace( - 'custom_mask_', '').replace('qkv_', '').replace( - 'q_kv_', '').replace('q_paged_kv_', '').replace( - 'q_k_v_', '').replace('ws_', '').replace( - 'softcapping_', - '').replace('sage_', '').replace( - 'skipSoftmax_', - '').replace('output_', '')) + 'bidirectional_sliding_window_', '').replace( + 'custom_mask_', '').replace('qkv_', '').replace( + 'q_kv_', + '').replace('q_paged_kv_', '').replace( + 'q_k_v_', + '').replace('ws_', '').replace( + 'softcapping_', + '').replace('sage_', '').replace( + 'skipSoftmax_', + '').replace('output_', '')) flash_attention = 'flash_attention' in kname warp_specialization = 'tma_ws' in kname toks = tname.split('_') @@ -3207,11 +3456,13 @@ def get_cubin_header(kernel_traits, specs_names): is_tiled = pythonBoolean2cpp['_tiled' in kname] # Attention mask type: - # padding (0), causal_mask (1), sliding_or_chunked_causal_mask (2), custom_mask (3). + # padding (0), causal_mask (1), sliding_or_chunked_causal_mask (2), bidirectional_sliding_window_mask (3), custom_mask (4). if '_custom_mask' in kname: attention_mask_type = AttentionMaskType.CUSTOM_MASK elif '_sliding_or_chunked_causal' in kname: attention_mask_type = AttentionMaskType.SLIDING_OR_CHUNKED_CAUSAL + elif '_bidirectional_sliding_window' in kname: + attention_mask_type = AttentionMaskType.BIDIRECTIONAL_SLIDING_WINDOW elif '_causal' in kname: attention_mask_type = AttentionMaskType.CAUSAL @@ -3269,7 +3520,9 @@ def get_lname_from_kname(kname: str) -> str: return 'nullptr' lname = kname.replace('_kernel', '') mask_types = [ - '_sliding_or_chunked_causal', '_custom_mask', '_causal' + '_sliding_or_chunked_causal', + '_bidirectional_sliding_window', '_custom_mask', + '_causal' ] for mask_type in mask_types: lname = lname.replace(mask_type, '') @@ -6687,6 +6940,11 @@ def enumerate_kernels(): and (kspec.head_size == 128 or kspec.head_size == 256 or not kspec.enable_attn_logit_softcapping)] # yapf: enable + # A separate more aggressive filter for building the fmha.exe binary. Can be ignored for building the cubins. + if "FMHA_FILTER_ARCH" in os.environ: + archs = [int(x) for x in os.environ["FMHA_FILTER_ARCH"].split(",")] + specs_names = [kspec for kspec in specs_names if kspec[0].sm in archs] + generate_files(specs_names) diff --git a/cpp/kernels/fmha_v2/src/fmha/hopper/kernel_traits.h b/cpp/kernels/fmha_v2/src/fmha/hopper/kernel_traits.h index ece561a6d21..90459572873 100644 --- a/cpp/kernels/fmha_v2/src/fmha/hopper/kernel_traits.h +++ b/cpp/kernels/fmha_v2/src/fmha/hopper/kernel_traits.h @@ -49,7 +49,8 @@ template < int WARPS_N, // The version of the kernel. int VERSION_, - // The mask version of the kernel, (2 denotes dense mask, 3 denotes causal mask) + // The mask version of the kernel, (2 denotes dense mask, 3 denotes causal mask, 4 denotes sliding window causal + // mask, 5 denotes bidirectional sliding window mask) int MASK_VERSION_ = 2, // The flags to control the behaviour of LDGs. uint32_t FLAGS = 0x8u> @@ -111,7 +112,7 @@ struct FMHA_kernel_traits_hopper // Whether use causal mask or not. enum { - CAUSAL_MASK = MASK_VERSION_ >= 3 + CAUSAL_MASK = MASK_VERSION_ == 3 || MASK_VERSION_ == 4 }; // Whether use the sliding window attention mask or not. @@ -120,6 +121,12 @@ struct FMHA_kernel_traits_hopper SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 4 }; + // Whether use the bidirectional sliding window attention mask or not. + enum + { + BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 5 + }; + // Do we use LDGSTS for Q, K or V. If not, TMA is used! enum { diff --git a/cpp/kernels/fmha_v2/src/fmha/kernel_traits.h b/cpp/kernels/fmha_v2/src/fmha/kernel_traits.h index 3391cf3d28e..e4a54252bf5 100644 --- a/cpp/kernels/fmha_v2/src/fmha/kernel_traits.h +++ b/cpp/kernels/fmha_v2/src/fmha/kernel_traits.h @@ -271,7 +271,8 @@ struct Kernel_traits_ VERSION = VERSION_ }; - // The mask version: padding (2), causal (3), sliding_window_causal (4), custom_mask (5). + // The mask version: padding (2), causal (3), sliding_window_causal (4), bidirectional_sliding_window (5), + // custom_mask (6). enum { MASK_VERSION = MASK_VERSION_ @@ -289,10 +290,16 @@ struct Kernel_traits_ SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 4 }; + // Whether use the bidirectional sliding window attention or not. + enum + { + BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 5 + }; + // Whether use the custom mask or not. enum { - CUSTOM_MASK = MASK_VERSION_ == 5 + CUSTOM_MASK = MASK_VERSION_ == 6 }; // Do we use LDGSTS for Q, K or V. @@ -551,7 +558,7 @@ struct Kernel_traits_fmhca_ // Whether use causal mask or not. enum { - CAUSAL_MASK = MASK_VERSION >= 3 + CAUSAL_MASK = MASK_VERSION == 3 || MASK_VERSION == 4 }; // Whether use the sliding window attention or not. @@ -560,6 +567,12 @@ struct Kernel_traits_fmhca_ SLIDING_WINDOW_ATTENTION = MASK_VERSION == 4 }; + // Whether use the bidirectional sliding window attention or not. + enum + { + BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = MASK_VERSION == 5 + }; + // Do we use LDGSTS for Q, K or V. enum { @@ -745,7 +758,7 @@ struct Kernel_traits_interleaved_v2_ // Whether use causal mask or not. enum { - CAUSAL_MASK = MASK_VERSION_ >= 3 + CAUSAL_MASK = MASK_VERSION_ == 3 || MASK_VERSION_ == 4 }; // Whether use the sliding window attention or not. @@ -754,6 +767,12 @@ struct Kernel_traits_interleaved_v2_ SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 4 }; + // Whether use the bidirectional sliding window attention or not. + enum + { + BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = MASK_VERSION_ == 5 + }; + // The number of CTAs per head for Cta_tile_p; equivalent to BMM1 split-K enum { diff --git a/cpp/kernels/fmha_v2/src/fmha/mask.h b/cpp/kernels/fmha_v2/src/fmha/mask.h index fc490286a8b..04cf4afcca0 100644 --- a/cpp/kernels/fmha_v2/src/fmha/mask.h +++ b/cpp/kernels/fmha_v2/src/fmha/mask.h @@ -492,9 +492,62 @@ struct Mask : public Mask //////////////////////////////////////////////////////////////////////////////////////////////////// +// Assume we only pay attention to bidirectional sliding-window-size long sequence. +// v v v x x x x x x +// v v v v x x x x x +// v v v v v x x x x +// x v v v v v x x x +// x v v v v v x x x +// x x v v v v v x x +// x x x v v v v v x +// x x x x v v v v v +// x x x x x v v v v +// x x x x x x v v v + +template +struct Mask : public Mask +{ + // V5 mask is the bidirectional sliding window mask. + using Base = Mask; + + // The shape of the MMA tile. + using Mma_tile = typename Base::Mma_tile; + + // Ctor. + template + inline __device__ Mask(Params const& params, Block_info const& block_info, int tidx) + : Base(params, block_info, tidx) + , seqlen_(block_info.actual_seqlen) + { + } + + // Is a given position valid? + inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const + { + int row, col; + this->get_row_col(row, col, mi, ni, ii, jj); + + // Is it a valid position in the sequence? + return is_valid(row, col); + } + + // Is a given position valid? + inline __device__ bool is_valid(int row, int col) const + { + // Is it a valid position in the sequence, i.e. are we in the lower triangle? + return (col >= max(0, row - Base::sliding_window_size_ / 2)) + && (col <= min(seqlen_ - 1, row + Base::sliding_window_size_ / 2)); + } + + // The sequence length. + int seqlen_; +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + // The custom mask (from global memory). template -struct Mask : public Mask +struct Mask : public Mask { using Base = Mask; @@ -958,6 +1011,46 @@ struct Mask_hopper : public Mask_hopper +struct Mask_hopper : public Mask_hopper +{ + + // V5 mask is the bidirectional sliding window mask. + using Base = Mask_hopper; + + // The shape of the MMA tile. + using Mma_tile = typename Traits::template Mma_tile; + + // Ctor. + template + inline __device__ Mask_hopper(Params const& params, Block_info const& block_info, int tidx) + : Base(params, block_info, tidx) + , seqlen_(block_info.actual_seqlen) + { + } + + // Is a given position valid? + inline __device__ bool is_valid(int mi, int ni, int ii, int jj) const + { + int row, col; + this->get_row_col(row, col, mi, ni, ii, jj); + + // Is it a valid position in the sequence? + return is_valid(row, col); + } + + // Is a given position valid? + inline __device__ bool is_valid(int row, int col) const + { + // Is it a valid position in the sequence? + return col >= max(0, row - Base::sliding_window_size_ / 2) + && col <= min(seqlen_ - 1, row + Base::sliding_window_size_ / 2); + } + + // The sequence length. + int seqlen_; +}; + //////////////////////////////////////////////////////////////////////////////////////////////////// } // namespace fmha diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h index bdc9b6d9dee..bf18d4921f6 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h @@ -116,6 +116,12 @@ struct Compute SLIDING_OR_CHUNKED_ATTENTION = Kernel_traits::SLIDING_OR_CHUNKED_ATTENTION }; + // Whether use the bidirectional sliding window attention or not. + enum + { + BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION + }; + // Are we applying alibi bias (drop FMA optimizations for accuracy reasons). enum { @@ -288,17 +294,30 @@ struct Compute // Is the chunked_attention used ? bool is_chunked_attention = params.log2_chunked_attention_size > 0; - // The left mask is needed when we attend to a specific sliding window or chunk. + // Handle sliding window or chunked attention masking if constexpr (SLIDING_OR_CHUNKED_ATTENTION) { - // The kv_left_mask_end is the start of the chunk. - kv_left_mask_end = div_up(is_chunked_attention - ? ((tile_offset_end >> params.log2_chunked_attention_size) << params.log2_chunked_attention_size) - : (tile_offset_end + 1 - params.sliding_window_size), - STEP_KV); + if constexpr (BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION) + { + // Handle bidirectional sliding window attention + kv_left_mask_end = div_up(tile_offset_end - params.sliding_window_size / 2, STEP_KV); + kv_right_mask_start + = min(kv_idx_end - 1, (tile_offset_start + params.sliding_window_size / 2 + 1) / STEP_KV); + } + else if (is_chunked_attention) + { + // Handle chunked attention + kv_left_mask_end = div_up( + ((tile_offset_end >> params.log2_chunked_attention_size) << params.log2_chunked_attention_size), + STEP_KV); + } + else + { + kv_left_mask_end = div_up(tile_offset_end + 1 - params.sliding_window_size, STEP_KV); + } } - // The right mask is needed when causal mask (including sliding_window_attention or chunked attention) is used. + // The right mask is needed when causal mask is used. if constexpr (SKIP_CAUSAL_MASK_TILES) { kv_right_mask_start = tile_offset_start / STEP_KV; diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h index c1ebf5a7bba..d77c5414455 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/dma.h @@ -114,6 +114,12 @@ struct DMA SLIDING_OR_CHUNKED_ATTENTION = Kernel_traits::SLIDING_OR_CHUNKED_ATTENTION }; + // Whether use the bidirectional sliding window attention or not. + enum + { + BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION + }; + // Is heads interleaved ? enum { @@ -201,11 +207,27 @@ struct DMA // Skip initial kv tiles due to sliding_window_size if (SLIDING_OR_CHUNKED_ATTENTION) { - // The kv_offset_start. - int kv_offset_start = is_chunked_attention - ? ((q_step_offset >> params.log2_chunked_attention_size) << params.log2_chunked_attention_size) - : max(0, q_step_offset + 1 - params.sliding_window_size); - kv_idx_start = kv_offset_start / STEP_KV; + if constexpr (BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION) + { + int kv_offset_start = max(0, q_step_offset - params.sliding_window_size / 2); + int kv_offset_end = min(kv_steps * STEP_KV - 1, q_step_end + params.sliding_window_size / 2); + + // We do floor division plus 1 to get the correct kv_idx_end, this is because kv_idx_end is + // exclusive + kv_idx_start = kv_offset_start / STEP_KV; + kv_idx_end = kv_offset_end / STEP_KV + 1; + } + else if (is_chunked_attention) + { + int kv_offset_start + = ((q_step_offset >> params.log2_chunked_attention_size) << params.log2_chunked_attention_size); + kv_idx_start = kv_offset_start / STEP_KV; + } + else + { + int kv_offset_start = max(0, q_step_offset + 1 - params.sliding_window_size); + kv_idx_start = kv_offset_start / STEP_KV; + } } // Early stop when causal mask is enabled. diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h index 465c9430cb0..9393b4fd4e3 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h @@ -76,6 +76,12 @@ struct Softmax_base SLIDING_OR_CHUNKED_ATTENTION = Kernel_traits::SLIDING_OR_CHUNKED_ATTENTION }; + // Whether use the bidirectional sliding window attention or not. + enum + { + BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION + }; + // Are we applying alibi bias (drop FMA optimizations for accuracy reasons). enum { @@ -134,7 +140,7 @@ struct Softmax_base // The corresponding row/col for each thread after MMA. // fixed 4x1 warp layout. quad_col_ = lane % 4; - if (CAUSAL_MASK) + if (CAUSAL_MASK || SLIDING_OR_CHUNKED_ATTENTION) { quad_row_ = warp * 16 + lane / 4; } @@ -149,9 +155,14 @@ struct Softmax_base // The attention chunk start. return (row >> log2_chunked_attention_size_) << log2_chunked_attention_size_; } + else if constexpr (BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION) + { + // The bidirectional sliding window start is the max of 0 and row - sliding_window_size/2. + return max(0, row - sliding_window_size_ / 2); + } else { - // The sliding window start is the max of 0 and row - sliding_window_size. + // The sliding window start is the max of 0 and row + 1 - sliding_window_size. return max(0, row + 1 - sliding_window_size_); } } @@ -286,14 +297,18 @@ struct Softmax_base valid_positions(mi, ni, v0, v1); // Causal mask. } - else if constexpr (CAUSAL_MASK) + else if constexpr (CAUSAL_MASK || SLIDING_OR_CHUNKED_ATTENTION) { // Causal Mask: we have to apply mask before getting max. int row = row_offset + quad_row_ + mi * 8; col = col_offset + quad_col_ * 2 + ni * 8; - // Mask for the two N elements. - v0 = (col <= row); - v1 = (col + 1 <= row); + + if constexpr (CAUSAL_MASK) + { + // Mask for the two N elements. + v0 &= (col <= row); + v1 &= (col + 1 <= row); + } // Attend to the specific sliding window or chunk. if constexpr (SLIDING_OR_CHUNKED_ATTENTION) @@ -301,6 +316,15 @@ struct Softmax_base int sliding_window_or_chunk_start = compute_sliding_window_or_chunk_start(row); v0 &= (col >= sliding_window_or_chunk_start); v1 &= (col + 1 >= sliding_window_or_chunk_start); + + if constexpr (BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION) + { + assert(log2_chunked_attention_size_ == 0 + && "Bidirectional sliding window attention should not use chunked attention"); + int sliding_window_end = min(actual_seqlen - 1, row + sliding_window_size_ / 2); + v0 &= (col <= sliding_window_end); + v1 &= (col + 1 <= sliding_window_end); + } } // Dense(padding) mask. } diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h index f8d7004939c..966ff015627 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h @@ -51,8 +51,9 @@ template < int NUM_COMPUTE_GROUPS_, // The number of data warpgroups (TMA). int DMA2COMPUTE_DEPTH_, - // The attention mask type: padding (0), causal (1), sliding_window_causal (2), custom_mask (3). - // See fused_multihead_attention_kernel.h for description. + // The attention mask type: padding (0), causal (1), sliding_or_chunked_attention (2), + // bidirectional_sliding_window_attention (3), custom_mask (4). See fused_multihead_attention_kernel.h for + // description. int ATTENTION_MASK_TYPE_ = 0, // Is head interleaved ? // (head_interleaved means input [bxs, h, 3, d], otherwise [bx3, 3, h, d]). @@ -250,7 +251,8 @@ struct Kernel_traits WARP_GROUP_K = 1 }; - // The attention mask type: padding (0), causal (1), sliding_or_chunked_attention (2), custom_mask (3). + // The attention mask type: padding (0), causal (1), sliding_or_chunked_attention (2), + // bidirectional_sliding_window_attention (3), custom_mask (4). enum { CAUSAL_MASK = (ATTENTION_MASK_TYPE_ == 1 || ATTENTION_MASK_TYPE_ == 2) @@ -258,7 +260,12 @@ struct Kernel_traits enum { - SLIDING_OR_CHUNKED_ATTENTION = ATTENTION_MASK_TYPE_ == 2 + SLIDING_OR_CHUNKED_ATTENTION = ATTENTION_MASK_TYPE_ == 2 || ATTENTION_MASK_TYPE_ == 3 + }; + + enum + { + BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION = ATTENTION_MASK_TYPE_ == 3 }; // Is head interleaved ? @@ -286,10 +293,10 @@ struct Kernel_traits ENABLE_BMM1_SOFTCAPPING_SCALE = ENABLE_BMM1_SOFTCAPPING_SCALE_ }; - // Use the custom mask input ( attention_mask_type == 3.) + // Use the custom mask input ( attention_mask_type == 4.) enum { - USE_CUSTOM_MASK = ATTENTION_MASK_TYPE_ == 3 + USE_CUSTOM_MASK = ATTENTION_MASK_TYPE_ == 4 }; // Are we enabling skip softmax attention feature? diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp index 5a32f0a5116..f4ec62cd032 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp +++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp @@ -781,6 +781,10 @@ int main(int argc, char** argv) { attention_mask_type = Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL; } + else if (!strcmp(argv[ii], "-bidirectional-sliding-window-mask")) + { + attention_mask_type = Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW; + } else if (!strcmp(argv[ii], "-custom-mask")) { attention_mask_type = Attention_mask_type::CUSTOM_MASK; @@ -943,13 +947,20 @@ int main(int argc, char** argv) { assert( chunked_attention_size == 0 && "chunked_attention_size should not be used when sliding_window_size is set"); - attention_mask_type = Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL; + // Default to causal sliding window if the user did not explicitly set the mask type to bidirectional sliding + // window + if (attention_mask_type != Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW) + { + attention_mask_type = Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL; + } } // Chunked attention. if (chunked_attention_size > 0) { assert((chunked_attention_size & (chunked_attention_size - 1)) == 0 && "chunked_attention_size has to be a power of 2"); + assert(attention_mask_type != Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW + && "Bidirectional sliding window attention should not use chunked attention"); attention_mask_type = Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL; } @@ -1632,6 +1643,11 @@ int main(int argc, char** argv) valid = valid && (si >= std::max(int(so + 1 - sliding_window_size), 0)); } } + else if (attention_mask_type == Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW) + { + valid = valid && si >= std::max(int(so - sliding_window_size / 2), 0); + valid = valid && si <= std::min(int(so + sliding_window_size / 2), int(actual_seqlen - 1)); + } if (is_mtp) { // Only the last s_q tokens are used for verifying the results. diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.h b/cpp/kernels/fmha_v2/src/fused_multihead_attention.h index f71bd948676..32e2ed465ba 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_attention.h +++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention.h @@ -49,6 +49,8 @@ enum class Attention_mask_type CAUSAL, // Causal mask + attend to the specific sliding window or chunk. SLIDING_OR_CHUNKED_CAUSAL, + // Bidirectional sliding window attention. + BIDIRECTIONAL_SLIDING_WINDOW, // The custom mask input. CUSTOM_MASK, }; @@ -62,6 +64,7 @@ static inline std::string mask_type_to_string(Attention_mask_type mask_type) case Attention_mask_type::PADDING: return "padding"; case Attention_mask_type::CAUSAL: return "causal"; case Attention_mask_type::SLIDING_OR_CHUNKED_CAUSAL: return "sliding_or_chunked_causal"; + case Attention_mask_type::BIDIRECTIONAL_SLIDING_WINDOW: return "bidirectional_sliding_window"; case Attention_mask_type::CUSTOM_MASK: return "custom_mask"; default: assert(false); return ""; } diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h index 467d614c1d0..2c38c1703e4 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h +++ b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop.h @@ -172,19 +172,42 @@ inline __device__ void device_flash_attention_nl(Params const& params) static_assert(MASK_LOOPS * Cta_tile_p::N == Cta_tile_p::M || Cta_tile_p::N >= Cta_tile_p::M, ""); // The start/end step of kv loops. - // Do we need to mask out the tokens that is far away from the beginning. + // Do we need to mask out the tokens that is not in the sliding window. bool const mask_sliding_window - = Kernel_traits::SLIDING_WINDOW_ATTENTION && binfo.actual_kv_seqlen > params.sliding_window_size; + = (Kernel_traits::SLIDING_WINDOW_ATTENTION && binfo.actual_kv_seqlen > params.sliding_window_size) + || (Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION + && binfo.actual_kv_seqlen > params.sliding_window_size / 2 + 1); // +1 to include self token + int const valid_seqlen = Kernel_traits::CAUSAL_MASK ? min(q_sequence_start + Cta_tile_p::M, binfo.actual_kv_seqlen) : binfo.actual_kv_seqlen; - int const kv_loop_end = ((valid_seqlen + Cta_tile_p::N - 1) / Cta_tile_p::N) * Cta_tile_p::N; - int const kv_loop_start = mask_sliding_window - ? (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N - : 0; - int const sliding_window_mask_end = mask_sliding_window - ? (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N - : 0; + int kv_loop_start = 0; + int kv_loop_end = fmha::div_up(valid_seqlen, int(Cta_tile_p::N)) * int(Cta_tile_p::N); + int sliding_window_mask_left = 0; + int sliding_window_mask_right = kv_loop_end; + if (mask_sliding_window) + { + if constexpr (Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION) + { + kv_loop_start = (max(0, q_sequence_start - params.sliding_window_size / 2) / Cta_tile_p::N) * Cta_tile_p::N; + sliding_window_mask_left + = (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size / 2) / Cta_tile_p::N) + * Cta_tile_p::N; + + kv_loop_end = min(kv_loop_end, + (fmha::div_up(q_sequence_start + Cta_tile_p::M + params.sliding_window_size / 2, int(Cta_tile_p::N)) + * Cta_tile_p::N)); + sliding_window_mask_right = min(sliding_window_mask_right, + ((q_sequence_start + params.sliding_window_size / 2) / int(Cta_tile_p::N)) * Cta_tile_p::N); + } + else + { + kv_loop_start = (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N; + sliding_window_mask_left + = (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) / Cta_tile_p::N) + * Cta_tile_p::N; + } + } static_assert(Cta_tile_p::M >= Cta_tile_p::N, ""); @@ -337,7 +360,8 @@ inline __device__ void device_flash_attention_nl(Params const& params) // Do we need to check if there are negative inf for softmax row_max ? enum { - CHECK_NEG_INF = Kernel_traits::SLIDING_WINDOW_ATTENTION || Kernel_traits::CUSTOM_MASK + CHECK_NEG_INF = Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION || Kernel_traits::SLIDING_WINDOW_ATTENTION + || Kernel_traits::CUSTOM_MASK }; // Load the mask for that iteration. @@ -363,7 +387,8 @@ inline __device__ void device_flash_attention_nl(Params const& params) bool const first_step = (kv_loop == kv_loop_start); // It is possible that all tokens are masked out (sliding-window-attention). - bool const apply_sliding_window_mask = (mask_sliding_window && kv_loop <= sliding_window_mask_end); + bool const apply_sliding_window_mask + = (mask_sliding_window && (kv_loop <= sliding_window_mask_left || kv_loop >= sliding_window_mask_right)); bool const apply_mask = params.has_alibi || (kv_loop >= kv_mask_loop_start) || apply_sliding_window_mask; // Move mask offset. diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h index 55ba07ed8c3..2f3f05a4f48 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h +++ b/cpp/kernels/fmha_v2/src/fused_multihead_flash_attention_kernel_noloop_tiled.h @@ -175,17 +175,40 @@ inline __device__ void device_flash_attention_nl_tiled(Params const& params) // The start/end step of kv loops. // Do we need to mask out the tokens that is not in the sliding window. bool const mask_sliding_window - = Kernel_traits::SLIDING_WINDOW_ATTENTION && binfo.actual_kv_seqlen > params.sliding_window_size; + = (Kernel_traits::SLIDING_WINDOW_ATTENTION && binfo.actual_kv_seqlen > params.sliding_window_size) + || (Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION + && binfo.actual_kv_seqlen > params.sliding_window_size / 2 + 1); // +1 to include self token + int const valid_seqlen = Kernel_traits::CAUSAL_MASK ? min(q_sequence_start + Cta_tile_p::M, binfo.actual_kv_seqlen) : binfo.actual_kv_seqlen; - int const kv_loop_end = ((valid_seqlen + Cta_tile_p::N - 1) / Cta_tile_p::N) * Cta_tile_p::N; - int const kv_loop_start = mask_sliding_window - ? (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N - : 0; - int const sliding_window_mask_end = mask_sliding_window - ? (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N - : 0; + int kv_loop_start = 0; + int kv_loop_end = fmha::div_up(valid_seqlen, int(Cta_tile_p::N)) * int(Cta_tile_p::N); + int sliding_window_mask_left = 0; + int sliding_window_mask_right = kv_loop_end; + if (mask_sliding_window) + { + if constexpr (Kernel_traits::BIDIRECTIONAL_SLIDING_WINDOW_ATTENTION) + { + kv_loop_start = (max(0, q_sequence_start - params.sliding_window_size / 2) / Cta_tile_p::N) * Cta_tile_p::N; + sliding_window_mask_left + = (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size / 2) / Cta_tile_p::N) + * Cta_tile_p::N; + + kv_loop_end = min(kv_loop_end, + (fmha::div_up(q_sequence_start + Cta_tile_p::M + params.sliding_window_size / 2, int(Cta_tile_p::N)) + * Cta_tile_p::N)); + sliding_window_mask_right = min(sliding_window_mask_right, + ((q_sequence_start + params.sliding_window_size / 2) / int(Cta_tile_p::N)) * Cta_tile_p::N); + } + else + { + kv_loop_start = (max(0, q_sequence_start + 1 - params.sliding_window_size) / Cta_tile_p::N) * Cta_tile_p::N; + sliding_window_mask_left + = (max(0, q_sequence_start + Cta_tile_p::M - params.sliding_window_size) / Cta_tile_p::N) + * Cta_tile_p::N; + } + } // Move K and V tiles. // We need offset here since we split single k loops into finer granularity. @@ -301,7 +324,8 @@ inline __device__ void device_flash_attention_nl_tiled(Params const& params) bool const first_step = (kv_loop == kv_loop_start); // It is possible that all tokens are masked out (sliding-window-attention). - bool const apply_sliding_window_mask = (mask_sliding_window && kv_loop <= sliding_window_mask_end); + bool const apply_sliding_window_mask + = (mask_sliding_window && (kv_loop <= sliding_window_mask_left || kv_loop >= sliding_window_mask_right)); bool const apply_mask = params.has_alibi || (kv_loop >= kv_mask_loop_start) || apply_sliding_window_mask; // Declare the accumulators for the 1st gemm. diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h index 9679be86fcc..f129a973ac4 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h @@ -69,6 +69,8 @@ enum class ContextAttentionMaskType CAUSAL, // Causal mask + attend to the specific sliding window or chunk. SLIDING_OR_CHUNKED_CAUSAL, + // Bidirectional sliding window attention. + BIDIRECTIONAL_SLIDING_WINDOW, // The custom mask input. CUSTOM_MASK }; diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy index ac3f7b2e977..19799dcecf3 100644 --- a/jenkins/L0_Test.groovy +++ b/jenkins/L0_Test.groovy @@ -3294,7 +3294,8 @@ def launchTestJobs(pipeline, testFilter) "DGX_H100-4_GPUs-PyTorch-DeepSeek-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 2, 4], "DGX_H100-4_GPUs-PyTorch-DeepSeek-2": ["auto:dgx-h100-x4", "l0_dgx_h100", 2, 2, 4], "DGX_H100-4_GPUs-PyTorch-GptOss-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], - "DGX_H100-4_GPUs-PyTorch-Others-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], + "DGX_H100-4_GPUs-PyTorch-Others-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 2, 4], + "DGX_H100-4_GPUs-PyTorch-Others-2": ["auto:dgx-h100-x4", "l0_dgx_h100", 2, 2, 4], "DGX_H100-4_GPUs-PyTorch-Ray-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-AutoDeploy-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], "DGX_H100-4_GPUs-AutoDeploy-Post-Merge-1": ["auto:dgx-h100-x4", "l0_dgx_h100", 1, 1, 4], @@ -3305,7 +3306,9 @@ def launchTestJobs(pipeline, testFilter) "DGX_B200-Triton-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 1, 1, 1, true], "DGX_B200-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_b200", 1, 2, 1, 1, true], "DGX_B200-PyTorch-Post-Merge-2": ["auto:dgx-b200-flex", "l0_b200", 2, 2, 1, 1, true], - "DGX_B200-4_GPUs-PyTorch-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 1, 4, 1, true], + "DGX_B200-4_GPUs-PyTorch-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 3, 4, 1, true], + "DGX_B200-4_GPUs-PyTorch-2": ["auto:dgx-b200-flex", "l0_dgx_b200", 2, 3, 4, 1, true], + "DGX_B200-4_GPUs-PyTorch-3": ["auto:dgx-b200-flex", "l0_dgx_b200", 3, 3, 4, 1, true], "DGX_B200-4_GPUs-PyTorch-Ray-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 1, 4, 1, true], "DGX_B200-4_GPUs-AutoDeploy-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 1, 4, 1, true], "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 2, 4, 1, true], diff --git a/requirements.txt b/requirements.txt index 1b0455b72d3..678d640864e 100644 --- a/requirements.txt +++ b/requirements.txt @@ -84,4 +84,4 @@ cuda-tile>=1.0.1 nvidia-cuda-tileiras>=13.1 etcd-sdk-python==0.0.7 python-multipart -smg-grpc-proto>=0.3.3 +smg-grpc-proto>=0.4.2 diff --git a/security_scanning/docs/poetry.lock b/security_scanning/docs/poetry.lock index 9ac6899f28f..861bbdd4bbc 100644 --- a/security_scanning/docs/poetry.lock +++ b/security_scanning/docs/poetry.lock @@ -1046,14 +1046,14 @@ test = ["flake8", "mypy", "pytest"] [[package]] name = "sphinxcontrib-mermaid" -version = "2.0.0" +version = "2.0.1" description = "Mermaid diagrams in your Sphinx-powered docs" optional = false python-versions = ">=3.10" groups = ["main"] files = [ - {file = "sphinxcontrib_mermaid-2.0.0-py3-none-any.whl", hash = "sha256:59a73249bbee2c74b1a4db036f8e8899ade65982bdda6712cf22b4f4e9874bb5"}, - {file = "sphinxcontrib_mermaid-2.0.0.tar.gz", hash = "sha256:cf4f7d453d001132eaba5d1fdf53d42049f02e913213cf8337427483bfca26f4"}, + {file = "sphinxcontrib_mermaid-2.0.1-py3-none-any.whl", hash = "sha256:9dca7fbe827bad5e7e2b97c4047682cfd26e3e07398cfdc96c7a8842ae7f06e7"}, + {file = "sphinxcontrib_mermaid-2.0.1.tar.gz", hash = "sha256:a21a385a059a6cafd192aa3a586b14bf5c42721e229db67b459dc825d7f0a497"}, ] [package.dependencies] @@ -1222,4 +1222,4 @@ test = ["pytest (>=6.0.0)", "setuptools (>=77)"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "f93bbd8da205c4e4374138aeea92fdc5a73d7764638e5f27c12351866592f2bb" +content-hash = "25155b7ceb59522a3d568a3a7f15a11aca6e1b2e7f17bde117f1b1b33be32945" diff --git a/security_scanning/docs/pyproject.toml b/security_scanning/docs/pyproject.toml index 40191f553af..da05d31bade 100644 --- a/security_scanning/docs/pyproject.toml +++ b/security_scanning/docs/pyproject.toml @@ -15,7 +15,7 @@ dependencies = [ "sphinx-copybutton (>=0.5.2,<0.6.0)", "autodoc-pydantic (>=2.2.0,<3.0.0)", "sphinx-togglebutton (>=0.4.4,<0.5.0)", - "sphinxcontrib-mermaid (>=2.0.0,<3.0.0)" + "sphinxcontrib-mermaid (>=2.0.1,<3.0.0)" ] diff --git a/security_scanning/examples/apps/poetry.lock b/security_scanning/examples/apps/poetry.lock index ed6e03e3fcf..33f100adc22 100644 --- a/security_scanning/examples/apps/poetry.lock +++ b/security_scanning/examples/apps/poetry.lock @@ -275,14 +275,14 @@ files = [ [[package]] name = "openai" -version = "2.24.0" +version = "2.26.0" description = "The official Python library for the openai API" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "openai-2.24.0-py3-none-any.whl", hash = "sha256:fed30480d7d6c884303287bde864980a4b137b60553ffbcf9ab4a233b7a73d94"}, - {file = "openai-2.24.0.tar.gz", hash = "sha256:1e5769f540dbd01cb33bc4716a23e67b9d695161a734aff9c5f925e2bf99a673"}, + {file = "openai-2.26.0-py3-none-any.whl", hash = "sha256:6151bf8f83802f036117f06cc8a57b3a4da60da9926826cc96747888b57f394f"}, + {file = "openai-2.26.0.tar.gz", hash = "sha256:b41f37c140ae0034a6e92b0c509376d907f3a66109935fba2c1b471a7c05a8fb"}, ] [package.dependencies] diff --git a/security_scanning/examples/models/contrib/mmdit/poetry.lock b/security_scanning/examples/models/contrib/mmdit/poetry.lock index e2ff09a48d0..925d0e528a1 100644 --- a/security_scanning/examples/models/contrib/mmdit/poetry.lock +++ b/security_scanning/examples/models/contrib/mmdit/poetry.lock @@ -197,14 +197,14 @@ files = [ [[package]] name = "diffusers" -version = "0.36.0" +version = "0.37.0" description = "State-of-the-art diffusion in PyTorch and JAX." optional = false -python-versions = ">=3.8.0" +python-versions = ">=3.10.0" groups = ["main"] files = [ - {file = "diffusers-0.36.0-py3-none-any.whl", hash = "sha256:525d42abc74bfc3b2db594999961295c054b48ef40a11724dacf50e6abd1af98"}, - {file = "diffusers-0.36.0.tar.gz", hash = "sha256:a9cde8721b415bde6a678f2d02abb85396487e1b0e0d2b4abb462d14a9825ab0"}, + {file = "diffusers-0.37.0-py3-none-any.whl", hash = "sha256:7eab74bf896974250b5e1027cae813aba1004f02d97c9b44891b83713386aa08"}, + {file = "diffusers-0.37.0.tar.gz", hash = "sha256:408789af73898585f525afd07ca72b3955affea4216a669558e9f59b5b1fe704"}, ] [package.dependencies] @@ -220,14 +220,14 @@ safetensors = ">=0.3.1" [package.extras] bitsandbytes = ["accelerate (>=0.31.0)", "bitsandbytes (>=0.43.3)"] -dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "timm", "torch (>=1.4)", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"] +dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "ftfy", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "timm", "torch (>=1.4)", "torchsde", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"] docs = ["hf-doc-builder (>=0.3.0)"] flax = ["flax (>=0.4.1)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)"] gguf = ["accelerate (>=0.31.0)", "gguf (>=0.10.0)"] nvidia-modelopt = ["nvidia_modelopt[hf] (>=0.33.1)"] optimum-quanto = ["accelerate (>=0.31.0)", "optimum_quanto (>=0.2.6)"] quality = ["hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.9.10)", "urllib3 (<=2.0.0)"] -test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "invisible-watermark (>=0.2.0)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "phonemizer", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tiktoken (>=0.7.0)", "torchvision", "transformers (>=4.41.2)"] +test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "ftfy", "invisible-watermark (>=0.2.0)", "librosa", "parameterized", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tiktoken (>=0.7.0)", "torchsde", "torchvision", "transformers (>=4.41.2)"] torch = ["accelerate (>=0.31.0)", "torch (>=1.4)"] torchao = ["accelerate (>=0.31.0)", "torchao (>=0.7.0)"] training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.17.0)", "protobuf (>=3.20.3,<4)", "tensorboard", "timm"] diff --git a/security_scanning/examples/models/contrib/stdit/poetry.lock b/security_scanning/examples/models/contrib/stdit/poetry.lock index ea9387bbbb3..defb5de3256 100644 --- a/security_scanning/examples/models/contrib/stdit/poetry.lock +++ b/security_scanning/examples/models/contrib/stdit/poetry.lock @@ -2564,86 +2564,102 @@ files = [ [[package]] name = "wrapt" -version = "2.1.1" +version = "2.1.2" description = "Module for decorators, wrappers and monkey patching." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "wrapt-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7e927375e43fd5a985b27a8992327c22541b6dede1362fc79df337d26e23604f"}, - {file = "wrapt-2.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c99544b6a7d40ca22195563b6d8bc3986ee8bb82f272f31f0670fe9440c869"}, - {file = "wrapt-2.1.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b2be3fa5f4efaf16ee7c77d0556abca35f5a18ad4ac06f0ef3904c3399010ce9"}, - {file = "wrapt-2.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67c90c1ae6489a6cb1a82058902caa8006706f7b4e8ff766f943e9d2c8e608d0"}, - {file = "wrapt-2.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:05c0db35ccffd7480143e62df1e829d101c7b86944ae3be7e4869a7efa621f53"}, - {file = "wrapt-2.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0c2ec9f616755b2e1e0bf4d0961f59bb5c2e7a77407e7e2c38ef4f7d2fdde12c"}, - {file = "wrapt-2.1.1-cp310-cp310-win32.whl", hash = "sha256:203ba6b3f89e410e27dbd30ff7dccaf54dcf30fda0b22aa1b82d560c7f9fe9a1"}, - {file = "wrapt-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:6f9426d9cfc2f8732922fc96198052e55c09bb9db3ddaa4323a18e055807410e"}, - {file = "wrapt-2.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:69c26f51b67076b40714cff81bdd5826c0b10c077fb6b0678393a6a2f952a5fc"}, - {file = "wrapt-2.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6c366434a7fb914c7a5de508ed735ef9c133367114e1a7cb91dfb5cd806a1549"}, - {file = "wrapt-2.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d6a2068bd2e1e19e5a317c8c0b288267eec4e7347c36bc68a6e378a39f19ee7"}, - {file = "wrapt-2.1.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:891ab4713419217b2aed7dd106c9200f64e6a82226775a0d2ebd6bef2ebd1747"}, - {file = "wrapt-2.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c8ef36a0df38d2dc9d907f6617f89e113c5892e0a35f58f45f75901af0ce7d81"}, - {file = "wrapt-2.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:76e9af3ebd86f19973143d4d592cbf3e970cf3f66ddee30b16278c26ae34b8ab"}, - {file = "wrapt-2.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ff562067485ebdeaef2fa3fe9b1876bc4e7b73762e0a01406ad81e2076edcebf"}, - {file = "wrapt-2.1.1-cp311-cp311-win32.whl", hash = "sha256:9e60a30aa0909435ec4ea2a3c53e8e1b50ac9f640c0e9fe3f21fd248a22f06c5"}, - {file = "wrapt-2.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:7d79954f51fcf84e5ec4878ab4aea32610d70145c5bbc84b3370eabfb1e096c2"}, - {file = "wrapt-2.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:d3ffc6b0efe79e08fd947605fd598515aebefe45e50432dc3b5cd437df8b1ada"}, - {file = "wrapt-2.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ab8e3793b239db021a18782a5823fcdea63b9fe75d0e340957f5828ef55fcc02"}, - {file = "wrapt-2.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7c0300007836373d1c2df105b40777986accb738053a92fe09b615a7a4547e9f"}, - {file = "wrapt-2.1.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2b27c070fd1132ab23957bcd4ee3ba707a91e653a9268dc1afbd39b77b2799f7"}, - {file = "wrapt-2.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b0e36d845e8b6f50949b6b65fc6cd279f47a1944582ed4ec8258cd136d89a64"}, - {file = "wrapt-2.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4aeea04a9889370fcfb1ef828c4cc583f36a875061505cd6cd9ba24d8b43cc36"}, - {file = "wrapt-2.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d88b46bb0dce9f74b6817bc1758ff2125e1ca9e1377d62ea35b6896142ab6825"}, - {file = "wrapt-2.1.1-cp312-cp312-win32.whl", hash = "sha256:63decff76ca685b5c557082dfbea865f3f5f6d45766a89bff8dc61d336348833"}, - {file = "wrapt-2.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:b828235d26c1e35aca4107039802ae4b1411be0fe0367dd5b7e4d90e562fcbcd"}, - {file = "wrapt-2.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:75128507413a9f1bcbe2db88fd18fbdbf80f264b82fa33a6996cdeaf01c52352"}, - {file = "wrapt-2.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ce9646e17fa7c3e2e7a87e696c7de66512c2b4f789a8db95c613588985a2e139"}, - {file = "wrapt-2.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:428cfc801925454395aa468ba7ddb3ed63dc0d881df7b81626cdd433b4e2b11b"}, - {file = "wrapt-2.1.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5797f65e4d58065a49088c3b32af5410751cd485e83ba89e5a45e2aa8905af98"}, - {file = "wrapt-2.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a2db44a71202c5ae4bb5f27c6d3afbc5b23053f2e7e78aa29704541b5dad789"}, - {file = "wrapt-2.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8d5350c3590af09c1703dd60ec78a7370c0186e11eaafb9dda025a30eee6492d"}, - {file = "wrapt-2.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d9b076411bed964e752c01b49fd224cc385f3a96f520c797d38412d70d08359"}, - {file = "wrapt-2.1.1-cp313-cp313-win32.whl", hash = "sha256:0bb7207130ce6486727baa85373503bf3334cc28016f6928a0fa7e19d7ecdc06"}, - {file = "wrapt-2.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:cbfee35c711046b15147b0ae7db9b976f01c9520e6636d992cd9e69e5e2b03b1"}, - {file = "wrapt-2.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:7d2756061022aebbf57ba14af9c16e8044e055c22d38de7bf40d92b565ecd2b0"}, - {file = "wrapt-2.1.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4814a3e58bc6971e46baa910ecee69699110a2bf06c201e24277c65115a20c20"}, - {file = "wrapt-2.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:106c5123232ab9b9f4903692e1fa0bdc231510098f04c13c3081f8ad71c3d612"}, - {file = "wrapt-2.1.1-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1a40b83ff2535e6e56f190aff123821eea89a24c589f7af33413b9c19eb2c738"}, - {file = "wrapt-2.1.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:789cea26e740d71cf1882e3a42bb29052bc4ada15770c90072cb47bf73fb3dbf"}, - {file = "wrapt-2.1.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ba49c14222d5e5c0ee394495a8655e991dc06cbca5398153aefa5ac08cd6ccd7"}, - {file = "wrapt-2.1.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ac8cda531fe55be838a17c62c806824472bb962b3afa47ecbd59b27b78496f4e"}, - {file = "wrapt-2.1.1-cp313-cp313t-win32.whl", hash = "sha256:b8af75fe20d381dd5bcc9db2e86a86d7fcfbf615383a7147b85da97c1182225b"}, - {file = "wrapt-2.1.1-cp313-cp313t-win_amd64.whl", hash = "sha256:45c5631c9b6c792b78be2d7352129f776dd72c605be2c3a4e9be346be8376d83"}, - {file = "wrapt-2.1.1-cp313-cp313t-win_arm64.whl", hash = "sha256:da815b9263947ac98d088b6414ac83507809a1d385e4632d9489867228d6d81c"}, - {file = "wrapt-2.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9aa1765054245bb01a37f615503290d4e207e3fd59226e78341afb587e9c1236"}, - {file = "wrapt-2.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:feff14b63a6d86c1eee33a57f77573649f2550935981625be7ff3cb7342efe05"}, - {file = "wrapt-2.1.1-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:81fc5f22d5fcfdbabde96bb3f5379b9f4476d05c6d524d7259dc5dfb501d3281"}, - {file = "wrapt-2.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:951b228ecf66def855d22e006ab9a1fc12535111ae7db2ec576c728f8ddb39e8"}, - {file = "wrapt-2.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ddf582a95641b9a8c8bd643e83f34ecbbfe1b68bc3850093605e469ab680ae3"}, - {file = "wrapt-2.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fc5c500966bf48913f795f1984704e6d452ba2414207b15e1f8c339a059d5b16"}, - {file = "wrapt-2.1.1-cp314-cp314-win32.whl", hash = "sha256:4aa4baadb1f94b71151b8e44a0c044f6af37396c3b8bcd474b78b49e2130a23b"}, - {file = "wrapt-2.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:860e9d3fd81816a9f4e40812f28be4439ab01f260603c749d14be3c0a1170d19"}, - {file = "wrapt-2.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:3c59e103017a2c1ea0ddf589cbefd63f91081d7ce9d491d69ff2512bb1157e23"}, - {file = "wrapt-2.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9fa7c7e1bee9278fc4f5dd8275bc8d25493281a8ec6c61959e37cc46acf02007"}, - {file = "wrapt-2.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:39c35e12e8215628984248bd9c8897ce0a474be2a773db207eb93414219d8469"}, - {file = "wrapt-2.1.1-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:94ded4540cac9125eaa8ddf5f651a7ec0da6f5b9f248fe0347b597098f8ec14c"}, - {file = "wrapt-2.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da0af328373f97ed9bdfea24549ac1b944096a5a71b30e41c9b8b53ab3eec04a"}, - {file = "wrapt-2.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4ad839b55f0bf235f8e337ce060572d7a06592592f600f3a3029168e838469d3"}, - {file = "wrapt-2.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0d89c49356e5e2a50fa86b40e0510082abcd0530f926cbd71cf25bee6b9d82d7"}, - {file = "wrapt-2.1.1-cp314-cp314t-win32.whl", hash = "sha256:f4c7dd22cf7f36aafe772f3d88656559205c3af1b7900adfccb70edeb0d2abc4"}, - {file = "wrapt-2.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f76bc12c583ab01e73ba0ea585465a41e48d968f6d1311b4daec4f8654e356e3"}, - {file = "wrapt-2.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:7ea74fc0bec172f1ae5f3505b6655c541786a5cabe4bbc0d9723a56ac32eb9b9"}, - {file = "wrapt-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9e03b3d486eb39f5d3f562839f59094dcee30c4039359ea15768dc2214d9e07c"}, - {file = "wrapt-2.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0fdf3073f488ce4d929929b7799e3b8c52b220c9eb3f4a5a51e2dc0e8ff07881"}, - {file = "wrapt-2.1.1-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0cb4f59238c6625fae2eeb72278da31c9cfba0ff4d9cbe37446b73caa0e9bcf7"}, - {file = "wrapt-2.1.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f794a1c148871b714cb566f5466ec8288e0148a1c417550983864b3981737cd"}, - {file = "wrapt-2.1.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:95ef3866631c6da9ce1fc0f1e17b90c4c0aa6d041fc70a11bc90733aee122e1a"}, - {file = "wrapt-2.1.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:66bc1b2446f01cbbd3c56b79a3a8435bcd4178ac4e06b091913f7751a7f528b8"}, - {file = "wrapt-2.1.1-cp39-cp39-win32.whl", hash = "sha256:1b9e08e57cabc32972f7c956d10e85093c5da9019faa24faf411e7dd258e528c"}, - {file = "wrapt-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:e75ad48c3cca739f580b5e14c052993eb644c7fa5b4c90aa51193280b30875ae"}, - {file = "wrapt-2.1.1-cp39-cp39-win_arm64.whl", hash = "sha256:9ccd657873b7f964711447d004563a2bc08d1476d7a1afcad310f3713e6f50f4"}, - {file = "wrapt-2.1.1-py3-none-any.whl", hash = "sha256:3b0f4629eb954394a3d7c7a1c8cca25f0b07cefe6aa8545e862e9778152de5b7"}, - {file = "wrapt-2.1.1.tar.gz", hash = "sha256:5fdcb09bf6db023d88f312bd0767594b414655d58090fc1c46b3414415f67fac"}, + {file = "wrapt-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b7a86d99a14f76facb269dc148590c01aaf47584071809a70da30555228158c"}, + {file = "wrapt-2.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a819e39017f95bf7aede768f75915635aa8f671f2993c036991b8d3bfe8dbb6f"}, + {file = "wrapt-2.1.2-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5681123e60aed0e64c7d44f72bbf8b4ce45f79d81467e2c4c728629f5baf06eb"}, + {file = "wrapt-2.1.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b8b28e97a44d21836259739ae76284e180b18abbb4dcfdff07a415cf1016c3e"}, + {file = "wrapt-2.1.2-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cef91c95a50596fcdc31397eb6955476f82ae8a3f5a8eabdc13611b60ee380ba"}, + {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:dad63212b168de8569b1c512f4eac4b57f2c6934b30df32d6ee9534a79f1493f"}, + {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d307aa6888d5efab2c1cde09843d48c843990be13069003184b67d426d145394"}, + {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c87cf3f0c85e27b3ac7d9ad95da166bf8739ca215a8b171e8404a2d739897a45"}, + {file = "wrapt-2.1.2-cp310-cp310-win32.whl", hash = "sha256:d1c5fea4f9fe3762e2b905fdd67df51e4be7a73b7674957af2d2ade71a5c075d"}, + {file = "wrapt-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:d8f7740e1af13dff2684e4d56fe604a7e04d6c94e737a60568d8d4238b9a0c71"}, + {file = "wrapt-2.1.2-cp310-cp310-win_arm64.whl", hash = "sha256:1c6cc827c00dc839350155f316f1f8b4b0c370f52b6a19e782e2bda89600c7dc"}, + {file = "wrapt-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:96159a0ee2b0277d44201c3b5be479a9979cf154e8c82fa5df49586a8e7679bb"}, + {file = "wrapt-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98ba61833a77b747901e9012072f038795de7fc77849f1faa965464f3f87ff2d"}, + {file = "wrapt-2.1.2-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:767c0dbbe76cae2a60dd2b235ac0c87c9cccf4898aef8062e57bead46b5f6894"}, + {file = "wrapt-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c691a6bc752c0cc4711cc0c00896fcd0f116abc253609ef64ef930032821842"}, + {file = "wrapt-2.1.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f3b7d73012ea75aee5844de58c88f44cf62d0d62711e39da5a82824a7c4626a8"}, + {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:577dff354e7acd9d411eaf4bfe76b724c89c89c8fc9b7e127ee28c5f7bcb25b6"}, + {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3d7b6fd105f8b24e5bd23ccf41cb1d1099796524bcc6f7fbb8fe576c44befbc9"}, + {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:866abdbf4612e0b34764922ef8b1c5668867610a718d3053d59e24a5e5fcfc15"}, + {file = "wrapt-2.1.2-cp311-cp311-win32.whl", hash = "sha256:5a0a0a3a882393095573344075189eb2d566e0fd205a2b6414e9997b1b800a8b"}, + {file = "wrapt-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:64a07a71d2730ba56f11d1a4b91f7817dc79bc134c11516b75d1921a7c6fcda1"}, + {file = "wrapt-2.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:b89f095fe98bc12107f82a9f7d570dc83a0870291aeb6b1d7a7d35575f55d98a"}, + {file = "wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9"}, + {file = "wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748"}, + {file = "wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e"}, + {file = "wrapt-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79847b83eb38e70d93dc392c7c5b587efe65b3e7afcc167aa8abd5d60e8761c8"}, + {file = "wrapt-2.1.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f8fba1bae256186a83d1875b2b1f4e2d1242e8fac0f58ec0d7e41b26967b965c"}, + {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e3d3b35eedcf5f7d022291ecd7533321c4775f7b9cd0050a31a68499ba45757c"}, + {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6f2c5390460de57fa9582bc8a1b7a6c86e1a41dfad74c5225fc07044c15cc8d1"}, + {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7dfa9f2cf65d027b951d05c662cc99ee3bd01f6e4691ed39848a7a5fffc902b2"}, + {file = "wrapt-2.1.2-cp312-cp312-win32.whl", hash = "sha256:eba8155747eb2cae4a0b913d9ebd12a1db4d860fc4c829d7578c7b989bd3f2f0"}, + {file = "wrapt-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1c51c738d7d9faa0b3601708e7e2eda9bf779e1b601dce6c77411f2a1b324a63"}, + {file = "wrapt-2.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c8e46ae8e4032792eb2f677dbd0d557170a8e5524d22acc55199f43efedd39bf"}, + {file = "wrapt-2.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787fd6f4d67befa6fe2abdffcbd3de2d82dfc6fb8a6d850407c53332709d030b"}, + {file = "wrapt-2.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4bdf26e03e6d0da3f0e9422fd36bcebf7bc0eeb55fdf9c727a09abc6b9fe472e"}, + {file = "wrapt-2.1.2-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bbac24d879aa22998e87f6b3f481a5216311e7d53c7db87f189a7a0266dafffb"}, + {file = "wrapt-2.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16997dfb9d67addc2e3f41b62a104341e80cac52f91110dece393923c0ebd5ca"}, + {file = "wrapt-2.1.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:162e4e2ba7542da9027821cb6e7c5e068d64f9a10b5f15512ea28e954893a267"}, + {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f29c827a8d9936ac320746747a016c4bc66ef639f5cd0d32df24f5eacbf9c69f"}, + {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:a9dd9813825f7ecb018c17fd147a01845eb330254dff86d3b5816f20f4d6aaf8"}, + {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f8dbdd3719e534860d6a78526aafc220e0241f981367018c2875178cf83a413"}, + {file = "wrapt-2.1.2-cp313-cp313-win32.whl", hash = "sha256:5c35b5d82b16a3bc6e0a04349b606a0582bc29f573786aebe98e0c159bc48db6"}, + {file = "wrapt-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:f8bc1c264d8d1cf5b3560a87bbdd31131573eb25f9f9447bb6252b8d4c44a3a1"}, + {file = "wrapt-2.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:3beb22f674550d5634642c645aba4c72a2c66fb185ae1aebe1e955fae5a13baf"}, + {file = "wrapt-2.1.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0fc04bc8664a8bc4c8e00b37b5355cffca2535209fba1abb09ae2b7c76ddf82b"}, + {file = "wrapt-2.1.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a9b9d50c9af998875a1482a038eb05755dfd6fe303a313f6a940bb53a83c3f18"}, + {file = "wrapt-2.1.2-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2d3ff4f0024dd224290c0eabf0240f1bfc1f26363431505fb1b0283d3b08f11d"}, + {file = "wrapt-2.1.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3278c471f4468ad544a691b31bb856374fbdefb7fee1a152153e64019379f015"}, + {file = "wrapt-2.1.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a8914c754d3134a3032601c6984db1c576e6abaf3fc68094bb8ab1379d75ff92"}, + {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ff95d4264e55839be37bafe1536db2ab2de19da6b65f9244f01f332b5286cfbf"}, + {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:76405518ca4e1b76fbb1b9f686cff93aebae03920cc55ceeec48ff9f719c5f67"}, + {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c0be8b5a74c5824e9359b53e7e58bef71a729bacc82e16587db1c4ebc91f7c5a"}, + {file = "wrapt-2.1.2-cp313-cp313t-win32.whl", hash = "sha256:f01277d9a5fc1862f26f7626da9cf443bebc0abd2f303f41c5e995b15887dabd"}, + {file = "wrapt-2.1.2-cp313-cp313t-win_amd64.whl", hash = "sha256:84ce8f1c2104d2f6daa912b1b5b039f331febfeee74f8042ad4e04992bd95c8f"}, + {file = "wrapt-2.1.2-cp313-cp313t-win_arm64.whl", hash = "sha256:a93cd767e37faeddbe07d8fc4212d5cba660af59bdb0f6372c93faaa13e6e679"}, + {file = "wrapt-2.1.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:1370e516598854e5b4366e09ce81e08bfe94d42b0fd569b88ec46cc56d9164a9"}, + {file = "wrapt-2.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6de1a3851c27e0bd6a04ca993ea6f80fc53e6c742ee1601f486c08e9f9b900a9"}, + {file = "wrapt-2.1.2-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:de9f1a2bbc5ac7f6012ec24525bdd444765a2ff64b5985ac6e0692144838542e"}, + {file = "wrapt-2.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:970d57ed83fa040d8b20c52fe74a6ae7e3775ae8cff5efd6a81e06b19078484c"}, + {file = "wrapt-2.1.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3969c56e4563c375861c8df14fa55146e81ac11c8db49ea6fb7f2ba58bc1ff9a"}, + {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:57d7c0c980abdc5f1d98b11a2aa3bb159790add80258c717fa49a99921456d90"}, + {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:776867878e83130c7a04237010463372e877c1c994d449ca6aaafeab6aab2586"}, + {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fab036efe5464ec3291411fabb80a7a39e2dd80bae9bcbeeca5087fdfa891e19"}, + {file = "wrapt-2.1.2-cp314-cp314-win32.whl", hash = "sha256:e6ed62c82ddf58d001096ae84ce7f833db97ae2263bff31c9b336ba8cfe3f508"}, + {file = "wrapt-2.1.2-cp314-cp314-win_amd64.whl", hash = "sha256:467e7c76315390331c67073073d00662015bb730c566820c9ca9b54e4d67fd04"}, + {file = "wrapt-2.1.2-cp314-cp314-win_arm64.whl", hash = "sha256:da1f00a557c66225d53b095a97eace0fc5349e3bfda28fa34ffae238978ee575"}, + {file = "wrapt-2.1.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:62503ffbc2d3a69891cf29beeaccdb4d5e0a126e2b6a851688d4777e01428dbb"}, + {file = "wrapt-2.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c7e6cd120ef837d5b6f860a6ea3745f8763805c418bb2f12eeb1fa6e25f22d22"}, + {file = "wrapt-2.1.2-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3769a77df8e756d65fbc050333f423c01ae012b4f6731aaf70cf2bef61b34596"}, + {file = "wrapt-2.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a76d61a2e851996150ba0f80582dd92a870643fa481f3b3846f229de88caf044"}, + {file = "wrapt-2.1.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6f97edc9842cf215312b75fe737ee7c8adda75a89979f8e11558dfff6343cc4b"}, + {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4006c351de6d5007aa33a551f600404ba44228a89e833d2fadc5caa5de8edfbf"}, + {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:a9372fc3639a878c8e7d87e1556fa209091b0a66e912c611e3f833e2c4202be2"}, + {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3144b027ff30cbd2fca07c0a87e67011adb717eb5f5bd8496325c17e454257a3"}, + {file = "wrapt-2.1.2-cp314-cp314t-win32.whl", hash = "sha256:3b8d15e52e195813efe5db8cec156eebe339aaf84222f4f4f051a6c01f237ed7"}, + {file = "wrapt-2.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:08ffa54146a7559f5b8df4b289b46d963a8e74ed16ba3687f99896101a3990c5"}, + {file = "wrapt-2.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:72aaa9d0d8e4ed0e2e98019cea47a21f823c9dd4b43c7b77bba6679ffcca6a00"}, + {file = "wrapt-2.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5e0fa9cc32300daf9eb09a1f5bdc6deb9a79defd70d5356ba453bcd50aef3742"}, + {file = "wrapt-2.1.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:710f6e5dfaf6a5d5c397d2d6758a78fecd9649deb21f1b645f5b57a328d63050"}, + {file = "wrapt-2.1.2-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:305d8a1755116bfdad5dda9e771dcb2138990a1d66e9edd81658816edf51aed1"}, + {file = "wrapt-2.1.2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0d8fc30a43b5fe191cf2b1a0c82bab2571dadd38e7c0062ee87d6df858dd06e"}, + {file = "wrapt-2.1.2-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a5d516e22aedb7c9c1d47cba1c63160b1a6f61ec2f3948d127cd38d5cfbb556f"}, + {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:45914e8efbe4b9d5102fcf0e8e2e3258b83a5d5fba9f8f7b6d15681e9d29ffe0"}, + {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:478282ebd3795a089154fb16d3db360e103aa13d3b2ad30f8f6aac0d2207de0e"}, + {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3756219045f73fb28c5d7662778e4156fbd06cf823c4d2d4b19f97305e52819c"}, + {file = "wrapt-2.1.2-cp39-cp39-win32.whl", hash = "sha256:b8aefb4dbb18d904b96827435a763fa42fc1f08ea096a391710407a60983ced8"}, + {file = "wrapt-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:e5aeab8fe15c3dff75cfee94260dcd9cded012d4ff06add036c28fae7718593b"}, + {file = "wrapt-2.1.2-cp39-cp39-win_arm64.whl", hash = "sha256:f069e113743a21a3defac6677f000068ebb931639f789b5b226598e247a4c89e"}, + {file = "wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8"}, + {file = "wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e"}, ] [package.extras] diff --git a/security_scanning/examples/models/core/qwen/poetry.lock b/security_scanning/examples/models/core/qwen/poetry.lock index 3f3a9015e09..ea55a2676aa 100644 --- a/security_scanning/examples/models/core/qwen/poetry.lock +++ b/security_scanning/examples/models/core/qwen/poetry.lock @@ -1681,14 +1681,14 @@ files = [ [[package]] name = "openai" -version = "2.24.0" +version = "2.26.0" description = "The official Python library for the openai API" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "openai-2.24.0-py3-none-any.whl", hash = "sha256:fed30480d7d6c884303287bde864980a4b137b60553ffbcf9ab4a233b7a73d94"}, - {file = "openai-2.24.0.tar.gz", hash = "sha256:1e5769f540dbd01cb33bc4716a23e67b9d695161a734aff9c5f925e2bf99a673"}, + {file = "openai-2.26.0-py3-none-any.whl", hash = "sha256:6151bf8f83802f036117f06cc8a57b3a4da60da9926826cc96747888b57f394f"}, + {file = "openai-2.26.0.tar.gz", hash = "sha256:b41f37c140ae0034a6e92b0c509376d907f3a66109935fba2c1b471a7c05a8fb"}, ] [package.dependencies] @@ -2709,31 +2709,31 @@ six = ">=1.14.0" [[package]] name = "ruff" -version = "0.15.4" +version = "0.15.5" description = "An extremely fast Python linter and code formatter, written in Rust." optional = false python-versions = ">=3.7" groups = ["main"] markers = "sys_platform != \"emscripten\"" files = [ - {file = "ruff-0.15.4-py3-none-linux_armv6l.whl", hash = "sha256:a1810931c41606c686bae8b5b9a8072adac2f611bb433c0ba476acba17a332e0"}, - {file = "ruff-0.15.4-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:5a1632c66672b8b4d3e1d1782859e98d6e0b4e70829530666644286600a33992"}, - {file = "ruff-0.15.4-py3-none-macosx_11_0_arm64.whl", hash = "sha256:a4386ba2cd6c0f4ff75252845906acc7c7c8e1ac567b7bc3d373686ac8c222ba"}, - {file = "ruff-0.15.4-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:b2496488bdfd3732747558b6f95ae427ff066d1fcd054daf75f5a50674411e75"}, - {file = "ruff-0.15.4-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:3f1c4893841ff2d54cbda1b2860fa3260173df5ddd7b95d370186f8a5e66a4ac"}, - {file = "ruff-0.15.4-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:820b8766bd65503b6c30aaa6331e8ef3a6e564f7999c844e9a547c40179e440a"}, - {file = "ruff-0.15.4-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:c9fb74bab47139c1751f900f857fa503987253c3ef89129b24ed375e72873e85"}, - {file = "ruff-0.15.4-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:f80c98765949c518142b3a50a5db89343aa90f2c2bf7799de9986498ae6176db"}, - {file = "ruff-0.15.4-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:451a2e224151729b3b6c9ffb36aed9091b2996fe4bdbd11f47e27d8f2e8888ec"}, - {file = "ruff-0.15.4-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:a8f157f2e583c513c4f5f896163a93198297371f34c04220daf40d133fdd4f7f"}, - {file = "ruff-0.15.4-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:917cc68503357021f541e69b35361c99387cdbbf99bd0ea4aa6f28ca99ff5338"}, - {file = "ruff-0.15.4-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:e9737c8161da79fd7cfec19f1e35620375bd8b2a50c3e77fa3d2c16f574105cc"}, - {file = "ruff-0.15.4-py3-none-musllinux_1_2_i686.whl", hash = "sha256:291258c917539e18f6ba40482fe31d6f5ac023994ee11d7bdafd716f2aab8a68"}, - {file = "ruff-0.15.4-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:3f83c45911da6f2cd5936c436cf86b9f09f09165f033a99dcf7477e34041cbc3"}, - {file = "ruff-0.15.4-py3-none-win32.whl", hash = "sha256:65594a2d557d4ee9f02834fcdf0a28daa8b3b9f6cb2cb93846025a36db47ef22"}, - {file = "ruff-0.15.4-py3-none-win_amd64.whl", hash = "sha256:04196ad44f0df220c2ece5b0e959c2f37c777375ec744397d21d15b50a75264f"}, - {file = "ruff-0.15.4-py3-none-win_arm64.whl", hash = "sha256:60d5177e8cfc70e51b9c5fad936c634872a74209f934c1e79107d11787ad5453"}, - {file = "ruff-0.15.4.tar.gz", hash = "sha256:3412195319e42d634470cc97aa9803d07e9d5c9223b99bcb1518f0c725f26ae1"}, + {file = "ruff-0.15.5-py3-none-linux_armv6l.whl", hash = "sha256:4ae44c42281f42e3b06b988e442d344a5b9b72450ff3c892e30d11b29a96a57c"}, + {file = "ruff-0.15.5-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:6edd3792d408ebcf61adabc01822da687579a1a023f297618ac27a5b51ef0080"}, + {file = "ruff-0.15.5-py3-none-macosx_11_0_arm64.whl", hash = "sha256:89f463f7c8205a9f8dea9d658d59eff49db05f88f89cc3047fb1a02d9f344010"}, + {file = "ruff-0.15.5-py3-none-manylinux_2_17_aarch64.manylinux2014_aarch64.whl", hash = "sha256:ba786a8295c6574c1116704cf0b9e6563de3432ac888d8f83685654fe528fd65"}, + {file = "ruff-0.15.5-py3-none-manylinux_2_17_armv7l.manylinux2014_armv7l.whl", hash = "sha256:fd4b801e57955fe9f02b31d20375ab3a5c4415f2e5105b79fb94cf2642c91440"}, + {file = "ruff-0.15.5-py3-none-manylinux_2_17_i686.manylinux2014_i686.whl", hash = "sha256:391f7c73388f3d8c11b794dbbc2959a5b5afe66642c142a6effa90b45f6f5204"}, + {file = "ruff-0.15.5-py3-none-manylinux_2_17_ppc64le.manylinux2014_ppc64le.whl", hash = "sha256:8dc18f30302e379fe1e998548b0f5e9f4dff907f52f73ad6da419ea9c19d66c8"}, + {file = "ruff-0.15.5-py3-none-manylinux_2_17_s390x.manylinux2014_s390x.whl", hash = "sha256:1cc6e7f90087e2d27f98dc34ed1b3ab7c8f0d273cc5431415454e22c0bd2a681"}, + {file = "ruff-0.15.5-py3-none-manylinux_2_17_x86_64.manylinux2014_x86_64.whl", hash = "sha256:c1cb7169f53c1ddb06e71a9aebd7e98fc0fea936b39afb36d8e86d36ecc2636a"}, + {file = "ruff-0.15.5-py3-none-manylinux_2_31_riscv64.whl", hash = "sha256:9b037924500a31ee17389b5c8c4d88874cc6ea8e42f12e9c61a3d754ff72f1ca"}, + {file = "ruff-0.15.5-py3-none-musllinux_1_2_aarch64.whl", hash = "sha256:65bb414e5b4eadd95a8c1e4804f6772bbe8995889f203a01f77ddf2d790929dd"}, + {file = "ruff-0.15.5-py3-none-musllinux_1_2_armv7l.whl", hash = "sha256:d20aa469ae3b57033519c559e9bc9cd9e782842e39be05b50e852c7c981fa01d"}, + {file = "ruff-0.15.5-py3-none-musllinux_1_2_i686.whl", hash = "sha256:15388dd28c9161cdb8eda68993533acc870aa4e646a0a277aa166de9ad5a8752"}, + {file = "ruff-0.15.5-py3-none-musllinux_1_2_x86_64.whl", hash = "sha256:b30da330cbd03bed0c21420b6b953158f60c74c54c5f4c1dabbdf3a57bf355d2"}, + {file = "ruff-0.15.5-py3-none-win32.whl", hash = "sha256:732e5ee1f98ba5b3679029989a06ca39a950cced52143a0ea82a2102cb592b74"}, + {file = "ruff-0.15.5-py3-none-win_amd64.whl", hash = "sha256:821d41c5fa9e19117616c35eaa3f4b75046ec76c65e7ae20a333e9a8696bc7fe"}, + {file = "ruff-0.15.5-py3-none-win_arm64.whl", hash = "sha256:b498d1c60d2fe5c10c45ec3f698901065772730b411f164ae270bb6bfcc4740b"}, + {file = "ruff-0.15.5.tar.gz", hash = "sha256:7c3601d3b6d76dce18c5c824fc8d06f4eef33d6df0c21ec7799510cde0f159a2"}, ] [[package]] @@ -3665,4 +3665,4 @@ propcache = ">=0.2.1" [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "027f86b8ac2302108765cdc43dea3ccb6defb9a96e7429d9af8ce8c435b5f8ba" +content-hash = "c422d916b7a9ecf1a5857f50866e3160d65b7018e79bcaa5f3f83e7cd16d3ab4" diff --git a/security_scanning/examples/models/core/qwen/pyproject.toml b/security_scanning/examples/models/core/qwen/pyproject.toml index 963c16e2aaa..c94fe4ee5ee 100644 --- a/security_scanning/examples/models/core/qwen/pyproject.toml +++ b/security_scanning/examples/models/core/qwen/pyproject.toml @@ -19,7 +19,7 @@ dependencies = [ "mdtex2html (>=1.3.2,<2.0.0)", "sse-starlette (>=3.3.2,<4.0.0)", "aiohttp-sse-client (>=0.2.1,<0.3.0)", - "openai (>=2.24.0,<3.0.0)" + "openai (>=2.26.0,<3.0.0)" ] diff --git a/security_scanning/examples/models/core/whisper/poetry.lock b/security_scanning/examples/models/core/whisper/poetry.lock index 7d8c51ae5c9..d0c117a1323 100644 --- a/security_scanning/examples/models/core/whisper/poetry.lock +++ b/security_scanning/examples/models/core/whisper/poetry.lock @@ -2258,14 +2258,14 @@ xml = ["lxml (>=5.3.0)"] [[package]] name = "platformdirs" -version = "4.9.2" +version = "4.9.4" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.10" groups = ["main"] files = [ - {file = "platformdirs-4.9.2-py3-none-any.whl", hash = "sha256:9170634f126f8efdae22fb58ae8a0eaa86f38365bc57897a6c4f781d1f5875bd"}, - {file = "platformdirs-4.9.2.tar.gz", hash = "sha256:9a33809944b9db043ad67ca0db94b14bf452cc6aeaac46a88ea55b26e2e9d291"}, + {file = "platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868"}, + {file = "platformdirs-4.9.4.tar.gz", hash = "sha256:1ec356301b7dc906d83f371c8f487070e99d3ccf9e501686456394622a01a934"}, ] [[package]] diff --git a/security_scanning/examples/ray_orchestrator/poetry.lock b/security_scanning/examples/ray_orchestrator/poetry.lock index 22941a93c3c..4dbc20315f1 100644 --- a/security_scanning/examples/ray_orchestrator/poetry.lock +++ b/security_scanning/examples/ray_orchestrator/poetry.lock @@ -1313,14 +1313,14 @@ files = [ [[package]] name = "platformdirs" -version = "4.9.2" +version = "4.9.4" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.10" groups = ["main"] files = [ - {file = "platformdirs-4.9.2-py3-none-any.whl", hash = "sha256:9170634f126f8efdae22fb58ae8a0eaa86f38365bc57897a6c4f781d1f5875bd"}, - {file = "platformdirs-4.9.2.tar.gz", hash = "sha256:9a33809944b9db043ad67ca0db94b14bf452cc6aeaac46a88ea55b26e2e9d291"}, + {file = "platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868"}, + {file = "platformdirs-4.9.4.tar.gz", hash = "sha256:1ec356301b7dc906d83f371c8f487070e99d3ccf9e501686456394622a01a934"}, ] [[package]] @@ -2183,86 +2183,102 @@ typing-extensions = {version = ">=4.13.2", markers = "python_version < \"3.11\"" [[package]] name = "wrapt" -version = "2.1.1" +version = "2.1.2" description = "Module for decorators, wrappers and monkey patching." optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "wrapt-2.1.1-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:7e927375e43fd5a985b27a8992327c22541b6dede1362fc79df337d26e23604f"}, - {file = "wrapt-2.1.1-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:e1c99544b6a7d40ca22195563b6d8bc3986ee8bb82f272f31f0670fe9440c869"}, - {file = "wrapt-2.1.1-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:b2be3fa5f4efaf16ee7c77d0556abca35f5a18ad4ac06f0ef3904c3399010ce9"}, - {file = "wrapt-2.1.1-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:67c90c1ae6489a6cb1a82058902caa8006706f7b4e8ff766f943e9d2c8e608d0"}, - {file = "wrapt-2.1.1-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:05c0db35ccffd7480143e62df1e829d101c7b86944ae3be7e4869a7efa621f53"}, - {file = "wrapt-2.1.1-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:0c2ec9f616755b2e1e0bf4d0961f59bb5c2e7a77407e7e2c38ef4f7d2fdde12c"}, - {file = "wrapt-2.1.1-cp310-cp310-win32.whl", hash = "sha256:203ba6b3f89e410e27dbd30ff7dccaf54dcf30fda0b22aa1b82d560c7f9fe9a1"}, - {file = "wrapt-2.1.1-cp310-cp310-win_amd64.whl", hash = "sha256:6f9426d9cfc2f8732922fc96198052e55c09bb9db3ddaa4323a18e055807410e"}, - {file = "wrapt-2.1.1-cp310-cp310-win_arm64.whl", hash = "sha256:69c26f51b67076b40714cff81bdd5826c0b10c077fb6b0678393a6a2f952a5fc"}, - {file = "wrapt-2.1.1-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:6c366434a7fb914c7a5de508ed735ef9c133367114e1a7cb91dfb5cd806a1549"}, - {file = "wrapt-2.1.1-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:5d6a2068bd2e1e19e5a317c8c0b288267eec4e7347c36bc68a6e378a39f19ee7"}, - {file = "wrapt-2.1.1-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:891ab4713419217b2aed7dd106c9200f64e6a82226775a0d2ebd6bef2ebd1747"}, - {file = "wrapt-2.1.1-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:c8ef36a0df38d2dc9d907f6617f89e113c5892e0a35f58f45f75901af0ce7d81"}, - {file = "wrapt-2.1.1-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:76e9af3ebd86f19973143d4d592cbf3e970cf3f66ddee30b16278c26ae34b8ab"}, - {file = "wrapt-2.1.1-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:ff562067485ebdeaef2fa3fe9b1876bc4e7b73762e0a01406ad81e2076edcebf"}, - {file = "wrapt-2.1.1-cp311-cp311-win32.whl", hash = "sha256:9e60a30aa0909435ec4ea2a3c53e8e1b50ac9f640c0e9fe3f21fd248a22f06c5"}, - {file = "wrapt-2.1.1-cp311-cp311-win_amd64.whl", hash = "sha256:7d79954f51fcf84e5ec4878ab4aea32610d70145c5bbc84b3370eabfb1e096c2"}, - {file = "wrapt-2.1.1-cp311-cp311-win_arm64.whl", hash = "sha256:d3ffc6b0efe79e08fd947605fd598515aebefe45e50432dc3b5cd437df8b1ada"}, - {file = "wrapt-2.1.1-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ab8e3793b239db021a18782a5823fcdea63b9fe75d0e340957f5828ef55fcc02"}, - {file = "wrapt-2.1.1-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:7c0300007836373d1c2df105b40777986accb738053a92fe09b615a7a4547e9f"}, - {file = "wrapt-2.1.1-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2b27c070fd1132ab23957bcd4ee3ba707a91e653a9268dc1afbd39b77b2799f7"}, - {file = "wrapt-2.1.1-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:8b0e36d845e8b6f50949b6b65fc6cd279f47a1944582ed4ec8258cd136d89a64"}, - {file = "wrapt-2.1.1-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:4aeea04a9889370fcfb1ef828c4cc583f36a875061505cd6cd9ba24d8b43cc36"}, - {file = "wrapt-2.1.1-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:d88b46bb0dce9f74b6817bc1758ff2125e1ca9e1377d62ea35b6896142ab6825"}, - {file = "wrapt-2.1.1-cp312-cp312-win32.whl", hash = "sha256:63decff76ca685b5c557082dfbea865f3f5f6d45766a89bff8dc61d336348833"}, - {file = "wrapt-2.1.1-cp312-cp312-win_amd64.whl", hash = "sha256:b828235d26c1e35aca4107039802ae4b1411be0fe0367dd5b7e4d90e562fcbcd"}, - {file = "wrapt-2.1.1-cp312-cp312-win_arm64.whl", hash = "sha256:75128507413a9f1bcbe2db88fd18fbdbf80f264b82fa33a6996cdeaf01c52352"}, - {file = "wrapt-2.1.1-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:ce9646e17fa7c3e2e7a87e696c7de66512c2b4f789a8db95c613588985a2e139"}, - {file = "wrapt-2.1.1-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:428cfc801925454395aa468ba7ddb3ed63dc0d881df7b81626cdd433b4e2b11b"}, - {file = "wrapt-2.1.1-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5797f65e4d58065a49088c3b32af5410751cd485e83ba89e5a45e2aa8905af98"}, - {file = "wrapt-2.1.1-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:5a2db44a71202c5ae4bb5f27c6d3afbc5b23053f2e7e78aa29704541b5dad789"}, - {file = "wrapt-2.1.1-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:8d5350c3590af09c1703dd60ec78a7370c0186e11eaafb9dda025a30eee6492d"}, - {file = "wrapt-2.1.1-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:2d9b076411bed964e752c01b49fd224cc385f3a96f520c797d38412d70d08359"}, - {file = "wrapt-2.1.1-cp313-cp313-win32.whl", hash = "sha256:0bb7207130ce6486727baa85373503bf3334cc28016f6928a0fa7e19d7ecdc06"}, - {file = "wrapt-2.1.1-cp313-cp313-win_amd64.whl", hash = "sha256:cbfee35c711046b15147b0ae7db9b976f01c9520e6636d992cd9e69e5e2b03b1"}, - {file = "wrapt-2.1.1-cp313-cp313-win_arm64.whl", hash = "sha256:7d2756061022aebbf57ba14af9c16e8044e055c22d38de7bf40d92b565ecd2b0"}, - {file = "wrapt-2.1.1-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:4814a3e58bc6971e46baa910ecee69699110a2bf06c201e24277c65115a20c20"}, - {file = "wrapt-2.1.1-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:106c5123232ab9b9f4903692e1fa0bdc231510098f04c13c3081f8ad71c3d612"}, - {file = "wrapt-2.1.1-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:1a40b83ff2535e6e56f190aff123821eea89a24c589f7af33413b9c19eb2c738"}, - {file = "wrapt-2.1.1-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:789cea26e740d71cf1882e3a42bb29052bc4ada15770c90072cb47bf73fb3dbf"}, - {file = "wrapt-2.1.1-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ba49c14222d5e5c0ee394495a8655e991dc06cbca5398153aefa5ac08cd6ccd7"}, - {file = "wrapt-2.1.1-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:ac8cda531fe55be838a17c62c806824472bb962b3afa47ecbd59b27b78496f4e"}, - {file = "wrapt-2.1.1-cp313-cp313t-win32.whl", hash = "sha256:b8af75fe20d381dd5bcc9db2e86a86d7fcfbf615383a7147b85da97c1182225b"}, - {file = "wrapt-2.1.1-cp313-cp313t-win_amd64.whl", hash = "sha256:45c5631c9b6c792b78be2d7352129f776dd72c605be2c3a4e9be346be8376d83"}, - {file = "wrapt-2.1.1-cp313-cp313t-win_arm64.whl", hash = "sha256:da815b9263947ac98d088b6414ac83507809a1d385e4632d9489867228d6d81c"}, - {file = "wrapt-2.1.1-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:9aa1765054245bb01a37f615503290d4e207e3fd59226e78341afb587e9c1236"}, - {file = "wrapt-2.1.1-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:feff14b63a6d86c1eee33a57f77573649f2550935981625be7ff3cb7342efe05"}, - {file = "wrapt-2.1.1-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:81fc5f22d5fcfdbabde96bb3f5379b9f4476d05c6d524d7259dc5dfb501d3281"}, - {file = "wrapt-2.1.1-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:951b228ecf66def855d22e006ab9a1fc12535111ae7db2ec576c728f8ddb39e8"}, - {file = "wrapt-2.1.1-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:0ddf582a95641b9a8c8bd643e83f34ecbbfe1b68bc3850093605e469ab680ae3"}, - {file = "wrapt-2.1.1-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fc5c500966bf48913f795f1984704e6d452ba2414207b15e1f8c339a059d5b16"}, - {file = "wrapt-2.1.1-cp314-cp314-win32.whl", hash = "sha256:4aa4baadb1f94b71151b8e44a0c044f6af37396c3b8bcd474b78b49e2130a23b"}, - {file = "wrapt-2.1.1-cp314-cp314-win_amd64.whl", hash = "sha256:860e9d3fd81816a9f4e40812f28be4439ab01f260603c749d14be3c0a1170d19"}, - {file = "wrapt-2.1.1-cp314-cp314-win_arm64.whl", hash = "sha256:3c59e103017a2c1ea0ddf589cbefd63f91081d7ce9d491d69ff2512bb1157e23"}, - {file = "wrapt-2.1.1-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:9fa7c7e1bee9278fc4f5dd8275bc8d25493281a8ec6c61959e37cc46acf02007"}, - {file = "wrapt-2.1.1-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:39c35e12e8215628984248bd9c8897ce0a474be2a773db207eb93414219d8469"}, - {file = "wrapt-2.1.1-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:94ded4540cac9125eaa8ddf5f651a7ec0da6f5b9f248fe0347b597098f8ec14c"}, - {file = "wrapt-2.1.1-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:da0af328373f97ed9bdfea24549ac1b944096a5a71b30e41c9b8b53ab3eec04a"}, - {file = "wrapt-2.1.1-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4ad839b55f0bf235f8e337ce060572d7a06592592f600f3a3029168e838469d3"}, - {file = "wrapt-2.1.1-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:0d89c49356e5e2a50fa86b40e0510082abcd0530f926cbd71cf25bee6b9d82d7"}, - {file = "wrapt-2.1.1-cp314-cp314t-win32.whl", hash = "sha256:f4c7dd22cf7f36aafe772f3d88656559205c3af1b7900adfccb70edeb0d2abc4"}, - {file = "wrapt-2.1.1-cp314-cp314t-win_amd64.whl", hash = "sha256:f76bc12c583ab01e73ba0ea585465a41e48d968f6d1311b4daec4f8654e356e3"}, - {file = "wrapt-2.1.1-cp314-cp314t-win_arm64.whl", hash = "sha256:7ea74fc0bec172f1ae5f3505b6655c541786a5cabe4bbc0d9723a56ac32eb9b9"}, - {file = "wrapt-2.1.1-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:9e03b3d486eb39f5d3f562839f59094dcee30c4039359ea15768dc2214d9e07c"}, - {file = "wrapt-2.1.1-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:0fdf3073f488ce4d929929b7799e3b8c52b220c9eb3f4a5a51e2dc0e8ff07881"}, - {file = "wrapt-2.1.1-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:0cb4f59238c6625fae2eeb72278da31c9cfba0ff4d9cbe37446b73caa0e9bcf7"}, - {file = "wrapt-2.1.1-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:7f794a1c148871b714cb566f5466ec8288e0148a1c417550983864b3981737cd"}, - {file = "wrapt-2.1.1-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:95ef3866631c6da9ce1fc0f1e17b90c4c0aa6d041fc70a11bc90733aee122e1a"}, - {file = "wrapt-2.1.1-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:66bc1b2446f01cbbd3c56b79a3a8435bcd4178ac4e06b091913f7751a7f528b8"}, - {file = "wrapt-2.1.1-cp39-cp39-win32.whl", hash = "sha256:1b9e08e57cabc32972f7c956d10e85093c5da9019faa24faf411e7dd258e528c"}, - {file = "wrapt-2.1.1-cp39-cp39-win_amd64.whl", hash = "sha256:e75ad48c3cca739f580b5e14c052993eb644c7fa5b4c90aa51193280b30875ae"}, - {file = "wrapt-2.1.1-cp39-cp39-win_arm64.whl", hash = "sha256:9ccd657873b7f964711447d004563a2bc08d1476d7a1afcad310f3713e6f50f4"}, - {file = "wrapt-2.1.1-py3-none-any.whl", hash = "sha256:3b0f4629eb954394a3d7c7a1c8cca25f0b07cefe6aa8545e862e9778152de5b7"}, - {file = "wrapt-2.1.1.tar.gz", hash = "sha256:5fdcb09bf6db023d88f312bd0767594b414655d58090fc1c46b3414415f67fac"}, + {file = "wrapt-2.1.2-cp310-cp310-macosx_10_9_x86_64.whl", hash = "sha256:4b7a86d99a14f76facb269dc148590c01aaf47584071809a70da30555228158c"}, + {file = "wrapt-2.1.2-cp310-cp310-macosx_11_0_arm64.whl", hash = "sha256:a819e39017f95bf7aede768f75915635aa8f671f2993c036991b8d3bfe8dbb6f"}, + {file = "wrapt-2.1.2-cp310-cp310-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:5681123e60aed0e64c7d44f72bbf8b4ce45f79d81467e2c4c728629f5baf06eb"}, + {file = "wrapt-2.1.2-cp310-cp310-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:2b8b28e97a44d21836259739ae76284e180b18abbb4dcfdff07a415cf1016c3e"}, + {file = "wrapt-2.1.2-cp310-cp310-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:cef91c95a50596fcdc31397eb6955476f82ae8a3f5a8eabdc13611b60ee380ba"}, + {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_aarch64.whl", hash = "sha256:dad63212b168de8569b1c512f4eac4b57f2c6934b30df32d6ee9534a79f1493f"}, + {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_riscv64.whl", hash = "sha256:d307aa6888d5efab2c1cde09843d48c843990be13069003184b67d426d145394"}, + {file = "wrapt-2.1.2-cp310-cp310-musllinux_1_2_x86_64.whl", hash = "sha256:c87cf3f0c85e27b3ac7d9ad95da166bf8739ca215a8b171e8404a2d739897a45"}, + {file = "wrapt-2.1.2-cp310-cp310-win32.whl", hash = "sha256:d1c5fea4f9fe3762e2b905fdd67df51e4be7a73b7674957af2d2ade71a5c075d"}, + {file = "wrapt-2.1.2-cp310-cp310-win_amd64.whl", hash = "sha256:d8f7740e1af13dff2684e4d56fe604a7e04d6c94e737a60568d8d4238b9a0c71"}, + {file = "wrapt-2.1.2-cp310-cp310-win_arm64.whl", hash = "sha256:1c6cc827c00dc839350155f316f1f8b4b0c370f52b6a19e782e2bda89600c7dc"}, + {file = "wrapt-2.1.2-cp311-cp311-macosx_10_9_x86_64.whl", hash = "sha256:96159a0ee2b0277d44201c3b5be479a9979cf154e8c82fa5df49586a8e7679bb"}, + {file = "wrapt-2.1.2-cp311-cp311-macosx_11_0_arm64.whl", hash = "sha256:98ba61833a77b747901e9012072f038795de7fc77849f1faa965464f3f87ff2d"}, + {file = "wrapt-2.1.2-cp311-cp311-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:767c0dbbe76cae2a60dd2b235ac0c87c9cccf4898aef8062e57bead46b5f6894"}, + {file = "wrapt-2.1.2-cp311-cp311-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:9c691a6bc752c0cc4711cc0c00896fcd0f116abc253609ef64ef930032821842"}, + {file = "wrapt-2.1.2-cp311-cp311-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f3b7d73012ea75aee5844de58c88f44cf62d0d62711e39da5a82824a7c4626a8"}, + {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_aarch64.whl", hash = "sha256:577dff354e7acd9d411eaf4bfe76b724c89c89c8fc9b7e127ee28c5f7bcb25b6"}, + {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_riscv64.whl", hash = "sha256:3d7b6fd105f8b24e5bd23ccf41cb1d1099796524bcc6f7fbb8fe576c44befbc9"}, + {file = "wrapt-2.1.2-cp311-cp311-musllinux_1_2_x86_64.whl", hash = "sha256:866abdbf4612e0b34764922ef8b1c5668867610a718d3053d59e24a5e5fcfc15"}, + {file = "wrapt-2.1.2-cp311-cp311-win32.whl", hash = "sha256:5a0a0a3a882393095573344075189eb2d566e0fd205a2b6414e9997b1b800a8b"}, + {file = "wrapt-2.1.2-cp311-cp311-win_amd64.whl", hash = "sha256:64a07a71d2730ba56f11d1a4b91f7817dc79bc134c11516b75d1921a7c6fcda1"}, + {file = "wrapt-2.1.2-cp311-cp311-win_arm64.whl", hash = "sha256:b89f095fe98bc12107f82a9f7d570dc83a0870291aeb6b1d7a7d35575f55d98a"}, + {file = "wrapt-2.1.2-cp312-cp312-macosx_10_13_x86_64.whl", hash = "sha256:ff2aad9c4cda28a8f0653fc2d487596458c2a3f475e56ba02909e950a9efa6a9"}, + {file = "wrapt-2.1.2-cp312-cp312-macosx_11_0_arm64.whl", hash = "sha256:6433ea84e1cfacf32021d2a4ee909554ade7fd392caa6f7c13f1f4bf7b8e8748"}, + {file = "wrapt-2.1.2-cp312-cp312-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:c20b757c268d30d6215916a5fa8461048d023865d888e437fab451139cad6c8e"}, + {file = "wrapt-2.1.2-cp312-cp312-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:79847b83eb38e70d93dc392c7c5b587efe65b3e7afcc167aa8abd5d60e8761c8"}, + {file = "wrapt-2.1.2-cp312-cp312-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:f8fba1bae256186a83d1875b2b1f4e2d1242e8fac0f58ec0d7e41b26967b965c"}, + {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_aarch64.whl", hash = "sha256:e3d3b35eedcf5f7d022291ecd7533321c4775f7b9cd0050a31a68499ba45757c"}, + {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_riscv64.whl", hash = "sha256:6f2c5390460de57fa9582bc8a1b7a6c86e1a41dfad74c5225fc07044c15cc8d1"}, + {file = "wrapt-2.1.2-cp312-cp312-musllinux_1_2_x86_64.whl", hash = "sha256:7dfa9f2cf65d027b951d05c662cc99ee3bd01f6e4691ed39848a7a5fffc902b2"}, + {file = "wrapt-2.1.2-cp312-cp312-win32.whl", hash = "sha256:eba8155747eb2cae4a0b913d9ebd12a1db4d860fc4c829d7578c7b989bd3f2f0"}, + {file = "wrapt-2.1.2-cp312-cp312-win_amd64.whl", hash = "sha256:1c51c738d7d9faa0b3601708e7e2eda9bf779e1b601dce6c77411f2a1b324a63"}, + {file = "wrapt-2.1.2-cp312-cp312-win_arm64.whl", hash = "sha256:c8e46ae8e4032792eb2f677dbd0d557170a8e5524d22acc55199f43efedd39bf"}, + {file = "wrapt-2.1.2-cp313-cp313-macosx_10_13_x86_64.whl", hash = "sha256:787fd6f4d67befa6fe2abdffcbd3de2d82dfc6fb8a6d850407c53332709d030b"}, + {file = "wrapt-2.1.2-cp313-cp313-macosx_11_0_arm64.whl", hash = "sha256:4bdf26e03e6d0da3f0e9422fd36bcebf7bc0eeb55fdf9c727a09abc6b9fe472e"}, + {file = "wrapt-2.1.2-cp313-cp313-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:bbac24d879aa22998e87f6b3f481a5216311e7d53c7db87f189a7a0266dafffb"}, + {file = "wrapt-2.1.2-cp313-cp313-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:16997dfb9d67addc2e3f41b62a104341e80cac52f91110dece393923c0ebd5ca"}, + {file = "wrapt-2.1.2-cp313-cp313-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:162e4e2ba7542da9027821cb6e7c5e068d64f9a10b5f15512ea28e954893a267"}, + {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_aarch64.whl", hash = "sha256:f29c827a8d9936ac320746747a016c4bc66ef639f5cd0d32df24f5eacbf9c69f"}, + {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_riscv64.whl", hash = "sha256:a9dd9813825f7ecb018c17fd147a01845eb330254dff86d3b5816f20f4d6aaf8"}, + {file = "wrapt-2.1.2-cp313-cp313-musllinux_1_2_x86_64.whl", hash = "sha256:6f8dbdd3719e534860d6a78526aafc220e0241f981367018c2875178cf83a413"}, + {file = "wrapt-2.1.2-cp313-cp313-win32.whl", hash = "sha256:5c35b5d82b16a3bc6e0a04349b606a0582bc29f573786aebe98e0c159bc48db6"}, + {file = "wrapt-2.1.2-cp313-cp313-win_amd64.whl", hash = "sha256:f8bc1c264d8d1cf5b3560a87bbdd31131573eb25f9f9447bb6252b8d4c44a3a1"}, + {file = "wrapt-2.1.2-cp313-cp313-win_arm64.whl", hash = "sha256:3beb22f674550d5634642c645aba4c72a2c66fb185ae1aebe1e955fae5a13baf"}, + {file = "wrapt-2.1.2-cp313-cp313t-macosx_10_13_x86_64.whl", hash = "sha256:0fc04bc8664a8bc4c8e00b37b5355cffca2535209fba1abb09ae2b7c76ddf82b"}, + {file = "wrapt-2.1.2-cp313-cp313t-macosx_11_0_arm64.whl", hash = "sha256:a9b9d50c9af998875a1482a038eb05755dfd6fe303a313f6a940bb53a83c3f18"}, + {file = "wrapt-2.1.2-cp313-cp313t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:2d3ff4f0024dd224290c0eabf0240f1bfc1f26363431505fb1b0283d3b08f11d"}, + {file = "wrapt-2.1.2-cp313-cp313t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:3278c471f4468ad544a691b31bb856374fbdefb7fee1a152153e64019379f015"}, + {file = "wrapt-2.1.2-cp313-cp313t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a8914c754d3134a3032601c6984db1c576e6abaf3fc68094bb8ab1379d75ff92"}, + {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_aarch64.whl", hash = "sha256:ff95d4264e55839be37bafe1536db2ab2de19da6b65f9244f01f332b5286cfbf"}, + {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_riscv64.whl", hash = "sha256:76405518ca4e1b76fbb1b9f686cff93aebae03920cc55ceeec48ff9f719c5f67"}, + {file = "wrapt-2.1.2-cp313-cp313t-musllinux_1_2_x86_64.whl", hash = "sha256:c0be8b5a74c5824e9359b53e7e58bef71a729bacc82e16587db1c4ebc91f7c5a"}, + {file = "wrapt-2.1.2-cp313-cp313t-win32.whl", hash = "sha256:f01277d9a5fc1862f26f7626da9cf443bebc0abd2f303f41c5e995b15887dabd"}, + {file = "wrapt-2.1.2-cp313-cp313t-win_amd64.whl", hash = "sha256:84ce8f1c2104d2f6daa912b1b5b039f331febfeee74f8042ad4e04992bd95c8f"}, + {file = "wrapt-2.1.2-cp313-cp313t-win_arm64.whl", hash = "sha256:a93cd767e37faeddbe07d8fc4212d5cba660af59bdb0f6372c93faaa13e6e679"}, + {file = "wrapt-2.1.2-cp314-cp314-macosx_10_15_x86_64.whl", hash = "sha256:1370e516598854e5b4366e09ce81e08bfe94d42b0fd569b88ec46cc56d9164a9"}, + {file = "wrapt-2.1.2-cp314-cp314-macosx_11_0_arm64.whl", hash = "sha256:6de1a3851c27e0bd6a04ca993ea6f80fc53e6c742ee1601f486c08e9f9b900a9"}, + {file = "wrapt-2.1.2-cp314-cp314-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:de9f1a2bbc5ac7f6012ec24525bdd444765a2ff64b5985ac6e0692144838542e"}, + {file = "wrapt-2.1.2-cp314-cp314-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:970d57ed83fa040d8b20c52fe74a6ae7e3775ae8cff5efd6a81e06b19078484c"}, + {file = "wrapt-2.1.2-cp314-cp314-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:3969c56e4563c375861c8df14fa55146e81ac11c8db49ea6fb7f2ba58bc1ff9a"}, + {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_aarch64.whl", hash = "sha256:57d7c0c980abdc5f1d98b11a2aa3bb159790add80258c717fa49a99921456d90"}, + {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_riscv64.whl", hash = "sha256:776867878e83130c7a04237010463372e877c1c994d449ca6aaafeab6aab2586"}, + {file = "wrapt-2.1.2-cp314-cp314-musllinux_1_2_x86_64.whl", hash = "sha256:fab036efe5464ec3291411fabb80a7a39e2dd80bae9bcbeeca5087fdfa891e19"}, + {file = "wrapt-2.1.2-cp314-cp314-win32.whl", hash = "sha256:e6ed62c82ddf58d001096ae84ce7f833db97ae2263bff31c9b336ba8cfe3f508"}, + {file = "wrapt-2.1.2-cp314-cp314-win_amd64.whl", hash = "sha256:467e7c76315390331c67073073d00662015bb730c566820c9ca9b54e4d67fd04"}, + {file = "wrapt-2.1.2-cp314-cp314-win_arm64.whl", hash = "sha256:da1f00a557c66225d53b095a97eace0fc5349e3bfda28fa34ffae238978ee575"}, + {file = "wrapt-2.1.2-cp314-cp314t-macosx_10_15_x86_64.whl", hash = "sha256:62503ffbc2d3a69891cf29beeaccdb4d5e0a126e2b6a851688d4777e01428dbb"}, + {file = "wrapt-2.1.2-cp314-cp314t-macosx_11_0_arm64.whl", hash = "sha256:c7e6cd120ef837d5b6f860a6ea3745f8763805c418bb2f12eeb1fa6e25f22d22"}, + {file = "wrapt-2.1.2-cp314-cp314t-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:3769a77df8e756d65fbc050333f423c01ae012b4f6731aaf70cf2bef61b34596"}, + {file = "wrapt-2.1.2-cp314-cp314t-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:a76d61a2e851996150ba0f80582dd92a870643fa481f3b3846f229de88caf044"}, + {file = "wrapt-2.1.2-cp314-cp314t-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:6f97edc9842cf215312b75fe737ee7c8adda75a89979f8e11558dfff6343cc4b"}, + {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_aarch64.whl", hash = "sha256:4006c351de6d5007aa33a551f600404ba44228a89e833d2fadc5caa5de8edfbf"}, + {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_riscv64.whl", hash = "sha256:a9372fc3639a878c8e7d87e1556fa209091b0a66e912c611e3f833e2c4202be2"}, + {file = "wrapt-2.1.2-cp314-cp314t-musllinux_1_2_x86_64.whl", hash = "sha256:3144b027ff30cbd2fca07c0a87e67011adb717eb5f5bd8496325c17e454257a3"}, + {file = "wrapt-2.1.2-cp314-cp314t-win32.whl", hash = "sha256:3b8d15e52e195813efe5db8cec156eebe339aaf84222f4f4f051a6c01f237ed7"}, + {file = "wrapt-2.1.2-cp314-cp314t-win_amd64.whl", hash = "sha256:08ffa54146a7559f5b8df4b289b46d963a8e74ed16ba3687f99896101a3990c5"}, + {file = "wrapt-2.1.2-cp314-cp314t-win_arm64.whl", hash = "sha256:72aaa9d0d8e4ed0e2e98019cea47a21f823c9dd4b43c7b77bba6679ffcca6a00"}, + {file = "wrapt-2.1.2-cp39-cp39-macosx_10_9_x86_64.whl", hash = "sha256:5e0fa9cc32300daf9eb09a1f5bdc6deb9a79defd70d5356ba453bcd50aef3742"}, + {file = "wrapt-2.1.2-cp39-cp39-macosx_11_0_arm64.whl", hash = "sha256:710f6e5dfaf6a5d5c397d2d6758a78fecd9649deb21f1b645f5b57a328d63050"}, + {file = "wrapt-2.1.2-cp39-cp39-manylinux1_x86_64.manylinux_2_28_x86_64.manylinux_2_5_x86_64.whl", hash = "sha256:305d8a1755116bfdad5dda9e771dcb2138990a1d66e9edd81658816edf51aed1"}, + {file = "wrapt-2.1.2-cp39-cp39-manylinux2014_aarch64.manylinux_2_17_aarch64.manylinux_2_28_aarch64.whl", hash = "sha256:f0d8fc30a43b5fe191cf2b1a0c82bab2571dadd38e7c0062ee87d6df858dd06e"}, + {file = "wrapt-2.1.2-cp39-cp39-manylinux_2_31_riscv64.manylinux_2_39_riscv64.whl", hash = "sha256:a5d516e22aedb7c9c1d47cba1c63160b1a6f61ec2f3948d127cd38d5cfbb556f"}, + {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_aarch64.whl", hash = "sha256:45914e8efbe4b9d5102fcf0e8e2e3258b83a5d5fba9f8f7b6d15681e9d29ffe0"}, + {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_riscv64.whl", hash = "sha256:478282ebd3795a089154fb16d3db360e103aa13d3b2ad30f8f6aac0d2207de0e"}, + {file = "wrapt-2.1.2-cp39-cp39-musllinux_1_2_x86_64.whl", hash = "sha256:3756219045f73fb28c5d7662778e4156fbd06cf823c4d2d4b19f97305e52819c"}, + {file = "wrapt-2.1.2-cp39-cp39-win32.whl", hash = "sha256:b8aefb4dbb18d904b96827435a763fa42fc1f08ea096a391710407a60983ced8"}, + {file = "wrapt-2.1.2-cp39-cp39-win_amd64.whl", hash = "sha256:e5aeab8fe15c3dff75cfee94260dcd9cded012d4ff06add036c28fae7718593b"}, + {file = "wrapt-2.1.2-cp39-cp39-win_arm64.whl", hash = "sha256:f069e113743a21a3defac6677f000068ebb931639f789b5b226598e247a4c89e"}, + {file = "wrapt-2.1.2-py3-none-any.whl", hash = "sha256:b8fd6fa2b2c4e7621808f8c62e8317f4aae56e59721ad933bac5239d913cf0e8"}, + {file = "wrapt-2.1.2.tar.gz", hash = "sha256:3996a67eecc2c68fd47b4e3c564405a5777367adfd9b8abb58387b63ee83b21e"}, ] [package.extras] diff --git a/security_scanning/examples/serve/poetry.lock b/security_scanning/examples/serve/poetry.lock index 6350255c191..4a92644ea02 100644 --- a/security_scanning/examples/serve/poetry.lock +++ b/security_scanning/examples/serve/poetry.lock @@ -2458,14 +2458,14 @@ xmp = ["defusedxml"] [[package]] name = "platformdirs" -version = "4.9.2" +version = "4.9.4" description = "A small Python package for determining appropriate platform-specific dirs, e.g. a `user data dir`." optional = false python-versions = ">=3.10" groups = ["main"] files = [ - {file = "platformdirs-4.9.2-py3-none-any.whl", hash = "sha256:9170634f126f8efdae22fb58ae8a0eaa86f38365bc57897a6c4f781d1f5875bd"}, - {file = "platformdirs-4.9.2.tar.gz", hash = "sha256:9a33809944b9db043ad67ca0db94b14bf452cc6aeaac46a88ea55b26e2e9d291"}, + {file = "platformdirs-4.9.4-py3-none-any.whl", hash = "sha256:68a9a4619a666ea6439f2ff250c12a853cd1cbd5158d258bd824a7df6be2f868"}, + {file = "platformdirs-4.9.4.tar.gz", hash = "sha256:1ec356301b7dc906d83f371c8f487070e99d3ccf9e501686456394622a01a934"}, ] [[package]] diff --git a/security_scanning/metadata.json b/security_scanning/metadata.json index 222f60c2f9e..12ddd9e9954 100644 --- a/security_scanning/metadata.json +++ b/security_scanning/metadata.json @@ -1,4 +1,4 @@ { - "commit_hash": "e01c38f83a5ed13c0abad3a970e9344bfabf7453", - "timestamp": "2026-03-05T02:50:04Z" + "commit_hash": "e699f232511bde5ab7c15af72528484175771e7b", + "timestamp": "2026-03-06T02:46:54Z" } diff --git a/security_scanning/poetry.lock b/security_scanning/poetry.lock index 01a480d3f29..383362e6228 100644 --- a/security_scanning/poetry.lock +++ b/security_scanning/poetry.lock @@ -1216,14 +1216,14 @@ vision = ["Pillow (>=9.4.0)"] [[package]] name = "diffusers" -version = "0.36.0" +version = "0.37.0" description = "State-of-the-art diffusion in PyTorch and JAX." optional = false -python-versions = ">=3.8.0" +python-versions = ">=3.10.0" groups = ["main"] files = [ - {file = "diffusers-0.36.0-py3-none-any.whl", hash = "sha256:525d42abc74bfc3b2db594999961295c054b48ef40a11724dacf50e6abd1af98"}, - {file = "diffusers-0.36.0.tar.gz", hash = "sha256:a9cde8721b415bde6a678f2d02abb85396487e1b0e0d2b4abb462d14a9825ab0"}, + {file = "diffusers-0.37.0-py3-none-any.whl", hash = "sha256:7eab74bf896974250b5e1027cae813aba1004f02d97c9b44891b83713386aa08"}, + {file = "diffusers-0.37.0.tar.gz", hash = "sha256:408789af73898585f525afd07ca72b3955affea4216a669558e9f59b5b1fe704"}, ] [package.dependencies] @@ -1239,14 +1239,14 @@ safetensors = ">=0.3.1" [package.extras] bitsandbytes = ["accelerate (>=0.31.0)", "bitsandbytes (>=0.43.3)"] -dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "timm", "torch (>=1.4)", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"] +dev = ["GitPython (<3.1.19)", "Jinja2", "Jinja2", "accelerate (>=0.31.0)", "accelerate (>=0.31.0)", "compel (==0.1.8)", "datasets", "datasets", "flax (>=0.4.1)", "ftfy", "hf-doc-builder (>=0.3.0)", "hf-doc-builder (>=0.3.0)", "invisible-watermark (>=0.2.0)", "isort (>=5.5.4)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)", "librosa", "parameterized", "peft (>=0.17.0)", "phonemizer", "protobuf (>=3.20.3,<4)", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "ruff (==0.9.10)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tensorboard", "tiktoken (>=0.7.0)", "timm", "torch (>=1.4)", "torchsde", "torchvision", "transformers (>=4.41.2)", "urllib3 (<=2.0.0)"] docs = ["hf-doc-builder (>=0.3.0)"] flax = ["flax (>=0.4.1)", "jax (>=0.4.1)", "jaxlib (>=0.4.1)"] gguf = ["accelerate (>=0.31.0)", "gguf (>=0.10.0)"] nvidia-modelopt = ["nvidia_modelopt[hf] (>=0.33.1)"] optimum-quanto = ["accelerate (>=0.31.0)", "optimum_quanto (>=0.2.6)"] quality = ["hf-doc-builder (>=0.3.0)", "isort (>=5.5.4)", "ruff (==0.9.10)", "urllib3 (<=2.0.0)"] -test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "invisible-watermark (>=0.2.0)", "k-diffusion (==0.0.12)", "librosa", "parameterized", "phonemizer", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tiktoken (>=0.7.0)", "torchvision", "transformers (>=4.41.2)"] +test = ["GitPython (<3.1.19)", "Jinja2", "compel (==0.1.8)", "datasets", "ftfy", "invisible-watermark (>=0.2.0)", "librosa", "parameterized", "phonemizer", "protobuf (>=3.20.3,<4)", "pytest", "pytest-timeout", "pytest-xdist", "requests-mock (==1.10.0)", "safetensors (>=0.3.1)", "scipy", "sentencepiece (>=0.1.91,!=0.1.92)", "tiktoken (>=0.7.0)", "torchsde", "torchvision", "transformers (>=4.41.2)"] torch = ["accelerate (>=0.31.0)", "torch (>=1.4)"] torchao = ["accelerate (>=0.31.0)", "torchao (>=0.7.0)"] training = ["Jinja2", "accelerate (>=0.31.0)", "datasets", "peft (>=0.17.0)", "protobuf (>=3.20.3,<4)", "tensorboard", "timm"] @@ -3874,14 +3874,14 @@ typing_extensions = ">=4.10" [[package]] name = "openai" -version = "2.24.0" +version = "2.26.0" description = "The official Python library for the openai API" optional = false python-versions = ">=3.9" groups = ["main"] files = [ - {file = "openai-2.24.0-py3-none-any.whl", hash = "sha256:fed30480d7d6c884303287bde864980a4b137b60553ffbcf9ab4a233b7a73d94"}, - {file = "openai-2.24.0.tar.gz", hash = "sha256:1e5769f540dbd01cb33bc4716a23e67b9d695161a734aff9c5f925e2bf99a673"}, + {file = "openai-2.26.0-py3-none-any.whl", hash = "sha256:6151bf8f83802f036117f06cc8a57b3a4da60da9926826cc96747888b57f394f"}, + {file = "openai-2.26.0.tar.gz", hash = "sha256:b41f37c140ae0034a6e92b0c509376d907f3a66109935fba2c1b471a7c05a8fb"}, ] [package.dependencies] @@ -6996,4 +6996,4 @@ type = ["pytest-mypy"] [metadata] lock-version = "2.1" python-versions = ">=3.10,<3.13" -content-hash = "ec5d305a29833682cefb2c4c995403be74c8fdaeb6ac096f94480d4582f568a5" +content-hash = "2a21a66f0b0512caff14b587ff5a19a8d42d3ae9b5624172c4e68f8d07044c48" diff --git a/security_scanning/pyproject.toml b/security_scanning/pyproject.toml index b8587dd5f95..ccb19badfc0 100644 --- a/security_scanning/pyproject.toml +++ b/security_scanning/pyproject.toml @@ -20,7 +20,7 @@ dependencies = [ "onnx-graphsurgeon (>=0.5.2)", "onnxscript (==0.5.4)", "graphviz (>=0.21,<0.22)", - "openai (>=2.24.0,<3.0.0)", + "openai (>=2.26.0,<3.0.0)", "polygraphy (>=0.49.26,<0.50.0)", "psutil (>=7.2.2,<8.0.0)", "nvidia-ml-py (>=13)", @@ -83,7 +83,7 @@ dependencies = [ "nvidia-cuda-tileiras (>=13.1)", "etcd-sdk-python (==0.0.7)", "python-multipart (>=0.0.22,<0.0.23)", - "smg-grpc-proto (>=0.3.3)" + "smg-grpc-proto (>=0.4.2)" ] diff --git a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py index a7504a8b85d..19fd880c1dc 100644 --- a/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py +++ b/tensorrt_llm/_torch/custom_ops/cute_dsl_custom_ops.py @@ -867,6 +867,12 @@ def get_valid_tactics( valid_tactics = [] for mma_tiler_mn, cluster_shape_mn in itertools.product( mma_tiler_mn_candidates, cluster_shape_mn_candidates): + # Skip tactics where the cluster shape exceeds available + # tiles. Launching more cluster CTAs than tiles causes + # out-of-bounds memory access in the CuteDSL kernel. + if (ceil_div(m, mma_tiler_mn[0]) < cluster_shape_mn[0] + or ceil_div(n, mma_tiler_mn[1]) < cluster_shape_mn[1]): + continue if self.__class__.kernel_class.can_implement( ab_dtype=cutlass.Float4E2M1FN, sf_dtype=cutlass.Float8E4M3FN, @@ -1162,6 +1168,12 @@ def get_valid_tactics( for mma_tiler_mn, cluster_shape_mn, raster_along_m in itertools.product( mma_tiler_mn_candidates, cluster_shape_mn_candidates, raster_along_m_candidates): + # Skip tactics where the cluster shape exceeds available + # tiles. Launching more cluster CTAs than tiles causes + # out-of-bounds memory access in the CuteDSL kernel. + if (ceil_div(m, mma_tiler_mn[0]) < cluster_shape_mn[0] + or ceil_div(n, mma_tiler_mn[1]) < cluster_shape_mn[1]): + continue if self.__class__.kernel_class.can_implement( ab_dtype=cutlass.Float4E2M1FN, sf_dtype=cutlass.Float8E4M3FN, @@ -1548,6 +1560,12 @@ def get_valid_tactics( valid_tactics = [] for mma_tiler_mn, cluster_shape_mn in itertools.product( mma_tiler_mn_candidates, cluster_shape_mn_candidates): + # Skip tactics where the cluster shape exceeds available + # tiles. Launching more cluster CTAs than tiles causes + # out-of-bounds memory access in the CuteDSL kernel. + if (ceil_div(m, mma_tiler_mn[0]) < cluster_shape_mn[0] + or ceil_div(n, mma_tiler_mn[1]) < cluster_shape_mn[1]): + continue if self.__class__.kernel_class.can_implement( ab_dtype=cutlass.Float4E2M1FN, sf_dtype=cutlass.Float8E4M3FN, diff --git a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py index cd0cb71fbec..607b5d870e8 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py +++ b/tensorrt_llm/_torch/modules/fused_moe/configurable_moe.py @@ -733,7 +733,7 @@ def _forward_chunk_impl( ) # Step 4b: Quantization AFTER dispatch - x, x_sf = self.backend.quantize_input(x) + x, x_sf = self.backend.quantize_input(x, post_quant_comm=False) else: # No communication, just quantize # (use non-post-quant-comm path for TRTLLMGenFusedMoE) diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py index 9812c4ef868..1273262f5f4 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cute_dsl.py @@ -513,11 +513,16 @@ def run_moe_nvfp4( self.hidden_size) assert moe_output.dtype == output_dtype + # After DeepEPLowLatency dispatch, token_selected_experts has shape + # [N, 1] instead of [N, top_k], because each row is already assigned + # to exactly one expert. Use the tensor shape as the effective top_k. + effective_top_k = token_selected_experts.size(-1) + tuner = AutoTuner.get() runner = CuteDslFusedMoENvfp4Runner( forward_impl=self.run_moe_nvfp4_impl, num_experts=self.num_slots, - top_k=self.routing_method.experts_per_token, + top_k=effective_top_k, num_local_experts=self.expert_size_per_partition, local_expert_offset=self.slot_start, enable_finalize_fusion=self.use_fused_finalize, @@ -547,11 +552,15 @@ def run_moe_nvfp4_impl( ) -> torch.Tensor: output_dtype = torch.bfloat16 + # Use effective top_k from tensor shape rather than routing config. + # After DeepEPLowLatency dispatch, each row maps to one expert (top_k=1). + effective_top_k = token_selected_experts.size(1) + tile_idx_to_expert_idx, tile_idx_to_mn_limit, expanded_idx_to_permuted_idx, permuted_idx_to_expanded_idx, total_num_padded_tokens, num_non_exiting_tiles = torch.ops.trtllm.moe_sort( token_selected_experts=token_selected_experts, token_final_scales=token_final_scales, num_experts=self.num_slots, - top_k=self.routing_method.experts_per_token, + top_k=effective_top_k, local_expert_offset=self.slot_start, local_num_experts=self.expert_size_per_partition, tile_tokens_dim=tile_size, @@ -574,7 +583,7 @@ def run_moe_nvfp4_impl( num_non_exiting_tiles=num_non_exiting_tiles, global_sf=self.fc2_input_scale, num_experts=self.num_slots, - top_k=self.routing_method.experts_per_token, + top_k=effective_top_k, num_local_experts=self.expert_size_per_partition, local_expert_offset=self.slot_start, tile_size=tile_size, @@ -591,7 +600,7 @@ def run_moe_nvfp4_impl( permuted_idx_to_expanded_idx=permuted_idx_to_expanded_idx, num_non_exiting_tiles=num_non_exiting_tiles, tile_tokens_dim=tile_size, - top_k=self.routing_method.experts_per_token, + top_k=effective_top_k, ep_size=self.mapping.moe_ep_size, enable_alltoall=enable_alltoall, ) @@ -612,7 +621,7 @@ def run_moe_nvfp4_impl( num_non_exiting_tiles=num_non_exiting_tiles, token_final_scales=token_final_scales, num_experts=self.num_slots, - top_k=self.routing_method.experts_per_token, + top_k=effective_top_k, num_local_experts=self.expert_size_per_partition, local_expert_offset=self.slot_start, tile_size=tile_size, @@ -629,7 +638,7 @@ def run_moe_nvfp4_impl( tile_idx_to_group_idx=tile_idx_to_expert_idx, num_non_exiting_tiles=num_non_exiting_tiles, num_experts=self.num_slots, - top_k=self.routing_method.experts_per_token, + top_k=effective_top_k, num_local_experts=self.expert_size_per_partition, local_expert_offset=self.slot_start, tile_size=tile_size, diff --git a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py index 83aae9a06a5..5432445bc5f 100755 --- a/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py +++ b/tensorrt_llm/_torch/modules/fused_moe/fused_moe_cutlass.py @@ -76,14 +76,17 @@ class CutlassFusedMoE(MoE): "sm_constraint": ("min", 89), "dtypes": {torch.float16, torch.bfloat16, torch.float32}, }, - # FP8_BLOCK_SCALES: SM == 90 only + # FP8_BLOCK_SCALES: SM == 90 only (float16 not supported by kernel) QuantAlgo.FP8_BLOCK_SCALES: { "sm_constraint": ("exact", 90), - "dtypes": {torch.float16, torch.bfloat16, torch.float32}, + "dtypes": {torch.bfloat16}, }, - # NVFP4: SM in {100, 103} + # NVFP4: SM in {100, 103, 120, 121} + # SM 120 = desktop Blackwell (e.g. RTX 5090 / GB202) + # SM 121 = GB10 / DGX Spark + # C++ kernel: isValidSM120MOESpecialisation() supports FP4xFP4 and FP8xFP4 QuantAlgo.NVFP4: { - "sm_constraint": ("in", {100, 103}), + "sm_constraint": ("in", {100, 103, 120, 121}), "dtypes": {torch.float16, torch.bfloat16, torch.float8_e4m3fn}, }, # W4A8_AWQ: SM in {89, 90} only @@ -130,7 +133,7 @@ def can_implement( - Unquantized (FP16/BF16): SM >= 80 - FP8 per-tensor (QDQ): SM >= 89 - FP8_BLOCK_SCALES: SM == 90 only - - NVFP4: SM in {100, 103} + - NVFP4: SM in {100, 103, 120, 121} - W4A8_AWQ: SM in {89, 90} only - W8A16: SM >= 80 - W4A16_MXFP4: SM == 90 only diff --git a/tensorrt_llm/_torch/modules/fused_moe/quantization.py b/tensorrt_llm/_torch/modules/fused_moe/quantization.py index 49c00f8c752..2a349c28e03 100644 --- a/tensorrt_llm/_torch/modules/fused_moe/quantization.py +++ b/tensorrt_llm/_torch/modules/fused_moe/quantization.py @@ -1900,7 +1900,7 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict): [torch.stack(all_w3_scales), torch.stack(all_w1_scales)], dim=-2) - w3_w1_scales = all_w3_w1_scales.to(torch.bfloat16).view(module.dtype) + w3_w1_scales = all_w3_w1_scales.to(torch.bfloat16) w3_w1_s_shape = w3_w1_scales.shape w3_w1_scales_interleaved = w3_w1_scales.reshape( w3_w1_s_shape[0], w3_w1_s_shape[1], @@ -1928,8 +1928,7 @@ def load_quant_scales(self, module: torch.nn.Module, weights: Dict): w2_scales_shard, (0, pad_size_inter, 0, pad_size_hidden)) all_w2_scales.append(w2_scales_shard) - w2_scales = torch.stack(all_w2_scales).to(torch.bfloat16).view( - module.dtype) + w2_scales = torch.stack(all_w2_scales).to(torch.bfloat16) w2_s_shape = w2_scales.shape w2_scales_interleaved = w2_scales.reshape( w2_s_shape[0], w2_s_shape[1], diff --git a/tensorrt_llm/grpc/grpc_request_manager.py b/tensorrt_llm/grpc/grpc_request_manager.py index c18af48ba26..c0fa15af4f6 100644 --- a/tensorrt_llm/grpc/grpc_request_manager.py +++ b/tensorrt_llm/grpc/grpc_request_manager.py @@ -233,10 +233,11 @@ def create_sampling_params_from_proto( proto_config: pb2.SamplingConfig, output_config: pb2.OutputConfig, max_tokens: int, - end_id: Optional[int] = None, - pad_id: Optional[int] = None, - bad_words: Optional[List[pb2.TokenSequence]] = None, - stop_words: Optional[List[pb2.TokenSequence]] = None, + stop: Optional[List[str]] = None, + stop_token_ids: Optional[List[int]] = None, + ignore_eos: bool = False, + bad: Optional[List[str]] = None, + bad_token_ids: Optional[List[int]] = None, guided_decoding: Optional[pb2.GuidedDecodingParams] = None, embedding_bias: Optional[List[float]] = None, ) -> SamplingParams: @@ -246,10 +247,11 @@ def create_sampling_params_from_proto( proto_config: Protobuf SamplingConfig message output_config: Protobuf OutputConfig message max_tokens: Maximum tokens to generate - end_id: End-of-sequence token ID - pad_id: Padding token ID - bad_words: Bad word token sequences - stop_words: Stop word token sequences + stop: Stop strings (tokenized by TRT-LLM's _setup()) + stop_token_ids: Stop token IDs + ignore_eos: Whether to ignore end-of-sequence token + bad: Bad word strings (tokenized by TRT-LLM's _setup()) + bad_token_ids: Bad word token IDs guided_decoding: Guided decoding parameters embedding_bias: Embedding bias tensor @@ -317,13 +319,19 @@ def create_sampling_params_from_proto( if proto_config.HasField("no_repeat_ngram_size"): kwargs["no_repeat_ngram_size"] = proto_config.no_repeat_ngram_size - # End/pad tokens - if end_id is not None: - kwargs["end_id"] = end_id - if end_id == -1: - kwargs["ignore_eos"] = True - if pad_id is not None: - kwargs["pad_id"] = pad_id + # Stop sequences and ignore_eos (TRT-LLM's _setup() tokenizes stop strings) + if stop: + kwargs["stop"] = stop + if stop_token_ids: + kwargs["stop_token_ids"] = stop_token_ids + if ignore_eos: + kwargs["ignore_eos"] = True + + # Bad words (TRT-LLM's _setup() tokenizes bad word strings) + if bad: + kwargs["bad"] = bad + if bad_token_ids: + kwargs["bad_token_ids"] = bad_token_ids # Output configuration - logprobs if output_config.HasField("logprobs"): @@ -337,11 +345,6 @@ def create_sampling_params_from_proto( if output_config.exclude_input_from_output: kwargs["exclude_input_from_output"] = True - # Pre-tokenized stop/bad word sequences (set after construction since - # SamplingParams._stop_word_ids/_bad_word_ids are init=False fields) - stop_word_ids = [list(seq.token_ids) for seq in stop_words] if stop_words else None - bad_word_ids = [list(seq.token_ids) for seq in bad_words] if bad_words else None - # Embedding bias if embedding_bias: kwargs["embedding_bias"] = list(embedding_bias) @@ -363,13 +366,6 @@ def create_sampling_params_from_proto( params = SamplingParams(**kwargs) - # Set pre-tokenized stop/bad word IDs directly (these come pre-tokenized - # from the router, so we bypass the tokenizer-based setup path) - if stop_word_ids: - params._stop_word_ids = stop_word_ids - if bad_word_ids: - params._bad_word_ids = bad_word_ids - return params diff --git a/tensorrt_llm/grpc/grpc_servicer.py b/tensorrt_llm/grpc/grpc_servicer.py index 4ad8addd80d..5dbb8291348 100644 --- a/tensorrt_llm/grpc/grpc_servicer.py +++ b/tensorrt_llm/grpc/grpc_servicer.py @@ -97,10 +97,11 @@ async def Generate( proto_config=request.sampling_config, output_config=request.output_config, max_tokens=request.max_tokens, - end_id=request.end_id if request.HasField("end_id") else None, - pad_id=request.pad_id if request.HasField("pad_id") else None, - bad_words=list(request.bad_words) if request.bad_words else None, - stop_words=list(request.stop_words) if request.stop_words else None, + stop=list(request.stop) if request.stop else None, + stop_token_ids=list(request.stop_token_ids) if request.stop_token_ids else None, + ignore_eos=request.ignore_eos, + bad=list(request.bad) if request.bad else None, + bad_token_ids=list(request.bad_token_ids) if request.bad_token_ids else None, guided_decoding=request.guided_decoding if request.HasField("guided_decoding") else None, @@ -485,15 +486,18 @@ def _complete_responses( complete = trtllm_service_pb2.GenerateComplete( output_token_ids=output_tokens, sequence_index=completion.index, - finish_reason=completion.finish_reason or "stop", + finish_reason=completion.finish_reason or "", prompt_tokens=len(prompt_token_ids), completion_tokens=len(output_tokens), cached_tokens=cached_tokens, ) - # Add stop reason if available - if hasattr(completion, "stop_reason") and completion.stop_reason: - complete.stop_reason = str(completion.stop_reason) + # Add matched stop if available (int token ID or str stop sequence) + if hasattr(completion, "stop_reason") and completion.stop_reason is not None: + if isinstance(completion.stop_reason, int): + complete.matched_token_id = completion.stop_reason + else: + complete.matched_stop_str = str(completion.stop_reason) # Add generation logprobs if available if completion.logprobs: diff --git a/tests/integration/defs/accuracy/test_llm_api_pytorch.py b/tests/integration/defs/accuracy/test_llm_api_pytorch.py index 60f0624fc3d..91cc83b32d5 100644 --- a/tests/integration/defs/accuracy/test_llm_api_pytorch.py +++ b/tests/integration/defs/accuracy/test_llm_api_pytorch.py @@ -1565,17 +1565,8 @@ def test_bfloat16_4gpus(self, tp_size, pp_size, ep_size, mtp_nextn, (False, False, False, True), (True, False, True, True), (True, True, True, True)]) @parametrize_with_ids("mtp", ["disable", "eagle", "vanilla"]) - @pytest.mark.parametrize("enable_configurable_moe", [0, 1], - ids=lambda x: "" - if x == 0 else "enable_configurable_moe") def test_fp8_block_scales(self, mtp, fp8kv, attention_dp, cuda_graph, - overlap_scheduler, torch_compile, - enable_configurable_moe, mocker): - # Patch MpiPoolSession to propagate env vars to MPI worker processes - env_value = "1" if enable_configurable_moe == 1 else "0" - patch_mpi_pool_session_for_env(mocker, - {"ENABLE_CONFIGURABLE_MOE": env_value}) - + overlap_scheduler, torch_compile): if torch_compile and mtp != "disable": pytest.skip("https://nvbugs/5252313") kv_cache_config = KvCacheConfig(free_gpu_memory_fraction=0.75) @@ -2001,28 +1992,9 @@ def test_nvfp4_batch_waiting(self, torch_compile, fp8kv, cuda_graph, ids=["tp4", "ep4", "tp2pp2", "pp4"]) @parametrize_with_ids("mtp_nextn", [0, 2]) @parametrize_with_ids("moe_backend", ["CUTLASS", "TRTLLM", "CUTEDSL"]) - @pytest.mark.parametrize("enable_configurable_moe", [0, 1], - ids=lambda x: "" - if x == 0 else "enable_configurable_moe") def test_nvfp4_4gpus(self, fp8kv, attention_dp, cuda_graph, overlap_scheduler, tp_size, pp_size, ep_size, - torch_compile, mtp_nextn, moe_backend, - enable_configurable_moe, mocker): - # Handle ENABLE_CONFIGURABLE_MOE environment variable - if enable_configurable_moe == 1 and moe_backend not in [ - "TRTLLM", "CUTLASS" - ]: - pytest.skip( - f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM and CUTLASS backend, " - f"current backend is {moe_backend}") - - # Patch MpiPoolSession to propagate env vars to MPI worker processes - env_value = "1" if enable_configurable_moe == 1 and moe_backend in [ - "TRTLLM", "CUTLASS" - ] else "0" - patch_mpi_pool_session_for_env(mocker, - {"ENABLE_CONFIGURABLE_MOE": env_value}) - + torch_compile, mtp_nextn, moe_backend): sm_version = get_sm_version() if moe_backend == "TRTLLM" and sm_version in (120, 121): pytest.skip(f"{moe_backend} backend does not support SM 120 or 121") @@ -4119,27 +4091,9 @@ def test_nvfp4( ids=["latency", "ep2", "ep4"]) @pytest.mark.parametrize("activation_dtype", ["static_fp8", "mxfp8"], ids=["fp8", "mxfp8"]) - @pytest.mark.parametrize("enable_configurable_moe", [0, 1], - ids=lambda x: "" - if x == 0 else "enable_configurable_moe") def test_w4a8_mxfp4(self, moe_backend, tp_size, pp_size, ep_size, attention_dp, cuda_graph, overlap_scheduler, - activation_dtype, enable_configurable_moe, mocker): - # Handle ENABLE_CONFIGURABLE_MOE environment variable - if enable_configurable_moe == 1 and moe_backend not in [ - "TRTLLM", "CUTLASS" - ]: - pytest.skip( - f"ENABLE_CONFIGURABLE_MOE=1 is only supported with TRTLLM and CUTLASS backend, " - f"current backend is {moe_backend}") - - # Patch MpiPoolSession to propagate env vars to MPI worker processes - env_value = "1" if enable_configurable_moe == 1 and moe_backend in [ - "TRTLLM", "CUTLASS" - ] else "0" - patch_mpi_pool_session_for_env(mocker, - {"ENABLE_CONFIGURABLE_MOE": env_value}) - + activation_dtype): if moe_backend in ["CUTLASS", "TRTLLM"] and get_sm_version() < 100: pytest.skip( "CUTLASS or TRTLLM moe backend requires Blackwell or newer.") @@ -5230,17 +5184,8 @@ def test_eagle3_2gpus(self, moe_backend, one_model, overlap_scheduler, @pytest.mark.parametrize( "kv_cache_dtype", ["auto", pytest.param("fp8", marks=skip_pre_blackwell)]) - @pytest.mark.parametrize("enable_configurable_moe", [0, 1], - ids=lambda x: "" - if x == 0 else "enable_configurable_moe") - def test_w4_4gpus_online_eplb(self, kv_cache_dtype, enable_configurable_moe, - mocker): + def test_w4_4gpus_online_eplb(self, kv_cache_dtype, mocker): """Test GPTOSS with online expert parallel load balancer using TRTLLM backend and attention DP.""" - # Patch MpiPoolSession to propagate env vars to MPI worker processes - env_value = "1" if enable_configurable_moe == 1 else "0" - patch_mpi_pool_session_for_env(mocker, - {"ENABLE_CONFIGURABLE_MOE": env_value}) - mocker.patch.object(GSM8K, "MAX_OUTPUT_LEN", 8192) mocker.patch.dict(GSM8K.EVALUATE_KWARGS, {"scores_filter": "exact_match,flexible-extract"}) diff --git a/tests/integration/defs/conftest.py b/tests/integration/defs/conftest.py index 48f57261a98..cd399acb415 100644 --- a/tests/integration/defs/conftest.py +++ b/tests/integration/defs/conftest.py @@ -2212,94 +2212,6 @@ def pytest_generate_tests(metafunc: pytest.Metafunc): metafunc.parametrize("case", uts, ids=lambda x: x) -# Test cases that use enable_configurable_moe parameter and need ID conversion -TESTS_WITH_CONFIGURABLE_MOE = [ - "TestDeepSeekV3Lite::test_nvfp4_4gpus[", - "TestDeepSeekV3Lite::test_fp8_block_scales[", - "TestGPTOSS::test_w4_4gpus_online_eplb[", - "TestQwen3_30B_A3B::test_w4a8_mxfp4[", -] - - -def _convert_clean_to_original_moe_test_id(test_id): - """Convert clean MoE test ID back to original format for pytest collection. - - Example: "test_llm_api_pytorch.py::test_foo[param]" -> "test_llm_api_pytorch.py::test_foo[-param]" - - This is needed because the `enable_configurable_moe` parameter uses empty string - as ID when value is 0, resulting in test IDs like "test_foo[-param]". - We clean these up in pytest_collection_modifyitems, but pytest filters tests - during collection using the original IDs. So when user runs with clean test name, - we need to convert it back to match the original. - """ - if "test_llm_api_pytorch.py" not in test_id: - return test_id - - # Match pattern like "test_name[params]" and add leading dash after "[" - # But only if params don't already start with "-" or "enable_configurable_moe" - match = re.search(r"\[([^\]]+)\]", test_id) - if match: - params = match.group(1) - # Skip if already has leading dash or starts with enable_configurable_moe - if not params.startswith("-") and not params.startswith( - "enable_configurable_moe"): - # Add leading dash to params - new_params = "-" + params - test_id = test_id.replace(f"[{params}]", f"[{new_params}]") - - return test_id - - -def pytest_sessionstart(session): - """Convert clean MoE test IDs in config.args to original format for collection. - - This is needed because pytest filters tests during collection using original IDs. - When user runs with clean test name, we convert it back to match the original. - """ - args = session.config.args - for i, arg in enumerate(args): - if "test_llm_api_pytorch.py" in arg and "[" in arg: - # Only apply conversion to specific tests that use enable_configurable_moe - should_convert = any(test_name in arg - for test_name in TESTS_WITH_CONFIGURABLE_MOE) - if should_convert: - args[i] = _convert_clean_to_original_moe_test_id(arg) - - -def _clean_moe_test_ids(items): - """Clean up test IDs by removing leading/trailing dashes from parameter IDs. - - This is needed because `enable_configurable_moe` parameter can be empty, - resulting in ugly test IDs like "test_foo[-True]" or "test_foo[--abc]". - We clean these up to "test_foo[True]" or "test_foo[abc]" so that: - 1. Test names in waive files and test lists remain unchanged - 2. Test reports look cleaner - """ - for item in items: - if "test_llm_api_pytorch.py" in item.nodeid and "[" in item.nodeid: - # Only apply cleanup to specific tests that use enable_configurable_moe - should_cleanup = any(test_name in item.nodeid - for test_name in TESTS_WITH_CONFIGURABLE_MOE) - if should_cleanup: - original_nodeid = item.nodeid - original_name = item.name - nodeid = item.nodeid - name = item.name - - # Clean up leading/trailing dashes in nodeid - nodeid = nodeid.replace("[-", "[") - nodeid = nodeid.replace("-]", "]") - - # Clean up leading/trailing dashes in name - name = name.replace("[-", "[") - name = name.replace("-]", "]") - - if nodeid != original_nodeid: - item._nodeid = nodeid - if name != original_name: - item.name = name - - @pytest.hookimpl(tryfirst=True, hookwrapper=True) def pytest_collection_modifyitems(session, config, items): testlist_path = config.getoption("--test-list") @@ -2308,10 +2220,6 @@ def pytest_collection_modifyitems(session, config, items): perf_test = config.getoption("--perf") test_model_suites = config.getoption("--test-model-suites") - # TODO Once the MoE refactor is complete, this should be removed. - # This is a temporary WAR to minimize the impact of the MoE refactor on the existing test lists. - _clean_moe_test_ids(items) - if perf_test: global ALL_PYTEST_ITEMS ALL_PYTEST_ITEMS = None diff --git a/tests/integration/defs/test_fmha.py b/tests/integration/defs/test_fmha.py index c596da374f6..96ac2864966 100644 --- a/tests/integration/defs/test_fmha.py +++ b/tests/integration/defs/test_fmha.py @@ -3,6 +3,8 @@ from pathlib import Path from subprocess import run +from tests.unittest.utils.util import getSMVersion + def test_fmha(): build_run = partial(run, shell=True, check=True) @@ -14,6 +16,17 @@ def test_fmha(): try: os.chdir(fmha_v2_dir) + test_arch = getSMVersion() + # SM70 is deprecated in TRTLLM, so we don't need to test it + all_archs = [80, 86, 89, 90, 100, 120] + + # TODO Find a way to get this programmatically + # Filter out the architectures that are tested explicitly to not double up + tested_archs = [80, 86, 89, 90] + + # Select the family we belong to (e.g. 103 -> 100) + test_arch = max(filter(lambda x: x <= test_arch, all_archs)) + env = os.environ.copy() env.update({ "TORCH_CUDA_ARCH_LIST": "9.0", @@ -26,10 +39,23 @@ def test_fmha(): "1", # Do not run tests with skip-softmax feature. }) - build_run( - "rm -rf generated temp obj .pytest_cache __pycache__ bin cubin") - build_run("python3 setup.py", env=env) - build_run("make -j 16", env=env) + # The test executable is too large if we build all the architectures, so we must build architectures individually + def build_arch(arch): + env["FMHA_FILTER_ARCH"] = str(arch) + build_run( + "rm -rf generated temp obj .pytest_cache __pycache__ bin cubin") + build_run("python3 setup.py", env=env) + build_run("make -j 16", env=env) + + # As part of the A100 test we compile all the architectures we dont have executors for, even if we dont run them + if test_arch == 80: + build_only_on_archs = set(all_archs) - set(tested_archs) + + for arch in build_only_on_archs: + build_arch(arch) + + # Run the test of our current architecture + build_arch(test_arch) build_run("pytest fmha_test.py", env=env) finally: diff --git a/tests/integration/test_lists/test-db/l0_a100.yml b/tests/integration/test_lists/test-db/l0_a100.yml index a7fc2569ea8..bfff4e577cd 100644 --- a/tests/integration/test_lists/test-db/l0_a100.yml +++ b/tests/integration/test_lists/test-db/l0_a100.yml @@ -105,4 +105,4 @@ l0_a100: stage: post_merge backend: fmha tests: - - test_fmha.py::test_fmha TIMEOUT (90) + - test_fmha.py::test_fmha TIMEOUT (120) # Longer timeout for A100 as it builds all the architectures diff --git a/tests/integration/test_lists/test-db/l0_b300.yml b/tests/integration/test_lists/test-db/l0_b300.yml index c866812d89b..7cd7fefc31b 100644 --- a/tests/integration/test_lists/test-db/l0_b300.yml +++ b/tests/integration/test_lists/test-db/l0_b300.yml @@ -21,7 +21,6 @@ l0_b300: - unittest/_torch/thop/serial - unittest/_torch/executor # 250s # ------------- modules (non-MoE) --------------- - - unittest/_torch/modules/test_mla_helix.py - unittest/_torch/modules/test_fused_add_rms_norm_quant.py - unittest/_torch/modules/test_fused_activation_quant.py - unittest/_torch/modules/test_awq_quantization.py @@ -33,7 +32,6 @@ l0_b300: # ------------- MoE components tests --------------- - unittest/_torch/modules/test_moe_load_balancer.py - unittest/_torch/modules/test_moe_routing.py - - unittest/_torch/modules/test_moe_host_sharer.py # ------------- legacy MoE tests --------------- - unittest/_torch/modules/test_fused_moe.py # ------------- MoE: test_moe_backend (by backend) --------------- @@ -41,11 +39,21 @@ l0_b300: - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "TRTLLM" - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "CUTEDSL" - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "DEEPGEMM" - # ------------- MoE: test_single_gpu (by backend) --------------- - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "CUTLASS" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "TRTLLM" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "CUTEDSL" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "DEEPGEMM" + # ------------- MoE: test_single_gpu (specific quant per backend) --------------- + # CUTLASS backend: FP8, NVFP4, W4A8_MXFP4_MXFP8, W8A16 + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=CUTLASS-quant=FP8-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=CUTLASS-quant=NVFP4-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=CUTLASS-quant=W4A8_MXFP4_MXFP8-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=CUTLASS-quant=W8A16-routing=Renormalize] + # TRTLLM backend: NVFP4, FP8_BLOCK_SCALES, W4A8_NVFP4_FP8, W4A16_MXFP4 + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=TRTLLM-quant=FP8_BLOCK_SCALES-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A8_NVFP4_FP8-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A16_MXFP4-routing=Renormalize] + # CUTEDSL backend: NVFP4 + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=CUTEDSL-quant=NVFP4-routing=Renormalize] + # DEEPGEMM backend: FP8_BLOCK_SCALES + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e60_k4_h2048_i1408-seq=1-dtype=torch.bfloat16-backend=DEEPGEMM-quant=FP8_BLOCK_SCALES-routing=Renormalize] # ---- end MoE tests ---- - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8B::test_nvfp4 - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_nvfp4[moe_backend=TRTLLM-mtp_nextn=0-fp8kv=True-attention_dp=False-cuda_graph=True-overlap_scheduler=True-torch_compile=False] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b200.yml b/tests/integration/test_lists/test-db/l0_dgx_b200.yml index 4821c6a3c14..d57e041023b 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b200.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b200.yml @@ -16,30 +16,6 @@ l0_dgx_b200: orchestrator: mpi tests: - unittest/_torch/misc/test_autotuner.py::test_autotuner_distributed_strategy - # ------------- legacy MoE tests --------------- - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEPLowLatency] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEPLowLatency] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[NVLinkTwoSided] - # ------------- MoE: test_multi_gpu (by backend x quant) --------------- - # --- CUTLASS --- - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and None" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and FP8 and not MXFP8" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and NVFP4" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W4A8_MXFP4_MXFP8" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W8A16" - # --- TRTLLM --- - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and NVFP4 and not W4A8" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and FP8_BLOCK_SCALES" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A8_NVFP4_FP8" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A16_MXFP4" - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A8_MXFP4_MXFP8" - # --- CUTEDSL (NVFP4 only) --- - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTEDSL" - # --- DEEPGEMM (FP8_BLOCK_SCALES only) --- - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "DEEPGEMM" - # ------------- MoE: test_multi_gpu_eplb --------------- - - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb - # ---- end MoE tests ---- - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-False-True-True] - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_auto_dtype_4gpus[4-4-True-True-True] - accuracy/test_llm_api_pytorch.py::TestNemotronV3Super::test_nvfp4_4gpu_mtp_ar TIMEOUT (60) @@ -60,6 +36,39 @@ l0_dgx_b200: - unittest/_torch/visual_gen/test_wan_i2v.py::TestWanI2VCombinedOptimizations::test_all_optimizations_combined - unittest/_torch/visual_gen/test_flux_pipeline.py::TestFluxParallelism::test_ulysses_2gpu_correctness - unittest/_torch/visual_gen/test_flux_pipeline.py::TestFluxCombinedOptimizations::test_all_optimizations_combined +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*b200*' + linux_distribution_name: ubuntu* + cpu: x86_64 + terms: + stage: pre_merge + backend: pytorch + orchestrator: mpi + tests: + # ------------- MoE: test_multi_gpu (by backend x quant) --------------- + # --- CUTLASS --- + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and FP8 and not MXFP8" + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and NVFP4" + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W4A8_MXFP4_MXFP8" + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W8A16" + # --- TRTLLM --- + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and NVFP4 and not W4A8" + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and FP8_BLOCK_SCALES" + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A8_NVFP4_FP8" + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A16_MXFP4" + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "TRTLLM and W4A8_MXFP4_MXFP8" + # --- CUTEDSL (NVFP4 only) --- + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTEDSL" + # --- DEEPGEMM (FP8_BLOCK_SCALES only) --- + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "DEEPGEMM" + # ------------- MoE: test_multi_gpu_eplb --------------- + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb - condition: ranges: system_gpu_count: @@ -169,7 +178,6 @@ l0_dgx_b200: backend: pytorch orchestrator: mpi tests: - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall_fp4[DeepEP] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_bfloat16_4gpus[tp4-attn_backend=FLASHINFER-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[tp4-fp8kv=False-attn_backend=FLASHINFER-torch_compile=False] - accuracy/test_llm_api_pytorch.py::TestLlama3_1_8BInstruct::test_fp8_4gpus[pp4-fp8kv=False-attn_backend=TRTLLM-torch_compile=False] diff --git a/tests/integration/test_lists/test-db/l0_dgx_b300.yml b/tests/integration/test_lists/test-db/l0_dgx_b300.yml index 2b71bb5bac1..602d7112ec4 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_b300.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_b300.yml @@ -17,22 +17,32 @@ l0_dgx_b300: tests: - unittest/_torch/attention - unittest/_torch/executor - # ------------- modules (non-MoE) --------------- + # ------------- modules (multi-GPU) --------------- - unittest/_torch/modules/test_mla_helix.py - - unittest/_torch/modules/test_fused_add_rms_norm_quant.py - - unittest/_torch/modules/test_fused_activation_quant.py - - unittest/_torch/modules/test_awq_quantization.py - - unittest/_torch/modules/test_triton_linear.py - - unittest/_torch/modules/test_group_rmn_norm.py - - unittest/_torch/modules/test_rotary_embedding.py - - unittest/_torch/modules/mamba - - unittest/_torch/modules/tests_lora_modules - # ------------- MoE components tests --------------- - - unittest/_torch/modules/test_moe_load_balancer.py - - unittest/_torch/modules/test_moe_routing.py + # ------------- MoE components tests (multi-GPU) --------------- - unittest/_torch/modules/test_moe_host_sharer.py - # ------------- legacy MoE tests --------------- + # ------------- legacy MoE tests (multi-GPU) --------------- - unittest/_torch/modules/test_fused_moe.py + # ------------- MoE: multi-GPU module tests (DEP parallel, per backend per quant) --------------- + # CUTLASS backend: FP8, NVFP4, W4A8_MXFP4_MXFP8, W8A16 + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=FP8-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=NVFP4-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=W4A8_MXFP4_MXFP8-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e256_k8_h7168_i2048-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=W8A16-routing=Renormalize] + # TRTLLM backend: NVFP4, FP8_BLOCK_SCALES, W4A8_NVFP4_FP8, W4A16_MXFP4, W4A8_MXFP4_MXFP8 + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e256_k8_h7168_i2048-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=FP8_BLOCK_SCALES-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A8_NVFP4_FP8-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e256_k8_h7168_i2048-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A16_MXFP4-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=NVLINK_ONE_SIDED-e8_k1_h512_i512-seq=8-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A8_MXFP4_MXFP8-routing=Renormalize] + # CUTEDSL backend: NVFP4 + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=CUTEDSL-quant=NVFP4-routing=Renormalize] + # DEEPGEMM backend: FP8_BLOCK_SCALES + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu[parallel=DEP-comm=DEEPEP-e60_k4_h2048_i1408-seq=8-dtype=torch.bfloat16-backend=DEEPGEMM-quant=FP8_BLOCK_SCALES-routing=Renormalize] + # ------------- MoE: EPLB (Expert Load Balancing) tests --------------- + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb[parallel=DEP-comm=NVLINK_ONE_SIDED-e8_k2_h512_i512-slots=16-dtype=torch.bfloat16-backend=CUTLASS-quant=NVFP4-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb[parallel=DEP-comm=NVLINK_ONE_SIDED-e8_k2_h512_i512-slots=16-dtype=torch.bfloat16-backend=TRTLLM-quant=NVFP4-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb[parallel=DEP-comm=NVLINK_ONE_SIDED-e8_k2_h512_i512-slots=16-dtype=torch.bfloat16-backend=TRTLLM-quant=W4A16_MXFP4-routing=Renormalize] - unittest/_torch/modeling -k "modeling_llama" - unittest/_torch/modeling -k "modeling_mixtral" - unittest/_torch/modeling -k "modeling_gpt_oss" diff --git a/tests/integration/test_lists/test-db/l0_dgx_h100.yml b/tests/integration/test_lists/test-db/l0_dgx_h100.yml index 7ac527a94d7..31535f49e81 100644 --- a/tests/integration/test_lists/test-db/l0_dgx_h100.yml +++ b/tests/integration/test_lists/test-db/l0_dgx_h100.yml @@ -143,6 +143,30 @@ l0_dgx_h100: - disaggregated/test_auto_scaling.py::test_worker_restart[http-load_balancing] - disaggregated/test_auto_scaling.py::test_minimal_instances[http-round_robin] - disaggregated/test_auto_scaling.py::test_disagg_server_restart[http-round_robin] +- condition: + ranges: + system_gpu_count: + gte: 4 + lte: 4 + wildcards: + gpu: + - '*h100*' + linux_distribution_name: ubuntu* + terms: + stage: pre_merge + backend: pytorch + auto_trigger: others + orchestrator: mpi + tests: + # ------------- MoE: test_multi_gpu (by backend x quant) --------------- + # Only CUTLASS backend runs on H100 (SM90). TRTLLM/CUTEDSL/DEEPGEMM require SM100+. + # --- CUTLASS --- + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and FP8 and not MXFP8" + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W8A16" + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W4A16_MXFP4" + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu -k "CUTLASS and W4A8_AWQ" + # ------------- MoE: test_multi_gpu_eplb --------------- + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_multi_gpu_eplb - condition: ranges: system_gpu_count: @@ -160,12 +184,6 @@ l0_dgx_h100: tests: - unittest/_torch/multi_gpu_modeling/test_deepseek.py::test_deepseek_streaming[tp1-bf16-trtllm-deepseekv3_lite] - unittest/_torch/multi_gpu_modeling/test_deepseek.py::test_deepseek_streaming[tp4-bf16-trtllm-deepseekv3_lite] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[DeepEP] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_alltoall[NVLinkTwoSided] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype0] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.VANILLA-dtype1] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.W4A8_CUSTOM-dtype0] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_w4afp8[MoEWeightLoadingMode.W4A8_CUSTOM-dtype1] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[tp4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=0-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] - accuracy/test_llm_api_pytorch.py::TestDeepSeekV3Lite::test_fp8_block_scales_4gpus[ep4-mtp_nextn=2-fp8kv=True-attention_dp=True-cuda_graph=True-overlap_scheduler=True-torch_compile=False-sampler_async_worker=False] diff --git a/tests/integration/test_lists/test-db/l0_gb10.yml b/tests/integration/test_lists/test-db/l0_gb10.yml index a749f27a54e..efc6a64530f 100644 --- a/tests/integration/test_lists/test-db/l0_gb10.yml +++ b/tests/integration/test_lists/test-db/l0_gb10.yml @@ -38,5 +38,5 @@ l0_gb10: # Below cases which are commented out due to they failed on gb10 # - unittest/_torch/modeling -k "modeling_mllama" - unittest/_torch/modeling -k "modeling_out_of_tree" - # - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype0] - # - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[CUTLASS-dtype1] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e8_k1_h512_i512-seq=8-dtype=torch.float16-backend=CUTLASS-quant=NVFP4-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e8_k1_h512_i512-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=NVFP4-routing=Renormalize] diff --git a/tests/integration/test_lists/test-db/l0_gb202.yml b/tests/integration/test_lists/test-db/l0_gb202.yml index 0255ba1086d..84c63b7f7a5 100644 --- a/tests/integration/test_lists/test-db/l0_gb202.yml +++ b/tests/integration/test_lists/test-db/l0_gb202.yml @@ -17,8 +17,8 @@ l0_gb202: # ------------- PyTorch tests --------------- - unittest/_torch/modeling -k "modeling_mllama" - unittest/_torch/modeling -k "modeling_out_of_tree" - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_finalize_fusion-CUTLASS-dtype0] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_finalize_fusion-CUTLASS-dtype1] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e8_k1_h512_i512-seq=8-dtype=torch.float16-backend=CUTLASS-quant=NVFP4-routing=Renormalize] + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu[e8_k1_h512_i512-seq=8-dtype=torch.bfloat16-backend=CUTLASS-quant=NVFP4-routing=Renormalize] # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573 - unittest/_torch/attention/test_attention_mla.py - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] diff --git a/tests/integration/test_lists/test-db/l0_h100.yml b/tests/integration/test_lists/test-db/l0_h100.yml index 3c3a4b0cbd4..6631322f8d9 100644 --- a/tests/integration/test_lists/test-db/l0_h100.yml +++ b/tests/integration/test_lists/test-db/l0_h100.yml @@ -36,6 +36,10 @@ l0_h100: - unittest/_torch/modules/test_moe_host_sharer.py # ------------- legacy MoE tests --------------- - unittest/_torch/modules/test_fused_moe.py + # ------------- MoE: test_moe_backend (by backend) --------------- + - unittest/_torch/modules/moe/test_moe_backend.py::test_moe_backend -k "CUTLASS" + # ------------- MoE: test_single_gpu (by backend) --------------- + - unittest/_torch/modules/moe/test_moe_module.py::test_configurable_moe_single_gpu -k "CUTLASS" - unittest/_torch/multimodal - unittest/_torch/sampler - unittest/_torch/speculative -k "eagle3" diff --git a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml index 96ee52c85dc..a2c9e0fcb51 100644 --- a/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml +++ b/tests/integration/test_lists/test-db/l0_rtx_pro_6000.yml @@ -18,8 +18,6 @@ l0_rtx_pro_6000: - unittest/_torch/modeling -k "modeling_out_of_tree" # - unittest/_torch/modeling -k "modeling_qwen" # https://nvbugs/5234573 - unittest/_torch/attention/test_attention_mla.py - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_finalize_fusion-CUTLASS-dtype0] - - unittest/_torch/modules/test_fused_moe.py::test_fused_moe_nvfp4[enable_finalize_fusion-CUTLASS-dtype1] - test_e2e.py::test_ptp_quickstart_bert[VANILLA-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] - test_e2e.py::test_ptp_quickstart_bert[TRTLLM-BertForSequenceClassification-bert/bert-base-uncased-yelp-polarity] - test_e2e.py::test_ptp_quickstart_advanced[Llama3.1-8B-BF16-llama-3.1-model/Meta-Llama-3.1-8B] diff --git a/tests/unittest/_torch/modules/moe/moe_test_utils.py b/tests/unittest/_torch/modules/moe/moe_test_utils.py index 5db2ddbc020..69c6418559f 100644 --- a/tests/unittest/_torch/modules/moe/moe_test_utils.py +++ b/tests/unittest/_torch/modules/moe/moe_test_utils.py @@ -28,6 +28,7 @@ """ import logging +import os import time from dataclasses import dataclass from enum import Enum @@ -92,6 +93,41 @@ def __str__(self) -> str: # ============================================================================ # Skip Logic Functions # ============================================================================ +def _is_fp4_fp8_standalone_gemm_available() -> bool: + """Check if standalone fp4_fp8_gemm_trtllmgen kernel has compiled configs on this GPU. + + The W4A8_NVFP4_FP8 reference module (W4A8NVFP4FP8RefGatedMLPFusedMoE) uses + standalone fp4_fp8_gemm_trtllmgen GEMM calls via W4A8NVFP4FP8LinearMethod. + These standalone GEMM kernels may not have compiled configurations for all SM + versions, even when the fused MoE kernel (TRTLLMGenFusedMoE) works fine. + + Returns True if the standalone kernel is available, False otherwise. + Result is cached after first call. + """ + if hasattr(_is_fp4_fp8_standalone_gemm_available, "_cached_result"): + return _is_fp4_fp8_standalone_gemm_available._cached_result + + try: + import tensorrt_llm.quantization.utils.fp4_utils as fp4_utils + + # Create minimal valid tensors for GEMM probe: + # mat1: (m, k) FP8, mat2: (n, k/2) FP4, scale: FP8, global_scale: FP32 + m, n, k = 1, 128, 128 + fp8_input = torch.zeros((m, k), dtype=torch.float8_e4m3fn, device="cuda") + fp4_weight = torch.zeros((n, k // 2), dtype=fp4_utils.float4_e2m1x2, device="cuda") + weight_scale = torch.ones((n * (k // 32),), dtype=torch.float8_e4m3fn, device="cuda") + global_scale = torch.ones((1,), dtype=torch.float32, device="cuda") + torch.ops.trtllm.fp4_fp8_gemm_trtllmgen( + fp8_input, fp4_weight, weight_scale, global_scale, torch.float16 + ) + result = True + except RuntimeError: + result = False + + _is_fp4_fp8_standalone_gemm_available._cached_result = result + return result + + def should_skip_trtllm( backend_type: MoeBackendType, quant_algo: Optional[QuantAlgo], @@ -99,6 +135,8 @@ def should_skip_trtllm( routing_method_cls=None, swiglu_gptoss_style: bool = False, comm_method: Optional[str] = None, + seq_len: Optional[int] = None, + moe_tp_size: int = 1, ) -> Optional[str]: """ Check TRTLLM Gen backend specific constraints. @@ -115,6 +153,8 @@ def should_skip_trtllm( swiglu_gptoss_style: Whether using swiglu gptoss style comm_method: Optional communication method (e.g. "DEEPEP", "DEEPEPLOWLATENCY") for multi-GPU EP mode checks + seq_len: Optional sequence length for seq_len-sensitive skip checks + moe_tp_size: MoE TP parallelism size (default: 1, no TP sharding) Returns: Skip reason string if test should be skipped, None otherwise @@ -226,6 +266,20 @@ def should_skip_trtllm( f"block_scale_interleave_reverse rows must be multiple of 128." ) + # -----------------Reference module constraints------------------ + # The W4A8_NVFP4_FP8 reference module (W4A8NVFP4FP8RefGatedMLPFusedMoE) uses + # standalone fp4_fp8_gemm_trtllmgen GEMM calls via W4A8NVFP4FP8LinearMethod. + # These standalone GEMM kernels may not have compiled configs for all SM versions, + # even though the fused MoE kernel (TRTLLMGenFusedMoE) works fine on those SMs. + # Skip if the standalone kernel is not available on the current GPU. + if quant_algo == QuantAlgo.W4A8_NVFP4_FP8: + if not _is_fp4_fp8_standalone_gemm_available(): + return ( + "W4A8_NVFP4_FP8 reference module requires standalone " + "fp4_fp8_gemm_trtllmgen kernel which is not available on this GPU. " + "The fused MoE kernel works but the reference GatedMLP cannot run." + ) + # -----------------Potential issues------------------ # These are known issues that need investigation. Skipping to avoid test failures # and CUDA errors that can cascade to subsequent tests. @@ -237,6 +291,30 @@ def should_skip_trtllm( "causes CUDA illegal memory access." ) + # Issue: NVFP4 with large expert count + large hidden_size + seq_len=1 + # has a single FP4BlockScaleMoERunner tactic with accuracy failure. + # Observed: e256_k8_h7168_i2048, seq=1, bfloat16 — tactic[204] with tile + # config [8, 83] produces 8.37% element mismatch (threshold: 3%). + # All other 207/208 tactics pass. seq=8 with the same config also passes + # (different tile behavior). The swiglu_gptoss_style variant passes too + # (uses relaxed tolerance: rtol=0.1, percent=0.95). + # Root cause: FP4 quantization error accumulates in the large GEMM reduction + # dimension (h=7168) and the [8, 83] tile config hits an edge case at seq=1. + if ( + quant_algo == QuantAlgo.NVFP4 + and not swiglu_gptoss_style + and seq_len == 1 + and num_experts >= 256 + and model_config.hidden_size >= 7168 + ): + return ( + f"[Potential Bug] TRTLLMGenFusedMoE NVFP4 with large model " + f"(num_experts={num_experts}, hidden_size={model_config.hidden_size}) " + f"and seq_len=1: 207/208 tactics pass but tactic[204] " + f"(FP4BlockScaleMoERunner tile [8, 83]) has 8.37% mismatch " + f"(threshold 3%). seq_len=8 passes all tactics." + ) + # Issue: NVFP4 with large intermediate_size has known accuracy issues if quant_algo == QuantAlgo.NVFP4 and intermediate_size >= 14336: return ( @@ -285,6 +363,43 @@ def should_skip_trtllm( f"Single-GPU tests pass; issue is in the kernel runner under EP." ) + # Issue: NVFP4 with large model configs crashes with CUDA illegal memory + # access in DeepEP mode (deep_ep.cpp:86). + # Verified: e60_k4_h2048_i1408 passes, e256_k8_h7168_i2048 crashes. + # The crash kills the entire pytest process, blocking all subsequent tests. + if ( + quant_algo == QuantAlgo.NVFP4 + and num_experts >= 256 + and model_config.hidden_size >= 7168 + ): + return ( + f"[Potential Bug] TRTLLMGenFusedMoE NVFP4 with large model " + f"(num_experts={num_experts}, hidden_size={model_config.hidden_size}) " + f"crashes with CUDA illegal memory access in DeepEP mode " + f"(comm={comm_method}). Smaller configs pass." + ) + + # TP per-shard alignment: when moe_tp_size > 1, intermediate_size is sharded. + # MXFP4 variants (W4A16_MXFP4, W4A8_MXFP4_MXFP8) auto-pad to 128 alignment, + # but other quants (FP8_BLOCK_SCALES, NVFP4, W4A8_NVFP4_FP8) crash: + # - FP8_BLOCK_SCALES: block scale tensor size mismatch + # (ceil(per_shard/128) vs floor(per_shard/128)) + # - NVFP4: unswizzle_sf shape '[-1, w3_w1, 128]' invalid + # - W4A8_NVFP4_FP8: No valid config for non-aligned N dimension + if moe_tp_size > 1 and intermediate_size % moe_tp_size == 0: + per_shard = intermediate_size // moe_tp_size + tp_crash_quants = { + QuantAlgo.FP8_BLOCK_SCALES, + QuantAlgo.NVFP4, + QuantAlgo.W4A8_NVFP4_FP8, + } + if quant_algo in tp_crash_quants and per_shard % 128 != 0: + return ( + f"TRTLLMGenFusedMoE {quant_algo}: per-shard intermediate_size=" + f"{per_shard} (= {intermediate_size} / {moe_tp_size}) is not " + f"128-aligned." + ) + return None @@ -294,6 +409,7 @@ def should_skip_cutedsl( model_config: "MoeModelConfig" = None, comm_method: Optional[str] = None, routing_method_cls=None, + moe_tp_size: int = 1, ) -> Optional[str]: """ Check CuteDSL backend specific constraints. @@ -304,42 +420,45 @@ def should_skip_cutedsl( if backend_type != MoeBackendType.CUTEDSL: return None - # DeepEPLowLatency _modify_output_to_adapt_fused_moe converts dispatch output - # to a format where token_selected_slots has shape [num_local_experts, tokens_per_expert] - # instead of [num_tokens, top_k]. CuteDSL moe_sort asserts - # token_selected_experts.size(1) == top_k, which fails with this format. - if comm_method == "DEEPEPLOWLATENCY": - return ( - "[Potential Bug] CuteDslFusedMoE is incompatible with DeepEPLowLatency: " - "DeepEPLowLatency _modify_output_to_adapt_fused_moe reshapes " - "token_selected_slots to [num_local_experts, tokens_per_expert] " - "(effectively top_k=1), but CuteDSL moe_sort requires " - "token_selected_experts.size(1) == top_k." - ) - if model_config is None: return None intermediate_size = model_config.intermediate_size - num_experts = model_config.num_experts - # NVFP4 with large intermediate_size has known accuracy issues + # NVFP4 with large intermediate_size has known accuracy issues (8.5% mismatch + # at i=14336, threshold 3%). Both CuteDSL and reference have FP4 intermediate + # storage, but produce DIFFERENT FP4 values due to: + # 1) SwiGLU precision: CuteDSL kernel uses approximate math ops for sigmoid + # (rcp_approx + exp2 fastmath, see utils.py:sigmoid_f32), while reference + # Triton kernel uses standard tl.sigmoid (see swiglu.py:42). + # 2) Precision chain: CuteDSL computes SwiGLU in FP32 (GEMM accumulator → + # FP32 SwiGLU → FP4), reference goes FP32 accumulator → BF16 → SwiGLU → + # BF16 → fp4_quantize. Two BF16 truncation points create different values. + # 3) FP4 quantization: CuteDSL uses rcp_approx for block scale reciprocal + # (blockscaled_...fusion.py:2588), fp4_quantize uses exact division. + # These per-element FP4 value differences accumulate through FC2 GEMM dot + # product (K=intermediate_size). CUTLASS avoids this entirely with a single + # fused kernel keeping BF16 intermediate precision. if quant_algo == QuantAlgo.NVFP4 and intermediate_size >= 14336: return ( - f"[Potential Bug] CuteDslFusedMoE NVFP4 with large intermediate_size " - f"has known accuracy issues (intermediate_size={intermediate_size} >= 14336)." - ) - - # NVFP4 with prime num_experts causes CUDA_ERROR_ILLEGAL_ADDRESS - prime_experts_with_issues = {7, 13} - if quant_algo == QuantAlgo.NVFP4 and num_experts in prime_experts_with_issues: - return ( - f"[Potential Bug] CuteDslFusedMoE NVFP4 with prime num_experts={num_experts} " - f"causes CUDA_ERROR_ILLEGAL_ADDRESS due to autotuner cache bucket mapping." + f"[Design Limitation] CuteDslFusedMoE NVFP4 with large " + f"intermediate_size has accuracy issues due to FP4 intermediate " + f"storage between FC1+SwiGLU and FC2 kernels " + f"(intermediate_size={intermediate_size} >= 14336, " + f"FC2 accumulates over K={intermediate_size} with 896+ blocks)." ) - # NVFP4 with Llama4Renormalize routing has significant accuracy issues on bfloat16. - # Observed mismatch up to 34.6% (threshold 2% at rtol=0.01, percent=0.98). + # NVFP4 with Llama4Renormalize routing has significant accuracy issues. + # Same root cause as the large intermediate_size skip above: CuteDSL and + # reference produce different FP4 intermediate values due to approximate + # math ops (rcp_approx, exp2 fastmath) and BF16 truncation differences. + # Llama4's sigmoid routing amplifies these differences: standard Renormalize + # uses softmax (weights sum to 1, per-expert errors averaged), while Llama4 + # uses sigmoid (weights independent in (0,1), per-expert errors summed + # without normalization). This amplifies FP4 value differences by ~top_k/2. + # Mismatch correlates with hidden_size (FC1 K dimension): h=512 passes, + # h=2048 fails 8-17%, h=7168 fails 24-35%. Observed: e60(9.4%), + # e64(16.5%), e256(34.6%), e384(30.9%) at threshold 3%. if routing_method_cls is not None: from tensorrt_llm._torch.modules.fused_moe import Llama4RenormalizeMoeRoutingMethod @@ -348,8 +467,20 @@ def should_skip_cutedsl( and routing_method_cls == Llama4RenormalizeMoeRoutingMethod ): return ( - "[Potential Bug] CuteDslFusedMoE NVFP4 with Llama4Renormalize " - "routing has significant accuracy issues (mismatch up to 34.6%%)." + "[Design Limitation] CuteDslFusedMoE NVFP4 with Llama4Renormalize " + "routing: FP4 intermediate errors amplified by non-normalized " + "sigmoid routing weights (mismatch up to 34.6%)." + ) + + # TP per-shard alignment: NVFP4 requires 128-aligned per-shard intermediate_size. + # fp4_utils.py asserts M % 128 == 0 where M = 2 * per_shard (combined w3_w1). + if moe_tp_size > 1 and quant_algo == QuantAlgo.NVFP4 and intermediate_size % moe_tp_size == 0: + per_shard = intermediate_size // moe_tp_size + if per_shard % 128 != 0: + return ( + f"CuteDslFusedMoE NVFP4: per-shard intermediate_size=" + f"{per_shard} (= {intermediate_size} / {moe_tp_size}) is not " + f"128-aligned. fp4_utils asserts M % 128 == 0." ) return None @@ -360,6 +491,8 @@ def should_skip_cutlass( comm_method: Optional[str] = None, quant_algo: Optional[QuantAlgo] = None, model_config: "MoeModelConfig" = None, + moe_tp_size: int = 1, + dtype=None, ) -> Optional[str]: """ Check CUTLASS backend specific constraints for multi-GPU tests. @@ -370,25 +503,35 @@ def should_skip_cutlass( if backend_type != MoeBackendType.CUTLASS: return None - # Issue: CUTLASS + DeepEP + (W4A8_MXFP4_MXFP8 or W8A16) has significant accuracy - # issues in multi-GPU EP mode. Observed failures: - # - e32_k8_h7168_i2048, seq=8: mismatch 24-37% (rtol=0.15) - # - e8_k1_h512_i512, seq=1/8: mismatch 86-100% (rtol=0.10), results completely wrong - # NVLINK communication with the same configs passes. - # Root cause: likely data layout or all-to-all dispatch/combine issue in the - # DeepEP communication path for these quantization methods. - if comm_method in ("DEEPEP", "DEEPEPLOWLATENCY"): - deepep_accuracy_quant_algos = { - QuantAlgo.W4A8_MXFP4_MXFP8, + # TP per-shard alignment: W8A16, NVFP4, and W4A8_AWQ require 128-aligned + # per-shard intermediate_size. W8A16 fails in preprocess_weights_for_mixed_gemm + # (num_rows % rows_per_tile != 0). NVFP4 pads to 128-alignment + # (NVFP4_ROW_ALIGNMENT in quantization.py:2312) but zero-padding + + # blockwise quantization interaction causes ~6-7% mismatch. + # W4A8_AWQ (WInt4AFP8FusedMoEMethod) requires K dimensions to be multiples + # of 128 on SM90 for interleave factor selection (quantization.py:1310-1324). + # W4A8_MXFP4_MXFP8 uses MXFP4 auto-padding that handles this correctly. + if moe_tp_size > 1 and model_config is not None: + tp_alignment_quants = { QuantAlgo.W8A16, + QuantAlgo.NVFP4, + QuantAlgo.W4A8_AWQ, } - if quant_algo in deepep_accuracy_quant_algos: - return ( - f"[Potential Bug] CutlassFusedMoE {quant_algo} has significant accuracy " - f"issues with DeepEP communication (comm={comm_method}). " - f"Mismatch up to 100% on small models (e8_k1). " - f"NVLINK communication with the same config passes." - ) + # FP8_BLOCK_SCALES has this issue only on Hopper (SM90) + if torch.cuda.get_device_capability(0) == (9, 0): + tp_alignment_quants.add(QuantAlgo.FP8_BLOCK_SCALES) + + if quant_algo in tp_alignment_quants: + intermediate_size = model_config.intermediate_size + if intermediate_size % moe_tp_size == 0: + per_shard = intermediate_size // moe_tp_size + if per_shard % 128 != 0: + return ( + f"CutlassFusedMoE {quant_algo}: per-shard " + f"intermediate_size={per_shard} " + f"(= {intermediate_size} / {moe_tp_size}) is not " + f"128-aligned." + ) return None @@ -398,6 +541,7 @@ def should_skip_deepgemm( comm_method: Optional[str] = None, quant_algo: Optional[QuantAlgo] = None, model_config: "MoeModelConfig" = None, + moe_tp_size: int = 1, ) -> Optional[str]: """ Check DeepGemm backend specific constraints. @@ -409,20 +553,50 @@ def should_skip_deepgemm( return None # Issue: DEEPGEMM + FP8_BLOCK_SCALES crashes with CUDA illegal memory access - # on large expert counts (e.g. e384_k8_h7168_i2048) during post_load_weights(). - # The crash occurs in get_col_major_tma_aligned_packed_tensor (fp8_utils.py) - # when resmoothing FP8 E8M0 scales on SM100f (Blackwell). - # Small configs (e.g. e60_k4_h2048_i1408) pass fine. + # in _resmooth_kernel (Triton JIT) during post_load_weights() FP8 E8M0 scale + # resmoothing on SM100f (Blackwell). Root cause is a Triton compiler/runtime + # bug on SM100f: the kernel crashes when total grid blocks exceed ~65K. + # The crash depends on grid size, not just num_experts — Grok-1 (e8, h=6144, + # i=32768) crashes despite having only 8 experts because its weight tensors + # produce grids with 196K+ blocks. + # Weight shapes: w3_w1=[E, I*2, H], w2=[E, H, I] (from quantization.py) + # Grid for resmooth: (E, cdiv(M,128), cdiv(K,128)) + # Verified boundary: max_blocks <= 57344 passes, >= 98304 crashes. + # Threshold: 65536 blocks (64K). Affected: DeepSeek-V3, Kimi-K2, Grok-1. + _RESMOOTH_GRID_BLOCK_LIMIT = 65536 if quant_algo == QuantAlgo.FP8_BLOCK_SCALES and model_config is not None: - if model_config.num_experts > 128: + num_e = model_config.num_experts + hidden = model_config.hidden_size + inter = model_config.intermediate_size + + def _cdiv(x, y): + return (x + y - 1) // y + + w31_blocks = num_e * _cdiv(inter * 2, 128) * _cdiv(hidden, 128) + w2_blocks = num_e * _cdiv(hidden, 128) * _cdiv(inter, 128) + max_blocks = max(w31_blocks, w2_blocks) + if max_blocks > _RESMOOTH_GRID_BLOCK_LIMIT: return ( - f"[Potential Bug] DeepGemmFusedMoE FP8_BLOCK_SCALES crashes with " - f"CUDA illegal memory access on large expert count " - f"(num_experts={model_config.num_experts}). The crash occurs in " - f"get_col_major_tma_aligned_packed_tensor during " - f"post_load_weights() FP8 E8M0 scale resmoothing on SM100f." + f"[Triton Bug] DeepGemmFusedMoE FP8_BLOCK_SCALES crashes in " + f"_resmooth_kernel on SM100f when grid blocks exceed ~64K " + f"(max_blocks={max_blocks:,} > {_RESMOOTH_GRID_BLOCK_LIMIT:,}). " + f"Affected: E={num_e}, H={hidden}, I={inter}." ) + # TP per-shard alignment: FP8_BLOCK_SCALES requires 128-aligned per-shard + # intermediate_size for block scale tensor operations. + if moe_tp_size > 1 and quant_algo == QuantAlgo.FP8_BLOCK_SCALES and model_config is not None: + intermediate_size = model_config.intermediate_size + if intermediate_size % moe_tp_size == 0: + per_shard = intermediate_size // moe_tp_size + if per_shard % 128 != 0: + return ( + f"DeepGemmFusedMoE FP8_BLOCK_SCALES: per-shard " + f"intermediate_size={per_shard} " + f"(= {intermediate_size} / {moe_tp_size}) is not " + f"128-aligned." + ) + return None @@ -430,6 +604,7 @@ def should_skip_multi_gpu( parallel_mode: str, model_config: "MoeModelConfig", world_size: int = 4, + comm_method: Optional[str] = None, ) -> Optional[str]: """ Check if a multi-GPU test should be skipped due to EP partitioning constraints. @@ -442,10 +617,20 @@ def should_skip_multi_gpu( parallel_mode: Parallelism strategy ("DEP", "TEP", "DTP", "TTP") model_config: MoE model configuration containing num_experts world_size: Total number of GPUs (default: 4) + comm_method: Optional communication method (e.g. "DEEPEP", "DEEPEPLOWLATENCY") Returns: Skip reason string if test should be skipped, None otherwise """ + # DEEPEPLOWLATENCY hangs on H100 (SM90) in CI multi-GPU tests. + if comm_method == "DEEPEPLOWLATENCY": + capability = torch.cuda.get_device_capability(0) + if capability == (9, 0): + return ( + "[CI Hang] DEEPEPLOWLATENCY hangs on H100 (SM90) in " + "multi-GPU tests. Skipping until the issue is resolved." + ) + # Only EP modes have ep_size = world_size; TP modes have ep_size = 1 if parallel_mode not in ("DEP", "TEP"): return None @@ -527,6 +712,7 @@ def get_quick_skip_reason( model_config: "MoeModelConfig", routing_method_cls=None, swiglu_gptoss_style: bool = False, + seq_len: Optional[int] = None, ) -> Optional[str]: """ Fast skip check that calls backend's can_implement() method. @@ -534,6 +720,7 @@ def get_quick_skip_reason( Unified version supporting both backend-level and module-level tests: - routing_method_cls: Used by test_moe_module.py for routing method compatibility checks - swiglu_gptoss_style: Used by test_moe_backend.py for SwiGLU parameter checks + - seq_len: Optional sequence length for seq_len-sensitive skip checks Returns: Skip reason string if test should be skipped, None otherwise @@ -559,7 +746,12 @@ def get_quick_skip_reason( skip_checks = [ lambda: should_skip_routing_method(routing_method_cls, model_config), lambda: should_skip_trtllm( - backend_type, quant_algo, model_config, routing_method_cls, swiglu_gptoss_style + backend_type, + quant_algo, + model_config, + routing_method_cls, + swiglu_gptoss_style, + seq_len=seq_len, ), lambda: should_skip_cutedsl( backend_type, quant_algo, model_config, routing_method_cls=routing_method_cls @@ -600,6 +792,45 @@ def get_quick_skip_reason( trtllm_logger.setLevel(original_level) +# ============================================================================ +# GPU Memory Check +# ============================================================================ +def skip_if_insufficient_gpu_memory( + num_experts: int, + hidden_size: int, + intermediate_size: int, + dtype: torch.dtype = torch.float32, + overhead_factor: float = 4.0, +) -> None: + """ + Skip the current test if estimated GPU memory exceeds device capacity. + + Each expert has gate_up_proj [2*I, H] + down_proj [H, I] = 3*H*I elements. + The overhead_factor (default 4x) accounts for ref model + DUT model + + quantization scales/activations + CUDA allocator overhead. + + Args: + num_experts: Number of MoE experts + hidden_size: Hidden dimension size + intermediate_size: Intermediate (FFN) dimension size + dtype: Weight data type for byte-size calculation + overhead_factor: Multiplier over single-model weight bytes + """ + if not torch.cuda.is_available(): + return + bytes_per_elem = torch.tensor([], dtype=dtype).element_size() + single_model_bytes = num_experts * 3 * hidden_size * intermediate_size * bytes_per_elem + estimated_total_bytes = int(single_model_bytes * overhead_factor) + gpu_total_bytes = torch.cuda.get_device_properties(0).total_memory + if estimated_total_bytes > gpu_total_bytes: + pytest.skip( + f"Estimated memory {estimated_total_bytes / (1 << 30):.1f}GB " + f"exceeds GPU memory {gpu_total_bytes / (1 << 30):.1f}GB " + f"(num_experts={num_experts}, hidden_size={hidden_size}, " + f"intermediate_size={intermediate_size}, dtype={dtype})" + ) + + # ============================================================================ # Autotuner Tactic Replay # ============================================================================ @@ -667,6 +898,113 @@ def create_test_param(param_values, test_id, skip_reason=None): return pytest.param(*param_values, id=test_id) +# ============================================================================ +# CI Mode Detection +# ============================================================================ +_TRTLLM_TEST_MOE_CI_ENV = "TRTLLM_TEST_MOE_CI" +IS_CI_MODE = os.environ.get(_TRTLLM_TEST_MOE_CI_ENV, "1") == "1" + +# ============================================================================ +# CI Acceleration Skip Logic +# ============================================================================ + +# Routing methods that require full routing coverage in CI +_CI_ROUTING_METHODS = {"Renormalize", "DeepSeekV3"} + + +def should_skip_to_accelerate_ci( + backend_type: "MoeBackendType", + quant_algo: Optional[QuantAlgo], + model_config: "MoeModelConfig", + routing_method_cls=None, + dtype: Optional[torch.dtype] = None, + seq_len: Optional[int] = None, + swiglu_gptoss_style: bool = False, + parallel_mode: Optional[str] = None, +) -> Optional[str]: + """ + Skip low-information-density test combinations to accelerate CI. + + Only active when TRTLLM_TEST_MOE_CI=1 (default). When TRTLLM_TEST_MOE_CI=0, + all combinations run (local exhaustive testing). + + Rules applied (in order): + 0. Skip unquantized (quant=None) — quantized paths are the focus of CI + 1. e256 model: only DeepSeekV3 routing, bfloat16, seq=1, non-gptoss + 2. Multi-GPU: only DEP and TTP parallel modes + 3. Routing: full 6 routing methods only on (CUTLASS or TRTLLM) with NVFP4; + other backend+quant combos only run Renormalize + and DeepSeekV3. This rule is overridden by rule 1 for e256. + + Args: + backend_type: MoE backend type + quant_algo: Quantization algorithm + model_config: MoE model configuration + routing_method_cls: Routing method class (None means no routing filter) + dtype: Activation data type + seq_len: Sequence length + swiglu_gptoss_style: Whether using SwiGLU gptoss style + parallel_mode: Multi-GPU parallel mode (None for single-GPU tests) + + Returns: + Skip reason string if test should be skipped for CI, None otherwise + """ + if not IS_CI_MODE: + return None + + if model_config is None: + return None + + # --- Rule 0: Skip unquantized (quant=None) --- + if quant_algo is None: + return "[CI accel] Skip unquantized (quant=None) in CI" + + is_large_model = model_config.num_experts >= 256 and model_config.hidden_size >= 7168 + + # --- Rule 1: Large model (e256_k8_h7168_i2048) restrictions --- + if is_large_model: + if routing_method_cls is not None: + from tensorrt_llm._torch.modules.fused_moe import DeepSeekV3MoeRoutingMethod + + if routing_method_cls != DeepSeekV3MoeRoutingMethod: + routing_name = routing_method_cls.__name__ + return ( + f"[CI accel] Large model (num_experts={model_config.num_experts}) " + f"only tests DeepSeekV3 routing in CI (got {routing_name})" + ) + + if dtype is not None and dtype != torch.bfloat16: + return f"[CI accel] Large model only tests bfloat16 in CI (got {dtype})" + + if seq_len is not None and seq_len != 1: + return f"[CI accel] Large model only tests seq=1 in CI (got seq={seq_len})" + + if swiglu_gptoss_style: + return "[CI accel] Large model only tests non-gptoss in CI" + + # --- Rule 2: Multi-GPU parallel mode restrictions --- + if parallel_mode is not None and parallel_mode not in ("DEP", "TTP"): + return f"[CI accel] Only DEP and TTP parallel modes in CI (got {parallel_mode})" + + # --- Rule 3: Routing method restrictions per backend+quant --- + # Full routing coverage on: (CUTLASS, or TRTLLM) with NVFP4 + # Other combos: only Renormalize + DeepSeekV3 + # Rule 1 already handles e256 (DeepSeekV3 only), so this only applies to non-e256. + if not is_large_model and routing_method_cls is not None: + routing_name = routing_method_cls.__name__.replace("MoeRoutingMethod", "") + if routing_name not in _CI_ROUTING_METHODS: + allows_full_routing = ( + backend_type == MoeBackendType.CUTLASS or backend_type == MoeBackendType.TRTLLM + ) and quant_algo == QuantAlgo.NVFP4 + if not allows_full_routing: + return ( + f"[CI accel] {backend_type.value}+{quant_algo} only tests " + f"Renormalize/DeepSeekV3 routing in CI (got {routing_name})" + ) + + return None + + # ============================================================================ # Timing Fixture # ============================================================================ @@ -729,6 +1067,7 @@ def iter_base_test_configs( model_config, routing_method_cls, swiglu_gptoss_style=swiglu_gptoss_style, + seq_len=seq_len, ) routing_name = routing_method_cls.__name__.replace("MoeRoutingMethod", "") swiglu_id = ( diff --git a/tests/unittest/_torch/modules/moe/quantize_utils.py b/tests/unittest/_torch/modules/moe/quantize_utils.py index 24652a1c068..99c00be7928 100644 --- a/tests/unittest/_torch/modules/moe/quantize_utils.py +++ b/tests/unittest/_torch/modules/moe/quantize_utils.py @@ -24,6 +24,7 @@ per_block_cast_to_fp8_e8m0, per_token_cast_to_fp8_e8m0, ) +from _torch.modules.moe.moe_test_utils import skip_if_insufficient_gpu_memory from utils.util import check_accuracy from tensorrt_llm._torch.model_config import ModelConfig @@ -217,6 +218,10 @@ def __init__( model_config = ModelConfig() self.quant_config = model_config.quant_config + skip_if_insufficient_gpu_memory( + num_experts, hidden_size, intermediate_size, dtype or torch.float32 + ) + # Custom swiglu activation for swiglu_gptoss_style def custom_swiglu(x): gate, value = x.chunk(2, dim=-1) @@ -253,11 +258,18 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> t final_hidden_states = torch.zeros( hidden_states.shape, dtype=hidden_states.dtype, device=hidden_states.device ) + # FP8_BLOCK_SCALES linear kernel requires bfloat16 activation input + ref_requires_bf16 = ( + self.quant_config is not None + and self.quant_config.quant_algo == QuantAlgo.FP8_BLOCK_SCALES + ) for expert_id in range(self.num_experts): if not torch.any(selected_experts == expert_id): continue batch_idx, nth_expert = torch.where(selected_experts == expert_id) expert_inputs = hidden_states[batch_idx] + if ref_requires_bf16: + expert_inputs = expert_inputs.to(torch.bfloat16) output = self.experts[expert_id](expert_inputs) final_hidden_states[batch_idx] += ( routing_weights[batch_idx, nth_expert, None] * output.float() @@ -1948,6 +1960,24 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> t else: weight_scale_key = "weight_scale_inv" + # For W4A8_CUSTOM mode, the fused kernel uses a GLOBAL max input_scale + # across all experts (not per-expert), because the kernel applies a single + # pre-quant scale to all tokens before dispatching to experts. + # The reference must match this behavior to produce identical results. + if self.weight_loading_mode == MoEWeightLoadingMode.W4A8_CUSTOM: + all_fc31_input_scales = [] + all_fc2_input_scales = [] + for eid in range(self.num_experts): + p1 = self.weights[f"{eid}.w1.input_scale"].cuda() + p3 = self.weights[f"{eid}.w3.input_scale"].cuda() + all_fc31_input_scales.append(torch.max(p1, p3)) + all_fc2_input_scales.append(self.weights[f"{eid}.w2.input_scale"].cuda()) + global_fc31_input_scale = torch.stack(all_fc31_input_scales).max() + global_fc2_input_scale = torch.stack(all_fc2_input_scales).max() + else: + global_fc31_input_scale = None + global_fc2_input_scale = None + for expert_id in range(self.num_experts): mask = selected_experts == expert_id activated_tokens = mask.sum(1).bool() @@ -1970,12 +2000,16 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> t # Fuse scales - must cat in same order as weights s3_s1 = torch.cat([s3, s1], dim=-1) - # Get input scales - p1 = self.weights[f"{expert_id}.w1.input_scale"].cuda() - p2 = self.weights[f"{expert_id}.w2.input_scale"].cuda() - p3 = self.weights[f"{expert_id}.w3.input_scale"].cuda() - # IMPORTANT: Use max for fused computation to ensure consistent quantization - p3_p1 = torch.max(p1, p3) + # Get input scales - use global max for W4A8_CUSTOM, per-expert for VANILLA + if global_fc31_input_scale is not None: + p3_p1 = global_fc31_input_scale + p2 = global_fc2_input_scale + else: + p1 = self.weights[f"{expert_id}.w1.input_scale"].cuda() + p2 = self.weights[f"{expert_id}.w2.input_scale"].cuda() + p3 = self.weights[f"{expert_id}.w3.input_scale"].cuda() + # IMPORTANT: Use max for fused computation to ensure consistent quantization + p3_p1 = torch.max(p1, p3) # Get pre_quant_scale (only for VANILLA mode) a1 = a2 = a3 = a1_a3 = None @@ -2023,7 +2057,12 @@ def forward(self, hidden_states: torch.Tensor, router_logits: torch.Tensor) -> t return results.reshape(hidden_states.shape) def check_accuracy(self, output, ref_output): - torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1) + # W4A8_AWQ accumulates FP8 QDQ noise from two layers (fc31 + fc2). + # With higher top_k, more experts contribute per token, increasing + # the accumulated numerical noise in the final summation. + top_k = self.routing_method.top_k if hasattr(self.routing_method, "top_k") else 1 + atol = 0.1 * max(1, top_k / 4) + check_accuracy(output, ref_output, rtol=1e-2, atol=atol, percent=0.97) class W4A8AWQQuantizeUtil(BaseQuantizeUtil): @@ -2039,8 +2078,9 @@ def __init__( intermediate_size: int, hidden_size: int, quant_config: QuantConfig, + **kwargs, ): - super().__init__(num_experts, dtype, intermediate_size, hidden_size, quant_config) + super().__init__(num_experts, dtype, intermediate_size, hidden_size, quant_config, **kwargs) # These will be set in create_weights and used in create_ref_module self.weight_loading_mode = MoEWeightLoadingMode.W4A8_CUSTOM self.scaling_group_size = 128 diff --git a/tests/unittest/_torch/modules/moe/test_moe_backend.py b/tests/unittest/_torch/modules/moe/test_moe_backend.py index 65721e4b924..09339d21475 100644 --- a/tests/unittest/_torch/modules/moe/test_moe_backend.py +++ b/tests/unittest/_torch/modules/moe/test_moe_backend.py @@ -28,18 +28,20 @@ import itertools import logging -import os from typing import List, Optional import pytest import torch from _torch.modules.moe.moe_test_utils import ( + IS_CI_MODE, MoeBackendType, MoeModelConfig, create_test_param, get_backend_class, iter_base_test_configs, replay_tactics_and_check, + should_skip_to_accelerate_ci, + skip_if_insufficient_gpu_memory, supports_autotuner_capture, ) from _torch.modules.moe.quantize_utils import get_test_quant_params @@ -49,7 +51,7 @@ from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.modules.fused_moe import RenormalizeMoeRoutingMethod from tensorrt_llm._torch.modules.fused_moe.create_moe import create_moe_backend -from tensorrt_llm._torch.modules.fused_moe.interface import MoE +from tensorrt_llm._torch.modules.fused_moe.interface import MoE, MoEWeightLoadingMode from tensorrt_llm._utils import mpi_rank from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.modeling_utils import QuantAlgo @@ -103,6 +105,7 @@ def create_test_backend( swiglu_alpha: Optional[torch.Tensor] = None, swiglu_beta: Optional[torch.Tensor] = None, swiglu_limit: Optional[torch.Tensor] = None, + weight_loading_mode: MoEWeightLoadingMode = MoEWeightLoadingMode.VANILLA, ) -> MoE: """Create a MoE backend for testing.""" backend_cls = get_backend_class(backend_type) @@ -134,6 +137,7 @@ def create_test_backend( swiglu_alpha=swiglu_alpha, swiglu_beta=swiglu_beta, swiglu_limit=swiglu_limit, + weight_loading_mode=weight_loading_mode, ) @@ -226,30 +230,28 @@ def run_backend_moe( # Default runs the CI subset (TRTLLM_TEST_MOE_CI=1). # Set TRTLLM_TEST_MOE_CI=0 for the full local config matrix. CI_MOE_MODEL_CONFIGS = [ + # Real models (small/medium — tactic replay is model-size-independent, + # e256 is covered by test_moe_module integration tests) MoeModelConfig(60, 4, 2048, 1408), # Qwen1.5-MoE-A2.7B - MoeModelConfig(256, 8, 7168, 2048), # DeepSeek-V3 MoeModelConfig(128, 4, 2880, 2880), # GPT-OSS-120B MoeModelConfig(8, 1, 512, 512), # boundary: top_k=1, single expert activated + # Boundary tests for tactic correctness + MoeModelConfig(4, 4, 512, 512), # top_k=num_experts, all experts activated + MoeModelConfig(7, 2, 256, 512), # prime num_experts + MoeModelConfig(13, 3, 256, 512), # prime num_experts, odd top_k ] LOCAL_MOE_MODEL_CONFIGS = CI_MOE_MODEL_CONFIGS + [ + MoeModelConfig(256, 8, 7168, 2048), # DeepSeek-V3 MoeModelConfig(8, 2, 4096, 14336), # Mixtral-8x7B MoeModelConfig(64, 6, 2048, 1408), # DeepSeek-MoE-16B / DeepSeek-V2-Lite MoeModelConfig(8, 2, 6144, 32768), # Grok-1 - # === Boundary Tests: num_experts / top_k === - MoeModelConfig(4, 4, 512, 512), # top_k=num_experts, all experts activated - MoeModelConfig(7, 2, 256, 512), # prime num_experts - MoeModelConfig(13, 3, 256, 512), # prime num_experts, odd top_k # === Boundary Tests: small sizes === MoeModelConfig(4, 2, 64, 128), # very small hidden_size MoeModelConfig(4, 2, 128, 64), # intermediate < hidden ] -MOE_MODEL_CONFIGS = ( - CI_MOE_MODEL_CONFIGS - if os.environ.get("TRTLLM_TEST_MOE_CI", "1") == "1" - else LOCAL_MOE_MODEL_CONFIGS -) +MOE_MODEL_CONFIGS = CI_MOE_MODEL_CONFIGS if IS_CI_MODE else LOCAL_MOE_MODEL_CONFIGS # Sequence lengths to test SEQ_LENS_TO_TEST = [1, 8] @@ -270,9 +272,7 @@ def run_backend_moe( (1.702, 1.0, 7.0), # gptoss style (GPT-OSS real values) ] -SWIGLU_COMBOS = ( - CI_SWIGLU_COMBOS if os.environ.get("TRTLLM_TEST_MOE_CI", "1") == "1" else LOCAL_SWIGLU_COMBOS -) +SWIGLU_COMBOS = CI_SWIGLU_COMBOS if IS_CI_MODE else LOCAL_SWIGLU_COMBOS def generate_test_params() -> List: @@ -381,7 +381,6 @@ def generate_test_params() -> List: # - 128-alignment requirements for quantization # # ============================================================================= -@pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test") @pytest.mark.parametrize( "dtype_activation,backend_type,quant_algo,seq_len,model_config," "routing_method_cls,swiglu_alpha,swiglu_beta,swiglu_limit", @@ -412,10 +411,17 @@ def test_moe_backend( # Default values: alpha=1, beta=0, limit=inf swiglu_gptoss_style = swiglu_alpha != 1 or swiglu_beta != 0 or swiglu_limit != float("inf") - # Note: Skip logic is now handled at parametrize level via get_quick_skip_reason() - # which calls backend's can_implement() and should_skip_* functions. - # This avoids entering test function for invalid combinations, significantly - # reducing test collection time (from ~17 min to ~5 sec for 3400+ skipped tests). + ci_skip = should_skip_to_accelerate_ci( + backend_type=backend_type, + quant_algo=quant_algo, + model_config=model_config, + routing_method_cls=routing_method_cls, + dtype=dtype_activation, + seq_len=seq_len, + swiglu_gptoss_style=swiglu_gptoss_style, + ) + if ci_skip: + pytest.skip(ci_skip) # Extract model parameters num_experts = model_config.num_experts @@ -423,6 +429,8 @@ def test_moe_backend( hidden_size = model_config.hidden_size intermediate_size = model_config.intermediate_size + skip_if_insufficient_gpu_memory(num_experts, hidden_size, intermediate_size, dtype_activation) + # Create mapping mapping = Mapping() mapping.rank = mpi_rank() @@ -464,6 +472,11 @@ def test_moe_backend( # Get swiglu tensors if swiglu_gptoss_style is enabled swiglu_tensors = quantize_util.get_swiglu_tensors() + # Determine weight loading mode based on quantization algorithm + weight_loading_mode = MoEWeightLoadingMode.VANILLA + if hasattr(quantize_util, "weight_loading_mode"): + weight_loading_mode = quantize_util.weight_loading_mode + # Create backend first (needed for MXFP4_MXFP8 to get shapes) backend = create_test_backend( backend_type=backend_type, @@ -478,6 +491,7 @@ def test_moe_backend( swiglu_alpha=swiglu_tensors["swiglu_alpha"] if swiglu_tensors else None, swiglu_beta=swiglu_tensors["swiglu_beta"] if swiglu_tensors else None, swiglu_limit=swiglu_tensors["swiglu_limit"] if swiglu_tensors else None, + weight_loading_mode=weight_loading_mode, ) # W4A8_MXFP4_MXFP8 requires different weights for backend and reference diff --git a/tests/unittest/_torch/modules/moe/test_moe_module.py b/tests/unittest/_torch/modules/moe/test_moe_module.py index a86a84a0dcd..099a9641fac 100644 --- a/tests/unittest/_torch/modules/moe/test_moe_module.py +++ b/tests/unittest/_torch/modules/moe/test_moe_module.py @@ -26,6 +26,7 @@ """ import copy +import functools import logging import os import pickle @@ -40,6 +41,7 @@ import pytest import torch from _torch.modules.moe.moe_test_utils import ( + IS_CI_MODE, MoeBackendType, MoeModelConfig, create_test_param, @@ -50,7 +52,9 @@ should_skip_cutlass, should_skip_deepgemm, should_skip_multi_gpu, + should_skip_to_accelerate_ci, should_skip_trtllm, + skip_if_insufficient_gpu_memory, supports_autotuner_capture, ) from _torch.modules.moe.quantize_utils import get_test_quant_params @@ -59,6 +63,7 @@ from transformers.configuration_utils import PretrainedConfig import tensorrt_llm.bindings.internal.runtime as _tbr +from tensorrt_llm._mnnvl_utils import MnnvlMemory from tensorrt_llm._torch.autotuner import AutoTuner, autotune from tensorrt_llm._torch.model_config import ModelConfig from tensorrt_llm._torch.modules.fused_moe import ( @@ -70,6 +75,8 @@ RenormalizeNaiveMoeRoutingMethod, create_moe, ) +from tensorrt_llm._torch.modules.fused_moe.communication.deep_ep_low_latency import DeepEPLowLatency +from tensorrt_llm._torch.modules.fused_moe.interface import MoEWeightLoadingMode from tensorrt_llm._torch.modules.fused_moe.moe_load_balancer import ( MoeLoadBalancer, MoeLoadBalancerIterContext, @@ -518,6 +525,12 @@ def _test_moe_worker_impl( # Get swiglu tensors if swiglu_gptoss_style is enabled swiglu_tensors = quantize_util.get_swiglu_tensors() + # Get weight_loading_mode from quantize_util if available + # (e.g., W4A8AWQQuantizeUtil uses W4A8_CUSTOM mode) + weight_loading_mode = getattr( + quantize_util, "weight_loading_mode", MoEWeightLoadingMode.VANILLA + ) + with moe_load_balancer: # Create and setup fused MoE module fused_moe = create_moe( @@ -528,6 +541,7 @@ def _test_moe_worker_impl( swiglu_alpha=swiglu_tensors["swiglu_alpha"] if swiglu_tensors else None, swiglu_beta=swiglu_tensors["swiglu_beta"] if swiglu_tensors else None, swiglu_limit=swiglu_tensors["swiglu_limit"] if swiglu_tensors else None, + weight_loading_mode=weight_loading_mode, ) fused_moe.load_weights([weights]) fused_moe.post_load_weights() @@ -724,11 +738,7 @@ def init_worker(custom_paths, comm_method_type): MoeModelConfig(4, 2, 128, 64), # intermediate < hidden ] -MOE_MODEL_CONFIGS = ( - CI_MOE_MODEL_CONFIGS - if os.environ.get("TRTLLM_TEST_MOE_CI", "1") == "1" - else LOCAL_MOE_MODEL_CONFIGS -) +MOE_MODEL_CONFIGS = CI_MOE_MODEL_CONFIGS if IS_CI_MODE else LOCAL_MOE_MODEL_CONFIGS # Sequence lengths to test SEQ_LENS = [1, 8] @@ -786,23 +796,47 @@ def init_worker(custom_paths, comm_method_type): ] # Default runs CI subset. Set TRTLLM_TEST_MOE_CI=0 for full local matrix. -SWIGLU_COMBOS = ( - CI_SWIGLU_COMBOS if os.environ.get("TRTLLM_TEST_MOE_CI", "1") == "1" else LOCAL_SWIGLU_COMBOS -) +SWIGLU_COMBOS = CI_SWIGLU_COMBOS if IS_CI_MODE else LOCAL_SWIGLU_COMBOS + + +@functools.lru_cache(maxsize=1) +def _is_mnnvl_supported() -> bool: + """Cached check for MNNVL platform support (pynvml query is expensive).""" + return MnnvlMemory.supports_mnnvl() def _get_comm_method_skip_reason( comm_method: str, model_config: "MoeModelConfig", + dtype: Optional[torch.dtype] = None, ) -> Optional[str]: """ Check if a communication method is compatible with the given model config. Returns a skip reason string if incompatible, None otherwise. """ - from tensorrt_llm._torch.modules.fused_moe.communication.deep_ep_low_latency import ( - DeepEPLowLatency, - ) + # NVLink-based methods require MNNVL support (all NVLink links active). + # See: _mnnvl_utils.py:supports_mnnvl() -> support_nvlink(need_all_up=True) + # Without MNNVL, Communication.__init__() raises RuntimeError (base.py:53-58). + if comm_method in ("NVLINK_ONE_SIDED", "NVLINK_TWO_SIDED"): + if not _is_mnnvl_supported(): + return ( + f"{comm_method} requires MNNVL support (all NVLink links active). " + f"Not supported on this platform." + ) + + # DeepEP normal mode: is_workload_feasible (deep_ep.py:127) rejects + # non-bfloat16, causing a runtime fallback to AllGather. The fallback + # replaces self.comm, and when the old DeepEP object is GC'd its + # Buffer destructor calls intranode::barrier (deep_ep.cpp:90) which + # requires all ranks simultaneously -- non-deterministic GC timing + # across MPI ranks causes the barrier to timeout and crash. + if comm_method == "DEEPEP" and dtype is not None and dtype != torch.bfloat16: + return ( + f"DeepEP is_workload_feasible rejects dtype={dtype} " + f"(requires bfloat16), and the runtime fallback triggers an " + f"unsafe Buffer destruction that crashes all ranks." + ) if comm_method == "DEEPEPLOWLATENCY": if model_config.hidden_size not in DeepEPLowLatency.SUPPORTED_HIDDEN_SIZES: @@ -866,19 +900,42 @@ def generate_multi_gpu_test_params( ): # Check multi-GPU specific skip conditions (short-circuit on first match) if not skip_reason: + # TP modes shard intermediate_size; EP modes don't + moe_tp_size = 4 if parallel_mode in ("DTP", "TTP") else 1 for reason in ( - _get_comm_method_skip_reason(comm_method, model_config), + _get_comm_method_skip_reason(comm_method, model_config, dtype=dtype), should_skip_trtllm( - backend_type, quant_algo, model_config, comm_method=comm_method + backend_type, + quant_algo, + model_config, + comm_method=comm_method, + moe_tp_size=moe_tp_size, ), should_skip_cutlass( - backend_type, comm_method, quant_algo=quant_algo, model_config=model_config + backend_type, + comm_method, + quant_algo=quant_algo, + model_config=model_config, + moe_tp_size=moe_tp_size, + dtype=dtype, + ), + should_skip_cutedsl( + backend_type, + quant_algo, + model_config, + comm_method, + moe_tp_size=moe_tp_size, ), - should_skip_cutedsl(backend_type, quant_algo, model_config, comm_method), should_skip_deepgemm( - backend_type, comm_method, quant_algo=quant_algo, model_config=model_config + backend_type, + comm_method, + quant_algo=quant_algo, + model_config=model_config, + moe_tp_size=moe_tp_size, + ), + should_skip_multi_gpu( + parallel_mode, model_config, world_size=4, comm_method=comm_method ), - should_skip_multi_gpu(parallel_mode, model_config, world_size=4), ): if reason: skip_reason = reason @@ -973,7 +1030,6 @@ def generate_base_test_params( ) -@pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test") @pytest.mark.parametrize( "dtype,moe_backend,quant_algo,seq_len,model_config,routing_method_cls," "swiglu_alpha,swiglu_beta,swiglu_limit", @@ -999,6 +1055,26 @@ def test_configurable_moe_single_gpu( 3. Autotune captures and replays all tactics properly 4. swiglu_gptoss_style (SwiGLU with custom parameters) works correctly """ + swiglu_gptoss_style = swiglu_alpha != 1 or swiglu_beta != 0 or swiglu_limit != float("inf") + ci_skip = should_skip_to_accelerate_ci( + backend_type=MoeBackendType(moe_backend), + quant_algo=quant_algo, + model_config=model_config, + routing_method_cls=routing_method_cls, + dtype=dtype, + seq_len=seq_len, + swiglu_gptoss_style=swiglu_gptoss_style, + ) + if ci_skip: + pytest.skip(ci_skip) + + skip_if_insufficient_gpu_memory( + model_config.num_experts, + model_config.hidden_size, + model_config.intermediate_size, + dtype, + ) + # DeepSeekV3 routing requires float32 routing_logits for TRTLLM backend # See: cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp:70-72 dtype_routing_logits = None @@ -1032,7 +1108,7 @@ def test_configurable_moe_single_gpu( comm_methods=COMM_METHODS, swiglu_combos=SWIGLU_COMBOS, model_configs=MOE_MODEL_CONFIGS, - seq_lens=SEQ_LENS, + seq_lens=[8] if IS_CI_MODE else SEQ_LENS, dtypes=DTYPES, backend_types=BACKEND_TYPES, quant_algos=QUANT_ALGOS, @@ -1040,7 +1116,6 @@ def test_configurable_moe_single_gpu( ) -@pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test") @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") @pytest.mark.parametrize( "parallel_mode,comm_method_type,dtype,moe_backend,quant_algo,seq_len,model_config," @@ -1060,6 +1135,27 @@ def test_configurable_moe_multi_gpu( swiglu_beta, swiglu_limit, ): + swiglu_gptoss_style = swiglu_alpha != 1 or swiglu_beta != 0 or swiglu_limit != float("inf") + ci_skip = should_skip_to_accelerate_ci( + backend_type=MoeBackendType(moe_backend), + quant_algo=quant_algo, + model_config=model_config, + routing_method_cls=routing_method_cls, + dtype=dtype, + seq_len=seq_len, + swiglu_gptoss_style=swiglu_gptoss_style, + parallel_mode=parallel_mode, + ) + if ci_skip: + pytest.skip(ci_skip) + + skip_if_insufficient_gpu_memory( + model_config.num_experts, + model_config.hidden_size, + model_config.intermediate_size, + dtype, + ) + # DeepSeekV3 routing requires float32 routing_logits for TRTLLM backend # See: cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp:70-72 dtype_routing_logits = None @@ -1245,6 +1341,10 @@ def generate_eplb_test_params( backend_type, quant_algo, dtype, model_config, routing_method_cls ) + # Check comm method platform compatibility (e.g. NVLink support) + if not skip_reason: + skip_reason = _get_comm_method_skip_reason(comm_method, model_config) + # Check EPLB-specific skip conditions if not skip_reason: skip_reason = _should_skip_EPLB( @@ -1288,7 +1388,6 @@ def generate_eplb_test_params( ) -@pytest.mark.skip(reason="Temporarily skipped due to the long time to run the test") @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") @pytest.mark.skipif( not _tbr.is_host_accessible_device_memory_supported(), @@ -1308,6 +1407,13 @@ def test_configurable_moe_multi_gpu_eplb( num_slots, routing_method_cls, ): + skip_if_insufficient_gpu_memory( + model_config.num_experts, + model_config.hidden_size, + model_config.intermediate_size, + dtype, + ) + world_size = 4 _test_moe_multi_gpu( comm_method_type, diff --git a/tests/unittest/_torch/modules/test_fused_moe.py b/tests/unittest/_torch/modules/test_fused_moe.py index 6bdf570457c..0453c66c6de 100644 --- a/tests/unittest/_torch/modules/test_fused_moe.py +++ b/tests/unittest/_torch/modules/test_fused_moe.py @@ -45,6 +45,10 @@ from tensorrt_llm.mapping import Mapping from tensorrt_llm.models.modeling_utils import QuantAlgo, QuantConfig +# NOTE: Most tests in this file are deprecated and skipped. They are now covered by the +# unified MoE test framework in tests/unittest/_torch/modules/moe/test_moe_backend.py +# and test_moe_module.py. Add new MoE tests there instead of here. + cloudpickle.register_pickle_by_value(sys.modules[__name__]) cloudpickle.register_pickle_by_value(_torch.helpers) MPI.pickle.__init__( @@ -76,6 +80,10 @@ def round_up(x, alignment): return (x + alignment - 1) // alignment * alignment +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @pytest.mark.parametrize( "moe_backend, dtype, experts, routing_cls, bias", product(["CUTLASS", "VANILLA", "TRITON"], [torch.float16, torch.bfloat16], @@ -195,6 +203,10 @@ def test_fused_moe(moe_backend, m //= 2 +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") @pytest.mark.parametrize("moe_cls", ["CUTLASS", "VANILLA"]) @@ -215,6 +227,10 @@ def test_fused_moe_multi_gpu(moe_cls, ep_size): assert r is None +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") @pytest.mark.parametrize("alltoall_method_type", [ @@ -328,6 +344,10 @@ def per_rank_test_fused_moe_alltoall(job_id): assert r is None +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") @pytest.mark.parametrize("alltoall_method_type", [ @@ -510,6 +530,10 @@ def per_rank_test_fused_moe_alltoall(job_id, weights, x_list): assert r is None +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_pre_hopper @pytest.mark.parametrize( "moe_backend", @@ -698,6 +722,10 @@ def set_tensor_value_4(x, num_row, num_cols): x.copy_(repeated) +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_pre_blackwell @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") @@ -853,6 +881,10 @@ def per_rank_test_fused_moe_alltoall_fp8_blockwise(job_id): assert r is None +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_pre_blackwell @pytest.mark.parametrize( "dtype, num_experts, seq_len, hidden_size, RoutingMethodCls", @@ -1038,6 +1070,10 @@ def grouped_gemm(a: torch.Tensor, b: torch.Tensor, a_sf: torch.Tensor, torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1) +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_pre_blackwell @pytest.mark.parametrize( "dtype, num_experts, seq_len, hidden_size, RoutingMethodCls, WeightLoadingMode", @@ -1172,6 +1208,10 @@ def test_fused_moe_fp8_blockwise_cute_dsl(dtype, return True +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_no_hopper @pytest.mark.parametrize( "dtype, num_experts, seq_len, hidden_size, RoutingMethodCls, WeightLoadingMode", @@ -1304,6 +1344,10 @@ def test_fused_moe_fp8_blockwise_cutlass(dtype, return True +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_no_hopper @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") @@ -1337,6 +1381,10 @@ def test_fused_moe_fp8_blockwise_cutlass_multi_gpu(ep_size, routing_method, assert r is True +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_pre_blackwell @pytest.mark.skipif(torch.cuda.device_count() < 4, reason="needs 4 GPUs to run this test") @@ -1370,6 +1418,10 @@ def test_fused_moe_fp8_blockwise_cute_dsl_multi_gpu(ep_size, routing_method, assert r is True +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_pre_blackwell @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("moe_backend", [ @@ -1383,6 +1435,10 @@ def test_fused_moe_nvfp4(dtype, moe_backend, finalize_fusion): run_fused_moe_nvfp4(dtype, moe_backend, finalize_fusion) +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_pre_blackwell @pytest.mark.parametrize("hidden_size, intermediate_size", [(2880, 2880)]) @pytest.mark.parametrize("swiglu_alpha", [1, 0.1], ids=lambda v: f"alpha{v}") @@ -1645,6 +1701,10 @@ def run_fused_moe_nvfp4(dtype, atol=atol) +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_pre_blackwell @pytest.mark.parametrize( "moe_backend", @@ -1782,6 +1842,10 @@ def test_fused_moe_w4a8_nvfp4_fp8(moe_backend): atol=0.5) +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_neither_ada_nor_hopper_unittest @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize( @@ -2057,6 +2121,10 @@ def process_layer( torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.1) +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @skip_pre_blackwell @pytest.mark.parametrize( "moe_backend", @@ -2308,6 +2376,10 @@ def prepare_weights(num_experts: int, torch.testing.assert_close(output, ref_output, rtol=1e-2, atol=0.15) +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @pytest.mark.parametrize("dtype", [torch.bfloat16]) @pytest.mark.parametrize("hidden_size", [768, 2880]) @pytest.mark.parametrize( @@ -2615,6 +2687,10 @@ def mxfp4_to_fp32(tensor, scales): check_accuracy(output, ref_output, rtol=0.6, atol=0.6, percent=0.945) +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @pytest.mark.parametrize("dtype", [torch.float16, torch.bfloat16]) @pytest.mark.parametrize("weight_dtype", [torch.int8]) def test_fused_moe_int8_woq_per_channel(dtype, weight_dtype): diff --git a/tests/unittest/_torch/thop/serial/test_moe.py b/tests/unittest/_torch/thop/serial/test_moe.py index 9c0d00bbe67..a1912def29b 100644 --- a/tests/unittest/_torch/thop/serial/test_moe.py +++ b/tests/unittest/_torch/thop/serial/test_moe.py @@ -22,6 +22,11 @@ import torch.nn.functional as F sys.path.append(os.path.join(os.path.dirname(__file__), '..')) + +# NOTE: Some tests in this file are deprecated and skipped. They are now covered by the +# unified MoE test framework in tests/unittest/_torch/modules/moe/test_moe_backend.py +# and test_moe_module.py. Add new MoE tests there instead of here. + from enum import Enum from utils.util import getSMVersion @@ -872,6 +877,10 @@ def are_groups_valid(top_k_groups, n_groups): return True +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @pytest.mark.skipif( getSMVersion() < 100 or getSMVersion() >= 110, reason="The kernel only supports Blackwell. Current SM is %d." % @@ -1006,6 +1015,10 @@ def run_moe_fp8_test(self, num_tokens: int, expert_info: Tuple[int, int, percent=0.925) +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @pytest.mark.skipif( getSMVersion() < 100 or getSMVersion() >= 110, reason="The kernel only supports Blackwell. Current SM is %d." % @@ -1939,6 +1952,10 @@ def run_moe_fp8_fp4_test(self, num_tokens: int, hidden_size: int, percent=0.925) +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @pytest.mark.skipif( getSMVersion() < 100 or getSMVersion() >= 110, reason="The kernel only supports Blackwell. Current SM is %d." % @@ -2164,6 +2181,10 @@ def test_moe_fp8_per_tensor_scale(num_tokens, hidden_size, intermediate_size, percent=0.925) +@pytest.mark.skip( + reason= + "Deprecated: covered by tests/unittest/_torch/modules/moe/test_moe_backend.py and test_moe_module.py. Add new tests there." +) @pytest.mark.skipif( getSMVersion() != 100, reason="The kernel only supports Blackwell. Current SM is %d." % diff --git a/tests/unittest/llmapi/test_grpc.py b/tests/unittest/llmapi/test_grpc.py index 08d712f6ab1..a74ec33e578 100644 --- a/tests/unittest/llmapi/test_grpc.py +++ b/tests/unittest/llmapi/test_grpc.py @@ -376,23 +376,17 @@ def test_all_sampling_config_fields(self): return_generation_logits=True, exclude_input_from_output=True, ) - stop_words = [ - pb2.TokenSequence(token_ids=[50256]), - pb2.TokenSequence(token_ids=[50257, 50258]), - ] - bad_words = [ - pb2.TokenSequence(token_ids=[100, 101]), - ] embedding_bias = [0.0] * 10 + [1.5, -1.5] params = create_sampling_params_from_proto( proto_config=proto_config, output_config=output_config, max_tokens=256, - end_id=50256, - pad_id=50257, - stop_words=stop_words, - bad_words=bad_words, + stop=["<|endoftext|>", "<|end|>"], + stop_token_ids=[50256], + ignore_eos=True, + bad=["badword1"], + bad_token_ids=[100, 101], embedding_bias=embedding_bias, ) @@ -432,20 +426,21 @@ def test_all_sampling_config_fields(self): # Other params assert params.max_tokens == 256 - assert params.end_id == 50256 - assert params.pad_id == 50257 assert params.detokenize is False # key optimization + assert params.ignore_eos is True - # Stop/bad words (set as pre-tokenized word IDs) - assert params._stop_word_ids == [[50256], [50257, 50258]] - assert params._bad_word_ids == [[100, 101]] + # Stop/bad words (passed as strings/token IDs for TRT-LLM's _setup() to tokenize) + assert params.stop == ["<|endoftext|>", "<|end|>"] + assert params.stop_token_ids == [50256] + assert params.bad == ["badword1"] + assert params.bad_token_ids == [100, 101] # Embedding bias converted to torch.Tensor assert params.embedding_bias is not None assert len(params.embedding_bias) == 12 - def test_end_id_minus_one_sets_ignore_eos(self): - """Test that end_id=-1 correctly sets ignore_eos=True.""" + def test_ignore_eos_flag(self): + """Test that ignore_eos=True correctly sets ignore_eos on SamplingParams.""" proto_config = pb2.SamplingConfig(temperature=0.7) output_config = pb2.OutputConfig() @@ -453,10 +448,9 @@ def test_end_id_minus_one_sets_ignore_eos(self): proto_config=proto_config, output_config=output_config, max_tokens=100, - end_id=-1, + ignore_eos=True, ) - assert params.end_id == -1 assert params.ignore_eos is True def test_defaults_when_fields_unset(self):