xinhe-nv · pull · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026 · Mar 6, 2026
diff --git a/cpp/kernels/fmha_v2/README.md b/cpp/kernels/fmha_v2/README.md
@@ -20,7 +20,12 @@ the `setup.py` code:
 export TORCH_CUDA_ARCH_LIST=9.0 ENABLE_SM89_QMMA=1 ENABLE_HMMA_FP32=1 SCHEDULING_MODE=1 ENABLE_SM100=1 ENABLE_SM120=1
 ```
 
-To generate subset of kernels, you can add conditions in setup.py.
+To generate subset of kernels, you can add conditions in setup.py. Or set `FMHA_FILTER_ARCH` before calling setup.py:
+
+```
+# Build only for a specific arch (or list of architectures). Will not enable kernels that are disabled by default
+export FMHA_FILTER_ARCH=90
+```
 
 To generate the files and compile the kernels:
 ```

diff --git a/cpp/kernels/fmha_v2/fmha_test.py b/cpp/kernels/fmha_v2/fmha_test.py
@@ -1,3 +1,17 @@
+# SPDX-FileCopyrightText: Copyright (c) 2020-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 import subprocess
 
 import pytest
@@ -268,3 +282,29 @@ def test_trtllm_chunked_attention(chunked_attention_size, input_layout):
             -chunked-attention-size {chunked_attention_size} -paged-kv",
             shell=True,
             check=True)
+
+
+# The test cases for sliding window attention.
+@pytest.mark.parametrize(
+    'sliding_window_size', [64, 127, 128, 129, 256, 512],
+    ids=[
+        "sliding-window-size-64", "sliding-window-size-127",
+        "sliding-window-size-128", "sliding-window-size-129",
+        "sliding-window-size-256", "sliding-window-size-512"
+    ])
+@pytest.mark.parametrize(
+    'mask_type',
+    ["-sliding-or-chunked-causal-mask", "-bidirectional-sliding-window-mask"])
+def test_trtllm_sliding_window_attention(sliding_window_size, mask_type):
+    if mask_type == "-bidirectional-sliding-window-mask":
+        sliding_window_size *= 2
+
+    subprocess.run(f"bin/fmha.exe -d 128 -b 2 -h 5 -s 2048 -min-s 1024 -bf16 \
+        -sliding-window-size {sliding_window_size} {mask_type}",
+                   shell=True,
+                   check=True)
+
+    subprocess.run(f"bin/fmha.exe -d 64 -b 2 -h 5 -s 2048 -min-s 1024 -bf16 \
+        -sliding-window-size {sliding_window_size} {mask_type}",
+                   shell=True,
+                   check=True)