fix build

tianleiwu · tianleiwu · commit 71a579a9f603 · 2026-02-05T08:29:26.000Z
diff --git a/onnxruntime/contrib_ops/cuda/bert/xqa/utils.cuh b/onnxruntime/contrib_ops/cuda/bert/xqa/utils.cuh
@@ -660,11 +660,11 @@ __device__ inline mha::tuple<uint32_t, uint32_t, decltype(bounds)..., uint32_t>
   return mha::tuple_cat(mha::tuple<uint32_t>(i0 % bound0), carryLE<bound1, bounds...>(i1 + i0 / bound0, i..., iLast));
 }
 
-__device__ __host__ inline void assertClose(float a, float b, float threshold = 0.01f) {
+__device__ __host__ inline void assertClose([[maybe_unused]] float a, [[maybe_unused]] float b, [[maybe_unused]] float threshold = 0.01f) {
   assert(abs(a - b) < threshold);
 }
 
-__device__ __host__ inline void assertClose(half a, half b, float threshold = 0.01f) {
+__device__ __host__ inline void assertClose([[maybe_unused]] half a, [[maybe_unused]] half b, [[maybe_unused]] float threshold = 0.01f) {
   assertClose(__half2float(a), __half2float(b), threshold);
 }
 
diff --git a/onnxruntime/test/python/transformers/test_gqa.py b/onnxruntime/test/python/transformers/test_gqa.py
@@ -60,8 +60,6 @@
 enable_debug_print = False
 
 enable_deterministic_check = True
-
-enable_quantized_kv_tests = True
 # #################################################################################################
 #  Configuration and Helper Classes
 # #################################################################################################
@@ -2010,10 +2008,7 @@ def test_gqa_past_flash_attention_bf16(self, name, config):
         )
 
 
-@unittest.skipIf(
-    not has_flash_attention() or not enable_quantized_kv_tests,
-    "Flash Attention is not available, skipping tests.",
-)
+@unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
 class TestFlashGQABF16QuantizedKV(unittest.TestCase):
     def manual_seed(self):
         # Reset random seeds before each test to ensure test isolation
@@ -2245,7 +2240,7 @@ def gqa_xqa_test_cases():
                             yield name, config, torch_type, ort_type
 
 
-@unittest.skipIf(not enable_quantized_kv_tests, "Quantized KV is not enabled, skipping tests.")
+@unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
 class TestXQAQuantizedParity(unittest.TestCase):
     """Tests that verify fused kernels produce the same results as unfused kernels."""
 

Original file line number	Diff line number	Diff line change
`@@ -660,11 +660,11 @@ __device__ inline mha::tuple<uint32_t, uint32_t, decltype(bounds)..., uint32_t>`
`660`	`660`	`return mha::tuple_cat(mha::tuple<uint32_t>(i0 % bound0), carryLE<bound1, bounds...>(i1 + i0 / bound0, i..., iLast));`
`661`	`661`	`}`
`662`	`662`
`663`		`-__device__ __host__ inline void assertClose(float a, float b, float threshold = 0.01f) {`
	`663`	`+__device__ __host__ inline void assertClose([[maybe_unused]] float a, [[maybe_unused]] float b, [[maybe_unused]] float threshold = 0.01f) {`
`664`	`664`	`assert(abs(a - b) < threshold);`
`665`	`665`	`}`
`666`	`666`
`667`		`-__device__ __host__ inline void assertClose(half a, half b, float threshold = 0.01f) {`
	`667`	`+__device__ __host__ inline void assertClose([[maybe_unused]] half a, [[maybe_unused]] half b, [[maybe_unused]] float threshold = 0.01f) {`
`668`	`668`	`assertClose(__half2float(a), __half2float(b), threshold);`
`669`	`669`	`}`
`670`	`670`