clean up quick build flag

tianleiwu · tianleiwu · commit 2e831a719f53 · 2026-01-20T05:43:11.000Z
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h
@@ -145,15 +145,6 @@ bool is_supported(const cudaDeviceProp& dprops, size_t head_size, size_t num_hea
 template <typename T>
 bool is_supported(const cudaDeviceProp& dprops, size_t head_size, size_t num_heads, size_t num_heads_k) {
 #ifdef ORT_QUICK_BUILD
-
-#if ORT_QUICK_BUILD == 1
-  // In quick build mode, only fp16 flash attention is built
-  constexpr bool is_bf16 = std::is_same<T, onnxruntime::BFloat16>::value;
-  if (is_bf16) {
-    return false;
-  }
-#endif
-
   if (head_size != 128) {
     return false;
   }
diff --git a/onnxruntime/contrib_ops/cuda/bert/flash_attention/static_switch.h b/onnxruntime/contrib_ops/cuda/bert/flash_attention/static_switch.h
@@ -66,14 +66,6 @@
 #define LOCAL_SWITCH BOOL_SWITCH
 #endif
 
-#if ORT_QUICK_BUILD == 1
-// Quick build mode: only fp16 kernels are compiled
-#define FP16_SWITCH(COND, ...)         \
-  [&] {                                \
-    using elem_type = cutlass::half_t; \
-    return __VA_ARGS__();              \
-  }()
-#else
 #define FP16_SWITCH(COND, ...)               \
   [&] {                                      \
     if (COND) {                              \
@@ -84,7 +76,6 @@
       return __VA_ARGS__();                  \
     }                                        \
   }()
-#endif
 
 #ifdef ORT_QUICK_BUILD
 // Quick build mode: only hdim128 kernels are compiled
diff --git a/onnxruntime/test/python/transformers/test_gqa.py b/onnxruntime/test/python/transformers/test_gqa.py
@@ -52,9 +52,6 @@
 # When quick build is used, flash attention only supports head_size=128
 quick_build = ", quick-build=" in get_build_info()
 
-# When quick build mode is 1, bf16 is excluded
-quick_build_exclude_bf16 = ", quick-build=1, " in get_build_info()
-
 enable_debug_print = quick_build
 
 enable_deterministic_check = True
@@ -2057,9 +2054,7 @@ def has_cuda_device(min_capability: int = 80):
     return major * 10 + minor >= min_capability
 
 
-def has_flash_attention(bf16: bool = False):
-    if bf16 and quick_build_exclude_bf16:
-        return False
+def has_flash_attention():
     return has_cuda_device(80)
 
 
@@ -2151,7 +2146,7 @@ def test_gqa_quantized_prompt(self, name, config):
         )
 
 
-@unittest.skipIf(not has_flash_attention(bf16=True), "Flash Attention is not available, skipping tests.")
+@unittest.skipIf(not has_flash_attention(), "Flash Attention is not available, skipping tests.")
 class TestFlashGQABF16(unittest.TestCase):
     @parameterized.expand(gqa_cuda_prompt_test_cases())
     def test_gqa_prompt_flash_attention_bf16(self, name, config):
@@ -2199,7 +2194,7 @@ def test_gqa_past_flash_attention_bf16(self, name, config):
 
 
 @unittest.skipIf(
-    not has_flash_attention(bf16=True) or not enable_quantized_kv_tests,
+    not has_flash_attention() or not enable_quantized_kv_tests,
     "Flash Attention is not available, skipping tests.",
 )
 class TestFlashGQABF16QuantizedKV(unittest.TestCase):