microsoft
diff --git a/‎cmake/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions b/‎cmake/CMakeLists.txt‎
Lines changed: 9 additions & 0 deletions
diff --git a/‎cmake/onnxruntime_providers_cpu.cmake‎
Lines changed: 10 additions & 0 deletions b/‎cmake/onnxruntime_providers_cpu.cmake‎
Lines changed: 10 additions & 0 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/attention.cc‎
Lines changed: 4 additions & 4 deletions b/‎onnxruntime/contrib_ops/cuda/bert/attention.cc‎
Lines changed: 4 additions & 4 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h‎
Lines changed: 22 additions & 2 deletions b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_api.h‎
Lines changed: 22 additions & 2 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim160_bf16_sm80.cu‎
Lines changed: 0 additions & 18 deletions b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim160_bf16_sm80.cu‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim160_fp16_sm80.cu‎
Lines changed: 0 additions & 18 deletions b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim160_fp16_sm80.cu‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim224_bf16_sm80.cu‎
Lines changed: 0 additions & 18 deletions b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim224_bf16_sm80.cu‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim224_fp16_sm80.cu‎
Lines changed: 0 additions & 18 deletions b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_hdim224_fp16_sm80.cu‎
Lines changed: 0 additions & 18 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h‎
Lines changed: 0 additions & 5 deletions b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_launch_template.h‎
Lines changed: 0 additions & 5 deletions
diff --git a/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim160_bf16_sm80.cu‎
Lines changed: 0 additions & 15 deletions b/‎onnxruntime/contrib_ops/cuda/bert/flash_attention/flash_fwd_split_hdim160_bf16_sm80.cu‎
Lines changed: 0 additions & 15 deletions
@@ -102,6 +102,7 @@ cmake_dependent_option(onnxruntime_USE_FLASH_ATTENTION "Build flash attention ke
 option(onnxruntime_USE_LEAN_ATTENTION "Build lean attention kernel for scaled dot product attention" OFF)
 cmake_dependent_option(onnxruntime_USE_MEMORY_EFFICIENT_ATTENTION "Build memory efficient attention kernel for scaled dot product attention" ON "onnxruntime_USE_CUDA" OFF)
 option(onnxruntime_USE_FPA_INTB_GEMM "Build FpA IntB gemm cuda kernels" OFF)
+option(onnxruntime_QUICK_BUILD "Speed up build by skipping some kernels for faster development" OFF)
 
 option(onnxruntime_BUILD_FOR_NATIVE_MACHINE "Enable this option for turning on optimization specific to this machine" OFF)
 option(onnxruntime_USE_AVX "Use AVX instructions" OFF)
@@ -789,6 +790,11 @@ if (onnxruntime_USE_CUDA)
       message( STATUS "Enable FpA IntB Gemm for CUDA EP")
       list(APPEND ORT_PROVIDER_FLAGS -DUSE_FPA_INTB_GEMM=1)
     endif()
+
+    if (onnxruntime_QUICK_BUILD)
+      message( STATUS "Quick build mode: Flash attention limited to fp16 only")
+      list(APPEND ORT_PROVIDER_FLAGS -DORT_QUICK_BUILD=1)
+    endif()
 endif()
 
 if (onnxruntime_USE_CUDA_INTERFACE AND (NOT onnxruntime_USE_CUDA))
@@ -1442,6 +1448,9 @@ if (Git_FOUND)
       OUTPUT_VARIABLE ORT_GIT_BRANCH)
   string(STRIP "${ORT_GIT_BRANCH}" ORT_GIT_BRANCH)
   string(APPEND ORT_BUILD_INFO "git-branch=${ORT_GIT_BRANCH}, git-commit-id=${ORT_GIT_COMMIT}, ")
+  if (onnxruntime_QUICK_BUILD)
+    string(APPEND ORT_BUILD_INFO "quick-build=1, ")
+  endif()
 endif()
 string(APPEND ORT_BUILD_INFO "build type=${CMAKE_BUILD_TYPE}")
 configure_file(onnxruntime_config.h.in ${CMAKE_CURRENT_BINARY_DIR}/onnxruntime_config.h)
 
@@ -25,6 +25,16 @@ file(GLOB_RECURSE onnxruntime_cuda_contrib_ops_cu_srcs CONFIGURE_DEPENDS
   "${ONNXRUNTIME_ROOT}/contrib_ops/cuda/*.cuh"
 )
 
+# Quick build mode: Filter out non-hdim128 flash attention kernels for faster development iteration
+if(onnxruntime_QUICK_BUILD)
+  message(STATUS "Quick build mode enabled: Only building hdim128 fp16 flash attention kernels")
+  # Filter non-hdim128 kernels
+  list(FILTER onnxruntime_cuda_contrib_ops_cu_srcs EXCLUDE REGEX "flash_fwd.*hdim(32|64|96|160|192|224|256)")
+  # Filter all bfloat16 kernels (only keep fp16)
+  list(FILTER onnxruntime_cuda_contrib_ops_cu_srcs EXCLUDE REGEX "flash_fwd.*_bf16")
+endif()
+
+
 
 file(GLOB_RECURSE onnxruntime_js_contrib_ops_cc_srcs CONFIGURE_DEPENDS
   "${ONNXRUNTIME_ROOT}/contrib_ops/js/*.h"
 
@@ -116,10 +116,10 @@ Status Attention<T>::ComputeInternal(OpKernelContext* context) const {
                              nullptr == present &&
                              parameters.hidden_size == parameters.v_hidden_size &&
                              nullptr == mask_index &&
-                             onnxruntime::flash::is_supported(device_prop,
-                                                              parameters.head_size,
-                                                              parameters.num_heads,
-                                                              parameters.num_heads);
+                             onnxruntime::flash::is_supported<T>(device_prop,
+                                                                 parameters.head_size,
+                                                                 parameters.num_heads,
+                                                                 parameters.num_heads);
   // When input is packed QKV format, TensorRT kernel might be faster when sequence length <= 512.
   if (use_flash_attention && parameters.sequence_length < kernel_options_->MinSeqLenForFlashAttentionPackedQkv()) {
     use_flash_attention = false;
 
@@ -32,6 +32,9 @@
 
 #include "core/providers/cuda/cuda_common.h"
 #include <tuple>
+#include <type_traits>
+#include <cutlass/numeric_types.h>
+#include <cuda_bf16.h>
 
 namespace onnxruntime {
 namespace flash {
@@ -89,8 +92,8 @@ Status mha_varlen_fwd(const cudaDeviceProp& dprops,
 Status mha_fwd_kvcache(const cudaDeviceProp& dprops,
                        cudaStream_t stream,
                        void* q,            // batch_size x seqlen_q x num_heads x head_size
-                       void* kcache,       // batch_size x seqlen_k x num_heads_k x head_size or batch_size x num_heads_k seqlen_k x x head_size
-                       void* vcache,       // batch_size x seqlen_k x num_heads_k x head_size or batch_size x num_heads_k seqlen_k x x head_size
+                       void* kcache,       // batch_size x seqlen_k x num_heads_k x head_size or batch_size x num_heads_k seqlen_k x head_size
+                       void* vcache,       // batch_size x seqlen_k x num_heads_k x head_size or batch_size x num_heads_k seqlen_k x head_size
                        void* k,            // batch_size x seqlen_k_new x num_heads_k x head_size
                        void* v,            // batch_size x seqlen_k_new x num_heads_k x head_size
                        void* out,          // batch_size x seqlen_q x num_heads x head_size
@@ -131,6 +134,23 @@ std::tuple<size_t, size_t, size_t> get_num_splits_and_buffer_sizes(size_t batch_
 
 bool is_supported(const cudaDeviceProp& dprops, size_t head_size, size_t num_heads, size_t num_heads_k);
 
+// Template version that checks for bf16 type in quick build mode
+template <typename T>
+bool is_supported(const cudaDeviceProp& dprops, size_t head_size, size_t num_heads, size_t num_heads_k) {
+#ifdef ORT_QUICK_BUILD
+  // In quick build mode, only fp16 flash attention is built
+  constexpr bool is_bf16 = std::is_same<T, onnxruntime::BFloat16>::value;
+  if (is_bf16) {
+    return false;
+  }
+
+  if (head_size != 128) {
+    return false;
+  }
+#endif
+  return is_supported(dprops, head_size, num_heads, num_heads_k);
+}
+
 }  // namespace flash
 }  // namespace onnxruntime
 
 
@@ -70,12 +70,9 @@ void run_flash_fwd(Flash_fwd_params& params, cudaStream_t stream) {
             // If head dim > 128, set IsEvenMNConst to false to reduce number of templates
             // If Is_local, set Is_causal to false
             auto kernel = &flash_fwd_kernel < Kernel_traits, Is_causal, Is_local && !Is_causal, Has_alibi, IsEvenMNConst && IsEvenKConst && !Is_local && Kernel_traits::kHeadDim <= 128, IsEvenKConst, Is_softcap, false > ;
-            // auto kernel = &flash_fwd_kernel<Kernel_traits, Is_causal, IsEvenMNConst, true, ReturnSoftmaxConst>;
             if (smem_size >= 48 * 1024) {
               cudaFuncSetAttribute(
                   kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast<int>(smem_size));
-              // ORT_ENFORCE(cudaFuncSetAttribute(
-              //     kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size));
             }
             // int ctas_per_sm;
             // cudaError status_ = cudaOccupancyMaxActiveBlocksPerMultiprocessor(
@@ -112,8 +109,6 @@ void run_flash_splitkv_fwd(Flash_fwd_params& params, cudaStream_t stream) {
                   auto kernel = &flash_fwd_splitkv_kernel < Kernel_traits, Is_causal, Is_Local_Const && !Is_causal, Has_alibi,
                   IsEvenMNConst && !Append_KV_Const && IsEvenKConst && !Is_Local_Const && Kernel_traits::kHeadDim <= 128,
                   IsEvenKConst, Is_softcap, SplitConst, Append_KV_Const >;
-                  // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, true, Split, Append_KV_Const>;
-                  // auto kernel = &flash_fwd_splitkv_kernel<Kernel_traits, Is_causal, false, IsEvenKConst>;
                   if (smem_size >= 48 * 1024) {
                     cudaFuncSetAttribute(
                         kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, static_cast<int>(smem_size));