flashinfer-ai
diff --git a/‎csrc/fmha_v2/fmha/warpspec/compute.h‎
Lines changed: 38 additions & 4 deletions b/‎csrc/fmha_v2/fmha/warpspec/compute.h‎
Lines changed: 38 additions & 4 deletions
diff --git a/‎csrc/fmha_v2/fmha/warpspec/epilogue.h‎
Lines changed: 119 additions & 15 deletions b/‎csrc/fmha_v2/fmha/warpspec/epilogue.h‎
Lines changed: 119 additions & 15 deletions
diff --git a/‎csrc/fmha_v2/fmha/warpspec/kernel_traits.h‎
Lines changed: 32 additions & 12 deletions b/‎csrc/fmha_v2/fmha/warpspec/kernel_traits.h‎
Lines changed: 32 additions & 12 deletions
@@ -179,7 +179,7 @@ struct Compute {
       USE_CUSTOM_MASK ? (head_info.mask_sum_s + q_step_idx * STEP_Q + local_q_tile_offset)       \
                       : (q_step_idx * STEP_Q + head_info.q_tile_offset),                         \
       kv_step_idx * STEP_KV, sage_scale_row, cbr, cbr_v, mutex_accessor,                         \
-      kv_step_idx == kv_idx_end - 1);
+      &shared->skip_softmax_votes[kv_step_idx & 1][warpgroup_id], kv_step_idx == kv_idx_end - 1);
 
   ////////////////////////////////////////////////////////////////////////////////////////////////
 
@@ -277,6 +277,12 @@ struct Compute {
       int const actual_kv_seqlen =
           SEPARATE_Q_KV_BUFFER ? head_info.actual_kv_seqlen : actual_q_seqlen;
 
+      // Update threshold of Skip-Softmax
+      if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX) {
+        softmax.skip_softmax_threshold =
+            params.skip_softmax_threshold_scale_factor / actual_kv_seqlen;
+      }
+
       // Calculate the alibi head_scaling_factor.
       float alibi_head_scale = APPLY_ALIBI ? get_alibi_head_scaling_factor<AlibiParams>(
                                                  head_info.bidh, params.alibi_params)
@@ -411,6 +417,12 @@ struct Compute {
         }
       }
     }
+#ifdef SKIP_SOFTMAX_STAT
+    if (tidx == 0) {
+      atomicAdd(params.skip_softmax_total_blocks, softmax.total_blocks);
+      atomicAdd(params.skip_softmax_skipped_blocks, softmax.skipped_blocks);
+    }
+#endif
   }
 
   ////////////////////////////////////////////////////////////////////////////////////////////////
@@ -421,7 +433,14 @@ struct Compute {
       float (&p_max)[Mma_tile_p::CORES_M], float (&p_sum)[Mma_tile_p::CORES_M], int const tidx,
       int const actual_kv_seqlen, float const alibi_head_scale, int const row_offset,
       int const col_offset, int const sage_scale_row, Circular_buffer_q_reader& cbr,
-      Circular_buffer_kv_reader& cbr_v, OrderedMutexAccessor& mutex, bool complete = false) {
+      Circular_buffer_kv_reader& cbr_v, OrderedMutexAccessor& mutex, uint32_t* skip_softmax_vote,
+      bool complete = false) {
+    // Skip-softmax vote initialization
+    if (tidx == 0) {
+      // Note that we need a named_barrier_wait in compute_single_tile to make sure init is before
+      // voting.
+      *skip_softmax_vote = 1;
+    }
 // load the scales of K/V from global memory
 #define LOAD_SCALES_KV(dst, which, blocks_per_step, block_size)                            \
   if constexpr (block_size > 0) {                                                          \
@@ -453,6 +472,10 @@ struct Compute {
     // Ctile_p is only used once by each n step.
     ctile_p.clear();
 
+    // If skip_softmax is enabled, make sure there is no racing between the initialization and
+    // writing of skip_softmax_vote.
+    named_barrier_wait(Kernel_traits::SKIP_SOFTMAX_BARRIER_ID + threadIdx.x / 128, 128);
+
     // BMM1 (Q x K').
     warpgroup_arrive();
 
@@ -513,8 +536,19 @@ struct Compute {
     softmax.apply_alibi_and_mask<APPLY_MASK>(ctile_p, params.alibi_params, alibi_head_scale,
                                              actual_kv_seqlen, row_offset, col_offset);
 
-    // Softmax Exp, max/sum, and update scales.
-    softmax.compute_and_update_scale<IS_FIRST_COL>(p_max, p_sum);
+    // Softmax Exp, max/sum, and update scales. If returns false we skip the rest.
+    if (!softmax.compute_and_update_scale<IS_FIRST_COL>(p_max, p_sum, skip_softmax_vote)) {
+      if constexpr (ENABLE_MUTEX && Kernel_traits::ELEMENT_BYTES == 1) {
+        // Notify another warpgroup to execute QGMMA.
+        mutex.named_bar_arrive();
+      }
+      // Need to wait V, otherwise compute-sanitizer synccheck will fail.
+      int ready2 = cbr_v.peek();
+      if (!ready2) {
+        cbr_v.wait();
+      }
+      return;
+    }
 
     // experiments show that here is the best place to load scales of V
     float scales_v[SAGE_BLOCKS_PER_STEP_V];
 
@@ -16,6 +16,8 @@
 #include <fmha/traits.h>
 #include <fmha/utils.h>
 
+#include "fmha/hopper/arrive_wait.h"
+
 namespace fmha {
 namespace ws {
 
@@ -71,6 +73,9 @@ struct Softmax_base {
   // Whether we need to check if local_max could be -inf or not.
   enum { CHECK_IF_NEG_INF_EXISTS = SLIDING_OR_CHUNKED_ATTENTION || USE_CUSTOM_MASK };
 
+  // There are 2 warpgroups so 0x3 and 0x4 are used
+  enum { SKIP_SOFTMAX_BARRIER = Kernel_traits::SKIP_SOFTMAX_BARRIER_ID };
+
   // Ctor.
   template <typename Params>
   inline __device__ Softmax_base(Params params, int tidx)
@@ -80,7 +85,12 @@ struct Softmax_base {
         sliding_window_size_(params.sliding_window_size),
         log2_chunked_attention_size_(params.log2_chunked_attention_size),
         packed_mask_ptr_{reinterpret_cast<uint32_t*>(params.packed_mask_ptr)},
-        params_packed_mask_stride_in_bytes_{params.packed_mask_stride_in_bytes} {
+        params_packed_mask_stride_in_bytes_{params.packed_mask_stride_in_bytes},
+#ifdef SKIP_SOFTMAX_STAT
+        total_blocks(0),
+        skipped_blocks(0),
+#endif
+        skip_softmax_threshold(0) {
     int warp = tidx / 32;
     int lane = tidx % 32;
     // The corresponding row/col for each thread after MMA.
@@ -253,25 +263,67 @@ struct Softmax_base {
   }
 
   // Calculate max/sum, and update flash-attention scales.
+  // Returns false if skipped due to skip-softmax attention feature.
   template <bool IS_FIRST_COL>
-  inline __device__ void compute_and_update_scale(float (&global_max)[Mma_tile_p::CORES_M],
-                                                  float (&global_sum)[Mma_tile_p::CORES_M]) {
+  inline __device__ bool compute_and_update_scale(float (&global_max)[Mma_tile_p::CORES_M],
+                                                  float (&global_sum)[Mma_tile_p::CORES_M],
+                                                  uint32_t* skip_softmax_vote) {
     float const scale = reinterpret_cast<float const&>(scale_bmm1_);
 
+    // whether this warpgroup skips the softmax
+    constexpr bool may_skip = Kernel_traits::ENABLE_SKIP_SOFTMAX && !IS_FIRST_COL;
+    bool skip = may_skip;
+
 // Row-wise max of current tile.
 #pragma unroll
     for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
-      if (IS_FIRST_COL) {
-        local_max_[mi] = elt_[mi][0];
-      } else {
-        local_max_[mi] = fmaxf(global_max[mi], elt_[mi][0]);
-      }
+      local_max_[mi] = elt_[mi][0];
 #pragma unroll
       for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++) {
         local_max_[mi] = fmaxf(local_max_[mi], elt_[mi][ni]);
       }
       local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 1), local_max_[mi]);
       local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 2), local_max_[mi]);
+
+      if constexpr (may_skip) {
+        // AND(&) the CORES_M results, then `skip` means whether to skip
+        // the CORES_M(=2) rows
+        if constexpr (!EXP2F_OPTIMIZATION) {
+          skip &= expf(local_max_[mi] - global_max[mi]) < skip_softmax_threshold;
+        } else {
+          skip &= exp2f((local_max_[mi] - global_max[mi]) * scale) < skip_softmax_threshold;
+        }
+      }
+
+      if (!IS_FIRST_COL) {
+        local_max_[mi] = fmaxf(local_max_[mi], global_max[mi]);
+      }
+    }
+
+    if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX) {
+#ifdef SKIP_SOFTMAX_STAT
+      total_blocks++;
+#endif
+      if constexpr (may_skip) {
+        // AND(&) the results together in a warp, then `skip` means whether to skip
+        // all the 16 rows managed by this warp.
+        // each 4 threads (e.g. T0~T3) have the same `skip`, only 0x11111111 is needed
+        // instead of 0xffffffff. But the perf is the same.
+        skip = __all_sync(0xffffffff, skip);
+        if (threadIdx.x % 32 == 0) {
+          // The leader of each warp votes.
+          atomicAnd(skip_softmax_vote, uint32_t(skip));
+        }
+        // WG0 uses 0x3 barrier, WG1 uses 0x4 barrier
+        named_barrier_wait(SKIP_SOFTMAX_BARRIER + threadIdx.x / 128, 128);
+        skip = *((uint32_t volatile*)skip_softmax_vote);
+        if (skip) {
+#ifdef SKIP_SOFTMAX_STAT
+          skipped_blocks++;
+#endif
+          return false;
+        }
+      }
     }
 
 // Softmax Exp.
@@ -339,6 +391,7 @@ struct Softmax_base {
         global_max[mi] = max_new;
       }
     }
+    return true;
   }
 
   // Update flash attention scales and pack elements for BMM2.
@@ -407,6 +460,13 @@ struct Softmax_base {
   float correction_[Mma_tile_p::CORES_M];
   // The packed mask.
   uint4 packed_mask_;
+  // Skip softmax when exp(local_max - global_max) < skip_softmax_threshold.
+  float skip_softmax_threshold;
+#ifdef SKIP_SOFTMAX_STAT
+  // Statistics of skip-softmax
+  uint32_t total_blocks;
+  uint32_t skipped_blocks;
+#endif
 };
 
 ////////////////////////////////////////////////////////////////////////////////////////////////////
@@ -676,29 +736,72 @@ struct Softmax<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
   inline __device__ Softmax(Params const& params, int tidx) : Base(params, tidx) {}
 
   // Calculate max/sum, and update flash-attention scales.
+  // Returns false if skipped due to skip-softmax attention feature.
   template <bool IS_FIRST_COL>
-  inline __device__ void compute_and_update_scale(float (&global_max)[Mma_tile_p::CORES_M],
-                                                  float (&global_sum)[Mma_tile_p::CORES_M]) {
+  inline __device__ bool compute_and_update_scale(float (&global_max)[Mma_tile_p::CORES_M],
+                                                  float (&global_sum)[Mma_tile_p::CORES_M],
+                                                  uint32_t* skip_softmax_vote) {
     float const scale = reinterpret_cast<float const&>(this->scale_bmm1_);
     float(&local_max_)[Mma_tile_p::CORES_M] = this->local_max_;
     float(&local_sum_)[Mma_tile_p::CORES_M] = this->local_sum_;
     float(&correction_)[Mma_tile_p::CORES_M] = this->correction_;
     float(&elt_)[Mma_tile_p::CORES_M][Mma_tile_p::CORES_N * 2] = this->elt_;
 
+    // whether this warpgroup skips the softmax
+    constexpr bool may_skip = Kernel_traits::ENABLE_SKIP_SOFTMAX && !IS_FIRST_COL;
+    bool skip = may_skip;
+
 // Row-wise max of current tile.
 #pragma unroll
     for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) {
-      if (IS_FIRST_COL) {
-        local_max_[mi] = elt_[mi][0];
-      } else {
-        local_max_[mi] = fmaxf(global_max[mi], elt_[mi][0]);
-      }
+      local_max_[mi] = elt_[mi][0];
 #pragma unroll
       for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++) {
         local_max_[mi] = fmaxf(local_max_[mi], elt_[mi][ni]);
       }
       local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 1), local_max_[mi]);
       local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 2), local_max_[mi]);
+      // AND(&) the CORES_M results, then `skip` means whether to skip
+      // the CORES_M(=2) rows
+      if constexpr (may_skip) {
+        // AND(&) the CORES_M results, then `skip` means whether to skip
+        // the CORES_M(=2) rows
+        if constexpr (!EXP2F_OPTIMIZATION) {
+          skip &= expf(local_max_[mi] - global_max[mi]) < this->skip_softmax_threshold;
+        } else {
+          skip &= exp2f((local_max_[mi] - global_max[mi]) * scale) < this->skip_softmax_threshold;
+        }
+      }
+      if (!IS_FIRST_COL) {
+        local_max_[mi] = fmaxf(local_max_[mi], global_max[mi]);
+      }
+    }
+
+    if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX) {
+#ifdef SKIP_SOFTMAX_STAT
+      this->total_blocks++;
+#endif
+
+      if constexpr (may_skip) {
+        // AND(&) the results together in a warp, then `skip` means whether to skip
+        // all the 16 rows managed by this warp.
+        // each 4 threads (e.g. T0~T3) have the same `skip`, only 0x11111111 is needed
+        // instead of 0xffffffff. But the perf is the same.
+        skip = __all_sync(0xffffffff, skip);
+        if (threadIdx.x % 32 == 0) {
+          // The leader of each warp votes.
+          atomicAnd(skip_softmax_vote, uint32_t(skip));
+        }
+        // WG0 uses 0x3 barrier, WG1 uses 0x4 barrier
+        named_barrier_wait(Base::SKIP_SOFTMAX_BARRIER + threadIdx.x / 128, 128);
+        skip = *((uint32_t volatile*)skip_softmax_vote);
+        if (skip) {
+#ifdef SKIP_SOFTMAX_STAT
+          this->skipped_blocks++;
+#endif
+          return false;
+        }
+      }
     }
 
 // Softmax Exp.
@@ -774,6 +877,7 @@ struct Softmax<Hopper_qgmma_e4m3_fp32_traits, Kernel_traits>
         global_max[mi] = max_new;
       }
     }
+    return true;
   }
 
   // Update flash attention scales and pack elements for BMM2.
 
@@ -65,6 +65,8 @@ template <
     bool ENABLE_BMM1_SOFTCAPPING_SCALE_ = false,
     // Save softmax stats ?
     bool RETURN_SOFTMAX_STATS_ = false,
+    // Enable skip softmax attention feature
+    bool ENABLE_SKIP_SOFTMAX_ = false,
     // The output type (only used by fp8 kernels).
     typename OutputType = typename Instruction_traits<STEP_Q_, STEP_KV_, 0, false, false>::A_type,
     // The sage attention block size for Q, K and V
@@ -189,6 +191,9 @@ struct Kernel_traits {
   // Use the custom mask input ( attention_mask_type == 3.)
   enum { USE_CUSTOM_MASK = ATTENTION_MASK_TYPE_ == 3 };
 
+  // Are we enabling skip softmax attention feature?
+  enum { ENABLE_SKIP_SOFTMAX = ENABLE_SKIP_SOFTMAX_ };
+
   static_assert(!USE_CUSTOM_MASK || STEP_KV == 64 || STEP_KV == 128 || STEP_KV == 256,
                 "Not implemented!");
 
@@ -250,6 +255,8 @@ struct Kernel_traits {
   // Named barrier ids
   static constexpr int DMA_SYNC_BARRIER_ID = 0x1;
   static constexpr int MMA_SYNC_BARRIER_ID = 0x2;
+  // There are 2 warpgroups so 0x3 and 0x4 are used for skip-softmax
+  static constexpr int SKIP_SOFTMAX_BARRIER_ID = 0x3;
 
   // How many threads get involved in the dma group.
   enum { NUM_THREADS_IN_DMA_GROUP = DMA_GROUP_TRANSPOSE_V ? 128 : (PAGED_KV_INPUT ? 1 : 32) };
@@ -383,6 +390,11 @@ struct Kernel_traits {
     // Mutex
     OrderedMutex compute_mutex;
 
+    // 4 warps in a warpgroup vote to an atomic variable in shared memory
+    // to decide whether to skip this STEP_KV. Double-buffered to avoid races between consecutive
+    // KV_STEPS.
+    uint32_t skip_softmax_votes[2][NUM_COMPUTE_GROUPS];
+
     inline __device__ void init(int tid0) {
 #pragma unroll
       for (int i = 0; i < NUM_COMPUTE_GROUPS; i++) {
@@ -439,24 +451,27 @@ template <  // The step size in query sequence dimension (M of BMM1 and BMM2).
     bool ENABLE_BMM1_SOFTCAPPING_SCALE_ = false,
     // Save softmax stats ?
     bool RETURN_SOFTMAX_STATS_ = false,
+    // Enable skip softmax attention feature
+    bool ENABLE_SKIP_SOFTMAX_ = false,
     // The output type (only used by fp8 kernels).
     typename OutputType = e4m3_t,
     // The sage attention block size for Q, K and V
     int SAGE_BLOCK_SIZE_Q_ = 0, int SAGE_BLOCK_SIZE_K_ = 0, int SAGE_BLOCK_SIZE_V_ = 0>
 struct Kernel_traits_Hopper_qgmma_e4m3_fp32
-    : public Kernel_traits<Hopper_qgmma_e4m3_fp32_traits, STEP_Q_, STEP_KV_, D_, DV_, Q_BUFFERS_,
-                           KV_BUFFERS_, NUM_COMPUTE_GROUPS_, DMA2COMPUTE_DEPTH_,
-                           ATTENTION_MASK_TYPE_, HEADS_INTERLEAVED_, APPLY_ALIBI_, ENABLE_MUTEX_,
-                           SCHEDULING_MODE_, INPUT_LAYOUT_, USE_TMA_STORE_,
-                           ENABLE_BMM1_SOFTCAPPING_SCALE_, RETURN_SOFTMAX_STATS_, OutputType,
-                           SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_> {
+    : public Kernel_traits<
+          Hopper_qgmma_e4m3_fp32_traits, STEP_Q_, STEP_KV_, D_, DV_, Q_BUFFERS_, KV_BUFFERS_,
+          NUM_COMPUTE_GROUPS_, DMA2COMPUTE_DEPTH_, ATTENTION_MASK_TYPE_, HEADS_INTERLEAVED_,
+          APPLY_ALIBI_, ENABLE_MUTEX_, SCHEDULING_MODE_, INPUT_LAYOUT_, USE_TMA_STORE_,
+          ENABLE_BMM1_SOFTCAPPING_SCALE_, RETURN_SOFTMAX_STATS_, ENABLE_SKIP_SOFTMAX_, OutputType,
+          SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_> {
   // Base class.
-  using Base = Kernel_traits<Hopper_qgmma_e4m3_fp32_traits, STEP_Q_, STEP_KV_, D_, DV_, Q_BUFFERS_,
-                             KV_BUFFERS_, NUM_COMPUTE_GROUPS_, DMA2COMPUTE_DEPTH_,
-                             ATTENTION_MASK_TYPE_, HEADS_INTERLEAVED_, APPLY_ALIBI_, ENABLE_MUTEX_,
-                             SCHEDULING_MODE_, INPUT_LAYOUT_, USE_TMA_STORE_,
-                             ENABLE_BMM1_SOFTCAPPING_SCALE_, RETURN_SOFTMAX_STATS_, OutputType,
-                             SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_>;
+  using Base =
+      Kernel_traits<Hopper_qgmma_e4m3_fp32_traits, STEP_Q_, STEP_KV_, D_, DV_, Q_BUFFERS_,
+                    KV_BUFFERS_, NUM_COMPUTE_GROUPS_, DMA2COMPUTE_DEPTH_, ATTENTION_MASK_TYPE_,
+                    HEADS_INTERLEAVED_, APPLY_ALIBI_, ENABLE_MUTEX_, SCHEDULING_MODE_,
+                    INPUT_LAYOUT_, USE_TMA_STORE_, ENABLE_BMM1_SOFTCAPPING_SCALE_,
+                    RETURN_SOFTMAX_STATS_, ENABLE_SKIP_SOFTMAX_, OutputType, SAGE_BLOCK_SIZE_Q_,
+                    SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_>;
 
   enum { USE_TMA_STORE = USE_TMA_STORE_ };
 
@@ -549,6 +564,11 @@ struct Kernel_traits_Hopper_qgmma_e4m3_fp32
     // Mutex
     OrderedMutex compute_mutex;
 
+    // 4 warps in a warpgroup vote to an atomic variable in shared memory
+    // to decide whether to skip this STEP_KV. Double-buffered to avoid races between consecutive
+    // STEP_KVs.
+    uint32_t skip_softmax_votes[2][Base::NUM_COMPUTE_GROUPS];
+
     inline __device__ void init(int tid0) {
 #pragma unroll
       for (int i = 0; i < Base::NUM_COMPUTE_GROUPS; i++) {