SHI-Labs
diff --git a/‎csrc/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions b/‎csrc/CMakeLists.txt‎
Lines changed: 2 additions & 0 deletions
diff --git a/‎csrc/include/natten/cuda/flash_fna/flash_fna_backward.cuh‎
Lines changed: 8 additions & 2 deletions b/‎csrc/include/natten/cuda/flash_fna/flash_fna_backward.cuh‎
Lines changed: 8 additions & 2 deletions
diff --git a/‎csrc/include/natten/cuda/flash_fna/flash_kernel/block.h‎
Lines changed: 61 additions & 0 deletions b/‎csrc/include/natten/cuda/flash_fna/flash_kernel/block.h‎
Lines changed: 61 additions & 0 deletions
diff --git a/‎csrc/include/natten/cuda/flash_fna/flash_kernel/bwd_mask.h‎
Lines changed: 234 additions & 0 deletions b/‎csrc/include/natten/cuda/flash_fna/flash_kernel/bwd_mask.h‎
Lines changed: 234 additions & 0 deletions
diff --git a/‎csrc/include/natten/cuda/flash_fna/flash_kernel/flash_bwd_kernel_sm80.h‎
Lines changed: 1 addition & 0 deletions b/‎csrc/include/natten/cuda/flash_fna/flash_kernel/flash_bwd_kernel_sm80.h‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎csrc/include/natten/cuda/flash_fna/flash_kernel/flash_bwd_launch_template.h‎
Lines changed: 11 additions & 3 deletions b/‎csrc/include/natten/cuda/flash_fna/flash_kernel/flash_bwd_launch_template.h‎
Lines changed: 11 additions & 3 deletions
@@ -161,6 +161,7 @@ file(GLOB AUTOGEN_REFERENCE ./autogen/src/cuda/reference/*.cu)
 file(GLOB AUTOGEN_FNA ./autogen/src/cuda/fna/*.cu)
 file(GLOB AUTOGEN_FMHA ./autogen/src/cuda/fmha/*.cu)
 file(GLOB AUTOGEN_FLASH_FNA ./autogen/src/cuda/flash_fna/*.cu)
+file(GLOB AUTOGEN_FLASH_FNA_BWD ./autogen/src/cuda/flash_fna_bwd/*.cu)
 file(GLOB AUTOGEN_FLASH_FMHA ./autogen/src/cuda/flash_fmha/*.cu)
 file(GLOB AUTOGEN_FLASH_FMHA_BWD ./autogen/src/cuda/flash_fmha_bwd/*.cu)
 if(${NATTEN_WITH_HOPPER_FNA})
@@ -177,6 +178,7 @@ file(GLOB ALL_SOURCES
   ${AUTOGEN_FNA} 
   ${AUTOGEN_FMHA} 
   ${AUTOGEN_FLASH_FNA} 
+  ${AUTOGEN_FLASH_FNA_BWD}
   ${AUTOGEN_FLASH_FMHA} 
   ${AUTOGEN_FLASH_FMHA_BWD}
   ${AUTOGEN_BLACKWELL_FNA} 
 
@@ -86,10 +86,12 @@ constexpr Config get_config() {
 }
 
 
-template <int Arch, typename Element, int HeadDim, int kBlockM, int kBlockN, bool Deterministic>
+template <int Arch, typename Element, int HeadDim, int kBlockM, int kBlockN, 
+    class NADim, class QTileShape, class KVTileShape, class Causal,
+    bool Deterministic>
 struct FlashFnaBackwardKernel {
 
-  void run(Flash_fna_bwd_params params, cudaStream_t stream) {
+  void run(Flash_fna_bwd_params<NADim> params, cudaStream_t stream) {
 
     static constexpr Config config = get_config<HeadDim, Arch>();
 
@@ -99,6 +101,10 @@ struct FlashFnaBackwardKernel {
       /* kBlockM= */            kBlockM,
       /* kBlockN= */            kBlockN,
       /* Element= */            Element,
+      /* NADim= */              NADim,
+      /* QTileShape= */         QTileShape,
+      /* KVTileShape= */        KVTileShape,
+      /* Causal= */             Causal,
       /* Deterministic= */      Deterministic,
       /* GQA= */                false,
       /* Stages_dO= */          config.Stages_dO,
 
@@ -58,6 +58,67 @@ struct NABlockMN {
 
       return {kv_start, kv_diff_tiles};
     }
+
+    static
+    CUTLASS_DEVICE
+    cute::tuple<NADim, NADim> get_m_block_min_max(
+      SeqlenInfo_t const& seqlen_info,
+      int const n_block, int const bidb,
+      cutlass::FastDivmod const& qhead_per_khead_divmod,
+      // NA Args
+      NADim kv_shape, NADim qkv_shape,
+      NADim window_size, NADim window_left, NADim window_right, NADim stride
+    ) {
+
+      auto stride_group_offset = get_bwd_stride_offset(stride);
+
+      auto q_tile_shape = QTileShape{};
+      auto kv_tile_shape = KVTileShape{};
+
+      auto kv_tiled = ceil_div(kv_shape, kv_tile_shape);
+
+      // Map KV index back to coord
+      auto kv_tile_coord = idx2crd(n_block, kv_tiled);
+      auto kv_coord = tuple_mul(kv_tile_coord, kv_tile_shape);
+
+      auto kv_tile_offset_last = idx2crd(size(kv_tile_shape) - 1, kv_tile_shape);
+      auto kv_coord_last = tuple_add(kv_coord, kv_tile_offset_last);
+
+      // q start and end instead of kv like in forward pass
+      auto q_start_actual = get_bwd_window_start<Causal>(
+          kv_coord,
+          stride_group_offset,
+          window_left,
+          window_right,
+          window_size,
+          stride,
+          qkv_shape);
+
+      auto last_q_start_actual = get_bwd_window_start<Causal>(
+          kv_coord_last,
+          stride_group_offset,
+          window_left,
+          window_right,
+          window_size,
+          stride,
+          qkv_shape);
+      auto q_end_actual = get_bwd_window_end<Causal>(
+          kv_coord_last,
+          stride_group_offset,
+          window_left,
+          window_right,
+          window_size,
+          stride,
+          qkv_shape);
+
+      auto q_start = floor_tuple(q_start_actual, q_tile_shape);
+      auto q_end = ceil_tuple(q_end_actual, q_tile_shape);
+
+      auto q_diff = tuple_sub(q_end, q_start);
+      auto q_diff_tiles = ceil_div(q_diff, q_tile_shape);
+
+      return {q_start, q_diff_tiles};
+    }
 };
 
 template <class SeqlenInfo_t, int kBlockM, int kBlockN, bool PackGQA=false>
 
@@ -0,0 +1,234 @@
+
+
+#pragma once
+
+#include <cute/tensor.hpp>
+#include "cute/util/debug.hpp"
+#include "cute/util/print.hpp"
+
+#include "cutlass/fast_math.h"  // For cutlass::FastDivmod
+
+#include "utils.h"
+#include "na_utils.h"
+// #include "natten/cuda/flash_fmha/utils.h"
+
+namespace natten {
+namespace cuda {
+namespace flash_fna {
+
+using namespace cute;
+
+template<int kBlockM, int kBlockN, class NADim, class QTileShape, class KVTileShape, class Causal,
+  bool PackGQA, typename TiledMma, class IterMapType, bool SwapAB=false>
+struct BwdNAMask {
+    static_assert(!(PackGQA && SwapAB), "Cannot be both PackGQA and SwapAB");
+    int const thread_idx;
+    int const seqlen_q, seqlen_k;
+
+    NADim window_size;
+    NADim window_left;
+    NADim window_right;
+    NADim stride;
+    NADim qkv_shape;
+    NADim q_shape;
+    NADim kv_shape;
+    NADim q_blk_offset;
+    NADim q_diff_tiles;
+
+    bool is_fully_block_sparse;
+    bool has_q_padding;
+
+    IterMapType iter_to_tile_map;
+    cutlass::FastDivmod const qhead_per_khead_divmod;
+
+    CUTLASS_DEVICE
+    BwdNAMask (const int thread_idx, const int seqlen_q, const int seqlen_k,
+        cutlass::FastDivmod const &qhead_per_khead_divmod,
+        NADim window_size, NADim window_left, NADim window_right, NADim stride,
+        NADim qkv_shape, NADim q_shape, NADim kv_shape,
+        NADim q_blk_offset, NADim q_diff_tiles, IterMapType iter_to_tile_map,
+        bool is_fully_block_sparse, bool has_q_padding):
+        thread_idx(thread_idx),
+        seqlen_q(seqlen_q),
+        seqlen_k(seqlen_k),
+        qhead_per_khead_divmod(qhead_per_khead_divmod),
+        window_size(window_size),
+        window_left(window_left),
+        window_right(window_right),
+        stride(stride),
+        qkv_shape(qkv_shape),
+        q_shape(q_shape),
+        kv_shape(kv_shape),
+        q_blk_offset(q_blk_offset),
+        q_diff_tiles(q_diff_tiles),
+        iter_to_tile_map(iter_to_tile_map),
+        is_fully_block_sparse(is_fully_block_sparse),
+        has_q_padding(has_q_padding) {}
+
+    template <typename Engine, typename Layout>
+    CUTLASS_DEVICE
+    void apply_na_mask(Tensor<Engine, Layout> &tSrS, const int m_block, const int n_block) {
+        auto thread_mma = TiledMma{}.get_thread_slice(thread_idx);
+
+        Tensor cS = cute::make_identity_tensor(Shape<Int<!SwapAB ? kBlockM : kBlockN>, Int<!SwapAB ? kBlockN : kBlockM>>{});
+        Tensor tScS = thread_mma.partition_C(cS);
+
+        Tensor tSrS_rowcol = make_tensor(tSrS.data(), flash_fna::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tSrS.layout()));
+        Tensor tScS_rowcol = make_tensor(tScS.data(), flash_fna::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tScS.layout()));
+
+        if constexpr (!SwapAB) {
+          tScS_rowcol.data() = tScS_rowcol.data() + E<0>{} * m_block * kBlockM + E<0>{} * size(q_blk_offset);
+          tScS_rowcol.data() = tScS_rowcol.data() + E<1>{} * n_block * kBlockN;
+        }
+        else {
+          tScS_rowcol.data() = tScS_rowcol.data() + E<1>{} * m_block * kBlockM + E<1>{} * size(q_blk_offset);
+          tScS_rowcol.data() = tScS_rowcol.data() + E<0>{} * n_block * kBlockN;
+        }
+        // tScS_rowcol.data() = tScS_rowcol.data() + E<0>{} * m_block * kBlockM + E<0>{} * size(q_blk_offset);
+        // tScS_rowcol.data() = tScS_rowcol.data() + E<1>{} * n_block * kBlockN;
+
+        auto q_tile_shape = QTileShape{};
+        auto kv_tile_shape = KVTileShape{};
+
+        auto stride_group_offset = get_bwd_stride_offset(stride);
+
+        auto kv_tiled = ceil_div(kv_shape, kv_tile_shape);
+
+        auto [q_idx_first, kv_idx_first] = tScS_rowcol(0);
+        if constexpr (SwapAB) {
+          auto tmp = q_idx_first;
+          q_idx_first = kv_idx_first;
+          kv_idx_first = tmp;
+        }
+
+        // KV coord remap
+        int kv_tile_idx = kv_idx_first / size(kv_tile_shape);
+        auto kv_tile_coord = idx2crd(kv_tile_idx, kv_tiled);
+        auto kv_tile_offset = tuple_mul(kv_tile_coord, kv_tile_shape);
+        int kv_idx_first_in_tile = kv_tile_idx * size(kv_tile_shape);
+        auto kv_ctr = make_identity_tensor(kv_tile_shape);
+        auto kv_ctr_offset = domain_offset(kv_tile_offset, kv_ctr);
+
+        // Q coord remap
+        int q_tile_idx = m_block;
+        auto q_tile_coord = idx2crd(q_tile_idx, q_diff_tiles);
+        auto q_tile_offset = tuple_add(q_blk_offset, tuple_mul(q_tile_coord, q_tile_shape));
+        int q_idx_first_in_tile = (q_tile_idx * size(q_tile_shape)) + size(q_blk_offset);
+
+        auto q_ctr = make_identity_tensor(q_tile_shape);
+        auto q_ctr_offset = domain_offset(q_tile_offset, q_ctr);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tSrS_rowcol); i++) {
+          auto [q_idx, kv_idx] = tScS_rowcol(i);
+
+          if constexpr (SwapAB) {
+            auto tmp = q_idx;
+            q_idx = kv_idx;
+            kv_idx = tmp;
+          }
+
+          auto q_coord = q_ctr_offset(q_idx - q_idx_first_in_tile);
+          auto kv_coord = kv_ctr_offset(kv_idx - kv_idx_first_in_tile);
+
+          auto q_start = get_bwd_window_start<Causal>(
+              kv_coord,
+              stride_group_offset,
+              window_left,
+              window_right,
+              window_size,
+              stride,
+              qkv_shape);
+          auto q_end = get_bwd_window_end<Causal>(
+              kv_coord,
+              stride_group_offset,
+              window_left,
+              window_right,
+              window_size,
+              stride,
+              qkv_shape);
+
+          bool is_neigh = is_neighbor(q_coord, q_start, q_end);
+          if (not is_neighbor(q_coord, q_start, q_end)) {
+            tSrS_rowcol(i) = -INFINITY;
+          }
+        }
+  }
+
+    template <typename Engine, typename Layout>
+    CUTLASS_DEVICE
+    void apply_padding(Tensor<Engine, Layout> &tSrS, const int m_block, const int n_block) {
+
+        // Q coord remap
+        auto thread_mma = TiledMma{}.get_thread_slice(thread_idx);
+
+        Tensor cS = cute::make_identity_tensor(Shape<Int<!SwapAB ? kBlockM : kBlockN>, Int<!SwapAB ? kBlockN : kBlockM>>{});
+        Tensor tScS = thread_mma.partition_C(cS);
+
+        Tensor tSrS_rowcol = make_tensor(tSrS.data(), flash_fna::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tSrS.layout()));
+        Tensor tScS_rowcol = make_tensor(tScS.data(), flash_fna::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tScS.layout()));
+
+        if constexpr (!SwapAB) {
+          tScS_rowcol.data() = tScS_rowcol.data() + E<0>{} * m_block * kBlockM + E<0>{} * size(q_blk_offset);
+          tScS_rowcol.data() = tScS_rowcol.data() + E<1>{} * n_block * kBlockN;
+        }
+        else {
+          tScS_rowcol.data() = tScS_rowcol.data() + E<1>{} * m_block * kBlockM + E<1>{} * size(q_blk_offset);
+          tScS_rowcol.data() = tScS_rowcol.data() + E<0>{} * n_block * kBlockN;
+        }
+
+        auto q_tile_shape = QTileShape{};
+
+        auto stride_group_offset = get_bwd_stride_offset(stride);
+
+        auto [q_idx_first, kv_idx_first] = tScS_rowcol(0);
+        if constexpr (SwapAB) {
+          auto tmp = q_idx_first;
+          q_idx_first = kv_idx_first;
+          kv_idx_first = tmp;
+        }
+
+        int q_tile_idx = m_block;
+        auto q_tile_coord = idx2crd(q_tile_idx, q_diff_tiles);
+        // auto q_tile_offset = idx2crd(q_tile_res, q_tile_shape);
+        auto q_tile_offset = tuple_add(q_blk_offset, tuple_mul(q_tile_coord, q_tile_shape));
+        int q_idx_first_in_tile = (q_tile_idx * size(q_tile_shape)) + size(q_blk_offset);
+
+        auto q_ctr = make_identity_tensor(q_tile_shape);
+        auto q_ctr_offset = domain_offset(q_tile_offset, q_ctr);
+
+        // int q_tile_idx = q_idx_first / size(q_tile_shape);
+        // int q_tile_res = q_idx_first % size(q_tile_shape);
+
+        // auto q_tile_coord = idx2crd(q_tile_idx, q_diff_tiles);
+        // auto q_tile_offset = idx2crd(q_tile_res, q_tile_shape);
+        // auto q_thread_offset = tuple_add(
+        //     q_tile_offset,
+        //     tuple_add(q_blk_offset, tuple_mul(q_tile_coord, q_tile_shape)));
+
+        // auto q_ctr = make_identity_tensor(q_tile_shape);
+        // auto q_ctr_offset = domain_offset(q_thread_offset, q_ctr);
+
+        CUTLASS_PRAGMA_UNROLL
+        for (int i = 0; i < size(tSrS_rowcol); i++) {
+          auto [q_idx, kv_idx] = tScS_rowcol(i);
+
+          if constexpr (SwapAB) {
+            auto tmp = q_idx;
+            q_idx = kv_idx;
+            kv_idx = tmp;
+          }
+
+          auto q_coord = q_ctr_offset(q_idx - q_idx_first_in_tile);
+
+          if (not is_within_bounds(q_coord, qkv_shape)) {
+            tSrS_rowcol(i) = -INFINITY;
+          }
+        }
+    }
+
+};
+
+} // namespace flash_fna
+} // namespace cuda
+} // namespace natten
@@ -5,6 +5,7 @@
 #pragma once
 
 #include "cute/tensor.hpp"
+#include "cute/util/debug.hpp"
 
 #include <cutlass/cutlass.h>
 #include <cutlass/array.h>
 
@@ -38,7 +38,7 @@ namespace flash_fna {
 using namespace cute;
 
 template <int Arch, int kHeadDim, int kBlockM, int kBlockN, typename Element,
-          class NADim,
+          class NADim, class QTileShape, class KVTileShape, class Causal,
           bool Deterministic, bool GQA,
           int Stages_dO=2, int Stages_dS_or_QSm80=2,
           bool SdP_swapAB=true, bool dKV_swapAB=false, bool dQ_swapAB=false,
@@ -104,7 +104,7 @@ void run_flash_bwd(Flash_fna_bwd_params<NADim> &params, cudaStream_t stream) {
     //         SdP_swapAB, dKV_swapAB, dQ_swapAB, NumMmaWarpGroups, AtomLayoutMSdP, AtomLayoutNdKV, AtomLayoutMdQ, V_in_regs>
     // >;
     using CollectiveMainloop = flash_fna::CollectiveMainloopBwdSm80<Stages, Stages_dO, TileShape_MNK, Element, ElementAccum, cutlass::arch::Sm80,
-            Deterministic,
+            Deterministic, NADim, QTileShape, KVTileShape, Causal,
             SdP_swapAB, dKV_swapAB, dQ_swapAB, NumMmaWarpGroups, AtomLayoutMSdP, AtomLayoutNdKV, AtomLayoutMdQ, V_in_regs>;
     using CollectiveEpilogue = std::conditional_t<
         !GQA,
@@ -148,7 +148,15 @@ void run_flash_bwd(Flash_fna_bwd_params<NADim> &params, cudaStream_t stream) {
         {_1{}, seqlen_q_rounded, params.h * params.seqlen_q_rounded},  // stride_dPsum
         params.scale_softmax,
         params.b,
-        params.dq_semaphore
+        params.dq_semaphore,
+        // NA Args
+        params.qkv_shape,
+        params.q_shape,
+        params.kv_shape,
+        params.window_size,
+        params.stride,
+        params.dilation,
+        params.num_heads_actual
     };
     // The case work with GQA is ugly but idk how to fix it.
     typename CollectiveEpilogue::Arguments epilogue_args {