SHI-Labs
diff --git a/‎csrc/include/natten/cuda/flash_fmha/flash_kernel/block.h‎
Lines changed: 0 additions & 3 deletions b/‎csrc/include/natten/cuda/flash_fmha/flash_kernel/block.h‎
Lines changed: 0 additions & 3 deletions
diff --git a/‎csrc/include/natten/cuda/flash_fmha/flash_kernel/epilogue_bwd.hpp‎
Lines changed: 0 additions & 9 deletions b/‎csrc/include/natten/cuda/flash_fmha/flash_kernel/epilogue_bwd.hpp‎
Lines changed: 0 additions & 9 deletions
diff --git a/‎csrc/include/natten/cuda/flash_fmha/flash_kernel/epilogue_fwd.hpp‎
Lines changed: 0 additions & 2 deletions b/‎csrc/include/natten/cuda/flash_fmha/flash_kernel/epilogue_fwd.hpp‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎csrc/include/natten/cuda/flash_fmha/flash_kernel/flash.h‎
Lines changed: 0 additions & 50 deletions b/‎csrc/include/natten/cuda/flash_fmha/flash_kernel/flash.h‎
Lines changed: 0 additions & 50 deletions
diff --git a/‎csrc/include/natten/cuda/flash_fmha/flash_kernel/flash_bwd_launch_template.h‎
Lines changed: 0 additions & 1 deletion b/‎csrc/include/natten/cuda/flash_fmha/flash_kernel/flash_bwd_launch_template.h‎
Lines changed: 0 additions & 1 deletion
diff --git a/‎csrc/include/natten/cuda/flash_fna/flash_kernel/bwd_mask.h‎
Lines changed: 5 additions & 6 deletions b/‎csrc/include/natten/cuda/flash_fna/flash_kernel/bwd_mask.h‎
Lines changed: 5 additions & 6 deletions
diff --git a/‎csrc/include/natten/cuda/flash_fna/flash_kernel/cuda_check.h‎
Lines changed: 0 additions & 26 deletions b/‎csrc/include/natten/cuda/flash_fna/flash_kernel/cuda_check.h‎
Lines changed: 0 additions & 26 deletions
@@ -22,8 +22,6 @@ struct BlockMN {
         int const seqlen_q = seqlen_info.seqlen_q;
         int n_block_max = cute::ceil_div(seqlen_k, kBlockN);
         int n_block_min = 0;
-        // if (threadIdx.x == 128) { printf("Inside, bid.x = %d, bid.y = %d, bid.z = %d, split_idx = %d, n_block_min: %d, n_block_max: %d\n", blockIdx.x, blockIdx.y, blockIdx.z, split_idx, n_block_min, n_block_max); }
-        // if (threadIdx.x == 128) { printf("After split, inside, bid.y = %d, bid.z = %d, split_idx = %d, n_block_min: %d, n_block_max: %d\n", blockIdx.y, blockIdx.z, split_idx, n_block_min, n_block_max); }
         return {n_block_min, n_block_max};
     }
 
@@ -40,7 +38,6 @@ struct BlockMN {
         int const idx_k_new_max = std::min(n_block_max * kBlockN - seqlen_info.seqlen_k_og, seqlen_info.seqlen_k_new);
         int const n_block_new_min = idx_k_new_min / kBlockN;
         int const n_block_new_max = idx_k_new_max > idx_k_new_min ? cute::ceil_div(idx_k_new_max, kBlockN) : n_block_new_min;
-        // if (threadIdx.x == 128 && m_block == 0) { printf("bidb = %d, seqlen_k_new = %d, seqlen_k_og = %d, n_block_min = %d, n_block_max = %d, idx_k_new_min = %d, idx_k_new_max = %d, n_block_new_min = %d, n_block_new_max = %d\n", bidb, seqlen_k_new, seqlen_k_og, n_block_min, n_block_max, idx_k_new_min, idx_k_new_max, n_block_new_min, n_block_new_max);}
         return {n_block_new_min, n_block_new_max};
     }
 
 
@@ -9,14 +9,10 @@
 #include "cute/tensor.hpp"
 
 #include "cutlass/gemm/collective/builders/sm90_common.inl"
-// #include "copy_sm90_bulk_reduce.hpp"
 
 #include "seqlen.h"
 #include "named_barrier.hpp"
 #include "utils.h"
-// #include "natten/cuda/flash_fmha/seqlen.h"
-// #include "natten/cuda/flash_fmha/named_barrier.hpp"
-// #include "natten/cuda/flash_fmha/utils.h"
 
 namespace natten {
 namespace cuda {
@@ -191,7 +187,6 @@ struct CollectiveEpilogueBwd {
         flash::convert_type_out(tdKrdK, tdKrdK_out);
         Tensor taccdKrdK = smem_thr_copy_dKV.retile_S(tdKrdK_out);        // ((Atom,AtomNum), MMA_M, MMA_N)
         Tensor taccdVrdV = smem_thr_copy_dKV.retile_S(tdVrdV_out);        // ((Atom,AtomNum), MMA_M, MMA_N)
-        // if (blockIdx.x == 0 && threadIdx.x == 128) { print(smem_thr_copy_dKV); print(sdK); printf("\n"); print(sdKt); printf("\n"); }
         Tensor taccdKsdK = smem_thr_copy_dKV.partition_D(cute::conditional_return<!dKV_swapAB>(sdK, sdKt));     // ((Atom,AtomNum),PIPE_M,PIPE_N)
         Tensor taccdVsdV = smem_thr_copy_dKV.partition_D(cute::conditional_return<!dKV_swapAB>(sdV, sdVt));     // ((Atom,AtomNum),PIPE_M,PIPE_N)
 
@@ -461,12 +456,10 @@ struct CollectiveEpilogueBwdGQA {
         int *lock_ptr = !Deterministic ? nullptr : params.dv_semaphore + bidb * num_head_kv + bidh_kv;
         using Barrier = cutlass::GenericBarrier<cutlass::detail::SyncwarpSync>;
 
-        // if (thread_idx == 0) { printf("blockIdx.x = %d, blockIdx.y = %d, blockIdx.z = %d, bidb = %d, bidh_kv = %d, lock_ptr = %p, dv_semaphore = %p, num_batch = %d, num_head_kv = %d, n_block = %d, bihd_idx_in_group = %d\n", blockIdx.x, blockIdx.y, blockIdx.z, bidb, bidh_kv, lock_ptr, params.dv_semaphore, num_batch, num_head_kv, n_block, bidh_idx_in_group);}
 
         if constexpr (Deterministic) {
             Barrier::wait_eq(lock_ptr, thread_idx, n_block * num_batch * num_head_kv, bidh_idx_in_group);
         }
-        // if (thread_idx == 0) { printf("After barrier blockIdx.x = %d, blockIdx.y = %d, blockIdx.z = %d, bidb = %d, bidh_kv = %d, lock_ptr = %p, dv_semaphore = %p\n", blockIdx.x, blockIdx.y, blockIdx.z, bidb, bidh_kv, lock_ptr, params.dv_semaphore);}
         // if constexpr (Use_TMA) {
         //     cutlass::arch::fence_view_async_shared();
         //     cutlass::arch::NamedBarrier::sync(NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
@@ -492,12 +485,10 @@ struct CollectiveEpilogueBwdGQA {
             cute::copy(r2s_tiled_copy_dKVaccum, taccdKVrdK, tdKVsdKVaccum);
         }
         lock_ptr = !Deterministic ? nullptr : params.dk_semaphore + bidb * num_head_kv + bidh_kv;
-        // if (thread_idx == 0) { printf("blockIdx.x = %d, blockIdx.y = %d, blockIdx.z = %d, bidb = %d, bidh_kv = %d, lock_ptr = %p, dk_semaphore = %p, num_batch = %d, num_head_kv = %d, n_block = %d, bihd_idx_in_group = %d\n", blockIdx.x, blockIdx.y, blockIdx.z, bidb, bidh_kv, lock_ptr, params.dk_semaphore, num_batch, num_head_kv, n_block, bidh_idx_in_group);}
 
         if constexpr (Deterministic) {
             Barrier::wait_eq(lock_ptr, thread_idx, n_block * num_batch * num_head_kv, bidh_idx_in_group);
         }
-        // if (thread_idx == 0) { printf("After barrier blockIdx.x = %d, blockIdx.y = %d, blockIdx.z = %d, bidb = %d, bidh_kv = %d, lock_ptr = %p, dk_semaphore = %p\n", blockIdx.x, blockIdx.y, blockIdx.z, bidb, bidh_kv, lock_ptr, params.dk_semaphore);}
         // if constexpr (Use_TMA) {
         //     cutlass::arch::fence_view_async_shared();
         //     cutlass::arch::NamedBarrier::sync(NumEpilogueThreads, cutlass::arch::ReservedNamedBarriers::EpilogueBarrier);
 
@@ -289,7 +289,6 @@ struct CollectiveEpilogueFwd {
         Tensor mLSE = make_tensor(make_gmem_ptr(params.ptr_LSE + offset_o * get<0>(params.stride_LSE)),
                                   params.shape_LSE_packed,
                                   params.stride_LSE_packed)(_, bidh, bidb, 0);
-        // if (thread_idx == 0) { printf("Before LSE write, m_block: %d, bidh: %d, bidb: %d, split_idx: %d, offset_o: %d, seqlen_o: %d\n", m_block, bidh, bidb, split_idx, offset_o, seqlen_o); print(mLSE); printf("\n"); }
         if (!LargeHeadDimV || warp_group_idx == 0) {
             if constexpr (!PackGQA) {
                 #pragma unroll
@@ -327,7 +326,6 @@ struct CollectiveEpilogueFwd {
             // if (!is_split) {
             Tensor mO = make_tensor(make_gmem_ptr(params.ptr_O + offset_o * get<0>(params.stride_O)), params.shape_O_packed, params.stride_O_packed)(_, _, bidh, bidb, _0{});
             Tensor gO = local_tile(mO, select<0, 1>(TileShape_MNK_PV{}), make_coord(m_block, _0{}));  // (M, K)
-            // if (thread_idx == 0) { printf("Before O write, m_block: %d, bidh: %d, bidb: %d, split_idx: %d, offset_o: %d, seqlen_o: %d, mO_addr = %p, addr diff = %d\n", m_block, bidh, bidb, split_idx, offset_o, seqlen_o, mO.data(), reinterpret_cast<int>(&mO(0)) - reinterpret_cast<int>(params.ptr_O)); }
             GmemTiledCopyO gmem_tiled_copy_O;
             auto gmem_thr_copy_O = gmem_tiled_copy_O.get_thread_slice(thread_idx);
             Tensor tOsO = gmem_thr_copy_O.partition_S(sO);        // ((Atom,AtomNum),ATOM_M,ATOM_N)
 
@@ -72,17 +72,6 @@ struct Flash_fwd_params : public Qkv_params {
 
     // The scaling factors for the kernel.
     float scale_softmax;
-    // float softcap;
-
-    // array of length b+1 holding starting offset of each sequence.
-    // int * __restrict__ cu_seqlens_q;
-    // int * __restrict__ cu_seqlens_k;
-    // int * __restrict__ cu_seqlens_knew;
-    // int * __restrict__ leftpad_k;
-
-    // If provided, the actual length of each q/k sequence.
-    // int *__restrict__ seqused_q;
-    // int *__restrict__ seqused_k;
 
     // The stride between rows of Oaccum.
     index_t oaccum_split_stride;
@@ -95,38 +84,6 @@ struct Flash_fwd_params : public Qkv_params {
     index_t lseaccum_batch_stride;
     index_t lseaccum_head_stride;
 
-    // The K_new and V_new matrices.
-    // void * __restrict__ knew_ptr;
-    // void * __restrict__ vnew_ptr;
-
-    // The stride between rows of the Q, K and V matrices.
-    // index_t knew_batch_stride;
-    // index_t vnew_batch_stride;
-    // index_t knew_row_stride;
-    // index_t vnew_row_stride;
-    // index_t knew_head_stride;
-    // index_t vnew_head_stride;
-
-    // void *__restrict__ qv_ptr;
-    // index_t qv_batch_stride;
-    // index_t qv_row_stride;
-    // index_t qv_head_stride;
-
-    // The cos and sin matrices for rotary embedding.
-    // void * __restrict__ rotary_cos_ptr;
-    // void * __restrict__ rotary_sin_ptr;
-    // int *__restrict__ seqlens_rotary;
-
-    // The indices to index into the KV cache.
-    // int * __restrict__ kv_batch_idx;
-
-    // Paged KV cache
-    // int * __restrict__ page_table;
-    // index_t page_table_batch_stride;
-    // int page_size;
-    // int num_pages;
-    // bool pagedkv_tma;
-
     // The dropout probability (probability of keeping an activation).
     float p_dropout;
     // uint32_t p_dropout_in_uint;
@@ -221,10 +178,3 @@ struct Flash_bwd_params : public Flash_fwd_params {
 } // namespace natten
 ////////////////////////////////////////////////////////////////////////////////////////////////////
 
-// template <int Arch, typename T, int kHeadDim, int kHeadDimV, bool Split, bool PagedKVNonTMA, bool Has_softcap, bool PackGQA>
-// void run_mha_fwd_(Flash_fwd_params &params, cudaStream_t stream);
-// void prepare_varlen_num_blocks(Flash_fwd_params &params, cudaStream_t stream, bool packgqa, int blockM, int blockN, bool enable_pdl);
-// template <int Arch, typename T, int kHeadDim, bool Has_softcap>
-// void run_mha_bwd_(Flash_bwd_params &params, cudaStream_t stream);
-// template <typename T, typename Tpartial, int kBlockK>
-// void run_mha_fwd_combine_(Flash_fwd_params &params, cudaStream_t stream, bool enable_pdl);
@@ -221,7 +221,6 @@ void run_flash_bwd(Flash_bwd_params &params, cudaStream_t stream) {
     // int smem_size_v = sizeof(decltype((typename CollectiveMainloop::TensorStorage{}).smem_v));
     // int smem_size_lse = sizeof(decltype((typename CollectiveMainloop::TensorStorage{}).smem_lse));
     // int smem_size_dpsum = sizeof(decltype((typename CollectiveMainloop::TensorStorage{}).smem_dpsum));
-    // printf("smem_size = %d, q = %d, k = %d, v = %d, do = %d, ds = %d, dqacc = %d, lse = %d, dpsum = %d\n", smem_size, smem_size_q, smem_size_k, smem_size_v, smem_size_do, smem_size_ds, smem_size_dqacc, smem_size_lse, smem_size_dpsum);
     void const* kernel = (void const*) cutlass::device_kernel<AttnKernel>;
     if constexpr (size(ClusterShape{}) > 1) {
         if (smem_size >= 48 * 1024) {
 
@@ -8,9 +8,8 @@
 
 #include "cutlass/fast_math.h"  // For cutlass::FastDivmod
 
-#include "utils.h"
 #include "na_utils.h"
-// #include "natten/cuda/flash_fmha/utils.h"
+#include "natten/cuda/flash_fmha/flash_kernel/utils.h"
 
 namespace natten {
 namespace cuda {
@@ -73,8 +72,8 @@ struct BwdNAMask {
         Tensor cS = cute::make_identity_tensor(Shape<Int<!SwapAB ? kBlockM : kBlockN>, Int<!SwapAB ? kBlockN : kBlockM>>{});
         Tensor tScS = thread_mma.partition_C(cS);
 
-        Tensor tSrS_rowcol = make_tensor(tSrS.data(), flash_fna::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tSrS.layout()));
-        Tensor tScS_rowcol = make_tensor(tScS.data(), flash_fna::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tScS.layout()));
+        Tensor tSrS_rowcol = make_tensor(tSrS.data(), flash::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tSrS.layout()));
+        Tensor tScS_rowcol = make_tensor(tScS.data(), flash::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tScS.layout()));
 
         if constexpr (!SwapAB) {
           tScS_rowcol.data() = tScS_rowcol.data() + E<0>{} * m_block * kBlockM + E<0>{} * size(q_blk_offset);
@@ -165,8 +164,8 @@ struct BwdNAMask {
         Tensor cS = cute::make_identity_tensor(Shape<Int<!SwapAB ? kBlockM : kBlockN>, Int<!SwapAB ? kBlockN : kBlockM>>{});
         Tensor tScS = thread_mma.partition_C(cS);
 
-        Tensor tSrS_rowcol = make_tensor(tSrS.data(), flash_fna::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tSrS.layout()));
-        Tensor tScS_rowcol = make_tensor(tScS.data(), flash_fna::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tScS.layout()));
+        Tensor tSrS_rowcol = make_tensor(tSrS.data(), flash::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tSrS.layout()));
+        Tensor tScS_rowcol = make_tensor(tScS.data(), flash::convert_layout_acc_rowcol</*Transposed=*/SwapAB>(tScS.layout()));
 
         if constexpr (!SwapAB) {
           tScS_rowcol.data() = tScS_rowcol.data() + E<0>{} * m_block * kBlockM + E<0>{} * size(q_blk_offset);
Original file line number	Diff line number	Diff line change
`@@ -22,8 +22,6 @@ struct BlockMN {`
`22`	`22`	`int const seqlen_q = seqlen_info.seqlen_q;`
`23`	`23`	`int n_block_max = cute::ceil_div(seqlen_k, kBlockN);`
`24`	`24`	`int n_block_min = 0;`
`25`		`- // if (threadIdx.x == 128) { printf("Inside, bid.x = %d, bid.y = %d, bid.z = %d, split_idx = %d, n_block_min: %d, n_block_max: %d\n", blockIdx.x, blockIdx.y, blockIdx.z, split_idx, n_block_min, n_block_max); }`
`26`		`- // if (threadIdx.x == 128) { printf("After split, inside, bid.y = %d, bid.z = %d, split_idx = %d, n_block_min: %d, n_block_max: %d\n", blockIdx.y, blockIdx.z, split_idx, n_block_min, n_block_max); }`
`27`	`25`	`return {n_block_min, n_block_max};`
`28`	`26`	`}`
`29`	`27`
`@@ -40,7 +38,6 @@ struct BlockMN {`
`40`	`38`	`int const idx_k_new_max = std::min(n_block_max * kBlockN - seqlen_info.seqlen_k_og, seqlen_info.seqlen_k_new);`
`41`	`39`	`int const n_block_new_min = idx_k_new_min / kBlockN;`
`42`	`40`	`int const n_block_new_max = idx_k_new_max > idx_k_new_min ? cute::ceil_div(idx_k_new_max, kBlockN) : n_block_new_min;`
`43`		`- // if (threadIdx.x == 128 && m_block == 0) { printf("bidb = %d, seqlen_k_new = %d, seqlen_k_og = %d, n_block_min = %d, n_block_max = %d, idx_k_new_min = %d, idx_k_new_max = %d, n_block_new_min = %d, n_block_new_max = %d\n", bidb, seqlen_k_new, seqlen_k_og, n_block_min, n_block_max, idx_k_new_min, idx_k_new_max, n_block_new_min, n_block_new_max);}`
`44`	`41`	`return {n_block_new_min, n_block_new_max};`
`45`	`42`	`}`
`46`	`43`