Dao-AILab
diff --git a/‎flash_attn/cute/block_sparse_utils.py‎
Lines changed: 38 additions & 38 deletions b/‎flash_attn/cute/block_sparse_utils.py‎
Lines changed: 38 additions & 38 deletions
diff --git a/‎flash_attn/cute/block_sparsity.py‎
Lines changed: 10 additions & 10 deletions b/‎flash_attn/cute/block_sparsity.py‎
Lines changed: 10 additions & 10 deletions
diff --git a/‎flash_attn/cute/flash_bwd_sm100.py‎
Lines changed: 10 additions & 10 deletions b/‎flash_attn/cute/flash_bwd_sm100.py‎
Lines changed: 10 additions & 10 deletions
@@ -1007,15 +1007,15 @@ def get_total_q_block_count_bwd(
     batch_idx,
     head_idx,
     n_block,
-    subtile_factor: cutlass.Constexpr = 1,
+    q_subtile_factor: cutlass.Constexpr = 1,
     m_block_max: int = 0,
 ):
     """Count total tile iterations for given n_block (KV tile) in backward."""
     q_block_cnt, _, full_block_cnt, _, *_ = blocksparse_tensors
     total = q_block_cnt[batch_idx, head_idx, n_block]
     if const_expr(full_block_cnt is not None):
         total = total + full_block_cnt[batch_idx, head_idx, n_block]
-    return total * subtile_factor
+    return total * q_subtile_factor
 
 
 @cute.jit
@@ -1050,7 +1050,7 @@ def produce_block_sparse_q_loads_bwd_sm100(
     should_load_Q: cutlass.Constexpr,
     should_load_dO: cutlass.Constexpr,
     # Subtiling factor and bounds
-    subtile_factor: cutlass.Constexpr = 1,
+    q_subtile_factor: cutlass.Constexpr = 1,
     m_block_max: int = 0,
 ):
     """SM100 backward block sparse loading with subtiling.
@@ -1065,7 +1065,7 @@ def produce_block_sparse_q_loads_bwd_sm100(
         curr_full_idx,
         loop_count,
     ) = get_block_sparse_iteration_info_bwd(
-        blocksparse_tensors, batch_idx, head_idx, n_block, subtile_factor, m_block_max
+        blocksparse_tensors, batch_idx, head_idx, n_block, q_subtile_factor, m_block_max
     )
 
     for iter_idx in cutlass.range(loop_count, unroll=1):
@@ -1075,7 +1075,7 @@ def produce_block_sparse_q_loads_bwd_sm100(
             curr_q_idx,
             curr_full_cnt,
             curr_full_idx,
-            subtile_factor,
+            q_subtile_factor,
             m_block_max,
         )
         m_block_safe = m_block
@@ -1148,7 +1148,7 @@ def get_block_sparse_iteration_info_bwd(
     batch_idx,
     head_idx,
     n_block,
-    subtile_factor: cutlass.Constexpr = 1,
+    q_subtile_factor: cutlass.Constexpr = 1,
     m_block_max: int = 0,
 ):
     """Extract block-sparse iteration info for backward pass.
@@ -1169,7 +1169,7 @@ def get_block_sparse_iteration_info_bwd(
     sparse_block_count = curr_q_cnt
     if const_expr(full_cnt is not None):
         sparse_block_count = sparse_block_count + curr_full_cnt
-    total_count = sparse_block_count * subtile_factor
+    total_count = sparse_block_count * q_subtile_factor
 
     return curr_q_cnt, curr_q_idx, curr_full_cnt, curr_full_idx, total_count
 
@@ -1181,7 +1181,7 @@ def get_m_block_from_iter_bwd(
     curr_q_idx: cute.Tensor,
     curr_full_cnt,
     curr_full_idx: Optional[cute.Tensor],
-    subtile_factor: cutlass.Constexpr = 1,
+    q_subtile_factor: cutlass.Constexpr = 1,
     m_block_max: int = 0,
 ):
     """Derive m_block index and is_full_block flag from iteration index.
@@ -1190,8 +1190,8 @@ def get_m_block_from_iter_bwd(
         - m_block: The actual Q-tile block index
         - is_full_block: True if this is a full block (no mask_mod needed)
     """
-    sparse_iter_idx = iter_idx // subtile_factor
-    subtile_offset = iter_idx % subtile_factor
+    sparse_iter_idx = iter_idx // q_subtile_factor
+    subtile_offset = iter_idx % q_subtile_factor
 
     sparse_m_block = Int32(0)
     is_full_block = False
@@ -1204,7 +1204,7 @@ def get_m_block_from_iter_bwd(
     else:
         sparse_m_block = curr_q_idx[sparse_iter_idx]
 
-    return sparse_m_block * subtile_factor + subtile_offset, is_full_block
+    return sparse_m_block * q_subtile_factor + subtile_offset, is_full_block
 
 
 @cute.jit
@@ -1269,7 +1269,7 @@ def produce_block_sparse_q_loads_bwd_sm90(
     tma_copy_bytes_K,
     tma_copy_bytes_V,
     Q_stage_eq_dO_stage: cutlass.Constexpr,
-    subtile_factor: cutlass.Constexpr,
+    q_subtile_factor: cutlass.Constexpr,
     m_block_max: int,
 ):
     """SM90 backward block sparse loading with separate partial/full loops.
@@ -1292,10 +1292,10 @@ def produce_block_sparse_q_loads_bwd_sm90(
 
     kv_loaded = False
 
-    for iter_idx in cutlass.range(curr_q_cnt * subtile_factor, unroll=1):
-        sparse_idx = iter_idx // subtile_factor
-        subtile_offset = iter_idx % subtile_factor
-        m_block = curr_q_idx[sparse_idx] * subtile_factor + subtile_offset
+    for iter_idx in cutlass.range(curr_q_cnt * q_subtile_factor, unroll=1):
+        sparse_idx = iter_idx // q_subtile_factor
+        subtile_offset = iter_idx % q_subtile_factor
+        m_block = curr_q_idx[sparse_idx] * q_subtile_factor + subtile_offset
 
         if m_block < m_block_max:
             producer_state_Q, producer_state_dO = _load_q_do_block_sm90(
@@ -1318,10 +1318,10 @@ def produce_block_sparse_q_loads_bwd_sm90(
             kv_loaded = True
 
     if const_expr(full_cnt is not None):
-        for iter_idx in cutlass.range(curr_full_cnt * subtile_factor, unroll=1):
-            sparse_idx = iter_idx // subtile_factor
-            subtile_offset = iter_idx % subtile_factor
-            m_block = curr_full_idx[sparse_idx] * subtile_factor + subtile_offset
+        for iter_idx in cutlass.range(curr_full_cnt * q_subtile_factor, unroll=1):
+            sparse_idx = iter_idx // q_subtile_factor
+            subtile_offset = iter_idx % q_subtile_factor
+            m_block = curr_full_idx[sparse_idx] * q_subtile_factor + subtile_offset
 
             if m_block < m_block_max:
                 producer_state_Q, producer_state_dO = _load_q_do_block_sm90(
@@ -1362,7 +1362,7 @@ def consume_block_sparse_mma_bwd_sm90(
     thr_mma_SdP,
     score_mod_fn=None,
     score_mod_bwd_fn=None,
-    subtile_factor: cutlass.Constexpr = 1,
+    q_subtile_factor: cutlass.Constexpr = 1,
     m_block_max: int = 0,
     aux_data: AuxData = AuxData(),
     fastdiv_mods=(None, None),
@@ -1414,10 +1414,10 @@ def consume_block_sparse_mma_bwd_sm90(
         fastdiv_mods=fastdiv_mods,
     )
 
-    for iter_idx in cutlass.range(curr_q_cnt * subtile_factor, unroll=1):
-        sparse_idx = iter_idx // subtile_factor
-        subtile_offset = iter_idx % subtile_factor
-        m_block = curr_q_idx[sparse_idx] * subtile_factor + subtile_offset
+    for iter_idx in cutlass.range(curr_q_cnt * q_subtile_factor, unroll=1):
+        sparse_idx = iter_idx // q_subtile_factor
+        subtile_offset = iter_idx % q_subtile_factor
+        m_block = curr_q_idx[sparse_idx] * q_subtile_factor + subtile_offset
 
         if m_block < m_block_max:
             consumer_state_Q, consumer_state_dO = mma_one_m_block_fn(
@@ -1432,10 +1432,10 @@ def consume_block_sparse_mma_bwd_sm90(
             dKV_accumulate = True
 
     if const_expr(full_cnt is not None):
-        for iter_idx in cutlass.range(curr_full_cnt * subtile_factor, unroll=1):
-            sparse_idx = iter_idx // subtile_factor
-            subtile_offset = iter_idx % subtile_factor
-            m_block = curr_full_idx[sparse_idx] * subtile_factor + subtile_offset
+        for iter_idx in cutlass.range(curr_full_cnt * q_subtile_factor, unroll=1):
+            sparse_idx = iter_idx // q_subtile_factor
+            subtile_offset = iter_idx % q_subtile_factor
+            m_block = curr_full_idx[sparse_idx] * q_subtile_factor + subtile_offset
 
             if m_block < m_block_max:
                 consumer_state_Q, consumer_state_dO = mma_one_m_block_fn(
@@ -1490,7 +1490,7 @@ def dQaccum_store_block_sparse_bwd_sm90(
     n_block,
     sdQaccum: cute.Tensor,
     gdQaccum: cute.Tensor,
-    subtile_factor: cutlass.Constexpr,
+    q_subtile_factor: cutlass.Constexpr,
     m_block_max: int,
     num_dQ_warp_groups: cutlass.Constexpr,
     num_threads_per_warp_group: cutlass.Constexpr,
@@ -1511,10 +1511,10 @@ def dQaccum_store_block_sparse_bwd_sm90(
         curr_full_cnt = Int32(0)
         curr_full_idx = None
 
-    for iter_idx in cutlass.range(curr_q_cnt * subtile_factor, unroll=1):
-        sparse_idx = iter_idx // subtile_factor
-        subtile_offset = iter_idx % subtile_factor
-        m_block = curr_q_idx[sparse_idx] * subtile_factor + subtile_offset
+    for iter_idx in cutlass.range(curr_q_cnt * q_subtile_factor, unroll=1):
+        sparse_idx = iter_idx // q_subtile_factor
+        subtile_offset = iter_idx % q_subtile_factor
+        m_block = curr_q_idx[sparse_idx] * q_subtile_factor + subtile_offset
 
         if m_block < m_block_max:
             _store_one_dQaccum_sm90(
@@ -1527,10 +1527,10 @@ def dQaccum_store_block_sparse_bwd_sm90(
             )
 
     if const_expr(full_cnt is not None):
-        for iter_idx in cutlass.range(curr_full_cnt * subtile_factor, unroll=1):
-            sparse_idx = iter_idx // subtile_factor
-            subtile_offset = iter_idx % subtile_factor
-            m_block = curr_full_idx[sparse_idx] * subtile_factor + subtile_offset
+        for iter_idx in cutlass.range(curr_full_cnt * q_subtile_factor, unroll=1):
+            sparse_idx = iter_idx // q_subtile_factor
+            subtile_offset = iter_idx % q_subtile_factor
+            m_block = curr_full_idx[sparse_idx] * q_subtile_factor + subtile_offset
 
             if m_block < m_block_max:
                 _store_one_dQaccum_sm90(
 
@@ -391,15 +391,15 @@ def get_block_sparse_expected_shapes_bwd(
     seqlen_k: int,
     m_block_size: int,
     n_block_size: int,
-    subtile_factor: int,
+    q_subtile_factor: int,
 ) -> Tuple[Tuple[int, int, int], Tuple[int, int, int, int]]:
     """Return (expected_count_shape, expected_index_shape) for backward block sparse normalization.
 
     Backward uses Q-direction indexing (transposed from forward), where shapes are
     indexed by N-blocks first, then M-blocks. The sparse_block_size_q is determined
-    by subtile_factor * m_block_size.
+    by q_subtile_factor * m_block_size.
     """
-    sparse_block_size_q = subtile_factor * m_block_size
+    sparse_block_size_q = q_subtile_factor * m_block_size
     expected_m_blocks = ceildiv(seqlen_q, sparse_block_size_q)
     expected_n_blocks = ceildiv(seqlen_k, n_block_size)
     expected_count_shape = (batch_size, num_head, expected_n_blocks)
@@ -590,17 +590,17 @@ def normalize_block_sparse_config_bwd(
     seqlen_q: int,
     seqlen_k: int,
     block_size: tuple[int, int],
-    subtile_factor: int,
+    q_subtile_factor: int,
 ) -> tuple[BlockSparseTensorsTorch, Tuple[Tuple[bool, ...], ...] | None]:
     m_block_size, n_block_size = block_size
     if tensors.block_size is None:
-        sparse_block_size_q, sparse_block_size_kv = subtile_factor * m_block_size, n_block_size
+        sparse_block_size_q, sparse_block_size_kv = q_subtile_factor * m_block_size, n_block_size
     else:
         sparse_block_size_q, sparse_block_size_kv = tensors.block_size
-    if sparse_block_size_q != subtile_factor * m_block_size:
+    if sparse_block_size_q != q_subtile_factor * m_block_size:
         raise ValueError(
-            f"Block sparsity expects sparse_block_size_q={subtile_factor * m_block_size} "
-            f"for subtile_factor={subtile_factor}."
+            f"Block sparsity expects sparse_block_size_q={q_subtile_factor * m_block_size} "
+            f"for q_subtile_factor={q_subtile_factor}."
         )
     if sparse_block_size_kv != n_block_size:
         raise ValueError(
@@ -613,7 +613,7 @@ def normalize_block_sparse_config_bwd(
         seqlen_k,
         m_block_size,
         n_block_size,
-        subtile_factor,
+        q_subtile_factor,
     )
     normalized_tensors = normalize_block_sparse_tensors(
         tensors,
@@ -623,7 +623,7 @@ def normalize_block_sparse_config_bwd(
         hint=lambda: (
             f"Backward expects Q-direction block-sparse tensors (q_mask_cnt/q_mask_idx, "
             f"and optionally full_q_cnt/full_q_idx). Regenerate the backward BlockMask with "
-            f"BLOCK_SIZE=({subtile_factor * m_block_size}, {n_block_size})."
+            f"BLOCK_SIZE=({q_subtile_factor * m_block_size}, {n_block_size})."
         ),
     )
     return normalized_tensors, get_block_sparse_broadcast_pattern(normalized_tensors)
 
@@ -66,7 +66,7 @@ def __init__(
         score_mod_bwd: cutlass.Constexpr | None = None,
         mask_mod: cutlass.Constexpr | None = None,
         has_aux_tensors: cutlass.Constexpr = False,
-        subtile_factor: cutlass.Constexpr[int] = 1,
+        q_subtile_factor: cutlass.Constexpr[int] = 1,
     ):
         # padding head_dim to a multiple of 16 as k_block_size
         hdim_multiple_of = 16
@@ -119,7 +119,7 @@ def __init__(
         self.score_mod_bwd = score_mod_bwd
         self.mask_mod = mask_mod
         self.has_aux_tensors = has_aux_tensors
-        self.subtile_factor = subtile_factor
+        self.q_subtile_factor = q_subtile_factor
         # For score_mod, use vec_size=1 (like forward) to handle per-element indices
         if cutlass.const_expr(has_aux_tensors):
             self.vec_size: cutlass.Constexpr = 1
@@ -1910,7 +1910,7 @@ def load(
                     batch_idx,
                     head_idx,
                     n_block,
-                    subtile_factor=self.subtile_factor,
+                    q_subtile_factor=self.q_subtile_factor,
                     m_block_max=m_block_max,
                 )
                 process_tile = total_m_block_cnt > Int32(0)
@@ -1947,7 +1947,7 @@ def load(
                             self.tma_copy_bytes["V"],
                             should_load_Q=should_load_Q,
                             should_load_dO=should_load_dO,
-                            subtile_factor=self.subtile_factor,
+                            q_subtile_factor=self.q_subtile_factor,
                             m_block_max=m_block_max,
                         )
                     )
@@ -2366,7 +2366,7 @@ def mma(
                     batch_idx,
                     head_idx,
                     n_block,
-                    subtile_factor=self.subtile_factor,
+                    q_subtile_factor=self.q_subtile_factor,
                     m_block_max=m_block_max,
                 )
                 process_tile = block_iter_count > Int32(0)
@@ -3019,7 +3019,7 @@ def compute_loop(
                     batch_idx,
                     head_idx,
                     n_block,
-                    subtile_factor=self.subtile_factor,
+                    q_subtile_factor=self.q_subtile_factor,
                     m_block_max=m_block_max,
                 )
                 process_tile = loop_count > Int32(0)
@@ -3038,7 +3038,7 @@ def compute_loop(
                         curr_q_idx,
                         curr_full_cnt,
                         curr_full_idx,
-                        subtile_factor=self.subtile_factor,
+                        q_subtile_factor=self.q_subtile_factor,
                         m_block_max=m_block_max,
                     )
                     m_block_oob = m_block >= m_block_max
@@ -3445,7 +3445,7 @@ def _dq_semaphore_lock_value(
         if const_expr(self.use_block_sparsity):
             assert blocksparse_tensors is not None
             if const_expr(blocksparse_tensors.dq_write_order is not None):
-                sparse_iter = iter_idx // self.subtile_factor
+                sparse_iter = iter_idx // self.q_subtile_factor
                 if sparse_iter < curr_q_cnt:
                     assert curr_dq_write_order is not None
                     lock_value = curr_dq_write_order[sparse_iter]
@@ -3554,7 +3554,7 @@ def dQacc_reduce(
                     batch_idx,
                     head_idx,
                     n_block,
-                    subtile_factor=self.subtile_factor,
+                    q_subtile_factor=self.q_subtile_factor,
                     m_block_max=m_block_max,
                 )
                 process_tile = loop_count > Int32(0)
@@ -3584,7 +3584,7 @@ def dQacc_reduce(
                         curr_q_idx,
                         curr_full_cnt,
                         curr_full_idx,
-                        subtile_factor=self.subtile_factor,
+                        q_subtile_factor=self.q_subtile_factor,
                         m_block_max=m_block_max,
                     )
                     m_block_oob_upper = m_block >= m_block_max