tenstorrent
diff --git a/‎tt-train/sources/ttml/metal/ops/sdpa_bw/device/kernels/compute/sdpa_bw_compute_utils.hpp‎
Lines changed: 3 additions & 2 deletions b/‎tt-train/sources/ttml/metal/ops/sdpa_bw/device/kernels/compute/sdpa_bw_compute_utils.hpp‎
Lines changed: 3 additions & 2 deletions
diff --git a/‎tt-train/sources/ttml/metal/ops/sdpa_bw/device/kernels/compute/sdpa_bw_kv_compute_kernel.cpp‎
Lines changed: 55 additions & 8 deletions b/‎tt-train/sources/ttml/metal/ops/sdpa_bw/device/kernels/compute/sdpa_bw_kv_compute_kernel.cpp‎
Lines changed: 55 additions & 8 deletions
diff --git a/‎tt-train/sources/ttml/metal/ops/sdpa_bw/device/kernels/compute/sdpa_bw_q_compute_kernel.cpp‎
Lines changed: 51 additions & 6 deletions b/‎tt-train/sources/ttml/metal/ops/sdpa_bw/device/kernels/compute/sdpa_bw_q_compute_kernel.cpp‎
Lines changed: 51 additions & 6 deletions
diff --git a/‎tt-train/sources/ttml/metal/ops/sdpa_bw/device/kernels/dataflow/sdpa_bw_kv_reader_kernel.cpp‎
Lines changed: 27 additions & 3 deletions b/‎tt-train/sources/ttml/metal/ops/sdpa_bw/device/kernels/dataflow/sdpa_bw_kv_reader_kernel.cpp‎
Lines changed: 27 additions & 3 deletions
diff --git a/‎tt-train/sources/ttml/metal/ops/sdpa_bw/device/kernels/dataflow/sdpa_bw_kv_writer_kernel.cpp‎
Lines changed: 6 additions & 0 deletions b/‎tt-train/sources/ttml/metal/ops/sdpa_bw/device/kernels/dataflow/sdpa_bw_kv_writer_kernel.cpp‎
Lines changed: 6 additions & 0 deletions
@@ -31,6 +31,9 @@ constexpr uint32_t onetile = 1U;
 //   masked ones.
 // This way, after applying softmax, masked positions will effectively become zero,
 // and only the unmasked positions will retain meaningful attention weights
+//
+// Note: Does NOT pop the mask tile - caller must pop explicitly when done with the tile.
+// This allows reusing the same mask tile for causal masks.
 void apply_mask_on_reg(
     const uint32_t register_idx,
     const uint32_t cb_attn_mask,
@@ -62,8 +65,6 @@ void apply_mask_on_reg(
     // unmasked positions remain unchanged
     add_binary_tile_init();
     add_binary_tile(register_idx, mask_register, register_idx);
-
-    cb_pop_front(cb_attn_mask, onetile);
 }
 
 // Recomputes attention weights from pre-softmax scores using stored statistics.
 
@@ -78,7 +78,9 @@ constexpr uint32_t cb_attn_output = tt::CBIndex::c_1;         // Attention outpu
 constexpr uint32_t cb_query = tt::CBIndex::c_2;               // Original query
 constexpr uint32_t cb_key = tt::CBIndex::c_3;                 // Original key
 constexpr uint32_t cb_value = tt::CBIndex::c_4;               // Original value
+#if defined(CAUSAL_MASK) || defined(USE_ATTN_MASK)
 constexpr uint32_t cb_attn_mask = tt::CBIndex::c_5;           // Original mask
+#endif
 constexpr uint32_t cb_intermediates = tt::CBIndex::c_6;       // Forward pass intermediates
 constexpr uint32_t cb_mat_mul_reduction = tt::CBIndex::c_7;   // Temporary computations
 constexpr uint32_t cb_grad_value_accum = tt::CBIndex::c_8;    // L1 accumulator for grad_value
@@ -98,21 +100,45 @@ const uint32_t tiles_per_row = qWt;       // assuming qWt == kWt == vWt
 const uint32_t num_of_interm_tiles = 2U;  // number of tiles in intermediates buffer per head
 
 void MAIN {
+    // Runtime args - needed for causal mask to know global position within sequence
+    const uint32_t start_row = get_arg_val<uint32_t>(0);
+
     init_sfpu(cb_query, cb_key);
     binary_op_init_common(cb_grad_output, cb_query, cb_key);
 
     cb_wait_front(cb_mat_mul_reduction, onetile);
 
     mm_init(cb_query, cb_key, cb_attention_weights);
 
+#ifdef CAUSAL_MASK
+    // Wait for causal mask tile ONCE - it's generated by writer and will be reused for every diagonal
+    cb_wait_front(cb_attn_mask, onetile);
+#endif
+
     for (uint32_t row = 0; row < num_rows_per_core; ++row) {
         cb_wait_front(cb_key, tiles_per_row);
         cb_wait_front(cb_value, tiles_per_row);
 
+#ifdef CAUSAL_MASK
+        // Calculate global position for this K/V row
+        const uint32_t global_row_idx = start_row + row;
+        const uint32_t k_row_tile = global_row_idx % Ht;  // position within sequence (0 to Ht-1)
+
+        // For causal mask: only process Q rows from k_row_tile to Ht-1
+        // Q rows 0 to k_row_tile-1 have zero attention weights (can't attend to future keys)
+        const uint32_t q_start_tile = k_row_tile;
+        const uint32_t num_q_tiles_to_process = Ht - k_row_tile;
+#else
+        const uint32_t q_start_tile = 0;
+        const uint32_t num_q_tiles_to_process = Ht;
+#endif
+
         for (uint32_t head_idx = 0; head_idx < heads_per_group; ++head_idx) {
             const uint32_t matmul_accum_reg = 0;
 
-            for (uint32_t h = 0; h < Ht; ++h) {
+            for (uint32_t q_idx = 0; q_idx < num_q_tiles_to_process; ++q_idx) {
+                const uint32_t h = q_start_tile + q_idx;  // actual Q row tile index
+
                 // Wait for Q, dO, O, mask and intermediates for this K/V row
                 cb_wait_front(cb_query, tiles_per_row);
                 cb_wait_front(cb_grad_output, tiles_per_row);
@@ -134,12 +160,27 @@ void MAIN {
                         /* dst_reg_idx*/ matmul_accum_reg);  // accumulate in dest_reg 0
                 }
 
-                /*
-                 * apply attention mask on dest_reg.
-                 * function assumes that dest_reg is in acquired state via *acquire_dst* call
-                 * function transforms mask from 1/0 to 0/-inf and applies it on dest_reg
-                 */
+#ifdef CAUSAL_MASK
+                // For causal mask: apply triangular mask on diagonal tile (h == k_row_tile)
+                // Writer generates causal mask tile once, reused for every diagonal
+                if (h == k_row_tile) {
+                    apply_mask_on_reg(matmul_accum_reg, cb_attn_mask, scaler_bits, minus_one_bits, custom_inf_bits);
+                    // Don't pop - causal mask tile is reused for all diagonal positions
+                } else {
+                    // Off-diagonal (h > k_row_tile): just scale, no mask needed
+                    binop_with_scalar_tile_init();
+                    mul_unary_tile(matmul_accum_reg, scaler_bits);
+                }
+#elif defined(USE_ATTN_MASK)
+                // Apply attention mask from DRAM
+                // Transforms mask from 1/0 to 0/-inf and applies it on dest_reg
                 apply_mask_on_reg(matmul_accum_reg, cb_attn_mask, scaler_bits, minus_one_bits, custom_inf_bits);
+                cb_pop_front(cb_attn_mask, onetile);  // Pop each unique mask tile after use
+#else
+                // No mask: just scale
+                binop_with_scalar_tile_init();
+                mul_unary_tile(matmul_accum_reg, scaler_bits);
+#endif
                 tile_regs_commit();
                 tile_regs_wait();
                 pack_reconfig_data_format(cb_attention_weights);
@@ -151,14 +192,15 @@ void MAIN {
                 apply_statistics_inplace(cb_attention_weights, cb_intermediates, num_of_interm_tiles);
 
                 // Step 3: Accumulate grad_V = Attention^T @ grad_output
+                // For causal mask: first iteration is q_idx=0 (h=k_row_tile), head_idx=0
                 update_grad_value(
                     cb_attention_weights,
                     cb_transpose_wh,
                     cb_grad_output,
                     cb_grad_value_accum,
                     tiles_per_row,
                     block_size,
-                    /* do_accumulate */ h > 0 || head_idx > 0);
+                    /* do_accumulate */ q_idx > 0 || head_idx > 0);
                 cb_wait_front(cb_grad_value_accum, tiles_per_row);
 
                 // Step 4: calculate u_scalar_row = sum(dO * O) per row
@@ -185,7 +227,7 @@ void MAIN {
                     cb_grad_key_accum,
                     tiles_per_row,
                     block_size,
-                    /* do_accumulate */ h > 0 || head_idx > 0);
+                    /* do_accumulate */ q_idx > 0 || head_idx > 0);
                 cb_wait_front(cb_grad_key_accum, tiles_per_row);
 
                 // Pop intermediate results used for computing dK and dV
@@ -208,6 +250,11 @@ void MAIN {
         cb_pop_front(cb_key, tiles_per_row);
         cb_pop_front(cb_value, tiles_per_row);
     }
+
+#ifdef CAUSAL_MASK
+    // Pop the causal mask tile after all rows are processed (was reused for every diagonal)
+    cb_pop_front(cb_attn_mask, onetile);
+#endif
 }
 
 }  // namespace NAMESPACE
@@ -70,7 +70,9 @@ constexpr uint32_t cb_attn_output = tt::CBIndex::c_1;         // Attention outpu
 constexpr uint32_t cb_query = tt::CBIndex::c_2;               // Original query
 constexpr uint32_t cb_key = tt::CBIndex::c_3;                 // Original key
 constexpr uint32_t cb_value = tt::CBIndex::c_4;               // Original value
+#if defined(CAUSAL_MASK) || defined(USE_ATTN_MASK)
 constexpr uint32_t cb_attn_mask = tt::CBIndex::c_5;           // Original mask
+#endif
 constexpr uint32_t cb_intermediates = tt::CBIndex::c_6;       // Forward pass intermediates
 constexpr uint32_t cb_mat_mul_reduction = tt::CBIndex::c_7;   // Temporary computations
 constexpr uint32_t cb_grad_query_accum = tt::CBIndex::c_8;    // L1 accumulator for grad_query
@@ -85,12 +87,20 @@ const uint32_t tiles_per_row = qWt;       // number of tiles per row (qWt == kWt
 const uint32_t num_of_interm_tiles = 2U;  // number of tiles in intermediates buffer per head
 
 void MAIN {
+    // Runtime args - needed for causal mask to know global position within sequence
+    const uint32_t start_row = get_arg_val<uint32_t>(0);
+
     init_sfpu(cb_query, cb_key);
     binary_op_init_common(cb_grad_output, cb_query, cb_key);
 
     cb_wait_front(cb_mat_mul_reduction, onetile);
     mm_init(cb_query, cb_key, cb_attention_weights);
 
+#ifdef CAUSAL_MASK
+    // Wait for causal mask tile ONCE - it's generated by writer and will be reused for every diagonal
+    cb_wait_front(cb_attn_mask, onetile);
+#endif
+
     for (uint32_t row = 0; row < num_rows_per_core; ++row) {
         cb_wait_front(cb_attn_output, tiles_per_row);
         cb_wait_front(cb_grad_output, tiles_per_row);
@@ -101,8 +111,20 @@ void MAIN {
         compute_u_scalar_row(
             cb_grad_output, cb_attn_output, cb_u_scalar_row, cb_mat_mul_reduction, tiles_per_row, scaler_bits);
 
+#ifdef CAUSAL_MASK
+        // Calculate global position within sequence for causal mask
+        const uint32_t global_row_idx = start_row + row;
+        const uint32_t q_row_tile = global_row_idx % Ht;  // position within sequence (0 to Ht-1)
+
+        // For causal mask: only process K/V tiles up to and including the diagonal
+        // q_row_tile determines how many K/V chunks we need (0..q_row_tile inclusive)
+        const uint32_t num_kv_tiles_to_process = q_row_tile + 1;
+#else
+        const uint32_t num_kv_tiles_to_process = Ht;
+#endif
+
         const uint32_t matmul_accum_reg = 0;
-        for (uint32_t h = 0; h < Ht; ++h) {
+        for (uint32_t h = 0; h < num_kv_tiles_to_process; ++h) {
             cb_wait_front(cb_key, tiles_per_row);
             cb_wait_front(cb_value, tiles_per_row);
 
@@ -120,12 +142,27 @@ void MAIN {
                     /* dst_reg_idx*/ matmul_accum_reg);  // accumulate in dest_reg 0
             }
 
-            /*
-             * apply attention mask on dest_reg.
-             * function assumes that dest_reg is in acquired state via *acquire_dst* call
-             * function transforms mask from 1/0 to 0/-inf and applies it on dest_reg
-             */
+#ifdef CAUSAL_MASK
+            // For causal mask: apply triangular mask on diagonal tile (h == q_row_tile)
+            // Writer generates causal mask tile once, reused for every diagonal
+            if (h == q_row_tile) {
+                apply_mask_on_reg(matmul_accum_reg, cb_attn_mask, scaler_bits, minus_one_bits, custom_inf_bits);
+                // Don't pop - causal mask tile is reused for all diagonal positions
+            } else {
+                // Off-diagonal: just scale
+                binop_with_scalar_tile_init();
+                mul_unary_tile(matmul_accum_reg, scaler_bits);
+            }
+#elif defined(USE_ATTN_MASK)
+            // Apply attention mask from DRAM
+            // Transforms mask from 1/0 to 0/-inf and applies it on dest_reg
             apply_mask_on_reg(matmul_accum_reg, cb_attn_mask, scaler_bits, minus_one_bits, custom_inf_bits);
+            cb_pop_front(cb_attn_mask, onetile);  // Pop each unique mask tile after use
+#else
+            // No mask: just scale
+            binop_with_scalar_tile_init();
+            mul_unary_tile(matmul_accum_reg, scaler_bits);
+#endif
             tile_regs_commit();
             tile_regs_wait();
             pack_reconfig_data_format(cb_attention_weights);
@@ -161,6 +198,9 @@ void MAIN {
             cb_pop_front(cb_value, tiles_per_row);
             cb_pop_front(cb_attention_weights, onetile);
             cb_pop_front(cb_grad_attn_weights, onetile);
+            // Note: Mask pops are handled explicitly after apply_mask_on_reg:
+            // - USE_ATTN_MASK: pops each unique mask tile after use
+            // - CAUSAL_MASK: doesn't pop (reuses same tile for all diagonals)
             // Note: cb_grad_scores is popped inside update_grad_query
         }
 
@@ -173,6 +213,11 @@ void MAIN {
         cb_pop_front(cb_attn_output, tiles_per_row);
         cb_pop_front(cb_grad_output, tiles_per_row);
     }
+
+#ifdef CAUSAL_MASK
+    // Pop the causal mask tile after all rows are processed (was reused for every diagonal)
+    cb_pop_front(cb_attn_mask, onetile);
+#endif
 }
 
 }  // namespace NAMESPACE
@@ -27,7 +27,9 @@ void kernel_main() {
     constexpr uint32_t cb_query = tt::CBIndex::c_2;
     constexpr uint32_t cb_key = tt::CBIndex::c_3;
     constexpr uint32_t cb_value = tt::CBIndex::c_4;
+#ifdef USE_ATTN_MASK
     constexpr uint32_t cb_attn_mask = tt::CBIndex::c_5;
+#endif
     constexpr uint32_t cb_intermediates = tt::CBIndex::c_6;
     constexpr uint32_t cb_matmul_reduce = tt::CBIndex::c_7;
 
@@ -58,7 +60,9 @@ void kernel_main() {
     const auto query_address_generator = TensorAccessor(query_args, query_addr, tile_bytes);
     const auto key_address_generator = TensorAccessor(key_args, key_addr, tile_bytes);
     const auto value_address_generator = TensorAccessor(value_args, value_addr, tile_bytes);
+#ifdef USE_ATTN_MASK
     const auto mask_address_generator = TensorAccessor(mask_args, mask_addr, tile_bytes);
+#endif
     const auto intermediates_address_generator = TensorAccessor(intermediates_args, intermediates_addr, tile_bytes);
 
     generate_matmul_row_reduce_tile(cb_matmul_reduce);  // generate tile for matmul row reduce (auto-detects data type)
@@ -81,23 +85,43 @@ void kernel_main() {
         const uint32_t first_q_head_idx = group_idx * heads_per_group;
         const uint32_t q_offset = (batch_idx * q_heads + first_q_head_idx) * Ht * qWt;
 
+        // k_row_tile = position within sequence (0 to Ht-1)
+        const uint32_t k_row_tile = global_row_idx % Ht;
+
+#ifdef CAUSAL_MASK
+        // For causal mask: only read Q rows from k_row_tile to Ht-1
+        // Q rows 0 to k_row_tile-1 have zero attention weights (can't attend to future keys)
+        const uint32_t q_start_tile = k_row_tile;
+        const uint32_t num_q_tiles_to_read = Ht - k_row_tile;
+#else
+        const uint32_t q_start_tile = 0;
+        const uint32_t num_q_tiles_to_read = Ht;
+#endif
+
+#ifdef USE_ATTN_MASK
         // Mask is (1, 1, S, S) - same mask for all batches/heads, indexed by sequence position only
-        // For KV kernel, we read column (global_row_idx % Ht) from each row h of the mask
-        const uint32_t mask_offset = (global_row_idx % Ht);
+        // For KV kernel, we read column k_row_tile from each row h of the mask
+        const uint32_t mask_offset = k_row_tile;
+#endif
 
         // add change here: multiply by num_of_interm_tiles because we need to read 2 tiles per head row
         uint32_t intermediates_offset = (batch_idx * q_heads + first_q_head_idx) * Ht * num_of_interm_tiles;
 
         // TODO: add calculation for dO, O indexes because in forward pass they are stored with shape (B, 1, S,
         // qNH*qEmbd)
         for (uint32_t q_head_idx = 0; q_head_idx < heads_per_group; ++q_head_idx) {
-            for (uint32_t h = 0; h < Ht; ++h) {
+            for (uint32_t q_idx = 0; q_idx < num_q_tiles_to_read; ++q_idx) {
+                const uint32_t h = q_start_tile + q_idx;  // actual Q row tile index
+
                 const uint32_t q_start_idx = q_offset + (q_head_idx * Ht + h) * qWt;
                 read_tiles_by_row(cb_query, query_address_generator, q_start_idx, qWt, tile_bytes, qWt);
 
+#ifdef USE_ATTN_MASK
                 // read one tile of attn_mask for current row of K and V
                 // row of K define the column in (QK^T) matrix, so it define the column of attn_mask
                 read_one_tile(cb_attn_mask, mask_address_generator, mask_offset + h * Ht);
+#endif
+                // Note: For CAUSAL_MASK, the mask tile is generated once by writer and reused by compute
 
                 // Read intermediates - one tile per row (contains 1/sum_exp values from forward pass)
                 // TODO[improve](vmelnykov): Now we share two intermediates values per head row: row-wise max value and
 
@@ -24,6 +24,12 @@ void kernel_main() {
     constexpr uint32_t q_heads = get_compile_time_arg_val(2);          // number of query heads
     constexpr uint32_t heads_per_group = get_compile_time_arg_val(3);  // heads per group
 
+#ifdef CAUSAL_MASK
+    // Generate causal mask tile ONCE - will be reused for every diagonal
+    constexpr uint32_t cb_attn_mask = tt::CBIndex::c_5;
+    generate_causal_mask_tile(cb_attn_mask);
+#endif
+
     const uint32_t tile_bytes = get_tile_size(cb_grad_key);
 
     // TensorAccessor definitions with chained offsets