tenstorrent
diff --git a/‎ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp‎
Lines changed: 93 additions & 35 deletions b/‎ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/compute/sdpa_flash_decode.cpp‎
Lines changed: 93 additions & 35 deletions
diff --git a/‎ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/dataflow_common.hpp‎
Lines changed: 106 additions & 0 deletions b/‎ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/dataflow_common.hpp‎
Lines changed: 106 additions & 0 deletions
@@ -61,6 +61,7 @@ void kernel_main() {
     constexpr bool use_half_tile = get_compile_time_arg_val(27);
     constexpr uint32_t scale_fp32 = get_compile_time_arg_val(28);
     constexpr uint32_t sliding_window_size = get_compile_time_arg_val(29);
+    constexpr uint32_t num_tree_reduction_rounds = get_compile_time_arg_val(30);
 
     constexpr uint32_t q_chunk_tiles = Sq_chunk_t * DHt;
     constexpr uint32_t out_chunk_tiles = Sq_chunk_t * vDHt;
@@ -109,6 +110,19 @@ void kernel_main() {
     const uint32_t core_num_in_output = get_arg_val<uint32_t>(arg_idx++);
     const uint32_t cur_pos_arg = get_arg_val<uint32_t>(arg_idx++);
 
+    // Tree reduction runtime arguments
+    const bool is_tree_root = get_arg_val<uint32_t>(arg_idx++) == 1;
+    const uint32_t parent_core_in_group = get_arg_val<uint32_t>(arg_idx++);
+    const uint32_t send_at_round = get_arg_val<uint32_t>(arg_idx++);
+    const uint32_t num_children = get_arg_val<uint32_t>(arg_idx++);
+    const uint32_t my_active_rounds = get_arg_val<uint32_t>(arg_idx++);
+
+    // Read children_per_round array
+    uint32_t children_per_round[MAX_TREE_REDUCTION_ROUNDS];
+    for (uint32_t r = 0; r < MAX_TREE_REDUCTION_ROUNDS; ++r) {
+        children_per_round[r] = get_arg_val<uint32_t>(arg_idx++);
+    }
+
     // Idle core
     // get_arg_val<uint32_t>(0) can go from 0-63 for the core_num; for active cores 65 is out of range so 65 indicates
     // an idle_core
@@ -280,6 +294,7 @@ void kernel_main() {
                 bool add_mask_fusion = false;
                 bool add_sliding_window_mask_fusion = false;
 #endif
+                DPRINT << "doing fa main loop" << ENDL();
 
                 /* QK = Q_CHUNK @ K_CHUNK */
                 // Determine which mask buffer to use for fusion
@@ -399,7 +414,7 @@ void kernel_main() {
                     cb_out_mm = cb_out_im;
                 } else {
                     // When there is more than 1 chunk, we perform Lazy Softmax
-
+                    DPRINT << "doing local softmax" << ENDL();
                     // Reconfig register DF
                     reconfig_data_format(cb_prev_max, cb_cur_max);
                     pack_reconfig_data_format(cb_exp_max_diff);
@@ -428,38 +443,63 @@ void kernel_main() {
                     add_block_inplace<true>(cb_out_accumulate_im, cb_out_im, out_chunk_tiles);
                 }
 
-                if (k_chunk < k_chunk_end - 1 || do_reduce) {
-                    // Move intermediate sum and max values to appropriate ping pong buffers
-                    reconfig_data_format(cb_cur_max, cb_cur_max);
-                    pack_reconfig_data_format(cb_prev_max);
-
-                    // PREV_MAX <- CUR_MAX
-                    move_block<true>(cb_cur_max, cb_prev_max, Sq_chunk_t);
-
-                    // PREV_SUM <- CUR_SUM
-                    move_block<true>(cb_cur_sum, cb_prev_sum, Sq_chunk_t);
-                } else {
-                    // Write results OUT_ACC, CUR_MAX, CUR_SUM to designated
-                    // Write o, m, l into cb_out
-                    move_block<true>(cb_out_accumulate_im, cb_out_o, out_chunk_tiles);
-                    move_block<true>(cb_cur_max, cb_out_m, Sq_chunk_t);
-                    move_block<true>(cb_cur_sum, cb_out_l, Sq_chunk_t);
-                }
+                // Move intermediate sum and max values to appropriate ping pong buffers
+                reconfig_data_format(cb_cur_max, cb_cur_max);
+                pack_reconfig_data_format(cb_prev_max);
+
+                // PREV_MAX <- CUR_MAX
+                move_block<true>(cb_cur_max, cb_prev_max, Sq_chunk_t);
+
+                // PREV_SUM <- CUR_SUM
+                move_block<true>(cb_cur_sum, cb_prev_sum, Sq_chunk_t);
+
+                // After this point:
+                // cb_out_accumulate_im contains o_1
+                // cb_prev_max contains m_1
+                // cb_prev_sum contains l_1
+
+                // else {
+                //     DPRINT << "local move for tree reduction root" << ENDL();
+                //     // Write results OUT_ACC, CUR_MAX, CUR_SUM to designated
+                //     // Write o, m, l into cb_out
+                //     move_block<true>(cb_out_accumulate_im, cb_out_o, out_chunk_tiles);
+                //     move_block<true>(cb_cur_max, cb_out_m, Sq_chunk_t);
+                //     move_block<true>(cb_cur_sum, cb_out_l, Sq_chunk_t);
+                // }
             }
         }
         /* END OF FLASH ATTENTION LOOP */
-        // Perform reduction across intermediates from other cores if this is the reduction core
-        if (do_reduce) {
-            // cb_out_accumulate_im should contain o_1 (output from FA of itself's core)
-            // cb_prev_max and cb_prev_sum should contain m_1 and l_1 (max and sum of logits of itself's core)
-
-            if (k_chunk_end - k_chunk_start < k_num_chunks) {
-                // This indicates that there are computes done by other workers.
-                // We need to wait for them and send to reducer's compute
-                // Iterate through each worker
-                for (uint32_t i = 0; i < num_cores_to_wait; i++) {
-                    move_block<true>(cb_l_in, cb_prev_sum_2, Sq_chunk_t);
 
+        /******************************************************************************
+         *                      TREE REDUCTION LOGIC                                  *
+         ******************************************************************************/
+        /**
+         * Tree reduction replaces the flat worker->reducer pattern with O(log n) rounds.
+         *
+         * For each round r (0 to my_active_rounds-1):
+         *   - If children_per_round[r] != UINT32_MAX, receive from that child
+         *   - Combine received data with local accumulator using softmax correction
+         *
+         * After all receives:
+         *   - If is_tree_root: finalize (1/sum normalization) and output
+         *   - Else: output intermediate results for writer to send to parent
+         */
+        DPRINT << "doing tree reduction" << ENDL();
+
+        // Tree reduction: receive from children and combine
+        if (num_children > 0 && k_chunk_end - k_chunk_start < k_num_chunks) {
+            // cb_out_accumulate_im should contain o_1 (output from FA of this core)
+            // cb_prev_max and cb_prev_sum should contain m_1 and l_1 (max and sum of logits of this core)
+
+            // Iterate through each round and receive from child if one exists
+            for (uint32_t round = 0; round < my_active_rounds; ++round) {
+                DPRINT << "doing tree reduction round " << round << ENDL();
+                uint32_t child_id = children_per_round[round];
+                if (child_id != UINT32_MAX) {
+                    DPRINT << "doing tree reduction child " << child_id << ENDL();
+                    // Writer kernel handles the wait and data transfer to cb_m_in, cb_l_in, cb_out_o
+                    move_block<true>(cb_l_in, cb_prev_sum_2, Sq_chunk_t);
+                    DPRINT << "moved child sum to prev_sum_2" << ENDL();
                     // Fused Softmax Correction
                     // * Fused Correction is a fused operation that performs the following steps:
                     // * 1. CUR_MAX = max(PREV_MAX, WORKER_MAX)
@@ -468,10 +508,9 @@ void kernel_main() {
                     // * 4. EXP_MAX_DIFF = exp((PREV_MAX - CUR_MAX)*scale)
                     // * 5. PREV_SUM *= EXP_MAX_DIFF
                     // * 6. CUR_SUM = PREV_SUM_2 + PREV_SUM
-                    // */
                     correction_block<scale_fp32, vector_mode>(
-                        cb_m_in,        // cb worker max
-                        cb_prev_sum_2,  // cb worker sum
+                        cb_m_in,        // cb child max
+                        cb_prev_sum_2,  // cb child sum
                         cb_cur_max,
                         cb_prev_max,
                         cb_cur_sum,
@@ -480,11 +519,12 @@ void kernel_main() {
                         cb_exp_max_diff_2,
                         Sq_chunk_t);
 
-                    // OUT_ACC_2 <- WORKER_OUT
+                    DPRINT << "done correction" << ENDL();
+                    // OUT_ACC_2 <- CHILD_OUT
                     move_block<true>(cb_out_o, cb_out_accumulate_im_2, out_chunk_tiles);
 
-                    // OUT_ACC_2 *= EXP_MAX_DIFF
-                    // OUT_ACC *= EXP_MAX_DIFF_2
+                    // OUT_ACC *= EXP_MAX_DIFF (scale local accumulator)
+                    // OUT_ACC_2 *= EXP_MAX_DIFF_2 (scale child's accumulator)
                     mul_block_bcast_cols_inplace<Sq_chunk_t, vDHt>(cb_out_accumulate_im, cb_exp_max_diff);
                     mul_block_bcast_cols_inplace<Sq_chunk_t, vDHt>(cb_out_accumulate_im_2, cb_exp_max_diff_2);
 
@@ -497,9 +537,15 @@ void kernel_main() {
                     cb_pop_front(cb_m_in, Sq_chunk_t);
                     move_block<true>(cb_cur_max, cb_prev_max, Sq_chunk_t);
                     move_block<true>(cb_cur_sum, cb_prev_sum, Sq_chunk_t);
+                    DPRINT << "moved cur_max and cur_sum to prev_max and prev_sum" << ENDL();
                 }
             }
+        }
 
+        // Finalize output based on tree role
+        if (is_tree_root) {
+            // Root node: perform final normalization and output
+            DPRINT << "doing tree reduction root" << ENDL();
             /* CUR_SUM = 1.0 / CUR_SUM */
             cb_push_back(cb_cur_sum, Sq_chunk_t);
             reconfig_data_format(cb_cur_sum, cb_cur_sum);
@@ -567,6 +613,18 @@ void kernel_main() {
             // Free up cb_prev_max after K chunks
             cb_pop_front(cb_prev_max, Sq_chunk_t);
             cb_pop_front(cb_prev_sum, Sq_chunk_t);
+            DPRINT << "root done math" << ENDL();
+        } else if (parent_core_in_group != UINT32_MAX) {
+            // Non-root node: output intermediate results for writer to send to parent
+            // Writer will read from cb_out_worker (cb_out_o), cb_out_m, cb_out_l
+            DPRINT << "doing tree reduction non-root" << ENDL();
+            move_block<true>(cb_out_accumulate_im, cb_out_o, out_chunk_tiles);
+            DPRINT << "moved out im to out_o" << ENDL();
+            move_block<true>(cb_prev_max, cb_out_m, Sq_chunk_t);
+            DPRINT << "moved prev_max to out_m" << ENDL();
+            move_block<true>(cb_prev_sum, cb_out_l, Sq_chunk_t);
+            DPRINT << "moved prev_sum to out_l" << ENDL();
+            DPRINT << "non-root done math" << ENDL();
         }
     }
 
 
@@ -402,6 +402,112 @@ void worker_compute(
     cb_pop_front(cb_out_l, PNHt);
 }
 
+/******************************************************************************
+ *                   Tree Reduction Worker Functions                          *
+ ******************************************************************************/
+
+/**
+ * Tree reduction send function: sends intermediate results (o, m, l) to parent core
+ *
+ * @param parent_noc_x Physical X coordinate of parent core
+ * @param parent_noc_y Physical Y coordinate of parent core
+ * @param semaphore_addr Local semaphore address
+ * @param round The current round in tree reduction (determines write offset)
+ */
+template <
+    uint32_t out_chunk_tiles,
+    uint32_t cb_out,
+    uint32_t cb_out_m,
+    uint32_t cb_out_l,
+    uint32_t cb_intermed_out,
+    uint32_t PNHt>
+void tree_reduction_send_to_parent(
+    uint32_t parent_noc_x, uint32_t parent_noc_y, uint32_t semaphore_addr, uint32_t round) {
+    // Wait for compute to deliver output chunk
+    DPRINT << "waiting for compute to deliver out and send" << ENDL();
+
+    cb_wait_front(cb_out, out_chunk_tiles);
+    cb_wait_front(cb_out_m, PNHt);
+    cb_wait_front(cb_out_l, PNHt);
+    DPRINT << "compute produced data, gonna send" << ENDL();
+    // In tree reduction, each round has a specific offset in the parent's intermediate buffer
+    // Round 0 children write at offset 0, round 1 children write at offset 1, etc.
+    constexpr uint32_t tile_bytes = get_tile_size(cb_out);
+    uint32_t block_offset = round * (out_chunk_tiles + 2 * PNHt) * tile_bytes;
+    constexpr uint32_t o_write_size = out_chunk_tiles * tile_bytes;
+    constexpr uint32_t ml_write_size = PNHt * tile_bytes;
+
+    uint64_t output_write_addr =
+        get_noc_addr(parent_noc_x, parent_noc_y, get_write_ptr(cb_intermed_out)) + block_offset;
+
+    // send m, l, o to parent (same order as original worker_compute)
+    noc_async_write(get_read_ptr(cb_out_m), output_write_addr, ml_write_size);
+    output_write_addr += ml_write_size;
+    noc_async_write(get_read_ptr(cb_out_l), output_write_addr, ml_write_size);
+    output_write_addr += ml_write_size;
+    noc_async_write(get_read_ptr(cb_out), output_write_addr, o_write_size);
+
+    // increment parent's semaphore
+    noc_async_write_barrier();
+    uint64_t parent_semaphore_noc_addr = get_noc_addr(parent_noc_x, parent_noc_y, semaphore_addr);
+    noc_semaphore_inc(parent_semaphore_noc_addr, 1);
+    DPRINT << "incremented parent sem" << ENDL();
+    // pop front
+    cb_pop_front(cb_out, out_chunk_tiles);
+    cb_pop_front(cb_out_m, PNHt);
+    cb_pop_front(cb_out_l, PNHt);
+    DPRINT << "sent to parent" << ENDL();
+}
+
+/**
+ * Tree reduction receive function: receives intermediate results from a child core
+ * Data is read from this core's intermediate buffer and pushed to compute CBs
+ *
+ * @param round The round from which the child is sending (determines read offset)
+ */
+template <
+    uint32_t out_chunk_tiles,
+    uint32_t cb_out_o,
+    uint32_t cb_m_in,
+    uint32_t cb_l_in,
+    uint32_t cb_intermed_out,
+    uint32_t PNHt>
+void tree_reduction_receive_from_child(uint32_t round) {
+    constexpr uint32_t tile_bytes_intermed = get_tile_size(cb_intermed_out);
+    constexpr uint32_t o_read_size = out_chunk_tiles * tile_bytes_intermed;
+    constexpr uint32_t ml_read_size = PNHt * tile_bytes_intermed;
+
+    // Calculate offset based on round
+    uint32_t block_offset = round * (out_chunk_tiles + 2 * PNHt) * tile_bytes_intermed;
+    uint64_t intermed_l1_read_addr = get_noc_addr(get_read_ptr(cb_intermed_out)) + block_offset;
+
+    // Reserve and read m, l, o (same order as send)
+    DPRINT << "reserving mlo1 for round " << round << ENDL();
+    cb_reserve_back(cb_m_in, PNHt);
+    DPRINT << "reserving mlo2 for round " << round << ENDL();
+    cb_reserve_back(cb_l_in, PNHt);
+    DPRINT << "reserving mlo3 for round " << round << ENDL();
+    cb_reserve_back(cb_out_o, out_chunk_tiles);
+    DPRINT << "reserved mlo for round " << round << ENDL();
+    uint32_t m_write_ptr = get_read_ptr(cb_m_in);
+    noc_async_read(intermed_l1_read_addr, m_write_ptr, ml_read_size);
+    intermed_l1_read_addr += ml_read_size;
+    noc_async_read_barrier();
+    cb_push_back(cb_m_in, PNHt);
+
+    uint32_t l_write_ptr = get_read_ptr(cb_l_in);
+    noc_async_read(intermed_l1_read_addr, l_write_ptr, ml_read_size);
+    intermed_l1_read_addr += ml_read_size;
+    noc_async_read_barrier();
+    cb_push_back(cb_l_in, PNHt);
+
+    uint32_t o_write_ptr = get_read_ptr(cb_out_o);
+    noc_async_read(intermed_l1_read_addr, o_write_ptr, o_read_size);
+    noc_async_read_barrier();
+    cb_push_back(cb_out_o, out_chunk_tiles);
+    DPRINT << "data is ready for another reduction" << round << ENDL();
+}
+
 template <uint32_t cb_out, uint32_t out_chunk_tiles, uint32_t barrier_threshold, typename WriterType>
 uint32_t write_tiles_to_memory(uint32_t& out_tile_id, const WriterType& out_writer, uint32_t& barrier_count) {
     constexpr uint32_t tile_bytes = get_tile_size(cb_out);