Skip to content

Commit f72cada

Browse files
committed
more clean up
1 parent 5020d74 commit f72cada

File tree

1 file changed

+1
-3
lines changed

1 file changed

+1
-3
lines changed

ttnn/cpp/ttnn/operations/transformer/sdpa_decode/device/kernels/dataflow/writer_decode_all.cpp

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -232,8 +232,7 @@ void kernel_main() {
232232
cur_head < cur_head_group * num_heads_per_core + num_heads_per_core;
233233
++cur_head) {
234234
// Tree reduction: receive from children at each round
235-
// IMPORTANT: We process round-by-round to synchronize with compute kernel
236-
// Each round, we wait for one child (if any), read its data, and push to CBs
235+
// Each round, we wait for one child (if any), read remote_sum, remote_max, remote_output, and push to CBs
237236
// The compute kernel processes each child's data before we move to the next round
238237
// Only receive from children that actually have data
239238
if (actual_num_children > 0) {
@@ -262,7 +261,6 @@ void kernel_main() {
262261
constexpr uint32_t ml_read_size = PNHt * tile_bytes_intermed;
263262

264263
// Calculate offset based on round (child writes at round offset)
265-
// Note: Use get_write_ptr to match what the sender uses when writing to this buffer
266264
uint32_t block_offset = round * (out_chunk_tiles + 2 * PNHt) * tile_bytes_intermed;
267265
uint64_t intermed_l1_read_addr = get_noc_addr(get_read_ptr(cb_intermed_out)) + block_offset;
268266

0 commit comments

Comments
 (0)