Enable get_last_token for batch>1 prefill, single-call paged_fill_cache

Ubuntu · Ubuntu · commit 387359eeaead · 2026-02-16T06:55:54.000Z
- model.py: When batch_size&gt;1, extract each user's 32-token last-token
  tile via ttnn.slice and concat to [1,1,B*32,H] before norm+lm_head.
  Removes the batch&gt;1 get_last_token=-1 override in ttnn_prefill_forward.
- attention/prefill.py: Replace per-user paged_fill_cache loop with
  single-call reshape approach (flatten batch into seq dim, heads into
  last dim, flatten page table). Matches llama_70b_galaxy pattern.
- text_demo.py: Remove batch&gt;1 get_last_token override, use
  seq_len_per_user=32 when get_last_token is active.

batch128 users_per_row_per_iter=2 TTFT: 218ms -&gt; 99ms (get_last_token fix)
Compile time: 5.24s -&gt; 3.16s (single-call paged_fill_cache)
batch128 users_per_row_per_iter=1: unchanged at 91ms
diff --git a/models/demos/gpt_oss/demo/text_demo.py b/models/demos/gpt_oss/demo/text_demo.py
@@ -599,17 +599,14 @@ def test_gpt_oss_demo(
 
                 # Compute fixed get_last_token for trace (all users must be in same 32-token tile)
                 all_last_idxs = [int(decoding_pos[uid]) - 1 for uid in range(global_batch_size)]
-                if users_per_row_per_iter > 1:
-                    fixed_get_last_token = -1  # Can't use get_last_token with batch>1
-                else:
-                    fixed_get_last_token = (min(all_last_idxs) // 32) * 32
-                    max_tile_start = (max(all_last_idxs) // 32) * 32
-                    if fixed_get_last_token != max_tile_start:
-                        logger.warning(
-                            f"Users span multiple 32-token tiles ({fixed_get_last_token} vs {max_tile_start}), "
-                            f"using get_last_token=-1 (slower)"
-                        )
-                        fixed_get_last_token = -1
+                fixed_get_last_token = (min(all_last_idxs) // 32) * 32
+                max_tile_start = (max(all_last_idxs) // 32) * 32
+                if fixed_get_last_token != max_tile_start:
+                    logger.warning(
+                        f"Users span multiple 32-token tiles ({fixed_get_last_token} vs {max_tile_start}), "
+                        f"using get_last_token=-1 (slower)"
+                    )
+                    fixed_get_last_token = -1
 
                 def _prepare_batch_host(user_indices):
                     """Prepare host-side tokens + page_table for a batch of users."""
@@ -669,7 +666,7 @@ def _prepare_batch_host(user_indices):
                         tt_logits,
                         [idx % 32 for idx in last_w],
                         users_per_row=users_per_row_per_iter,
-                        seq_len_per_user=max_padded_len,
+                        seq_len_per_user=32,
                     )
                 for row, uid in enumerate(warmup_indices):
                     prefilled_token[uid] = torch.argmax(warmup_results[row].view(-1)).item()
@@ -753,7 +750,7 @@ def _prepare_batch_host(user_indices):
                             tt_out_trace,
                             [idx % 32 for idx in last_i],
                             users_per_row=users_per_row_per_iter,
-                            seq_len_per_user=max_padded_len,
+                            seq_len_per_user=32,
                         )
                     for row, uid in enumerate(user_indices):
                         prefilled_token[uid] = torch.argmax(row_results[row].view(-1)).item()
@@ -804,7 +801,10 @@ def _run_batched_prefill_iter(iter_idx, user_indices):
                         batched_prefill=True,
                     )
 
-                    get_last_token_val = (max(batch_last_token_idxs) // 32) * 32 if users_per_row_per_iter == 1 else -1
+                    # Use get_last_token if all users' last tokens fall in the same 32-token tile
+                    min_tile = (min(batch_last_token_idxs) // 32) * 32
+                    max_tile = (max(batch_last_token_idxs) // 32) * 32
+                    get_last_token_val = min_tile if min_tile == max_tile else -1
                     tt_logits = model[model_id].ttnn_prefill_forward(
                         tokens_embd,
                         rot_mats_global=rot_mats_global,
@@ -818,13 +818,15 @@ def _run_batched_prefill_iter(iter_idx, user_indices):
 
                     if get_last_token_val == -1:
                         adjusted_last_idxs = batch_last_token_idxs
+                        seq_len_for_output = padded_len
                     else:
                         adjusted_last_idxs = [idx % 32 for idx in batch_last_token_idxs]
+                        seq_len_for_output = 32
                     row_results = model[model_id].process_output_prefill_batched(
                         tt_logits,
                         adjusted_last_idxs,
                         users_per_row=users_per_row_per_iter,
-                        seq_len_per_user=padded_len,
+                        seq_len_per_user=seq_len_for_output,
                     )
                     return row_results
 
diff --git a/models/demos/gpt_oss/tt/attention/prefill.py b/models/demos/gpt_oss/tt/attention/prefill.py
@@ -99,13 +99,15 @@ def prefill_forward(
         block_size = k_cache.shape[2]
         page_len = page_table.shape[-1] * block_size
         if batch_size > 1:
-            for b in range(batch_size):
-                k_b = ttnn.slice(tt_k, (b, 0, 0, 0), (b + 1, tt_k.shape[1], min(page_len, seq_len), tt_k.shape[3]))
-                v_b = ttnn.slice(tt_v, (b, 0, 0, 0), (b + 1, tt_v.shape[1], min(page_len, seq_len), tt_v.shape[3]))
-                ttnn.experimental.paged_fill_cache(k_cache, k_b, page_table, batch_idx=b)
-                ttnn.experimental.paged_fill_cache(v_cache, v_b, page_table, batch_idx=b)
-                k_b.deallocate(True)
-                v_b.deallocate(True)
+            # Flatten batch into seq dim, heads into last dim — single fill call, no per-user loop.
+            # Paged cache just maps sequence positions to physical pages.
+            k_fill = ttnn.reshape(tt_k, [1, 1, total_seq_len, -1])
+            v_fill = ttnn.reshape(tt_v, [1, 1, total_seq_len, -1])
+            page_table_flat = ttnn.reshape(page_table, [1, -1])
+            ttnn.experimental.paged_fill_cache(k_cache, k_fill, page_table_flat, batch_idx=0)
+            ttnn.experimental.paged_fill_cache(v_cache, v_fill, page_table_flat, batch_idx=0)
+            k_fill.deallocate(True)
+            v_fill.deallocate(True)
         else:
             tt_k_sliced = tt_k[:, :, :page_len, :] if page_len < tt_k.shape[2] else tt_k
             tt_v_sliced = tt_v[:, :, :page_len, :] if page_len < tt_v.shape[2] else tt_v
diff --git a/models/demos/gpt_oss/tt/model.py b/models/demos/gpt_oss/tt/model.py
@@ -278,12 +278,26 @@ def _forward_layers_and_head(
         logits = hidden_states
 
         if get_last_token != -1:
-            # The logits come from the shared method, slice them
             if len(logits.shape) == 3:
                 logits = ttnn.unsqueeze(logits, dim=1)
-            logits_sliced = ttnn.slice(logits, (0, 0, get_last_token, 0), (1, 1, get_last_token + 32, logits.shape[-1]))
-            logits.deallocate(True)
-            logits = logits_sliced
+            if batch_size > 1:
+                # Batch>1: tokens are concatenated [1,1,B*S,H]. Extract each user's 32-token tile.
+                per_user_seq = logits.shape[2] // batch_size
+                tiles = []
+                for b in range(batch_size):
+                    start = b * per_user_seq + get_last_token
+                    tile = ttnn.slice(logits, (0, 0, start, 0), (1, 1, start + 32, logits.shape[-1]))
+                    tiles.append(tile)
+                logits.deallocate(True)
+                logits = ttnn.concat(tiles, dim=2)  # [1, 1, B*32, H]
+                for t in tiles:
+                    t.deallocate(True)
+            else:
+                logits_sliced = ttnn.slice(
+                    logits, (0, 0, get_last_token, 0), (1, 1, get_last_token + 32, logits.shape[-1])
+                )
+                logits.deallocate(True)
+                logits = logits_sliced
             hidden_states = logits
 
         # Final norm and lm_head
@@ -366,7 +380,7 @@ def ttnn_prefill_forward(
             current_pos=None,  # No current_pos for prefill
             page_table=page_table,
             kv_cache=kv_cache,
-            get_last_token=get_last_token if batch_size == 1 else -1,  # Disable get_last_token for batch>1
+            get_last_token=get_last_token,
             is_decode=False,
             user_id=user_id,
             batch_size=batch_size,