tenstorrent
diff --git a/‎models/common/sampling/generator.py‎
Lines changed: 26 additions & 19 deletions b/‎models/common/sampling/generator.py‎
Lines changed: 26 additions & 19 deletions
diff --git a/‎models/common/sampling/tt_sampling.py‎
Lines changed: 20 additions & 1 deletion b/‎models/common/sampling/tt_sampling.py‎
Lines changed: 20 additions & 1 deletion
diff --git a/‎models/demos/llama3_70b_galaxy/tt/generator.py‎
Lines changed: 74 additions & 52 deletions b/‎models/demos/llama3_70b_galaxy/tt/generator.py‎
Lines changed: 74 additions & 52 deletions
diff --git a/‎models/demos/llama3_70b_galaxy/tt/llama_attention.py‎
Lines changed: 7 additions & 4 deletions b/‎models/demos/llama3_70b_galaxy/tt/llama_attention.py‎
Lines changed: 7 additions & 4 deletions
@@ -3,6 +3,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import random
+import secrets
 from dataclasses import dataclass, fields, replace
 from typing import List, Optional
 
@@ -60,6 +61,7 @@ def __init__(
         self._penalties_active = False
 
         self._trace_states: dict[_TraceKey, dict] = {}
+        self.seed_manager = SeedManager(self.tt_sampling)
 
     def _new_trace_state(self):
         return {"id": None, "input": None, "output": None, "kwargs": {}}
@@ -263,24 +265,6 @@ def sample(
                 self.tt_penalties.update_output_tokens(tt_out)
         return tt_out
 
-    def reset_seed(self, seed):
-        for i, s in enumerate(seed):
-            if s is None:
-                # set to random seed to have variability while using tensor manual_seed
-                seed[i] = random.randint(0, 1000000)
-        seed = torch.tensor(seed)
-        user_ids = torch.arange(seed.shape[0])
-
-        user_ids_tt = ttnn.from_torch(
-            user_ids, device=self.mesh_device, dtype=ttnn.uint32, layout=ttnn.ROW_MAJOR_LAYOUT
-        )
-        seeds_tt = ttnn.from_torch(seed, device=self.mesh_device, dtype=ttnn.uint32, layout=ttnn.ROW_MAJOR_LAYOUT)
-
-        # reset seed for each user_id
-        ttnn.manual_seed(seeds=seeds_tt, user_ids=user_ids_tt, sub_core_grids=self.sub_core_grids)
-        seeds_tt.deallocate()
-        user_ids_tt.deallocate()
-
 
 def clamp(value, min_value, max_value):
     if value < min_value:
@@ -307,7 +291,7 @@ def format_sampling_params(sampling_params, max_batch_size):
         "presence_penalty": 0.0,
         "frequency_penalty": 0.0,
         "repetition_penalty": 1.0,
-        "seed": None,
+        "seed": random.randint(0, 1000000),  # set to random seed to have variability while using tensor manual_seed
     }
     target_len = max_batch_size
     assert target_len == 32, "Sampling only support batch_size=32"
@@ -369,3 +353,26 @@ def format_sampling_params(sampling_params, max_batch_size):
         if sampling_params.top_k[i] < 1:
             sampling_params.top_k[i] = 32  # k<1 means no restriction so set it to max k (32)
     return sampling_params
+
+
+class SeedManager:
+    def __init__(self, tt_sampling):
+        self.seeds = [secrets.randbits(64) for _ in range(32)]
+        self.rngs = [random.Random(seed) for seed in self.seeds]
+        self.tt_sampling = tt_sampling
+
+    def reset_seed(self, seeds, user_ids):
+        for i, user in enumerate(user_ids):
+            self.rngs[user].seed(seeds[i])
+            self.seeds[user] = seeds[i]
+
+    def get_new_values(self, empty_slots=range(32), replicate_seeds=False):
+        # get new seeds for each user in empty_slots otherwise 0
+        new_seeds = [rng.randint(0, 1000000) if i in empty_slots else 0 for i, rng in enumerate(self.rngs)]
+
+        if replicate_seeds:
+            assert len(empty_slots) == 1, "Cannot replicate seeds if empty_slots is not length 1"
+            new_seeds = 32 * [new_seeds[empty_slots[0]]]
+        # send new seeds to sampling module
+        new_seed_tt = ttnn.from_torch(torch.tensor(new_seeds), dtype=ttnn.uint32, layout=ttnn.ROW_MAJOR_LAYOUT)
+        ttnn.copy_host_to_device_tensor(new_seed_tt, self.tt_sampling.seeds_tt_tensor)
@@ -146,6 +146,21 @@ def __init__(
         self.tt_log_probs = None
         self.log_probs_calculator = LogProbsCalculator(self.mesh_device, self.sub_core_grids, self.tt_ccl)
 
+        self.seeds_tt_tensor = ttnn.as_tensor(
+            torch.tensor(list(torch.arange(32)), dtype=torch.uint32),
+            dtype=ttnn.uint32,
+            layout=ttnn.ROW_MAJOR_LAYOUT,
+            device=self.mesh_device,
+            memory_config=ttnn.DRAM_MEMORY_CONFIG,
+        )
+        self.user_ids_tt_tensor = ttnn.as_tensor(
+            torch.tensor(list(torch.arange(32)), dtype=torch.uint32),
+            dtype=ttnn.uint32,
+            layout=ttnn.ROW_MAJOR_LAYOUT,
+            device=self.mesh_device,
+            memory_config=ttnn.DRAM_MEMORY_CONFIG,
+        )
+
     def _create_indices_tensors(self):
         """Create the indices tensors needed for distributed top-k operations."""
         # Create indices tensor for device offsets
@@ -395,7 +410,11 @@ def forward(
             topk_global_indices_interleaved, use_multicore=True, sub_core_grids=self.sub_core_grids
         )
         ttnn.deallocate(topk_global_indices_interleaved)
-
+        ttnn.manual_seed(
+            seeds=self.seeds_tt_tensor,
+            user_ids=self.user_ids_tt_tensor,
+            sub_core_grids=self.sub_core_grids,
+        )
         # Perform the actual sampling with top-k, top-p, and temperature
         tt_out_tok = ttnn.sampling(
             topk_values_gathered_bf16_interleaved,
 
@@ -7,6 +7,7 @@
 from loguru import logger
 from typing import List
 from collections import defaultdict
+from dataclasses import fields, replace
 
 from llama_models.llama3.api.datatypes import (
     InterleavedTextMedia,
@@ -64,6 +65,18 @@ def __init__(self, model, model_args, mesh_device, tokenizer=None, formatter=Non
         self.trace_id_prefill = defaultdict(lambda: None)
         self.trace_inputs_prefill = defaultdict(lambda: None)
         self.trace_output_prefill = defaultdict(lambda: None)
+        # Create persistent buffer for accumulated logits (used for on-device sampling)
+        self.tt_logits_accumulated = [
+            ttnn.from_torch(
+                torch.zeros(1, 1, 1, self.model.args.padded_vocab_size // self.model_args.cluster_shape[0]),
+                mesh_mapper=ttnn.ReplicateTensorToMesh(self.mesh_device),
+                dtype=ttnn.bfloat8_b,
+                device=self.mesh_device,
+                layout=ttnn.TILE_LAYOUT,
+            )
+            for _ in range(self.model_args.max_batch_size)
+        ]
+        self.tt_logits_accumulated_batched = []  # Temporary list for batched prefill
         self.prev_page_table = None
         self.prefill_traces_warmup = False
         self.trace_ids_decode = defaultdict(lambda: None)  # {return_logits: {device_id: trace_id}}
@@ -146,7 +159,7 @@ def prefill_forward_text(
                 kv_cache,
                 prompt_lens,
                 enable_trace,
-                sampling_params,
+                None,
                 empty_slots,
                 tt_out_logits_all_users,
             )
@@ -176,9 +189,7 @@ def prefill_forward_text(
         if (
             batch >= 16
             and len(set(prefill_seq_lens)) == 1
-            and prefill_seq_lens[0] < 4 * 1024
-            and tt_out_logits_all_users is None
-            and not return_logits
+            and prefill_seq_lens[0] == 128
         ):
             use_batched_prefill = True
 
@@ -192,7 +203,6 @@ def prefill_forward_text(
         do_device_sampling = (not return_logits) and (not save_logits_to_host)
 
         # Accumulate sharded logits (same format as decode, before all-gather) for on-device sampling.
-        tt_logits_accumulated = [] if do_device_sampling else None
 
         all_users = [0] if use_batched_prefill else empty_slots
 
@@ -255,6 +265,10 @@ def prefill_forward_text(
                 prefill_kwargs["tt_out_logits_saved"] = tt_out_logits_saved
 
             if enable_trace:
+                # For batched prefill, reset to empty list since we use extend()
+                # For non-batched prefill with device sampling, use persistent buffer from __init__
+                if use_batched_prefill and do_device_sampling:
+                    self.tt_logits_accumulated_batched = []
                 tt_tok = self._easy_trace_prefill(**prefill_kwargs, prefill_seq_len=prefill_seq_len)
             else:
                 tt_tok = self.prefill_forward_single_user_text(**prefill_kwargs)
@@ -278,49 +292,64 @@ def prefill_forward_text(
                 tt_logits_list = self.model.process_output_prefill_logits(tt_tok, last_token_idx=last_token_idx)
                 if use_batched_prefill:
                     # Batched prefill: logits list has 32 entries ordered by slot position
-                    tt_logits_accumulated.extend(tt_logits_list)
+                    self.tt_logits_accumulated_batched.extend(tt_logits_list)
                 else:
-                    # Single user: logits list has 1 entry
-                    tt_logits_accumulated.append(ttnn.clone(tt_logits_list[0]))
-
+                    # Single user: logits list has 1 entry, copy into persistent buffer
+                    ttnn.copy(input_a=tt_logits_list[0], input_b=self.tt_logits_accumulated[user_id])
         # On-device sampling for prefill
-        if do_device_sampling and tt_logits_accumulated:
+        if do_device_sampling:
             padded_batch = 32
 
-            # lm_head output is a list [logits_tensor], extract the tensor
-            logits_tensors = [logits[0] if isinstance(logits, list) else logits for logits in tt_logits_accumulated]
-
-            if use_batched_prefill:
-                # Batched prefill: logits already have 32 entries (one per slot), ordered by slot.
-                tt_logits_batch = ttnn.concat(logits_tensors, dim=2)
-            else:
-                # Non-batched prefill: we have `batch` logits, need to pad to 32.
-                # Logits are in batch order (same as tokens and sampling_params).
-                if len(logits_tensors) > 1:
-                    tt_logits_batch = ttnn.concat(logits_tensors, dim=2)
-                else:
-                    tt_logits_batch = logits_tensors[0]
-
-                # Pad to 32 users for sampling
-                num_users = len(logits_tensors)
-                if num_users < padded_batch:
-                    padding_needed = padded_batch - num_users
-                    padding_tensors = [logits_tensors[-1]] * padding_needed
-                    tt_logits_batch = ttnn.concat([tt_logits_batch] + padding_tensors, dim=2)
+            # Use batched list for batched prefill, persistent buffer for non-batched
+            logits_source = self.tt_logits_accumulated_batched if use_batched_prefill else self.tt_logits_accumulated
 
+            # Concatenate along slot dimension -> [1, 1, 1[32], vocab_shard]
+            tt_logits_batch = ttnn.concat(logits_source, dim=2)
             # Sample using the sampling module
             # Logits are in sharded format (before all-gather), same as decode
             # sampling_params are already padded to 32 by format_sampling_params
             self.model.switch_mode("decode")
 
             # Setting sampling module up after switch to decode mode
             sampling_params = format_sampling_params(sampling_params, self.model_args.max_batch_size)
+
+            # Reorder sampling params so values sit in their slot positions (except seed).
+            def _scatter_params_to_slots(params, slots):
+                max_batch = self.model_args.max_batch_size
+
+                def _scatter_list(values):
+                    if not isinstance(values, list):
+                        return values
+                    values = list(values)
+                    # Broadcast single-entry lists to match user count
+                    if len(values) == 1 and len(slots) > 1:
+                        values = values * len(slots)
+                    user_vals = values[: len(slots)]
+                    filler = values[len(slots)] if len(values) > len(slots) else values[-1]
+                    scattered = [filler for _ in range(max_batch)]
+                    for val, slot_idx in zip(user_vals, slots):
+                        scattered[slot_idx] = val
+                    return scattered
+
+                updates = {}
+                for f in fields(SamplingParams):
+                    if f.name == "seed":
+                        # Seeds stay in original order; no reordering to slot indices.
+                        updates[f.name] = getattr(params, f.name)
+                        continue
+                    updates[f.name] = _scatter_list(getattr(params, f.name))
+                return replace(params, **updates)
+
+            sampling_params = _scatter_params_to_slots(sampling_params, empty_slots)
+            # print("sampling_params_scattered", sampling_params, "empty_slots", empty_slots)
             sampling_module = self.model.sampling
+
             sampling_module.reset_sampling_params(sampling_params)
             # if prompt_tokens is not None:  # Guard for warmup
             sampling_module.reset_prompt_tokens(prefill_ids)
             sampling_module.reset_output_state()
-            sampling_module.reset_seed(sampling_params.seed)
+            sampling_module.seed_manager.reset_seed(sampling_params.seed, empty_slots)
+            sampling_module.seed_manager.get_new_values(empty_slots)
             tt_sampled, tt_log_probs = sampling_module.sample(
                 tt_logits_batch,
                 tt_out_tok=None,
@@ -333,14 +362,9 @@ def prefill_forward_text(
 
             sampled_tokens = ttnn.to_torch(ttnn.get_device_tensors(tt_sampled)[0])
 
-            if use_batched_prefill:
-                # Batched prefill: sampled_tokens has 32 entries ordered by slot.
-                sampled_tensor = sampled_tokens[0, 0, 0, :]  # Shape: [32]
-                output_toks = sampled_tensor[empty_slots].reshape(batch, 1, 1)
-            else:
-                # Non-batched prefill: first `batch` entries are our results in batch order.
-                for i in range(batch):
-                    output_toks[i] = sampled_tokens[0, 0, 0, i].item()
+            # sampled_tokens has 32 entries ordered by slot.
+            sampled_tensor = sampled_tokens[0, 0, 0, :]  # Shape: [32]
+            output_toks = sampled_tensor[empty_slots].reshape(batch, 1, 1)
 
         if return_logits:
             # TODO: the current solution runs the argmax even if we are returning logits
@@ -523,6 +547,7 @@ def decode_forward_text(
             "is_cur_pos_sharded": is_cur_pos_sharded,
             "is_page_table_sharded": is_page_table_sharded,
         }
+        self.model.sampling.seed_manager.get_new_values()
         if reset_inputs and sampling_params is not None:
             # If we have new inputs, we need to set up the sampling module again
             sampling_params = format_sampling_params(sampling_params, self.model_args.max_batch_size)
@@ -532,7 +557,6 @@ def decode_forward_text(
             if reset_batch:
                 sampling_module.reset_prompt_tokens(prompt_tokens)
                 sampling_module.reset_output_state(output_tokens)
-                sampling_module.reset_seed(sampling_params.seed)
 
         if tt_out_logits_saved is not None:
             decode_kwargs["tt_out_logits_saved"] = tt_out_logits_saved
@@ -834,18 +858,16 @@ def warmup_model_prefill(self, kv_cache, enable_trace, sampling_params) -> None:
         # page_table gets padded properly in prefill_forward_text
         # be sure to pad correctly for non traced sequences in future warmup calls
         page_table = torch.zeros(1, 1, dtype=torch.int32)
-        # in case of multiple sampling parameters, we need to warmup for each one
-        for s in sampling_params:
-            self.warmup_prefill_traces(
-                tokens=None,
-                page_table=page_table,
-                kv_cache=kv_cache,
-                prompt_lens=None,
-                enable_trace=enable_trace,
-                sampling_params=s,
-                empty_slots=None,
-                tt_out_logits_all_users=None,
-            )
+        self.warmup_prefill_traces(
+            tokens=None,
+            page_table=page_table,
+            kv_cache=kv_cache,
+            prompt_lens=None,
+            enable_trace=enable_trace,
+            sampling_params=None,
+            empty_slots=None,
+            tt_out_logits_all_users=None,
+        )
 
     ## Destructor (used to delete ttnn trace if exists)
 
 
@@ -753,7 +753,9 @@ def forward_prefill(
                 is_causal=True,
                 scale=self.scale,
                 compute_kernel_config=self.compute_kernel_config_hifi4,
-                program_config=self.model_config["SDPA_PROGCFG"](seq_len),
+                program_config=self.model_config["SDPA_PROGCFG"](
+                    seq_len // batch_size if seq_len // batch_size == 128 else seq_len
+                ),
             )
 
         # deallocate keys and values
@@ -830,15 +832,16 @@ def forward_prefill(
         ttnn.deallocate(attn_output_11SH)
 
         # Reduce-scatter
-        output_11SH = self.tt_ccl.line_all_reduce(
+        output_11SH_reduced = self.tt_ccl.line_all_reduce(
             output_11SH,
             cluster_axis=0,
             num_links=3,
             memory_config=ttnn.DRAM_MEMORY_CONFIG,
-            buffer_key="WO",
+            buffer_key="WO_AG" if seq_len <= 4096 else "WO",
         )
+        output_11SH.deallocate()
 
-        return output_11SH
+        return output_11SH_reduced
 
     def forward(
         self,