fix

handrewsTT · handrewsTT · commit 6de9ea40b20f · 2026-02-05T14:23:02.000Z
diff --git a/models/demos/gpt_oss/tests/fused_op_unit_tests/test_gpt_oss_experts_mlp.py b/models/demos/gpt_oss/tests/fused_op_unit_tests/test_gpt_oss_experts_mlp.py
@@ -153,13 +153,13 @@ def gpt_oss_experts_mlp_reference(
     Returns:
         Expert output tensor [num_experts_per_device, B, S, H]
     """
-    num_tokens = batch_size * seq_len
+    total_tokens = batch_size * seq_len
     num_experts = w1.shape[0]
     hidden_size = config.hidden_size
     intermediate_size = config.intermediate_size
 
     # Reshape input: [1, 1, B*S, H] -> [B*S, H]
-    x = post_dispatch.reshape(num_tokens, hidden_size)
+    x = post_dispatch.reshape(total_tokens, hidden_size)
 
     # Expand for all experts: [B*S, H] -> [num_experts, B*S, H]
     x_expanded = x.unsqueeze(0).expand(num_experts, -1, -1)
@@ -196,8 +196,7 @@ def gpt_oss_experts_mlp_ttnn(
     config: ThroughputExpertConfig,
     program_config: ThroughputProgramConfig,
     memory_config: ttnn.MemoryConfig,
-    batch_size: int,
-    seq_len: int,
+    total_tokens: int,
     mesh_device=None,
     save_intermediate: bool = False,
 ) -> ttnn.Tensor:
@@ -227,8 +226,7 @@ def gpt_oss_experts_mlp_ttnn(
         config=config,
         program_config=program_config,
         memory_config=memory_config,
-        batch_size=batch_size,
-        seq_len=seq_len,
+        total_tokens=total_tokens,
         mesh_device=mesh_device,
         save_intermediate=save_intermediate,
     )
@@ -482,13 +480,13 @@ def _run_experts_mlp_test(
 
     # Create input tensor (post_dispatch output)
     # Shape: [1, 1, B*S, H]
-    num_tokens = batch_size * seq_len
-    post_dispatch_torch = torch.randn(1, 1, num_tokens, hidden_size, dtype=torch.bfloat16)
+    total_tokens = batch_size * seq_len
+    post_dispatch_torch = torch.randn(1, 1, total_tokens, hidden_size, dtype=torch.bfloat16)
 
     # Create sparsity tensor - for reference we'll compute dense
     # In practice sparsity indicates which (token_block, expert) pairs are active
     # For this test, we'll assume all tokens are active for all experts (dense case)
-    num_sparse_blocks = num_tokens // throughput_config.sparsity_block_size
+    num_sparse_blocks = total_tokens // throughput_config.sparsity_block_size
     num_experts_per_device = throughput_config.num_experts_per_device
 
     # Create full sparsity tensor (all ones = all active)
@@ -536,8 +534,7 @@ def _run_experts_mlp_test(
         config=throughput_config,
         program_config=program_config,
         memory_config=memory_config,
-        batch_size=batch_size,
-        seq_len=seq_len,
+        total_tokens=total_tokens,
         mesh_device=mesh_device,
     )
 
@@ -572,8 +569,7 @@ def op_fn():
             config=throughput_config,
             program_config=program_config,
             memory_config=memory_config,
-            batch_size=batch_size,
-            seq_len=seq_len,
+            total_tokens=total_tokens,
             mesh_device=mesh_device,
         )
 
@@ -819,11 +815,11 @@ def test_gpt_oss_experts_mlp_single_device(
     w2_ref = state_dict["down_proj"]
 
     # Create input tensor
-    num_tokens = batch_size_per_device * seq_len
-    post_dispatch_torch = torch.randn(1, 1, num_tokens, hidden_size, dtype=torch.bfloat16)
+    total_tokens = batch_size_per_device * seq_len
+    post_dispatch_torch = torch.randn(1, 1, total_tokens, hidden_size, dtype=torch.bfloat16)
 
     # Create sparsity tensor
-    num_sparse_blocks = num_tokens // throughput_config.sparsity_block_size
+    num_sparse_blocks = total_tokens // throughput_config.sparsity_block_size
     num_experts_per_device = throughput_config.num_experts_per_device
     sparsity_torch = torch.ones(num_sparse_blocks, 1, 1, num_experts_per_device, dtype=torch.bfloat16)
 
diff --git a/models/demos/gpt_oss/tt/experts_throughput/decode.py b/models/demos/gpt_oss/tt/experts_throughput/decode.py
@@ -108,8 +108,7 @@ def expert_mlp_forward(
     config: ThroughputExpertConfig,
     program_config: ThroughputProgramConfig,
     memory_config: ttnn.MemoryConfig,
-    batch_size: int,
-    seq_len: int,
+    total_tokens: int,
     mesh_device=None,
     save_intermediate: bool = False,
 ) -> ttnn.Tensor:
@@ -139,8 +138,7 @@ def expert_mlp_forward(
     """
     # Reshape to sparse block format for matmul
     # Note: reshape returns a view - don't deallocate post_dispatch separately
-    num_tokens = batch_size * seq_len
-    num_sparse_blocks = num_tokens // config.sparsity_block_size
+    num_sparse_blocks = total_tokens // config.sparsity_block_size
     reshaped_expert_input = ttnn.reshape(
         experts_input,
         shape=(1, num_sparse_blocks, config.sparsity_block_size, config.hidden_size),
@@ -174,7 +172,7 @@ def expert_mlp_forward(
 
     # Up projection (w3): same shape as gate
     w3_out = ttnn.sparse_matmul(
-        expert_input,
+        reshaped_expert_input,
         weights.w3,
         sparsity=sparsity,
         memory_config=memory_config,
@@ -183,7 +181,7 @@ def expert_mlp_forward(
         is_input_b_sparse=True,
         output_tile=ttnn.Tile([config.sparsity_block_size, ttnn.TILE_SIZE]),
     )
-    ttnn.deallocate(expert_input)
+    ttnn.deallocate(reshaped_expert_input)
 
     # Add up bias
     # w3_out shape: [1, num_sparse_blocks, 1, num_experts_per_device, block_size, intermediate]
@@ -306,7 +304,7 @@ def decode_forward(
     topk_expert_indices = ttnn.typecast(topk_expert_indices, dtype=ttnn.uint32)
     topk_expert_indices = ttnn.reshape(topk_expert_indices, (-1, 1, 1, config.num_experts_per_tok))
     topk_expert_indices = ttnn.typecast(topk_expert_indices, dtype=ttnn.uint16)
-    
+
     topk_expert_weights = ttnn.reshape(topk_expert_weights, (-1, 1, 1, config.num_experts_per_tok))
 
     num_dispatch_devices = (
@@ -410,8 +408,7 @@ def decode_forward(
         config=config,
         program_config=program_config,
         memory_config=dispatch_config.memory_config,
-        batch_size=batch_size,
-        seq_len=seq_len,
+        total_tokens=total_tokens,
         mesh_device=mesh_device,
         save_intermediate=False,
     )