rm reshape

handrewsTT · handrewsTT · commit 1e160210f710 · 2026-02-04T12:23:33.000Z
diff --git a/models/demos/gpt_oss/tt/attention/decode.py b/models/demos/gpt_oss/tt/attention/decode.py
@@ -2,6 +2,7 @@
 # SPDX-License-Identifier: Apache-2.0
 
 import ttnn
+from models.demos.gpt_oss.tt.common import row_major_reshape
 
 from .config import AttentionConfig, ProgramConfig
 from .operations import apply_allreduce, apply_rope
@@ -160,7 +161,8 @@ def decode_forward(
     tt_sdpa_out.deallocate(True)
     tt_out = ttnn.add(tt_out, weights.o_proj_bias, memory_config=ttnn.L1_MEMORY_CONFIG)
     tt_out = ttnn.typecast(tt_out, ttnn.bfloat8_b)
-    tt_out = ttnn.reshape(
+    # tt_out = ttnn.reshape(
+    tt_out = row_major_reshape(
         tt_out,
         (1, 1, batch_size, hidden_size),
         (1, 1, 32, hidden_size),
diff --git a/models/demos/gpt_oss/tt/common.py b/models/demos/gpt_oss/tt/common.py
@@ -10,6 +10,24 @@
 from models.tt_transformers.tt.common import PagedAttentionConfig
 
 
+def row_major_reshape(tensor: ttnn.Tensor, shape: ttnn.Shape) -> ttnn.Tensor:
+    """Reshape a tensor to row major layout.
+
+    Args:
+        tensor: Input tensor
+        shape: New shape
+
+    Returns:
+    """
+    tensor_is_tile = tensor.layout == ttnn.TILE_LAYOUT
+    if tensor_is_tile:
+        tensor = ttnn.to_layout(tensor, ttnn.ROW_MAJOR_LAYOUT)
+    out = ttnn.reshape(tensor, shape)
+    if tensor_is_tile:
+        out = ttnn.to_layout(out, ttnn.TILE_LAYOUT)
+    return out
+
+
 def create_tt_model(
     mesh_device,
     max_batch_size,
diff --git a/models/demos/gpt_oss/tt/experts_throughput/decode.py b/models/demos/gpt_oss/tt/experts_throughput/decode.py
@@ -17,6 +17,7 @@
 from math import prod
 
 import ttnn
+from models.demos.gpt_oss.tt.common import row_major_reshape
 
 from .config import AllToAllCombineConfig, AllToAllDispatchConfig, ThroughputExpertConfig, ThroughputProgramConfig
 from .weights import ThroughputExpertWeights
@@ -124,19 +125,22 @@ def decode_forward(
     tokens_per_device = input_shape[0] * input_shape[2]  # B * S
 
     # Reshape hidden states: put all tokens on dim -2
-    hidden_states = ttnn.reshape(hidden_states, (1, 1, tokens_per_device, config.hidden_size))
+    # hidden_states = ttnn.reshape(hidden_states, (1, 1, tokens_per_device, config.hidden_size))
+    hidden_states = row_major_reshape(hidden_states, (1, 1, tokens_per_device, config.hidden_size))
 
     # typecast creates new tensors - safe to deallocate originals
     topk_expert_indices_orig = topk_expert_indices
     topk_expert_indices = ttnn.typecast(topk_expert_indices, dtype=ttnn.uint32)
     ttnn.deallocate(topk_expert_indices_orig)
 
     # Reshape indices: put all tokens on dim -2
-    topk_expert_indices = ttnn.reshape(topk_expert_indices, (1, 1, tokens_per_device, config.num_experts_per_tok))
+    # topk_expert_indices = ttnn.reshape(topk_expert_indices, (1, 1, tokens_per_device, config.num_experts_per_tok))
+    topk_expert_indices = row_major_reshape(topk_expert_indices, (1, 1, tokens_per_device, config.num_experts_per_tok))
     topk_expert_indices_u32 = topk_expert_indices
     topk_expert_indices = ttnn.typecast(topk_expert_indices, dtype=ttnn.uint16)
     ttnn.deallocate(topk_expert_indices_u32)
-    topk_expert_weights = ttnn.reshape(topk_expert_weights, (1, 1, tokens_per_device, config.num_experts_per_tok))
+    # topk_expert_weights = ttnn.reshape(topk_expert_weights, (1, 1, tokens_per_device, config.num_experts_per_tok))
+    topk_expert_weights = row_major_reshape(topk_expert_weights, (1, 1, tokens_per_device, config.num_experts_per_tok))
 
     num_dispatch_devices = (
         mesh_device.shape[dispatch_config.cluster_axis]
@@ -154,12 +158,14 @@ def decode_forward(
     hidden_rm = ttnn.to_layout(hidden_states, ttnn.ROW_MAJOR_LAYOUT)
     ttnn.deallocate(hidden_states)
     # Shape is already [1, 1, tokens_per_device, H], just ensure it's correct
-    hidden_rm = ttnn.reshape(hidden_rm, shape=(1, 1, tokens_per_device, config.hidden_size))
+    # hidden_rm = ttnn.reshape(hidden_rm, shape=(1, 1, tokens_per_device, config.hidden_size))
+    hidden_rm = row_major_reshape(hidden_rm, (1, 1, tokens_per_device, config.hidden_size))
 
     # Expert indices: [1, 1, tokens_per_device, K]
     topk_indices_rm = ttnn.to_layout(topk_expert_indices, ttnn.ROW_MAJOR_LAYOUT)
     ttnn.deallocate(topk_expert_indices)
-    topk_indices_rm = ttnn.reshape(topk_indices_rm, shape=(1, 1, tokens_per_device, config.num_experts_per_tok))
+    # topk_indices_rm = ttnn.reshape(topk_indices_rm, shape=(1, 1, tokens_per_device, config.num_experts_per_tok))
+    topk_indices_rm = row_major_reshape(topk_indices_rm, (1, 1, tokens_per_device, config.num_experts_per_tok))
 
     # ==========================================================================
     # STEP 2: ALL_TO_ALL_DISPATCH - Route tokens to expert devices
@@ -197,7 +203,8 @@ def decode_forward(
     # -> repeat to [1, dispatch_rows, tokens_per_device, num_experts]
     # -> reshape to [1, 1, total_tokens, num_experts] to match dispatch_metadata batch/seq dims
     remap_mask = ttnn.repeat(remap_topk_mask, ttnn.Shape((1, 1, tokens_per_device, 1)))
-    remap_mask = ttnn.reshape(remap_mask, (1, 1, total_tokens, config.num_experts))
+    # remap_mask = ttnn.reshape(remap_mask, (1, 1, total_tokens, config.num_experts))
+    remap_mask = row_major_reshape(remap_mask, (1, 1, total_tokens, config.num_experts))
     # moe_expert_token_remap returns:
     #   - mapping: [D, tokens, 1, experts_per_device] - local expert activation weights
     #   - sparsity: [D, 1, tokens/reduction_size, experts_per_device] - which blocks are active
@@ -222,15 +229,17 @@ def decode_forward(
     # The sparse matmul operates on blocks of tokens, with sparsity indicating
     # which (token_block, expert) pairs need computation.
     # Note: reshape returns view, but to_layout creates new tensor
-    post_dispatch = ttnn.reshape(dispatch_output, shape=(1, 1, total_tokens, config.hidden_size))
+    # post_dispatch = ttnn.reshape(dispatch_output, shape=(1, 1, total_tokens, config.hidden_size))
+    post_dispatch = row_major_reshape(dispatch_output, (1, 1, total_tokens, config.hidden_size))
     post_dispatch_rm = post_dispatch
     post_dispatch = ttnn.to_layout(post_dispatch, ttnn.TILE_LAYOUT)
     ttnn.deallocate(post_dispatch_rm)  # This deallocates dispatch_output via the view
 
     # Reshape to sparse block format for matmul
     # Note: reshape returns a view - don't deallocate post_dispatch separately
     num_sparse_blocks = total_tokens // config.sparsity_block_size
-    expert_input = ttnn.reshape(
+    # expert_input = ttnn.reshape(
+    expert_input = row_major_reshape(
         post_dispatch,
         shape=(1, num_sparse_blocks, config.sparsity_block_size, config.hidden_size),
     )
@@ -328,7 +337,8 @@ def decode_forward(
     ttnn.deallocate(expert_output_sparse)
     # Note: reshape returns a view, to_layout creates new tensor
     # With tokens on dim -2: [experts_per_device, 1, total_tokens, H]
-    expert_output = ttnn.reshape(
+    # expert_output = ttnn.reshape(
+    expert_output = row_major_reshape(
         expert_output,
         shape=(config.num_experts_per_device, 1, total_tokens, config.hidden_size),
     )
diff --git a/models/demos/gpt_oss/tt/topk.py b/models/demos/gpt_oss/tt/topk.py
@@ -11,6 +11,7 @@
 """
 
 import ttnn
+from models.demos.gpt_oss.tt.common import row_major_reshape
 from models.demos.gpt_oss.utils.general_utils import get_cache_file_name
 
 
@@ -146,7 +147,8 @@ def __call__(self, hidden_states, use_throughput_experts):
         # )
         mem_config = ttnn.DRAM_MEMORY_CONFIG
 
-        hidden_states = ttnn.reshape(hidden_states, (-1, self.hidden_dim))
+        # hidden_states = ttnn.reshape(hidden_states, (-1, self.hidden_dim))
+        hidden_states = row_major_reshape(hidden_states, (-1, self.hidden_dim))
         router_logits = ttnn.linear(
             hidden_states,
             self.weight,