tenstorrent · sraizada-tt · Feb 6, 2026 · Feb 5, 2026 · Feb 5, 2026 · Feb 5, 2026
@@ -160,10 +160,17 @@ def decode_forward(
     tt_sdpa_out.deallocate(True)
     tt_out = ttnn.add(tt_out, weights.o_proj_bias, memory_config=ttnn.L1_MEMORY_CONFIG)
     tt_out = ttnn.typecast(tt_out, ttnn.bfloat8_b)
+
+    # Calculate padded hidden size for tile-aligned CCL operations.
+    # o_proj weights may be padded so local_hidden becomes tile-aligned.
+    local_hidden = hidden_size // mesh_config.tp
+    padded_local_hidden = ((local_hidden + 31) // 32) * 32
+    padded_hidden = padded_local_hidden * mesh_config.tp if mesh_config.tp > 1 else hidden_size
+
     tt_out = ttnn.reshape(
         tt_out,
-        (1, 1, batch_size, hidden_size),
-        (1, 1, 32, hidden_size),
+        (1, 1, batch_size, padded_hidden),
+        (1, 1, 32, padded_hidden),
     )
     # tt_out = ttnn.unsqueeze(tt_out, 0)
 

@@ -132,9 +132,22 @@ def apply_allreduce(tensor, mesh_config, ccl_manager, batch_size: int, seq_len:
         Tensor after allreduce (if TP > 1) or original tensor
     """
     if mesh_config.tp > 1:
-        # tensor = ttnn.unsqueeze(tensor, 0)
         tensor = mesh_config.allreduce(tensor, ccl_manager, pad_size=0, axis=mesh_config.tp_axis)
-        # tensor = ttnn.reshape(tensor, (batch_size, seq_len, hidden_size))
+
+        # Remove padding added in weights.py for tile-aligned CCL operations.
+        # If local_hidden was padded (e.g., 360 -> 384), we need to slice back to original hidden_size.
+        local_hidden = hidden_size // mesh_config.tp
+        padded_local_hidden = ((local_hidden + 31) // 32) * 32
+        if padded_local_hidden != local_hidden:
+            # Slice from padded_hidden back to hidden_size on the last dimension.
+            # Works for both decode [1, 1, batch, padded_hidden] and prefill [1, batch, seq_len, padded_hidden].
+            shape = tensor.shape
+            tensor = ttnn.slice(
+                tensor,
+                starts=[0, 0, 0, 0],
+                ends=[shape[0], shape[1], shape[2], hidden_size],
+                steps=[1, 1, 1, 1],
+            )
     return tensor
 
 

@@ -132,16 +132,34 @@ def load_attention_weights(
     decode_sinks /= config.scaling
 
     # Output projection
+    # Pad o_proj output dimension for tile alignment in CCL operations.
+    # Without padding, local_hidden = hidden_size / TP may not be tile-aligned (e.g., 2880/8 = 360),
+    # causing CCL to do expensive Untilize->Pad->Tilize cycles internally.
+    hidden_size = config.hidden_size
+    local_hidden = hidden_size // mesh_config.tp
+    padded_local_hidden = ((local_hidden + 31) // 32) * 32  # Round up to tile boundary
+    o_proj_pad_size = padded_local_hidden - local_hidden
+
+    if o_proj_pad_size > 0 and mesh_config.tp > 1:
+        # Pad the output dimension of o_proj weight: [input_dim, hidden_size] -> [input_dim, padded_hidden]
+        # Each TP device's output goes from local_hidden to padded_local_hidden
+        padded_hidden = padded_local_hidden * mesh_config.tp
+        o_proj = torch.nn.functional.pad(o_proj, (0, padded_hidden - hidden_size), "constant", value=0.0)
+        # Pad bias similarly
+        o_proj_bias = torch.nn.functional.pad(o_proj_bias, (0, padded_hidden - hidden_size), "constant", value=0.0)
+
     if mesh_config.tp > 1:
         o_proj_bias = torch.cat([o_proj_bias] + [torch.zeros_like(o_proj_bias)] * (mesh_config.tp - 1), dim=-1)
 
+    # Use unique cache key when padding is applied
+    o_proj_cache_suffix = f"_padded{padded_local_hidden}" if o_proj_pad_size > 0 and mesh_config.tp > 1 else ""
     o_proj_tt = ttnn.as_tensor(
         o_proj,
         device=mesh_device,
         layout=ttnn.TILE_LAYOUT,
         dtype=weight_dtype,
         mesh_mapper=row_mesh_mapper,
-        cache_file_name=get_cache_file_name(tensor_cache_path, "o_proj"),
+        cache_file_name=get_cache_file_name(tensor_cache_path, f"o_proj{o_proj_cache_suffix}"),
         memory_config=ttnn.DRAM_MEMORY_CONFIG,
     )
 
@@ -151,7 +169,7 @@ def load_attention_weights(
         layout=ttnn.TILE_LAYOUT,
         dtype=bias_dtype,
         mesh_mapper=col_mesh_mapper,
-        cache_file_name=get_cache_file_name(tensor_cache_path, "o_proj_bias"),
+        cache_file_name=get_cache_file_name(tensor_cache_path, f"o_proj_bias{o_proj_cache_suffix}"),
         memory_config=ttnn.DRAM_MEMORY_CONFIG,
     )