Fusing on by default + Multiply commute pattern rewrite (#3946)

mtopalovicTT · web-flow · commit 83be68784772 · 2025-06-30T15:10:58.000+03:00
Extending multiply commute to support commuting multiply when scale is not coming from block argument directly. Before we only supported this pattern: ``` constant_argument conv2d | | | | +-------multiply-----+ ``` In mobilenet there are bunch of layers which have this pattern: ``` constant_argument | | | transpose | | | transpose | | | broadcast conv2d | | | | +-------multiply-----+ ``` This PR adds small extension to `Conv2dWithMultiply` which can match scale coming directly from block argument or scale coming from broadcast where subgraph which is input into broadcast is const eval. For example above graph can be commuted since input into graph is constant but something like below can't: ``` constant_argument input | | | | +--------add---------+ | | | broadcast conv2d | | | | +-------multiply-----+ ``` ``` constant_argument | | | transpose | | | transpose conv2d | | | | +-------multiply-----+ ``` To check if subraph is fusable we start from`scale` argument in `isCommutable` and we construct [UD chain](https://en.wikipedia.org/wiki/Use-define_chain) and we use it to check if inputs into this subgraph are constants. When we determine that subgraph is const eval we commute whole subgraph before conv2d and apply reshape like we did before to align channel dim with weight. So resulting graph after commute would become: ``` constant_argument | | | transpose | | | transpose | | | reshape | | | broadcast weight | | | | | | multiply--------+ | | | conv2d ``` Or in no broadcast case: ``` constant_argument | | | reshape weight | | | | | | multiply--------+ | | | conv2d ``` In addition this PR tags clamp scalar with eltwise unary trait which would enable TM to commute through it.
diff --git a/include/ttmlir/Dialect/StableHLO/Transforms/ShardyUtils.h b/include/ttmlir/Dialect/StableHLO/Transforms/ShardyUtils.h
@@ -7,7 +7,6 @@
 
 #include "ttmlir/Dialect/StableHLO/Transforms/ShardyCCLToStableHLOCCL.h"
 
-#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Dialect/Func/IR/FuncOps.h"
 #include "mlir/Dialect/Tensor/IR/Tensor.h"
 #include "mlir/IR/Builders.h"
diff --git a/include/ttmlir/Dialect/TTIR/IR/TTIROps.td b/include/ttmlir/Dialect/TTIR/IR/TTIROps.td
@@ -4075,7 +4075,7 @@ def TTIR_UnsqueezeOp : TTIR_NamedOp<"unsqueeze"> {
     let hasVerifier = 1;
 }
 
-def TTIR_ClampScalarOp : TTIR_NamedOp<"clamp_scalar"> {
+def TTIR_ClampScalarOp : TTIR_NamedOp<"clamp_scalar", [TTIR_ElementwiseUnary]> {
     let summary = "Scalar value clamping operation.";
     let description = [{
       The `clamp_scalar` operation constrains all elements of a tensor to be within a specified range.
diff --git a/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h b/include/ttmlir/Dialect/TTNN/Pipelines/TTNNPipelines.h
@@ -222,7 +222,7 @@ struct TTIRToTTNNBackendPipelineOptions
 
   Option<bool> enableFusing{*this, "enable-fusing-pass",
                             llvm::cl::desc("Enable fusing pass."),
-                            llvm::cl::init(false)};
+                            llvm::cl::init(true)};
 
   Option<ttcore::TTArgumentTypeMap, ttcore::ArgumentTypeMapParser>
       argumentTypeMap{
diff --git a/include/ttmlir/Utils.h b/include/ttmlir/Utils.h
@@ -486,6 +486,63 @@ OpType getOutermostLoopNest(mlir::ValueRange values) {
 
 } // namespace loop
 
+// Given a startng mlir::Value return a set of all values in the use-def
+// chain. This chain is not topologically sorted, so the order of values in the
+// result is not guaranteed. If you want to topologically sort the chain
+// use topologicalSort.
+inline llvm::SetVector<mlir::Value> getUseDefChain(mlir::Value start) {
+  llvm::SetVector<mlir::Value> useDefChain;
+  llvm::SmallVector<mlir::Value> worklist{start};
+  llvm::SmallPtrSet<mlir::Value, 4> visited;
+
+  while (!worklist.empty()) {
+    mlir::Value value = worklist.pop_back_val();
+    useDefChain.insert(value);
+
+    mlir::Operation *defOp = value.getDefiningOp();
+    if (!defOp) {
+      continue;
+    }
+
+    for (mlir::OpOperand &operand : defOp->getOpOperands()) {
+      mlir::Value operandValue = operand.get();
+      if (visited.contains(operandValue)) {
+        continue;
+      }
+      visited.insert(operandValue);
+      worklist.push_back(operandValue);
+    }
+  }
+
+  return useDefChain;
+}
+
+// Given list of mlir::Value filter out block arguments.
+inline llvm::SetVector<mlir::BlockArgument>
+filterBlockArguments(llvm::ArrayRef<mlir::Value> values) {
+  llvm::SetVector<mlir::BlockArgument> blockArgs;
+  for (mlir::Value value : values) {
+    if (auto blockArg = llvm::dyn_cast<mlir::BlockArgument>(value)) {
+      blockArgs.insert(blockArg);
+    }
+  }
+
+  return blockArgs;
+}
+
+// Given list of mlir::Value filter out operations that define them.
+// If value is not operation it is ignored.
+inline llvm::SetVector<mlir::Operation *>
+filterOperations(llvm::ArrayRef<mlir::Value> values) {
+  llvm::SetVector<mlir::Operation *> ops;
+  for (mlir::Value value : values) {
+    if (auto *op = value.getDefiningOp()) {
+      ops.insert(op);
+    }
+  }
+
+  return ops;
+}
 } // namespace ttmlir::utils
 
 #endif // TTMLIR_UTILS_H
diff --git a/lib/Dialect/TTIR/Transforms/ExplicateTMs.cpp b/lib/Dialect/TTIR/Transforms/ExplicateTMs.cpp
@@ -102,7 +102,8 @@ class ExplicateBroadcastsRewriter
       llvm::ArrayRef<int64_t> operandShape = operandType.getShape();
 
       llvm::SmallVector<int64_t> broadcastDimensions =
-          getBroadcastDimensions(operandShape, broadcastedShape);
+          ttmlir::utils::getBroadcastDimensions<int64_t>(operandShape,
+                                                         broadcastedShape);
       if (llvm::all_of(broadcastDimensions, [](int64_t i) { return i == 1; })) {
         continue;
       }
@@ -157,19 +158,6 @@ class ExplicateBroadcastsRewriter
 
     return broadcastedShape;
   }
-
-  llvm::SmallVector<int64_t>
-  getBroadcastDimensions(llvm::ArrayRef<int64_t> operandShape,
-                         llvm::ArrayRef<int64_t> targetShape) const {
-    llvm::SmallVector<int64_t> broadcastDimensions(operandShape.size(), 1);
-    for (size_t dim = 0; dim < operandShape.size(); dim++) {
-      if (operandShape[dim] < targetShape[dim]) {
-        broadcastDimensions[dim] = targetShape[dim];
-      }
-    }
-
-    return broadcastDimensions;
-  }
 };
 } // namespace
 
diff --git a/lib/Dialect/TTIR/Transforms/TTIRFusing.cpp b/lib/Dialect/TTIR/Transforms/TTIRFusing.cpp
@@ -6,7 +6,7 @@
 #include "ttmlir/Dialect/TTIR/Utils/Utils.h"
 #include "ttmlir/Utils.h"
 
-#include "mlir/IR/Value.h"
+#include "mlir/Analysis/TopologicalSortUtils.h"
 #include "mlir/Transforms/GreedyPatternRewriteDriver.h"
 
 namespace mlir::tt::ttir {
@@ -251,16 +251,30 @@ class Conv2dWithMultiply : public mlir::OpRewritePattern<MultiplyOp> {
     Conv2dOp conv2dOp = components->first;
     Value scaleValue = components->second;
 
-    // Insert before conv2d op.
-    rewriter.setInsertionPoint(conv2dOp);
-
     // Reshape scale to match weight dimensions and pre-multiply weights.
     Value reshapedScale =
-        createReshapedScale(rewriter, conv2dOp.getLoc(), scaleValue);
+        createReshapedScale(rewriter, conv2dOp.getLoc(), scaleValue,
+                            conv2dOp.getWeight().getType());
+
+    // Get UD chain starting from the reshaped scale. This chain will be
+    // moved before the conv2dOp to ensure that weight scale can be
+    // const-evaled.
+    SetVector<Value> udChain = ttmlir::utils::getUseDefChain(reshapedScale);
+    SetVector<Operation *> udChainOps =
+        ttmlir::utils::filterOperations(udChain.getArrayRef());
+    SetVector<Operation *> udChainSorted = topologicalSort(udChainOps);
+    for (auto *op : udChainSorted) {
+      op->moveBefore(conv2dOp);
+    }
+
+    rewriter.setInsertionPoint(conv2dOp);
+
+    // Create scaled weights by multiplying the original weights with the
+    // resshaped scale.
     Value scaledWeights = createScaledWeights(
         rewriter, conv2dOp.getLoc(), conv2dOp.getWeight(), reshapedScale);
 
-    // Update conv2d to use scaled weights and replace multiply operation
+    // Update conv2d to use scaled weights and replace multiply operation.
     rewriter.modifyOpInPlace(
         conv2dOp, [&]() { conv2dOp.getWeightMutable().assign(scaledWeights); });
     rewriter.replaceAllOpUsesWith(multiplyOp, conv2dOp);
@@ -295,22 +309,50 @@ class Conv2dWithMultiply : public mlir::OpRewritePattern<MultiplyOp> {
     mlir::func::FuncOp funcOp = conv2dOp->getParentOfType<mlir::func::FuncOp>();
     llvm::SmallPtrSet<BlockArgument, 4> constParams =
         mlir::tt::ttcore::getConstsAndParams(funcOp);
-    auto isConstant = [&constParams, conv2dOp](mlir::Value value) {
+    auto isConstant = [&constParams](mlir::Value value) {
       if (auto blockArg = mlir::dyn_cast<BlockArgument>(value)) {
         return constParams.contains(blockArg);
       }
 
-      Operation *op = value.getDefiningOp();
-      return op->hasTrait<mlir::tt::ttcore::Trait::TTCoreCreationOpTrait>() &&
-             op->isBeforeInBlock(conv2dOp);
+      Operation *defOp = value.getDefiningOp();
+      return defOp->hasTrait<mlir::tt::ttcore::Trait::TTCoreCreationOpTrait>();
     };
 
-    // Both scale and weight must be constant.
-    if (!isConstant(scale) || !isConstant(conv2dOp.getWeight())) {
+    // If weight is not constant, we cannot commute.
+    if (!isConstant(conv2dOp.getWeight())) {
+      return false;
+    }
+
+    RankedTensorType scaleType = scale.getType();
+    // If scale is comming from broadcast then we want to use the input type
+    // to the broadcast to check the shape.
+    if (auto bcastOp =
+            mlir::dyn_cast_if_present<BroadcastOp>(scale.getDefiningOp())) {
+      scaleType = bcastOp.getInput().getType();
+    }
+
+    // Check if scale shape is with conv2d weight.
+    if (!hasValidScaleShape(conv2dOp, scaleType)) {
+      return false;
+    }
+
+    // Now we want to check if operations which produce scale are
+    // const-evalable. We do this by getting UD chain of the scale and then
+    // checking if all inputs into this chain are constants.
+    SetVector<Value> useDefChain = ttmlir::utils::getUseDefChain(scale);
+    SetVector<BlockArgument> useDefChainBlockArgs =
+        ttmlir::utils::filterBlockArguments(useDefChain.getArrayRef());
+    if (!all_of(useDefChainBlockArgs, isConstant)) {
       return false;
     }
 
-    return hasValidScaleShape(conv2dOp, scale.getType());
+    // Since we want to move the scale chain before conv2dOp we want to make
+    // sure that the scale chain does not contain conv2dOp.
+    if (useDefChain.contains(conv2dOp)) {
+      return false;
+    }
+
+    return true;
   }
 
   // Scale must have rank 4 and shape (1, 1, 1, out_channels).
@@ -320,11 +362,31 @@ class Conv2dWithMultiply : public mlir::OpRewritePattern<MultiplyOp> {
            scaleType.getDimSize(3) == convOp.getOutputChannelSize();
   }
 
+  // There are two cases we want to handle here:
+  // 1. Input scale is a constant tensor that only neeeds reshaping
+  // 2. Input scale is a broadcast operation that needs reshaping
+  //
+  // In case of 1 we just add reshape operation to the scale tensor such that
+  // it has shape (out_channels, 1, 1, 1).
+  //
+  // In case of 2 we need to add reshape operation to the input of the of bcast
+  // and then we create new broadcast operation with the new reshaped scale
+  // which broadcasts the reshaped scale to the shape of the weight tensor.
   static Value createReshapedScale(mlir::PatternRewriter &rewriter,
-                                   Location loc, Value scaleValue) {
+                                   Location loc, Value scaleValue,
+                                   RankedTensorType weightType) {
+    // If scaleValue is broadcast operation we want to reshape its input.
+    // Otherwise we reshape the scaleValue itself.
+    Value reshapeInput = scaleValue;
+    if (auto bcastOp = mlir::dyn_cast_if_present<BroadcastOp>(
+            scaleValue.getDefiningOp())) {
+      rewriter.setInsertionPoint(bcastOp);
+      reshapeInput = bcastOp.getInput();
+    }
+
     // Get the scale's type.
     RankedTensorType scaleType =
-        mlir::cast<RankedTensorType>(scaleValue.getType());
+        mlir::cast<RankedTensorType>(reshapeInput.getType());
 
     // Create a new shape (out_channels, 1, 1, 1) from (1, 1, 1, out_channels).
     llvm::SmallVector<int64_t> newShape(scaleType.getShape());
@@ -335,11 +397,25 @@ class Conv2dWithMultiply : public mlir::OpRewritePattern<MultiplyOp> {
     // Convert to int32 for the reshape operation.
     llvm::SmallVector<int32_t> newShapeI32(newShape.begin(), newShape.end());
 
-    // Create and return the reshape operation.
-    return ttir::utils::createDPSOp<ttir::ReshapeOp>(
+    // Create the reshape operation.
+    auto reshapedScale = ttir::utils::createDPSOp<ttir::ReshapeOp>(
         rewriter, ttmlir::utils::appendLocationSuffix(loc, "_reshape"),
         newShape, scaleType.getElementType(), scaleType.getEncoding(),
-        scaleValue, rewriter.getI32ArrayAttr(newShapeI32));
+        reshapeInput, rewriter.getI32ArrayAttr(newShapeI32));
+
+    // If scale value is not a broadcast operation we can return reshapedScale.
+    if (!isa_and_present<ttir::BroadcastOp>(scaleValue.getDefiningOp())) {
+      return reshapedScale;
+    }
+
+    // Otherwise we need to create a new broadcast operation that will take
+    // reshaped scale and brroadcast it to the shape of the weight tensor.
+    SmallVector<int64_t> broadcastDims =
+        ttmlir::utils::getBroadcastDimensions<int64_t>(
+            reshapedScale.getType().getShape(), weightType.getShape());
+    return ttir::utils::createDPSOp<ttir::BroadcastOp>(
+        rewriter, scaleValue.getLoc(), weightType, reshapedScale,
+        broadcastDims);
   }
 
   /// Create pre-multiplied weights.
diff --git a/test/ttmlir/Dialect/TTIR/fusing/conv2d_multiply_commute.mlir b/test/ttmlir/Dialect/TTIR/fusing/conv2d_multiply_commute.mlir
@@ -134,9 +134,11 @@ module {
   }
 
   // Check that we can't commute since %scale is not before %conv in block.
-  // CHECK-LABEL: func.func @conv2d_creation_op_non_commutable
-  func.func @conv2d_creation_op_non_commutable(%input: tensor<1x32x32x64xbf16>) -> tensor<1x30x30x64xbf16> {
-    // CHECK-NOT: "ttir.reshape"
+  // CHECK-LABEL: func.func @conv2d_creation_op_commutable
+  func.func @conv2d_creation_op_commutable(%input: tensor<1x32x32x64xbf16>) -> tensor<1x30x30x64xbf16> {
+    // CHECK: "ttir.ones"
+    // CHECK: "ttir.reshape"
+    // CHECK: "ttir.multiply"
     // CHECK: "ttir.conv2d"
     %0 = ttir.empty() : tensor<1x30x30x64xbf16>
     %weight = "ttir.zeros"() <{shape = array<i32: 64, 64, 3, 3>}> : () -> tensor<64x64x3x3xbf16>
@@ -147,11 +149,72 @@ module {
               dilation = 1: i32,
               groups = 1: i32
             }> : (tensor<1x32x32x64xbf16>, tensor<64x64x3x3xbf16>, tensor<1x30x30x64xbf16>) -> tensor<1x30x30x64xbf16>
-    // CHECK: "ttir.multiply"
     %scale = "ttir.ones"() <{shape = array<i32: 1, 1, 1, 64>}> : () -> tensor<1x1x1x64xbf16>
     %1 = ttir.empty() : tensor<1x30x30x64xbf16>
     %2 = "ttir.multiply"(%conv, %scale, %1) : (tensor<1x30x30x64xbf16>, tensor<1x1x1x64xbf16>, tensor<1x30x30x64xbf16>) -> tensor<1x30x30x64xbf16>
 
     return %2: tensor<1x30x30x64xbf16>
   }
+
+  // Check that we can commute const-eval subgraph which generates scale for conv2d output.
+  // CHECK-LABEL: func.func @conv2d_subgraph_commute
+  func.func @conv2d_subgraph_commute(%arg0: tensor<1x3x224x224xbf16> {ttcore.argument_type = #ttcore.argument_type<input>}, %arg1: tensor<1x32x1x1xbf16> {ttcore.argument_type = #ttcore.argument_type<constant>}, %arg2: tensor<1x32x1x1xbf16> {ttcore.argument_type = #ttcore.argument_type<constant>}, %arg3: tensor<32x3x3x3xbf16> {ttcore.argument_type = #ttcore.argument_type<parameter>, ttir.conv2d_weight}, %arg4: tensor<32x1x3x3xbf16> {ttcore.argument_type = #ttcore.argument_type<parameter>, ttir.conv2d_weight}) -> tensor<1x112x112x32xbf16> {
+    // Ignore first reshape which is for conv input.
+    // CHECK: "ttir.reshape"
+    // CHECK: %[[RESHAPE:.*]] = "ttir.reshape"
+    // CHECK: %[[BCAST:.*]] = "ttir.broadcast"
+    // CHECK-SAME: (%[[RESHAPE]]
+    // CHECK: %[[MUL:.*]] = "ttir.multiply"
+    // CHECK-SAME: (%arg3, %[[BCAST]]
+    // CHECK: "ttir.conv2d"
+    // CHECK-SAME: ([[X:.*]], %[[MUL]]
+    %0 = ttir.empty() : tensor<1x224x3x224xbf16>
+    %1 = "ttir.transpose"(%arg0, %0) <{dim0 = 1 : si32, dim1 = 2 : si32}> : (tensor<1x3x224x224xbf16>, tensor<1x224x3x224xbf16>) -> tensor<1x224x3x224xbf16>
+    %2 = ttir.empty() : tensor<1x224x224x3xbf16>
+    %3 = "ttir.transpose"(%1, %2) <{dim0 = 2 : si32, dim1 = 3 : si32}> : (tensor<1x224x3x224xbf16>, tensor<1x224x224x3xbf16>) -> tensor<1x224x224x3xbf16>
+    %4 = ttir.empty() : tensor<1x1x50176x3xbf16>
+    %5 = "ttir.reshape"(%3, %4) <{shape = [1 : i32, 1 : i32, 50176 : i32, 3 : i32]}> : (tensor<1x224x224x3xbf16>, tensor<1x1x50176x3xbf16>) -> tensor<1x1x50176x3xbf16>
+    %6 = ttir.empty() : tensor<1x1x12544x32xbf16>
+    %7 = "ttir.conv2d"(%5, %arg3, %6) <{dilation = array<i32: 1, 1>, flattened_compat_info = #ttir<flattened_compat batch_size = 1, input_height = 224, input_width = 224>, groups = 1 : i32, padding = array<i32: 1, 1, 1, 1>, stride = array<i32: 2, 2>}> : (tensor<1x1x50176x3xbf16>, tensor<32x3x3x3xbf16>, tensor<1x1x12544x32xbf16>) -> tensor<1x1x12544x32xbf16>
+    %8 = ttir.empty() : tensor<1x1x32x1xbf16>
+    %9 = "ttir.transpose"(%arg1, %8) <{dim0 = 1 : si32, dim1 = 2 : si32}> : (tensor<1x32x1x1xbf16>, tensor<1x1x32x1xbf16>) -> tensor<1x1x32x1xbf16>
+    %10 = ttir.empty() : tensor<1x1x1x32xbf16>
+    %11 = "ttir.transpose"(%9, %10) <{dim0 = 2 : si32, dim1 = 3 : si32}> : (tensor<1x1x32x1xbf16>, tensor<1x1x1x32xbf16>) -> tensor<1x1x1x32xbf16>
+    %12 = ttir.empty() : tensor<1x1x12544x32xbf16>
+    %13 = "ttir.broadcast"(%11, %12) <{broadcast_dimensions = array<i64: 1, 1, 12544, 1>}> : (tensor<1x1x1x32xbf16>, tensor<1x1x12544x32xbf16>) -> tensor<1x1x12544x32xbf16>
+    %14 = ttir.empty() : tensor<1x1x12544x32xbf16>
+    %15 = "ttir.multiply"(%7, %13, %14) : (tensor<1x1x12544x32xbf16>, tensor<1x1x12544x32xbf16>, tensor<1x1x12544x32xbf16>) -> tensor<1x1x12544x32xbf16>
+    %16 = ttir.empty() : tensor<1x112x112x32xbf16>
+    %17 = "ttir.reshape"(%15, %16) <{shape = [1 : i32, 112 : i32, 112 : i32, 32 : i32]}> : (tensor<1x1x12544x32xbf16>, tensor<1x112x112x32xbf16>) -> tensor<1x112x112x32xbf16>
+    return %17 : tensor<1x112x112x32xbf16>
+  }
+
+  // Check that we can't commute const-eval since arg1 is not constant.
+  // CHECK-LABEL: func.func @conv2d_subgraph_not_commuteable
+  func.func @conv2d_subgraph_not_commuteable(%arg0: tensor<1x3x224x224xbf16> {ttcore.argument_type = #ttcore.argument_type<input>}, %arg1: tensor<1x32x1x1xbf16> {ttcore.argument_type = #ttcore.argument_type<input>}, %arg2: tensor<1x32x1x1xbf16> {ttcore.argument_type = #ttcore.argument_type<constant>}, %arg3: tensor<32x3x3x3xbf16> {ttcore.argument_type = #ttcore.argument_type<parameter>, ttir.conv2d_weight}, %arg4: tensor<32x1x3x3xbf16> {ttcore.argument_type = #ttcore.argument_type<parameter>, ttir.conv2d_weight}) -> tensor<1x112x112x32xbf16> {
+    // Ignore first reshape which is for conv input.
+    // CHECK: "ttir.reshape"
+    // CHECK: "ttir.conv2d"
+    // CHECK: "ttir.broadcast"
+    // CHECK: "ttir.multiply"
+    %0 = ttir.empty() : tensor<1x224x3x224xbf16>
+    %1 = "ttir.transpose"(%arg0, %0) <{dim0 = 1 : si32, dim1 = 2 : si32}> : (tensor<1x3x224x224xbf16>, tensor<1x224x3x224xbf16>) -> tensor<1x224x3x224xbf16>
+    %2 = ttir.empty() : tensor<1x224x224x3xbf16>
+    %3 = "ttir.transpose"(%1, %2) <{dim0 = 2 : si32, dim1 = 3 : si32}> : (tensor<1x224x3x224xbf16>, tensor<1x224x224x3xbf16>) -> tensor<1x224x224x3xbf16>
+    %4 = ttir.empty() : tensor<1x1x50176x3xbf16>
+    %5 = "ttir.reshape"(%3, %4) <{shape = [1 : i32, 1 : i32, 50176 : i32, 3 : i32]}> : (tensor<1x224x224x3xbf16>, tensor<1x1x50176x3xbf16>) -> tensor<1x1x50176x3xbf16>
+    %6 = ttir.empty() : tensor<1x1x12544x32xbf16>
+    %7 = "ttir.conv2d"(%5, %arg3, %6) <{dilation = array<i32: 1, 1>, flattened_compat_info = #ttir<flattened_compat batch_size = 1, input_height = 224, input_width = 224>, groups = 1 : i32, padding = array<i32: 1, 1, 1, 1>, stride = array<i32: 2, 2>}> : (tensor<1x1x50176x3xbf16>, tensor<32x3x3x3xbf16>, tensor<1x1x12544x32xbf16>) -> tensor<1x1x12544x32xbf16>
+    %8 = ttir.empty() : tensor<1x1x32x1xbf16>
+    %9 = "ttir.transpose"(%arg1, %8) <{dim0 = 1 : si32, dim1 = 2 : si32}> : (tensor<1x32x1x1xbf16>, tensor<1x1x32x1xbf16>) -> tensor<1x1x32x1xbf16>
+    %10 = ttir.empty() : tensor<1x1x1x32xbf16>
+    %11 = "ttir.transpose"(%9, %10) <{dim0 = 2 : si32, dim1 = 3 : si32}> : (tensor<1x1x32x1xbf16>, tensor<1x1x1x32xbf16>) -> tensor<1x1x1x32xbf16>
+    %12 = ttir.empty() : tensor<1x1x12544x32xbf16>
+    %13 = "ttir.broadcast"(%11, %12) <{broadcast_dimensions = array<i64: 1, 1, 12544, 1>}> : (tensor<1x1x1x32xbf16>, tensor<1x1x12544x32xbf16>) -> tensor<1x1x12544x32xbf16>
+    %14 = ttir.empty() : tensor<1x1x12544x32xbf16>
+    %15 = "ttir.multiply"(%7, %13, %14) : (tensor<1x1x12544x32xbf16>, tensor<1x1x12544x32xbf16>, tensor<1x1x12544x32xbf16>) -> tensor<1x1x12544x32xbf16>
+    %16 = ttir.empty() : tensor<1x112x112x32xbf16>
+    %17 = "ttir.reshape"(%15, %16) <{shape = [1 : i32, 112 : i32, 112 : i32, 32 : i32]}> : (tensor<1x1x12544x32xbf16>, tensor<1x112x112x32xbf16>) -> tensor<1x112x112x32xbf16>
+    return %17 : tensor<1x112x112x32xbf16>
+  }
 }
diff --git a/test/ttmlir/Dialect/TTNN/fusing/resnet_pattern_fusing.mlir b/test/ttmlir/Dialect/TTNN/fusing/resnet_pattern_fusing.mlir
@@ -1,4 +1,4 @@
-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-fusing-pass=true" %s | FileCheck %s
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s | FileCheck %s
 
 // This is common pattern throught Resnet. We have conv2d with constant weight, followed by multiply with constant input. This will be commuted through conv2d.
 // Then we fuse add into conv2d with bias and lastly we fuse conv2d and relu into conv2d with activation.
diff --git a/test/ttmlir/Silicon/TTNN/n150/fusing/resnet_pattern_fusing.mlir b/test/ttmlir/Silicon/TTNN/n150/fusing/resnet_pattern_fusing.mlir
@@ -1,4 +1,4 @@
-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-fusing-pass=true" %s > %t.mlir
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 
diff --git a/test/ttmlir/Silicon/TTNN/n150/fusing/softmax_fusing.mlir b/test/ttmlir/Silicon/TTNN/n150/fusing/softmax_fusing.mlir
@@ -1,4 +1,4 @@
-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path% enable-fusing-pass=true" %s > %t.mlir
+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="system-desc-path=%system_desc_path%" %s > %t.mlir
 // RUN: FileCheck %s --input-file=%t.mlir
 // RUN: ttmlir-translate --ttnn-to-flatbuffer %t.mlir > %t.ttnn
 

Original file line number	Diff line number	Diff line change
`@@ -4075,7 +4075,7 @@ def TTIR_UnsqueezeOp : TTIR_NamedOp<"unsqueeze"> {`
`4075`	`4075`	`let hasVerifier = 1;`
`4076`	`4076`	`}`
`4077`	`4077`
`4078`		`-def TTIR_ClampScalarOp : TTIR_NamedOp<"clamp_scalar"> {`
	`4078`	`+def TTIR_ClampScalarOp : TTIR_NamedOp<"clamp_scalar", [TTIR_ElementwiseUnary]> {`
`4079`	`4079`	`let summary = "Scalar value clamping operation.";`
`4080`	`4080`	`let description = [{`
`4081`	`4081`	The `clamp_scalar` operation constrains all elements of a tensor to be within a specified range.
Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,4 @@`
`1`		`-// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline="enable-fusing-pass=true" %s \| FileCheck %s`
	`1`	`+// RUN: ttmlir-opt --ttir-to-ttnn-backend-pipeline %s \| FileCheck %s`
`2`	`2`
`3`	`3`	`// This is common pattern throught Resnet. We have conv2d with constant weight, followed by multiply with constant input. This will be commuted through conv2d.`
`4`	`4`	`// Then we fuse add into conv2d with bias and lastly we fuse conv2d and relu into conv2d with activation.`