tenstorrent · zoecarver · Feb 2, 2026 · Feb 2, 2026 · brnorris03 · Feb 11, 2026
@@ -73,6 +73,33 @@ def TTL_AttachCBOp : TTL_Op<"attach_cb", []> {
   let hasVerifier = 1;
 }
 
+def TTL_TensorStoreOp : TTL_Op<"tensor_store", []> {
+  let summary = "Store a tensor to a circular buffer";
+  let description = [{
+    Stores a tensor value to a circular buffer. Unlike `attach_cb` which is
+    purely metadata (associating a tensor with a CB), `tensor_store` represents
+    actual data movement: the tensor's tiles will be copied through DST
+    registers to the output CB.
+
+    This op is used when explicitly storing data from one CB to another,
+    such as in passthrough kernels where input is directly copied to output
+    without compute operations.
+
+    Example:
+    ```mlir
+    %result = ttl.tensor_store %input, %out_cb
+        : (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], bf16, 2>)
+          -> tensor<1x1x!ttcore.tile<32x32, bf16>>
+    ```
+  }];
+  let arguments = (ins
+    AnyRankedTensor:$tensor,
+    TTL_CircularBuffer:$cb
+  );
+  let results = (outs AnyRankedTensor:$result);
+  let assemblyFormat = "$tensor `,` $cb attr-dict `:` functional-type(operands, results)";
+}
+
 def TTL_TensorSliceOp : TTL_Op<"tensor_slice", [Pure]> {
   let summary = "Create a view into a tensor at specific tile indices";
   let description = [{

@@ -36,17 +36,20 @@ static Value buildInitTensor(OpBuilder &b, Location loc, RankedTensorType type,
                                    dynDims);
 }
 
-/// Find the CB that this operation's result will be attached to.
-/// Looks for an attach_cb op that uses this operation's result.
+/// Find the CB that this operation's result will be stored to.
+/// Looks for attach_cb or tensor_store ops that use this operation's result.
 static Value findOutputCB(Operation *op) {
   if (op->getNumResults() == 0) {
     return nullptr;
   }
-  Value result = op->getResult(0);
-  for (OpOperand &use : result.getUses()) {
+  Value opResult = op->getResult(0);
+  for (OpOperand &use : opResult.getUses()) {
     if (auto attachOp = dyn_cast<AttachCBOp>(use.getOwner())) {
       return attachOp.getCb();
     }
+    if (auto storeOp = dyn_cast<TensorStoreOp>(use.getOwner())) {
+      return storeOp.getCb();
+    }
   }
   return nullptr;
 }
@@ -259,8 +262,8 @@ static LogicalResult buildBinaryCompute(Operation *op,
     return failure();
   }
 
-  // Find the output CB. First check if there's an attach_cb that uses this
-  // result, and use that CB. Otherwise, find an unused bind_cb.
+  // Find the output CB. First check if there's an attach_cb or tensor_store
+  // that uses this result. Otherwise, find an unused bind_cb.
   Value outCb = findOutputCB(op);
   if (!outCb) {
     auto unusedCBs = findUnusedBindCBs(op);
@@ -344,8 +347,8 @@ static LogicalResult buildUnaryCompute(Operation *op, PatternRewriter &rewriter,
     return failure();
   }
 
-  // Find the output CB. First check if there's an attach_cb that uses this
-  // result, and use that CB. Otherwise, find an unused bind_cb.
+  // Find the output CB. First check if there's an attach_cb or tensor_store
+  // that uses this result. Otherwise, find an unused bind_cb.
   Value outCb = findOutputCB(op);
   if (!outCb) {
     auto unusedCBs = findUnusedBindCBs(op);
@@ -579,6 +582,70 @@ struct LowerBcastToCompute : OpRewritePattern<BcastOp> {
   using Lower##TTL_OP = LowerUnaryToCompute<TTL_OP##Op, TILE_OP>;
 #include "ttlang/Dialect/TTL/TTLElementwiseOps.def"
 
+//===----------------------------------------------------------------------===//
+// TensorStore Lowering (passthrough case only)
+//===----------------------------------------------------------------------===//
+
+/// Pattern for tensor_store with CB-attached input (passthrough case).
+/// Creates a ComputeOp that copies input tiles to output CB.
+/// For elementwise ops, tensor_store's input is from ComputeOp (not
+/// CB-attached), so this pattern won't match. Those tensor_stores are erased in
+/// TTKernel lowering.
+struct LowerTensorStoreToCompute : OpRewritePattern<TensorStoreOp> {
+  using OpRewritePattern<TensorStoreOp>::OpRewritePattern;
+
+  LogicalResult matchAndRewrite(TensorStoreOp op,
+                                PatternRewriter &rewriter) const override {
+    Value input = op.getTensor();
+    Value outputCb = op.getCb();
+
+    // Only handle passthrough case where input is CB-attached.
+    // Elementwise case: input is from ComputeOp, not CB-attached - skip.
+    if (!getAttachedCB(input)) {
+      return failure();
+    }
+
+    auto inputType = getTensorType(input);
+    if (!inputType) {
+      return failure();
+    }
+
+    Location loc = op.getLoc();
+    MLIRContext *ctx = rewriter.getContext();
+
+    // Build identity maps for input and output
+    AffineMap identityMap =
+        AffineMap::getMultiDimIdentityMap(inputType.getRank(), ctx);
+    SmallVector<Attribute> maps = {AffineMapAttr::get(identityMap),
+                                   AffineMapAttr::get(identityMap)};
+    SmallVector<Attribute> iterTypes(inputType.getRank(),
+                                     rewriter.getStringAttr("parallel"));
+
+    // Create output init tensor attached to output CB
+    Value init = buildInitTensor(rewriter, loc, inputType, input);
+    Value initAttached =
+        rewriter.create<AttachCBOp>(loc, init.getType(), init, outputCb);
+
+    // Create compute op with passthrough body
+    auto computeOp = rewriter.create<ComputeOp>(
+        loc, TypeRange{inputType}, ValueRange{input}, ValueRange{initAttached},
+        rewriter.getArrayAttr(maps), rewriter.getArrayAttr(iterTypes));
+
+    // Build the body: just yield the input tile (passthrough)
+    Block *body = rewriter.createBlock(&computeOp.getBody());
+    Type scalarType = inputType.getElementType();
+    Type tileType = ttcore::TileType::get(scalarType);
+    body->addArgument(tileType, loc); // Input tile
+    body->addArgument(tileType, loc); // Output tile (unused)
+
+    rewriter.setInsertionPointToEnd(body);
+    rewriter.create<YieldOp>(loc, body->getArgument(0));
+
+    rewriter.replaceOp(op, computeOp.getResult(0));
+    return success();
+  }
+};
+
 //===----------------------------------------------------------------------===//
 // Pass Implementations
 //===----------------------------------------------------------------------===//
@@ -621,6 +688,7 @@ void populateTTLToComputePatterns(RewritePatternSet &patterns) {
 #include "ttlang/Dialect/TTL/TTLElementwiseOps.def"
 
   patterns.add<LowerBcastToCompute>(ctx);
+  patterns.add<LowerTensorStoreToCompute>(ctx);
 }
 
 } // namespace mlir::tt::ttl
@@ -360,7 +360,7 @@ struct StoreLowering : OpConversionPattern<StoreOp> {
         utils::computeCBTileIndexFromLoops(op, rewriter, /*cbShapeRank=*/2);
 
     // Determine DST index based on the source operation type:
-    // - DST-to-DST ops (binary ops): have dst_idx attribute
+    // - DST-to-DST ops (binary ops, copy_tile): have dst_idx attribute
     // - CB-reading ops (bcast, reduce): no dst_idx attribute, use loop index
     Value dstIndex;
     auto tileValue = adaptor.getTile();
@@ -369,10 +369,15 @@ struct StoreLowering : OpConversionPattern<StoreOp> {
               defOp->getAttrOfType<IntegerAttr>(kDstIdxAttrName)) {
         dstIndex =
             rewriter.create<arith::ConstantIndexOp>(loc, dstIdxAttr.getInt());
+      } else if (auto copyTile = dyn_cast<CopyTileOp>(defOp)) {
+        // Fallback: get dst_index directly from copy_tile operand
+        dstIndex = copyTile.getDstIndex();
+      } else {
+        return op.emitError("ttl.store source op lacks dst_idx attribute: ")
+               << defOp->getName();
       }
-    }
-
-    if (!dstIndex) {
+    } else {
+      // Block argument (e.g., from bcast/reduce) - use CB tile index
       dstIndex = cbTileIndex;
     }
 
@@ -879,6 +884,34 @@ struct CoreYLowering : OpConversionPattern<CoreYOp> {
   }
 };
 
+/// Lowering for tensor_store: handles cleanup after elementwise lowering.
+/// For elementwise ops, the ComputeOp already writes to the output CB, so
+/// tensor_store becomes a no-op. For passthrough (CB-attached input), we
+/// would need to emit copy_tile + pack_tile, but that case should be handled
+/// by LowerTensorStoreToCompute creating a passthrough ComputeOp.
+struct TensorStoreLowering : OpConversionPattern<TensorStoreOp> {
+  using OpConversionPattern::OpConversionPattern;
+
+  LogicalResult
+  matchAndRewrite(TensorStoreOp op, OpAdaptor adaptor,
+                  ConversionPatternRewriter &rewriter) const override {
+    Value input = op.getTensor();
+
+    // If input is CB-attached, this is a passthrough case that should have
+    // been handled by LowerTensorStoreToCompute. Emit error.
+    if (getAttachedCB(input)) {
+      return op.emitError(
+          "passthrough tensor_store should be lowered to ComputeOp first; "
+          "ensure convert-ttl-to-compute runs before this pass");
+    }
+
+    // For elementwise case: the ComputeOp already wrote to the output CB.
+    // tensor_store is now a no-op - just erase it.
+    rewriter.eraseOp(op);
+    return success();
+  }
+};
+
 struct FuncKernelFinalize : OpRewritePattern<FuncOp> {
   using OpRewritePattern::OpRewritePattern;
 
@@ -979,10 +1012,11 @@ lowerTTLOpsToTTKernel(ModuleOp mod, MLIRContext &ctx,
   });
 
   RewritePatternSet patterns(&ctx);
-  patterns.add<BindCBLowering, TensorSliceLowering, CopyLowering, WaitLowering,
-               CBReserveLowering, CBPushLowering, CBWaitLowering, CBPopLowering,
-               StoreLowering, CoreXLowering, CoreYLowering>(typeConverter,
-                                                            &ctx);
+  patterns
+      .add<BindCBLowering, TensorSliceLowering, CopyLowering, WaitLowering,
+           CBReserveLowering, CBPushLowering, CBWaitLowering, CBPopLowering,
+           StoreLowering, CoreXLowering, CoreYLowering, TensorStoreLowering>(
+          typeConverter, &ctx);
   populateFunctionOpInterfaceTypeConversionPattern(
       func::FuncOp::getOperationName(), patterns, typeConverter);
 

@@ -837,9 +837,74 @@ struct TTLAssignDSTPass : public impl::TTLAssignDSTBase<TTLAssignDSTPass> {
         }
       }
 
-      // Set dst_idx attributes on tile compute ops.
+      // Handle passthrough case: block args directly yielded without compute.
+      // These args are not consumed by any tile compute op, so the loop above
+      // doesn't insert copy_tile for them. We need to copy them to DST so that
+      // the subsequent ttl.store (inserted by ttl-insert-tile-regs-sync) can
+      // pack from DST to the output CB.
+      for (Value yielded : yieldOp.getValues()) {
+        auto arg = dyn_cast<BlockArgument>(yielded);
+        if (!arg || !isTileValue(arg)) {
+          continue;
+        }
+
+        // Skip if already copied (was used by some compute op)
+        if (dstIndexForValue.count(arg)) {
+          continue;
+        }
+
+        // Allocate DST slot
+        std::uint32_t assignedDstIndex = 0;
+        auto it = dstAssignment.find(arg);
+        if (it != dstAssignment.end()) {
+          assignedDstIndex = it->second;
+        } else {
+          int freeReg = inUse.find_first_unset();
+          if (freeReg < 0) {
+            computeOp.emitOpError("no free DST register for passthrough");
+            signalPassFailure();
+            return;
+          }
+          assignedDstIndex = static_cast<std::uint32_t>(freeReg);
+        }
+        inUse.set(assignedDstIndex);
+
+        // Insert copy_tile just before yield
+        builder.setInsertionPoint(yieldOp);
+        Location loc = yieldOp.getLoc();
+
+        auto indexMapAttr = computeIndexMapAttr(arg, computeOp, builder);
+        if (failed(indexMapAttr)) {
+          yieldOp.emitOpError("passthrough block argument not found in inputs");
+          signalPassFailure();
+          return;
+        }
+        Value srcIndex = builder.create<LinearizedIndexOp>(loc, *indexMapAttr);
+        Value dstIndex =
+            builder.create<arith::ConstantIndexOp>(loc, assignedDstIndex);
+        auto copy = builder.create<CopyTileOp>(
+            loc,
+            TypeRange{DSTRegisterType::get(arg.getContext()), arg.getType()},
+            ValueRange{arg, srcIndex, dstIndex});
+        dstIndexForValue[copy.getDstTile()] = assignedDstIndex;
+        dstIndexForValue[arg] = assignedDstIndex;
+
+        // Replace the yielded arg with the copy result
+        yieldOp->replaceUsesOfWith(arg, copy.getDstTile());
+
+        LLVM_DEBUG({
+          llvm::dbgs() << "Passthrough: Inserted copy_tile for yielded block "
+                       << "arg " << arg << " -> DST[" << assignedDstIndex
+                       << "]\n";
+        });
+      }
+
+      // Set dst_idx attributes on tile compute ops, copy_tile ops, and
+      // copy_dst. CopyTileOp needs dst_idx so StoreLowering can determine the
+      // DST slot to pack from.
       for (Operation &op : *body) {
-        if (!isTileComputeOp(&op) && !isa<CopyDstOp>(&op)) {
+        if (!isTileComputeOp(&op) && !isa<CopyDstOp>(&op) &&
+            !isa<CopyTileOp>(&op)) {
           continue;
         }
 

@@ -86,11 +86,16 @@ def __matmul__(ast_self: TensorBlock, rhs: TensorBlock) -> TensorBlock:
         raise NotImplementedError("Matrix multiplication not yet supported in TTL mode")
 
     def store(ast_self: TensorBlock, rhs: TensorBlock) -> None:
-        """Store result tensor to CB by propagating CB association from output view."""
+        """Store result tensor to CB.
+
+        This generates ttl.tensor_store which represents actual data movement
+        from the input tensor to the output CB. The tensor will be copied
+        through DST registers to the output circular buffer.
+        """
         # ast_self is the result of attach_cb(tensor, cb) from reserve()
-        # Extract the CB operand and attach it to the result tensor
+        # Extract the CB operand and store rhs to it
         cb = ast_self.owner.operands[1]
-        return ttl.attach_cb(rhs.type, rhs, cb)
+        return ttl.tensor_store(rhs.type, rhs, cb)
 
 
 @syntax("!ttl.transfer_handle")

@@ -92,8 +92,8 @@ def dm_write():
 # Add operation (from l + r dunder method)
 # CHECK: ttl.add
 
-# Attach result to output CB
-# CHECK: ttl.attach_cb %{{.+}}, %[[CB2]]
+# Store result to output CB
+# CHECK: ttl.tensor_store %{{.+}}, %[[CB2]]
 
 # Finalize: pop inputs, push output
 # CHECK: ttl.cb_pop %[[CB0]]

@@ -91,8 +91,8 @@ def dm_write():
 # Add operation
 # CHECK: ttl.add
 
-# store() attaches result to output CB
-# CHECK: ttl.attach_cb %{{.+}}, %[[CB2]]
+# store() emits tensor_store to output CB
+# CHECK: ttl.tensor_store %{{.+}}, %[[CB2]]
 
 # 'with' exit: push output, pop inputs (reverse order)
 # CHECK: ttl.cb_push %[[CB2]]