Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
27 changes: 27 additions & 0 deletions include/ttlang/Dialect/TTL/IR/TTLOps.td
Original file line number Diff line number Diff line change
Expand Up @@ -73,6 +73,33 @@ def TTL_AttachCBOp : TTL_Op<"attach_cb", []> {
let hasVerifier = 1;
}

def TTL_TensorStoreOp : TTL_Op<"tensor_store", []> {
let summary = "Store a tensor to a circular buffer";
let description = [{
Stores a tensor value to a circular buffer. Unlike `attach_cb` which is
purely metadata (associating a tensor with a CB), `tensor_store` represents
actual data movement: the tensor's tiles will be copied through DST
registers to the output CB.

This op is used when explicitly storing data from one CB to another,
such as in passthrough kernels where input is directly copied to output
without compute operations.

Example:
```mlir
%result = ttl.tensor_store %input, %out_cb
: (tensor<1x1x!ttcore.tile<32x32, bf16>>, !ttl.cb<[1, 1], bf16, 2>)
-> tensor<1x1x!ttcore.tile<32x32, bf16>>
```
}];
let arguments = (ins
AnyRankedTensor:$tensor,
TTL_CircularBuffer:$cb
);
let results = (outs AnyRankedTensor:$result);
let assemblyFormat = "$tensor `,` $cb attr-dict `:` functional-type(operands, results)";
}

def TTL_TensorSliceOp : TTL_Op<"tensor_slice", [Pure]> {
let summary = "Create a view into a tensor at specific tile indices";
let description = [{
Expand Down
84 changes: 76 additions & 8 deletions lib/Dialect/TTL/Transforms/ConvertTTLToCompute.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -36,17 +36,20 @@ static Value buildInitTensor(OpBuilder &b, Location loc, RankedTensorType type,
dynDims);
}

/// Find the CB that this operation's result will be attached to.
/// Looks for an attach_cb op that uses this operation's result.
/// Find the CB that this operation's result will be stored to.
/// Looks for attach_cb or tensor_store ops that use this operation's result.
static Value findOutputCB(Operation *op) {
if (op->getNumResults() == 0) {
return nullptr;
}
Value result = op->getResult(0);
for (OpOperand &use : result.getUses()) {
Value opResult = op->getResult(0);
for (OpOperand &use : opResult.getUses()) {
if (auto attachOp = dyn_cast<AttachCBOp>(use.getOwner())) {
return attachOp.getCb();
}
if (auto storeOp = dyn_cast<TensorStoreOp>(use.getOwner())) {
return storeOp.getCb();
}
}
return nullptr;
}
Expand Down Expand Up @@ -259,8 +262,8 @@ static LogicalResult buildBinaryCompute(Operation *op,
return failure();
}

// Find the output CB. First check if there's an attach_cb that uses this
// result, and use that CB. Otherwise, find an unused bind_cb.
// Find the output CB. First check if there's an attach_cb or tensor_store
// that uses this result. Otherwise, find an unused bind_cb.
Value outCb = findOutputCB(op);
if (!outCb) {
auto unusedCBs = findUnusedBindCBs(op);
Expand Down Expand Up @@ -344,8 +347,8 @@ static LogicalResult buildUnaryCompute(Operation *op, PatternRewriter &rewriter,
return failure();
}

// Find the output CB. First check if there's an attach_cb that uses this
// result, and use that CB. Otherwise, find an unused bind_cb.
// Find the output CB. First check if there's an attach_cb or tensor_store
// that uses this result. Otherwise, find an unused bind_cb.
Value outCb = findOutputCB(op);
if (!outCb) {
auto unusedCBs = findUnusedBindCBs(op);
Expand Down Expand Up @@ -579,6 +582,70 @@ struct LowerBcastToCompute : OpRewritePattern<BcastOp> {
using Lower##TTL_OP = LowerUnaryToCompute<TTL_OP##Op, TILE_OP>;
#include "ttlang/Dialect/TTL/TTLElementwiseOps.def"

//===----------------------------------------------------------------------===//
// TensorStore Lowering (passthrough case only)
//===----------------------------------------------------------------------===//

/// Pattern for tensor_store with CB-attached input (passthrough case).
/// Creates a ComputeOp that copies input tiles to output CB.
/// For elementwise ops, tensor_store's input is from ComputeOp (not
/// CB-attached), so this pattern won't match. Those tensor_stores are erased in
/// TTKernel lowering.
struct LowerTensorStoreToCompute : OpRewritePattern<TensorStoreOp> {
using OpRewritePattern<TensorStoreOp>::OpRewritePattern;

LogicalResult matchAndRewrite(TensorStoreOp op,
PatternRewriter &rewriter) const override {
Value input = op.getTensor();
Value outputCb = op.getCb();

// Only handle passthrough case where input is CB-attached.
// Elementwise case: input is from ComputeOp, not CB-attached - skip.
if (!getAttachedCB(input)) {
return failure();
}

auto inputType = getTensorType(input);
if (!inputType) {
return failure();
}

Location loc = op.getLoc();
MLIRContext *ctx = rewriter.getContext();

// Build identity maps for input and output
AffineMap identityMap =
AffineMap::getMultiDimIdentityMap(inputType.getRank(), ctx);
SmallVector<Attribute> maps = {AffineMapAttr::get(identityMap),
AffineMapAttr::get(identityMap)};
SmallVector<Attribute> iterTypes(inputType.getRank(),
rewriter.getStringAttr("parallel"));

// Create output init tensor attached to output CB
Value init = buildInitTensor(rewriter, loc, inputType, input);
Value initAttached =
rewriter.create<AttachCBOp>(loc, init.getType(), init, outputCb);

// Create compute op with passthrough body
auto computeOp = rewriter.create<ComputeOp>(
loc, TypeRange{inputType}, ValueRange{input}, ValueRange{initAttached},
rewriter.getArrayAttr(maps), rewriter.getArrayAttr(iterTypes));

// Build the body: just yield the input tile (passthrough)
Block *body = rewriter.createBlock(&computeOp.getBody());
Type scalarType = inputType.getElementType();
Type tileType = ttcore::TileType::get(scalarType);
body->addArgument(tileType, loc); // Input tile
body->addArgument(tileType, loc); // Output tile (unused)

rewriter.setInsertionPointToEnd(body);
rewriter.create<YieldOp>(loc, body->getArgument(0));

rewriter.replaceOp(op, computeOp.getResult(0));
return success();
}
};

//===----------------------------------------------------------------------===//
// Pass Implementations
//===----------------------------------------------------------------------===//
Expand Down Expand Up @@ -621,6 +688,7 @@ void populateTTLToComputePatterns(RewritePatternSet &patterns) {
#include "ttlang/Dialect/TTL/TTLElementwiseOps.def"

patterns.add<LowerBcastToCompute>(ctx);
patterns.add<LowerTensorStoreToCompute>(ctx);
}

} // namespace mlir::tt::ttl
50 changes: 42 additions & 8 deletions lib/Dialect/TTL/Transforms/ConvertTTLToTTKernel.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -360,7 +360,7 @@ struct StoreLowering : OpConversionPattern<StoreOp> {
utils::computeCBTileIndexFromLoops(op, rewriter, /*cbShapeRank=*/2);

// Determine DST index based on the source operation type:
// - DST-to-DST ops (binary ops): have dst_idx attribute
// - DST-to-DST ops (binary ops, copy_tile): have dst_idx attribute
// - CB-reading ops (bcast, reduce): no dst_idx attribute, use loop index
Value dstIndex;
auto tileValue = adaptor.getTile();
Expand All @@ -369,10 +369,15 @@ struct StoreLowering : OpConversionPattern<StoreOp> {
defOp->getAttrOfType<IntegerAttr>(kDstIdxAttrName)) {
dstIndex =
rewriter.create<arith::ConstantIndexOp>(loc, dstIdxAttr.getInt());
} else if (auto copyTile = dyn_cast<CopyTileOp>(defOp)) {
// Fallback: get dst_index directly from copy_tile operand
dstIndex = copyTile.getDstIndex();
} else {
return op.emitError("ttl.store source op lacks dst_idx attribute: ")
<< defOp->getName();
}
}

if (!dstIndex) {
} else {
// Block argument (e.g., from bcast/reduce) - use CB tile index
dstIndex = cbTileIndex;
}

Expand Down Expand Up @@ -879,6 +884,34 @@ struct CoreYLowering : OpConversionPattern<CoreYOp> {
}
};

/// Lowering for tensor_store: handles cleanup after elementwise lowering.
/// For elementwise ops, the ComputeOp already writes to the output CB, so
/// tensor_store becomes a no-op. For passthrough (CB-attached input), we
/// would need to emit copy_tile + pack_tile, but that case should be handled
/// by LowerTensorStoreToCompute creating a passthrough ComputeOp.
struct TensorStoreLowering : OpConversionPattern<TensorStoreOp> {
using OpConversionPattern::OpConversionPattern;

LogicalResult
matchAndRewrite(TensorStoreOp op, OpAdaptor adaptor,
ConversionPatternRewriter &rewriter) const override {
Value input = op.getTensor();

// If input is CB-attached, this is a passthrough case that should have
// been handled by LowerTensorStoreToCompute. Emit error.
if (getAttachedCB(input)) {
return op.emitError(
"passthrough tensor_store should be lowered to ComputeOp first; "
"ensure convert-ttl-to-compute runs before this pass");
}
Comment on lines +902 to +906
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

negative test for this error?


// For elementwise case: the ComputeOp already wrote to the output CB.
// tensor_store is now a no-op - just erase it.
rewriter.eraseOp(op);
return success();
}
};

struct FuncKernelFinalize : OpRewritePattern<FuncOp> {
using OpRewritePattern::OpRewritePattern;

Expand Down Expand Up @@ -979,10 +1012,11 @@ lowerTTLOpsToTTKernel(ModuleOp mod, MLIRContext &ctx,
});

RewritePatternSet patterns(&ctx);
patterns.add<BindCBLowering, TensorSliceLowering, CopyLowering, WaitLowering,
CBReserveLowering, CBPushLowering, CBWaitLowering, CBPopLowering,
StoreLowering, CoreXLowering, CoreYLowering>(typeConverter,
&ctx);
patterns
.add<BindCBLowering, TensorSliceLowering, CopyLowering, WaitLowering,
CBReserveLowering, CBPushLowering, CBWaitLowering, CBPopLowering,
StoreLowering, CoreXLowering, CoreYLowering, TensorStoreLowering>(
typeConverter, &ctx);
populateFunctionOpInterfaceTypeConversionPattern(
func::FuncOp::getOperationName(), patterns, typeConverter);

Expand Down
69 changes: 67 additions & 2 deletions lib/Dialect/TTL/Transforms/TTLAssignDST.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -837,9 +837,74 @@ struct TTLAssignDSTPass : public impl::TTLAssignDSTBase<TTLAssignDSTPass> {
}
}

// Set dst_idx attributes on tile compute ops.
// Handle passthrough case: block args directly yielded without compute.
// These args are not consumed by any tile compute op, so the loop above
// doesn't insert copy_tile for them. We need to copy them to DST so that
// the subsequent ttl.store (inserted by ttl-insert-tile-regs-sync) can
// pack from DST to the output CB.
for (Value yielded : yieldOp.getValues()) {
auto arg = dyn_cast<BlockArgument>(yielded);
if (!arg || !isTileValue(arg)) {
continue;
}
Comment on lines +845 to +849
Copy link
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could consider marking ttl.compute (e.g., computeOp->setDiscardableAttr("ttl.passthrough", rewriter.getUnitAttr()); when creating pass through computes and check that attribute instead of checking yielded values.


// Skip if already copied (was used by some compute op)
if (dstIndexForValue.count(arg)) {
continue;
}

// Allocate DST slot
std::uint32_t assignedDstIndex = 0;
auto it = dstAssignment.find(arg);
if (it != dstAssignment.end()) {
assignedDstIndex = it->second;
} else {
int freeReg = inUse.find_first_unset();
if (freeReg < 0) {
computeOp.emitOpError("no free DST register for passthrough");
signalPassFailure();
return;
}
assignedDstIndex = static_cast<std::uint32_t>(freeReg);
}
inUse.set(assignedDstIndex);

// Insert copy_tile just before yield
builder.setInsertionPoint(yieldOp);
Location loc = yieldOp.getLoc();

auto indexMapAttr = computeIndexMapAttr(arg, computeOp, builder);
if (failed(indexMapAttr)) {
yieldOp.emitOpError("passthrough block argument not found in inputs");
signalPassFailure();
return;
}
Value srcIndex = builder.create<LinearizedIndexOp>(loc, *indexMapAttr);
Value dstIndex =
builder.create<arith::ConstantIndexOp>(loc, assignedDstIndex);
auto copy = builder.create<CopyTileOp>(
loc,
TypeRange{DSTRegisterType::get(arg.getContext()), arg.getType()},
ValueRange{arg, srcIndex, dstIndex});
dstIndexForValue[copy.getDstTile()] = assignedDstIndex;
dstIndexForValue[arg] = assignedDstIndex;

// Replace the yielded arg with the copy result
yieldOp->replaceUsesOfWith(arg, copy.getDstTile());

LLVM_DEBUG({
llvm::dbgs() << "Passthrough: Inserted copy_tile for yielded block "
<< "arg " << arg << " -> DST[" << assignedDstIndex
<< "]\n";
});
}

// Set dst_idx attributes on tile compute ops, copy_tile ops, and
// copy_dst. CopyTileOp needs dst_idx so StoreLowering can determine the
// DST slot to pack from.
for (Operation &op : *body) {
if (!isTileComputeOp(&op) && !isa<CopyDstOp>(&op)) {
if (!isTileComputeOp(&op) && !isa<CopyDstOp>(&op) &&
!isa<CopyTileOp>(&op)) {
continue;
}

Expand Down
11 changes: 8 additions & 3 deletions python/ttl/operators.py
Original file line number Diff line number Diff line change
Expand Up @@ -86,11 +86,16 @@ def __matmul__(ast_self: TensorBlock, rhs: TensorBlock) -> TensorBlock:
raise NotImplementedError("Matrix multiplication not yet supported in TTL mode")

def store(ast_self: TensorBlock, rhs: TensorBlock) -> None:
"""Store result tensor to CB by propagating CB association from output view."""
"""Store result tensor to CB.

This generates ttl.tensor_store which represents actual data movement
from the input tensor to the output CB. The tensor will be copied
through DST registers to the output circular buffer.
"""
# ast_self is the result of attach_cb(tensor, cb) from reserve()
# Extract the CB operand and attach it to the result tensor
# Extract the CB operand and store rhs to it
cb = ast_self.owner.operands[1]
return ttl.attach_cb(rhs.type, rhs, cb)
return ttl.tensor_store(rhs.type, rhs, cb)


@syntax("!ttl.transfer_handle")
Expand Down
12 changes: 0 additions & 12 deletions test/pytest.ini

This file was deleted.

4 changes: 2 additions & 2 deletions test/python/simple_add.py
Original file line number Diff line number Diff line change
Expand Up @@ -92,8 +92,8 @@ def dm_write():
# Add operation (from l + r dunder method)
# CHECK: ttl.add

# Attach result to output CB
# CHECK: ttl.attach_cb %{{.+}}, %[[CB2]]
# Store result to output CB
# CHECK: ttl.tensor_store %{{.+}}, %[[CB2]]

# Finalize: pop inputs, push output
# CHECK: ttl.cb_pop %[[CB0]]
Expand Down
4 changes: 2 additions & 2 deletions test/python/simple_add_with_stmt.py
Original file line number Diff line number Diff line change
Expand Up @@ -91,8 +91,8 @@ def dm_write():
# Add operation
# CHECK: ttl.add

# store() attaches result to output CB
# CHECK: ttl.attach_cb %{{.+}}, %[[CB2]]
# store() emits tensor_store to output CB
# CHECK: ttl.tensor_store %{{.+}}, %[[CB2]]

# 'with' exit: push output, pop inputs (reverse order)
# CHECK: ttl.cb_push %[[CB2]]
Expand Down
Loading
Loading