Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
29 commits
Select commit Hold shift + click to select a range
391424b
adding sqrt and leaky relu to perf model
AlexandreEichenberger Feb 2, 2026
a4ebc46
added logic to gate onnx to zhigh rules to take friendlyness to NNPA
AlexandreEichenberger Feb 3, 2026
dc5c85f
added more patterns
AlexandreEichenberger Feb 3, 2026
6a9b3f0
fliped default from QualifyingOps (no model) to FasterOps (NNPA model…
AlexandreEichenberger Feb 4, 2026
aba42d4
fix
AlexandreEichenberger Feb 4, 2026
97f7c96
tweaked handling of unknown dims for matmul perf model
AlexandreEichenberger Feb 4, 2026
b289daf
format
AlexandreEichenberger Feb 4, 2026
df128a7
fixed lit tests
AlexandreEichenberger Feb 4, 2026
c712d1d
update
AlexandreEichenberger Feb 4, 2026
74f9341
tuned heuristic
AlexandreEichenberger Feb 5, 2026
5261283
update
AlexandreEichenberger Feb 5, 2026
a0f82e5
fix some litests
AlexandreEichenberger Feb 5, 2026
ad5df35
fix litest and code to support config file
AlexandreEichenberger Feb 5, 2026
f27997b
fix littests
AlexandreEichenberger Feb 6, 2026
97ce5a7
update
AlexandreEichenberger Feb 6, 2026
ff8dd4f
fixes
AlexandreEichenberger Feb 6, 2026
7b70066
fixes
AlexandreEichenberger Feb 6, 2026
d5e9364
fix
AlexandreEichenberger Feb 6, 2026
ca2be53
test
AlexandreEichenberger Feb 6, 2026
13422fe
fix
AlexandreEichenberger Feb 6, 2026
03858a9
fixes
AlexandreEichenberger Feb 9, 2026
ae5feec
added default QualifyingOp heuristic for backend tests
AlexandreEichenberger Feb 10, 2026
75424b1
all ops to NNPA in numerical tests too
AlexandreEichenberger Feb 10, 2026
28e8f8c
update
AlexandreEichenberger Feb 10, 2026
8ccb7b1
Merge branch 'main' into missing-model-z17
AlexandreEichenberger Feb 10, 2026
1e43122
Merge branch 'main' into missing-model-z17
AlexandreEichenberger Feb 11, 2026
fbcbf88
Merge branch 'main' into missing-model-z17
AlexandreEichenberger Feb 12, 2026
1767ee7
update
AlexandreEichenberger Feb 13, 2026
4f0de7e
Merge branch 'missing-model-z17' of https://github.com/AlexandreEiche…
AlexandreEichenberger Feb 13, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
6 changes: 3 additions & 3 deletions src/Accelerators/NNPA/Compiler/NNPACompilerOptions.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -82,12 +82,12 @@ llvm::cl::opt<NNPAPlacementHeuristic> nnpaPlacementHeuristic{
"[Optional] Choose NNPA-related heuristic to place operations "
"on NNPA device:"),
llvm::cl::values(
clEnumVal(QualifyingOps, "Place all qualifying ops on NNPA (default)."),
clEnumVal(QualifyingOps, "Place all qualifying ops on NNPA."),
clEnumVal(FasterOps, "Place qualifying ops that are faster on NNPA."),
clEnumVal(FasterOpsWSU, "FasterOps with stick/unstick cost."),
clEnumVal(FasterOpsWSU, "FasterOps with stick/unstick cost (default)."),
clEnumVal(MuchFasterOpsWSU,
"Much/Significantly FasterOps with stick/unstick cost.")),
llvm::cl::init(QualifyingOps), llvm::cl::cat(OnnxMlirOptions)};
llvm::cl::init(FasterOpsWSU), llvm::cl::cat(OnnxMlirOptions)};

llvm::cl::opt<bool> nnpaDisableSaturation("nnpa-disable-saturation",
llvm::cl::desc("Disable saturating f32 values before stickify them."
Expand Down
28 changes: 21 additions & 7 deletions src/Accelerators/NNPA/Conversion/ONNXToZHigh/DevicePlacement.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -91,14 +91,20 @@ struct DevicePlacementPass
Option<bool> useFasterOps{*this, "use-faster",
llvm::cl::desc("Enable FasterOps NNPAPlacementHeuristic"),
llvm::cl::init(false)};
// Method to override placement using useXXX flags
Option<bool> useQualifyingOps{*this, "use-qualifying",
llvm::cl::desc("Enable QualifyingOps NNPAPlacementHeuristic"),
llvm::cl::init(false)};

// Method to override placement using use-XXX flags
void initPlacementHeuristic() {
if (useMuchFasterWithStickOps)
placementHeuristic = MuchFasterOpsWSU;
else if (useFasterWithStickOps)
placementHeuristic = FasterOpsWSU;
else if (useFasterOps)
placementHeuristic = FasterOps;
else if (useQualifyingOps)
placementHeuristic = QualifyingOps;
}

void runOnOperation() final;
Expand Down Expand Up @@ -157,6 +163,11 @@ void DevicePlacementPass::runOnOperation() {
op->setAttr(
DEVICE_ATTRIBUTE, StringAttr::get(module.getContext(), device));
});
LLVM_DEBUG({
llvm::dbgs() << "\n\nOps after reading config file\n";
module.dump();
llvm::dbgs() << "\n\n";
});
}

// Run patterns that converts ONNX to ZHigh with analysis mode to collect
Expand Down Expand Up @@ -196,17 +207,20 @@ void DevicePlacementPass::runOnOperation() {
legalizedOps1, llvm::set_intersection(legalizedOps2, legalizedOps3));

initPlacementHeuristic();
if (placementHeuristic == QualifyingOps)

if (placementHeuristic == QualifyingOps) {
PlaceAllLegalOpsOnNNPA(context, ops, cpuOps);
else if (placementHeuristic == FasterOps)
} else if (placementHeuristic == FasterOps) {
PlaceBeneficialOpsOnNNPA(context, ops, &dimAnalysis, cpuOps);
else if (placementHeuristic == FasterOpsWSU)
PlaceBeneficialOpsOnNNPAWithStickUnstick(
context, module, ops, &dimAnalysis, cpuOps);
else if (placementHeuristic == MuchFasterOpsWSU)
} else if (placementHeuristic == FasterOpsWSU) {
PlaceBeneficialOpsOnNNPAWithStickUnstick(context, module, ops, &dimAnalysis,
cpuOps, /*min factor*/ 1.1, /*significant CPU Factor*/ 2.0,
/*significant NNPA Factor*/ 2.0);
} else if (placementHeuristic == MuchFasterOpsWSU) {
PlaceBeneficialOpsOnNNPAWithStickUnstick(context, module, ops, &dimAnalysis,
cpuOps, /*min factor*/ 3.0, /*significant CPU Factor*/ 2.0,
/*significant NNPA Factor*/ 8.0);
}

// Create a JSON configuration file if required.
if (!saveConfigFile.empty()) {
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -35,38 +35,6 @@ using namespace onnx_mlir;

namespace {

//===----------------------------------------------------------------------===//
// Support to classify ops.

bool isMappedToDevice(Operation *op) {
StringAttr device = op->getAttrOfType<mlir::StringAttr>(DEVICE_ATTRIBUTE);
return device && !device.getValue().empty();
}

bool isMappedToCPU(Operation *op) {
StringAttr device = op->getAttrOfType<mlir::StringAttr>(DEVICE_ATTRIBUTE);
return device && device.getValue().equals_insensitive(CPU_DEVICE);
}

bool isMappedToNNPA(Operation *op) {
StringAttr device = op->getAttrOfType<mlir::StringAttr>(DEVICE_ATTRIBUTE);
return device && device.getValue().equals_insensitive(NNPA_DEVICE);
}

// Determine if op is unsuitable because its not an ONNX op of interest, or it
// is already mapped to the CPU device.
bool isNNPAFriendlyOp(Operation *op) {
if (op->getDialect()->getNamespace() != ONNXDialect::getDialectNamespace())
return false;
// These ops are NNPA unfriendly. Constants are friendly.
if (isa<ONNXEntryPointOp, ONNXReturnOp>(op))
return false;
// If `device` is already set to CPU, it is NNPA unfriendly
if (isMappedToCPU(op))
return false;
return true;
}

//===----------------------------------------------------------------------===//
// Support functions op assignment.

Expand All @@ -77,8 +45,7 @@ inline bool fasterOnNNPA(Operation *op, bool significant = false) {
llvm::dbgs() << " Significantly faster ";
else
llvm::dbgs() << " Faster ";
llvm::dbgs() << "on NNPA model for op:";
op->dump();
llvm::dbgs() << "on NNPA model\n";
});
return true;
}
Expand All @@ -90,25 +57,18 @@ inline bool fasterOnCPU(Operation *op, bool significant = false) {
llvm::dbgs() << " Significantly faster ";
else
llvm::dbgs() << " Faster ";
llvm::dbgs() << "on CPU model for op:";
op->dump();
llvm::dbgs() << "on CPU model\n";
});
return false;
}

inline void assignToNNPA(Operation *op, MLIRContext *context) {
LLVM_DEBUG({
llvm::dbgs() << "Assign to NNPA:";
op->dump();
});
LLVM_DEBUG(llvm::dbgs() << "Assign to NNPA\n");
op->setAttr(DEVICE_ATTRIBUTE, StringAttr::get(context, NNPA_DEVICE));
}

inline void assignToCPU(Operation *op, MLIRContext *context) {
LLVM_DEBUG({
llvm::dbgs() << "Assign to CPU:";
op->dump();
});
LLVM_DEBUG(llvm::dbgs() << "Assign to CPU\n");
op->setAttr(DEVICE_ATTRIBUTE, StringAttr::get(context, CPU_DEVICE));
}

Expand Down Expand Up @@ -162,41 +122,48 @@ struct DevicePlacementWithStickUnstickCost {
nnpaCandidateOps.clear();
nnpaNeutralOps.clear();
module.walk([&](Operation *op) -> WalkResult {
// Skip ops that are NNPA unfriendly such as ops already assigned to CPU.
// Skip ops that are NNPA unfriendly (i.e. not ONNX ops, ONNX ops mapped
// to CPU).
if (!isNNPAFriendlyOp(op))
return WalkResult::advance();
// Ops that cannot/may not go on NNPA but can operate on NNPA data "for
// free" are included here in NNPA neutral ops.
// I assume here (not really true) that transpose and reshape can carry
// the stickified data.
// free" are included here in NNPA neutral ops. I assume here (not really
// true) that transpose and reshape can carry the stickified data. Note
// that an op can be part of cpuOps (because they must be running on CPU)
// and also be part of neutralOps (because they don't force
// stick/unstick).
if (isa<ONNXConstantOp, ONNXTransposeOp, ONNXReshapeOp>(op)) {
nnpaNeutralOps.insert(op);
return WalkResult::advance();
}
// Skip ops that the compiler determined are not suitable for NNPA.
if (cpuOps.contains(op))
return WalkResult::advance();
// Skip ops already marked for NNPA.
if (isMappedToNNPA(op))
return WalkResult::advance();

// Remaining ops can be mapped to NNPA.
nnpaCandidateOps.insert(op);
return WalkResult::advance();
});
#if DEBUG >= 2
LLVM_DEBUG({
llvm::dbgs() << "\nCPU Ops:\n";
llvm::dbgs() << "\nCPU Ops (must run on CPU):\n";
for (auto op : cpuOps) {
if (isa<ONNXConstantOp, func::FuncOp>(op))
continue;
llvm::dbgs() << "cpu ";
op->dump();
}
llvm::dbgs() << "\nNNPA Neutral Ops:\n";
llvm::dbgs() << "\nNNPA Neutral Ops (does not force stick/unstick):\n";
for (auto op : nnpaNeutralOps) {
if (isa<ONNXConstantOp, func::FuncOp>(op))
continue;
llvm::dbgs() << "neutral ";
op->dump();
}
llvm::dbgs() << "\nNNPA Candidate Ops:\n";
llvm::dbgs() << "\nNNPA Candidate Ops (may run on zAIU):\n";
for (auto op : nnpaCandidateOps) {
llvm::dbgs() << "candidate ";
op->dump();
Expand Down Expand Up @@ -267,17 +234,17 @@ struct DevicePlacementWithStickUnstickCost {
TODO: If migrate X to NNPA, could attribute some benefits for having
users that are NNPA.
*/
double costOfUnstickOp = estimateTimeForUnstickOp(resVal);
double costOfStickOp = estimateTimeForStickOp(resVal);
if (cpuOpCount > 0) {
// Moving this op to NNPA will cost one unstick as there are one or
// more ops that must execute on CPU.
double costOfUnstickOp = estimateTimeForUnstickOp(resVal);
LLVM_DEBUG(
llvm::dbgs() << " +1 unstick: " << costOfUnstickOp << "\n");
totalCostBenefit += costOfUnstickOp;
}
if (nnpaOpCount > 0) {
// Moving this op to NNPA will remove the need to stick this result
double costOfStickOp = estimateTimeForStickOp(resVal);
LLVM_DEBUG(
llvm::dbgs() << " -1 stick: " << -costOfStickOp << "\n");
totalCostBenefit -= costOfStickOp;
Expand Down Expand Up @@ -312,7 +279,8 @@ struct DevicePlacementWithStickUnstickCost {
classifyValueUsage(inputVal, /*skip op X that we are analyzing*/ opX,
cpuOpCount, nnpaOpCount, nnpaCandidateOpCount, nnpaNeutralOpCount);
/*
Case study:
Case study for OP X:
Case 3 & 5: def is on CPU | Case 4 & 6: def is on NNPA
3) Op X remains on CPU | 4) Op X remains on CPU
def.CPU ----. | def.NNPA -----.
/ | \ \ | / | \ \
Expand All @@ -331,24 +299,28 @@ struct DevicePlacementWithStickUnstickCost {

placing X on NNPA: |
cost: +1 stick if first NNPA |
benefit: | -1 stick
benefit: | -1 unstick if (last) no CPU
*/
double costOfStickOp = estimateTimeForStickOp(inputVal);
if (isMappedToCPU(definingOp) ||
!(nnpaCandidateOps.contains(definingOp) ||
nnpaNeutralOps.contains(definingOp))) {
// Case 5.
if (nnpaOpCount == 0) {
double costOfStickOp = estimateTimeForStickOp(inputVal);
LLVM_DEBUG(llvm::dbgs() << " def-op on cpu (case 5), +1 stick "
<< costOfStickOp << ".\n");
totalCostBenefit += costOfStickOp;
}
}
if (isMappedToNNPA(definingOp)) {
// Case 6.
LLVM_DEBUG(llvm::dbgs() << " def-op on NNPA (case 6), -1 stick "
<< -costOfStickOp << ".\n");
totalCostBenefit -= costOfStickOp;
if (cpuOpCount == 0) {
double costOfUnstickOp = estimateTimeForUnstickOp(inputVal);
LLVM_DEBUG(llvm::dbgs()
<< " def-op on NNPA (case 6), -1 unstick "
<< -costOfUnstickOp << ".\n");
totalCostBenefit -= costOfUnstickOp;
}
}
}
return totalCostBenefit;
Expand All @@ -367,7 +339,7 @@ struct DevicePlacementWithStickUnstickCost {
bool &significant) {
LLVM_DEBUG({
llvm::dbgs()
<< "\nTest cost-benefit with stick/unstick of CPU/NNPA for op\n";
<< "\nTest cost-benefit with stick/unstick of CPU/NNPA for op\n ";
op->dump();
});
// Estimate time
Expand All @@ -378,23 +350,26 @@ struct DevicePlacementWithStickUnstickCost {
double inputCostBenefit = costBenefitIncurredForInputs(op);
nnpaTimeWithOverheads = nnpaTime + useCostBenefit + inputCostBenefit;
LLVM_DEBUG(llvm::dbgs()
<< " New estimated nnpa time with stick/unstick:"
<< nnpaTimeWithOverheads << " vs cpu " << cpuTime << ".\n");
<< " New estimated nnpa time (" << nnpaTime
<< ") with stick/unstick:" << nnpaTimeWithOverheads
<< " vs cpu " << cpuTime << ".\n");
} else {
// No performance model for this operation, assume faster on NNPA;
cpuTime = 10;
nnpaTime = nnpaTimeWithOverheads = 1;
LLVM_DEBUG(llvm::dbgs() << " no time estimate, assume NNPA better\n.");
LLVM_DEBUG(llvm::dbgs() << " no time estimate, assume NNPA better\n");
}
if (nnpaTimeWithOverheads * minFactor <= cpuTime) {
// For significant, don't take overheads into account as it may change
// depending on mapping.
// depending on mapping, namely use nnpaTime instead of
// nnpaTimeWithOverheads.
significant =
significantlyFaster(nnpaTime, cpuTime, significantNNPAFactor);
return fasterOnNNPA(op, significant);
}
// For significant, don't take overheads into account as it may change
// depending on mapping.
// depending on mapping, , namely use nnpaTime instead of
// nnpaTimeWithOverheads.
significant = significantlyFaster(cpuTime, nnpaTime, significantCPUFactor);
return fasterOnCPU(op, significant);
}
Expand Down Expand Up @@ -456,8 +431,18 @@ void PlaceBeneficialOpsOnNNPAWithStickUnstick(MLIRContext *context,
bool last = (i == ub - 1);
LLVM_DEBUG(llvm::dbgs() << "\n\n\nPlacement Iteration " << i << "\n\n");
for (Operation *op : ops) {
if (isMappedToDevice(op))
if (isMappedToDevice(op)) {
LLVM_DEBUG({
if (isMappedToCPU(op))
llvm::dbgs() << "\nSkip as operations is mapped to cpu\n ";
else if (isMappedToNNPA(op))
llvm::dbgs() << "\nSkip as operations is mapped to NNPA\n ";
else
llvm_unreachable("expected CPU or NNPA only here");
op->dump();
});
continue;
}
// Op that cannot go on NNPA.
if (cpuOps.contains(op))
continue;
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -81,9 +81,8 @@ void PlaceBeneficialOpsOnNNPA(mlir::MLIRContext *context,
*/
void PlaceBeneficialOpsOnNNPAWithStickUnstick(mlir::MLIRContext *context,
mlir::ModuleOp module, const llvm::SmallVector<mlir::Operation *, 32> &ops,
const DimAnalysis *dimAnalysis, const OpSetType &cpuOps,
double minFactor = 1.1, double significantCPUFactor = 2.0,
double significantNNPAFactor = 3.0);
const DimAnalysis *dimAnalysis, const OpSetType &cpuOps, double minFactor,
double significantCPUFactor, double significantNNPAFactor);

} // namespace onnx_mlir
#endif
Loading
Loading