Skip to content
Open
Show file tree
Hide file tree
Changes from 16 commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
Expand Up @@ -31,7 +31,22 @@ using namespace vpux;

namespace {

static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"};
struct GapCandidate {
uint64_t lookaheadGap = 0;
int64_t insertionPointTaskIndex = -1;

// used for sort
bool operator>(const GapCandidate& other) const {
return lookaheadGap > other.lookaheadGap;
}
};

static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {
"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm",
"activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"};

static const SmallVector<StringRef> SW_DUMMY_KERNELS_WITHOUT_ARGS = {
"convert", "eltwise_mul", "activation_cos", "activation_sin", "eltwise_equal", "eltwise_select", "rms_norm"};

//
// AddSwKernelInstructionPrefetch
Expand Down Expand Up @@ -66,19 +81,26 @@ class AddSwKernelInstructionPrefetch final :
size_t clusterIdx, std::string& kernelName,
mlir::SymbolRefAttr functionSymbol);

VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::Value updateBarrier,
size_t clusterIdx, std::string& kernelName);
VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask,
mlir::ValueRange updateBarrier, size_t clusterIdx,
std::string& kernelName);
mlir::Operation* getFirstSwTaskInIRWaitingForBarrier(mlir::Value waitBarrier);
std::pair<std::string, size_t> getKernelNameAndSize(VPUIP::SwKernelOp swKernelOp);

using SwKernelPrefetchVec = std::vector<std::pair<std::string, size_t>>;
using SwKernelPrefetchVec = std::vector<std::tuple<std::string, size_t, size_t>>;
std::pair<SwKernelPrefetchVec, size_t> getPrefetchCandidatesAndFirstSwTask(mlir::Operation* funcOp,
VPURT::TaskConfigVec& allTasks);
std::tuple<mlir::Operation*, mlir::Value, size_t> getFirstSwTaskInIRAndBestUpdateBarrier(
VPURT::InferenceExecutionSimulator& infSim, VPURT::TaskConfigVec& allTasks, size_t firstShvTaskIndex);
std::vector<VPUIP::SwKernelOp> insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch,
mlir::Operation* firstShaveTaskInIR,
mlir::Value bestUpdateBarrier);
std::optional<GapCandidate> findBestInsertionGapDuringExec(const std::string& kernelName,
uint64_t targetKernelGroupStartTime,
VPURT::TaskConfigVec& allTasks, size_t numClusters);
std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(
mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
VPURT::TaskConfigVec& allTasks);

bool hasVPUSWModule(mlir::Operation* funcOp);
size_t getOffsetReservedMem(const mlir::ModuleOp module);
Expand All @@ -94,6 +116,13 @@ class AddSwKernelInstructionPrefetch final :
bool _minFreeCyclesHasValue = false;
size_t _minimumFreeCyclesForPrefetch = 250000;
bool _useDummyKernelForInstructionPrefetch = false;
size_t _dynamicPrefetchTileCounter = 0;
// Using Tile 1 as the target for insertion to enable prefetching only when the available tile count is larger
// than 1.
int64_t _targetInsertTileDuringExec = 1;

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Could you elaborate in comments why we pick a specific tile _targetInsertTileDuringExec here?

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

This variable is used solely as a reference for gap finding, not for the actual insertion (which, as mentioned, follows a round-robin strategy). I agree the name is slightly misleading, so I plan to rename it.

We chose a specific tile (Tile 1) for two reasons:

  1. When multiple kernels with the same operator execute concurrently (e.g., across Tiles 0-3), the schedule is symmetric. We don't need to calculate the gap for every tile; checking one representative tile is sufficient.

  2. We selected Index 1 (instead of 0) to ensure instruction prefetching is enabled only when the kernel spans at least two tiles, which provides more insertion slots and yields better performance gains.

// The threshold of 50,000 cycles is empirically chosen to ensure there is a sufficient gap
// to perform instruction prefetching without causing stalls.
uint64_t _prefetchGapThresholdDuringExec = 50000;
};

bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) {
Expand Down Expand Up @@ -186,21 +215,26 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer
}

// For LNL, Shave kernel instruction prefetch needs to insert a dummy kernel instead of prefetch kernel
VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask,
mlir::Value updateBarrier,
size_t clusterIdx,
std::string& kernelName) {
VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(
mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier, size_t clusterIdx, std::string& kernelName) {
mlir::OpBuilder builder(firstSwTask);
auto moduleOp = firstSwTask->getParentOfType<mlir::ModuleOp>();
auto kernelOp = kernelNameToOps[kernelName];
auto moduleOp = kernelOp->getParentOfType<mlir::ModuleOp>();
auto reservedMemOffset = getOffsetReservedMem(moduleOp);
auto offsetAttr = getIntAttr(moduleOp->getContext(), reservedMemOffset);
auto kernelOp = kernelNameToOps[kernelName];
auto tileIndexAttr = kernelOp.getTileIndexAttr();
VPUX_THROW_UNLESS(tileIndexAttr, "SwKernelOp '{0}' does not have a tileIndex attribute", kernelOp->getLoc());
const int64_t tileIndex = static_cast<int64_t>(clusterIdx);

auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector<mlir::Value>& buffers) {
if (auto bufOp = io.getDefiningOp<VPURT::DeclareBufferOp>()) {
auto newType = mlir::cast<NDTypeInterface>(io.getType()).changeShape({1, 1, 1, 1});
auto origType = mlir::cast<NDTypeInterface>(io.getType());
auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(),
stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex);
auto newSectionIndexAttr = builder.getI64ArrayAttr({tileIndex});
auto newType = origType.changeShape({1, 1, 1, 1}).changeMemSpace(newMemSpaceAttr);
auto newBuff = builder.create<VPURT::DeclareBufferOp>(appendLoc(bufOp->getLoc(), suffix), newType,
bufOp.getSectionAttr(), bufOp.getSectionIndexAttr(),
bufOp.getSectionAttr(), newSectionIndexAttr,
offsetAttr, bufOp.getSwizzlingKeyAttr());
buffers.push_back(newBuff);
return true;
Expand Down Expand Up @@ -230,14 +264,16 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst

auto cachePrefetchSwKernel = vpux::VPURT::wrapIntoTaskOp<VPUIP::SwKernelOp>(
builder, mlir::ValueRange(), updateBarrier, newLoc, mlir::ValueRange(srcBuffers),
mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], kernelOp.getTileIndexAttr(),
mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], builder.getI64IntegerAttr(tileIndex),
kernelOp.getInputStridesAttr(), kernelOp.getOutputStridesAttr());
// The dummy kernels here are generated after ActShaveProfilingPass,
// so we need to add skipProfiling as attribute to avoid capturing their metadata
cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext()));

auto args =
(kernelName == "convert") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName];
auto args = llvm::is_contained(SW_DUMMY_KERNELS_WITHOUT_ARGS, kernelName)
? mlir::ArrayAttr::get(moduleOp->getContext(), {})
: kernelNameToArgs[kernelName];

vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args,
_log.nest(), /*swKernelRunOp=*/nullptr);

Expand Down Expand Up @@ -316,7 +352,7 @@ AddSwKernelInstructionPrefetch::getPrefetchCandidatesAndFirstSwTask(mlir::Operat
}

if (!cache.isLoaded(kernelName)) {
kernelsToPrefetch.push_back(std::move(kernelNameAndSize));
kernelsToPrefetch.push_back(std::make_tuple(kernelName, kernelSize, shvTaskIndex));
}
cache.loadKernel(kernelName, kernelSize);

Expand Down Expand Up @@ -355,7 +391,8 @@ AddSwKernelInstructionPrefetch::getFirstSwTaskInIRAndBestUpdateBarrier(VPURT::In
_log.trace("First SW kernel start time {0}, best barrier release time {1}", firstKernelTask.cycleStart,
bestReleaseCycle);
if (bestReleaseCycle < _minimumFreeCyclesForPrefetch) {
_log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, skipping prefetching",
_log.info("bestReleaseCycle: {0} is smaller than _minimumFreeCyclesForPrefetch {1}, try prefetching during "
"execution",
bestReleaseCycle, _minimumFreeCyclesForPrefetch);
return std::make_tuple(nullptr, nullptr, 0);
}
Expand Down Expand Up @@ -394,7 +431,7 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
for (size_t shaveIdx = 0; (shaveIdx < numClusters * noOfShavesPerCluster) && (shaveIdx < kernelsToPrefetch.size());
shaveIdx++) {
auto clusterIdx = shaveIdx / noOfShavesPerCluster;
auto [kernelName, kernelSize] = kernelsToPrefetch[shaveIdx];
auto [kernelName, kernelSize, shvTaskIndex] = kernelsToPrefetch[shaveIdx];
_log.trace("Prefetching kernel {0} on cluster {1}", kernelName, clusterIdx);
auto newPrefetchKernel =
_useDummyKernelForInstructionPrefetch
Expand All @@ -410,6 +447,169 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas
return prefetchedKernels;
}

size_t getSwKernelCountAtTime(uint64_t startTime, VPURT::TaskConfigVec& allTasks) {
size_t count = 0;
for (auto& taskConfig : allTasks) {
if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) {
if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) {
count++;
}
}
if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) {
break;
}
}
return count;
}

uint64_t findNextSaturationStart(size_t startIndex, vpux::VPURT::TaskConfigVec& allTasks, size_t numClusters,
std::map<uint64_t, size_t>& swKernelCountsCache) {
// Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels)
const size_t saturationThreshold = numClusters * 2;

// Iterate through tasks strictly AFTER the startIndex
for (size_t i = startIndex + 1; i < allTasks.size(); ++i) {
uint64_t currentStartTime = static_cast<uint64_t>(allTasks[i].cycleStart);

if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) {
swKernelCountsCache[currentStartTime] = getSwKernelCountAtTime(currentStartTime, allTasks);
}

if (swKernelCountsCache[currentStartTime] >= saturationThreshold) {
return currentStartTime;
}
}

return std::numeric_limits<uint64_t>::max();
}

std::optional<GapCandidate> AddSwKernelInstructionPrefetch::findBestInsertionGapDuringExec(
const std::string& kernelName, uint64_t targetKernelGroupStartTime, VPURT::TaskConfigVec& allTasks,
size_t numClusters) {
const size_t saturationThreshold = numClusters * 2;

// <LookaheadGapSize, GapCandidate>
std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps;
std::map<uint64_t, size_t> swKernelCountsCache; // local cache

int64_t prevTargetTileTaskIndex = -1;
uint64_t prevTargetTileTaskStartTime = 0;

// find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched
for (size_t i = 0; i < allTasks.size(); ++i) {
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

I think this code assumes there is no eviction event between the start of schedule and kernel start time. This might be true for existing platforms but might not be true for future. If we get good results we can accept it as is but for now let's at least put a comment here that we make such an assumption.

auto& currentTaskConfig = allTasks[i];
uint64_t currentTaskStartTime = static_cast<uint64_t>(currentTaskConfig.cycleStart);
if (currentTaskStartTime > targetKernelGroupStartTime) {
break;
}

bool isTargetTileTask = false;
if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp()); swOp != nullptr) {
isTargetTileTask = (swOp.getTileIndexAttr().getInt() == _targetInsertTileDuringExec);
}

if (prevTargetTileTaskIndex != -1 && isTargetTileTask) {
auto& insertionPointTask = allTasks[prevTargetTileTaskIndex];
auto insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart);

size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks);

if (simultaneousSwKernels < saturationThreshold) {
uint64_t nextSaturationStart =
findNextSaturationStart(prevTargetTileTaskIndex, allTasks, numClusters, swKernelCountsCache);
uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime);
uint64_t lookaheadGap = 0;
if (gapEnd > prevTargetTileTaskStartTime) {
lookaheadGap = gapEnd - prevTargetTileTaskStartTime;
}

if (lookaheadGap >= _prefetchGapThresholdDuringExec) {
GapCandidate gap;
gap.lookaheadGap = lookaheadGap;
gap.insertionPointTaskIndex = prevTargetTileTaskIndex;
validGaps[lookaheadGap] = gap;
}
}
}

if (isTargetTileTask) {
prevTargetTileTaskIndex = static_cast<int64_t>(i);
prevTargetTileTaskStartTime = currentTaskStartTime;
}
}

if (validGaps.empty()) {
_log.trace("Kernel '{0}': No suitable insertion point found.", kernelName);
return std::nullopt;
}

return validGaps.begin()->second;
}

std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTasksDuringExec(
mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch,
VPURT::TaskConfigVec& allTasks) {
auto moduleOp = funcOp->getParentOfType<mlir::ModuleOp>();
const auto numClusters = getNumTiles(moduleOp);
VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero.");

std::vector<VPUIP::SwKernelOp> prefetchedKernels{};

for (auto& kernelInfo : kernelsToPrefetch) {
std::string kernelName = std::get<0>(kernelInfo);
size_t firstAppearanceIndex = std::get<2>(kernelInfo);

if (firstAppearanceIndex >= allTasks.size()) {
_log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex);
continue;
}
if (kernelNameToOps.count(kernelName) == 0) {
_log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName);
continue;
}

auto targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart);

// Finds the best insertion point for prefetch by identifying non-saturated execution windows.
// Scans for tasks on the target tile to serve as prefetch anchors. A valid "Gap" is the
// duration from an anchor task to the next saturation event or the target kernel start.
//
// Logic:
// 1. Find a candidate task on the target tile.
// 2. Ensure NPU is not saturated at that time.
// 3. Calculate Gap = (Next Saturation or Target Start) - Insertion Time.
// 4. Return the candidate with the largest Gap >= _prefetchGapThreshold.
auto bestGapOpt = findBestInsertionGapDuringExec(kernelName, targetKernelGroupStartTime, allTasks, numClusters);

if (!bestGapOpt.has_value()) {
_log.trace("Kernel '{0}': No valid gap found.", kernelName);
continue;
}

GapCandidate bestGap = bestGapOpt.value();
_log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.", kernelName,
bestGap.lookaheadGap, bestGap.insertionPointTaskIndex);

if (bestGap.insertionPointTaskIndex < 0 ||
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can this ever happen?

static_cast<size_t>(bestGap.insertionPointTaskIndex) >= allTasks.size()) {
_log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", kernelName,
bestGap.insertionPointTaskIndex);
continue;
}

auto insertBeforeOp = allTasks[bestGap.insertionPointTaskIndex].taskOp;
size_t dynamicExecTile = _dynamicPrefetchTileCounter % numClusters;
_dynamicPrefetchTileCounter++;

auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask(insertBeforeOp, mlir::ValueRange(),

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

newPrefetchKernel is set empty update barriers in this case, and insertDummyKernelOpBeforeFirstKernelTask will make the wait barriers empty too. 🤔
I am not sure if they will be scheduled in the slots as we expect. I suppose tasks without wait barriers will be executed in the very beginning? @DariaMityagina Can we have someone confirm this?
If so, the insert function will need a proper wait barrier(maybe also an update barrier) instead of empty for this use case.

Copy link
Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You are correct that the update and wait barriers are empty. However, according to my observations from hardware SHAVE profiling, the dummy inst prefetch op is executed in the inserted non-saturated position. If the dummy op were inserted in a saturated position, the original SHAVE task would be postponed or even scheduled to another SHAVE unit. But if you confirm any scheduling behavior is not as expected, adding the barriers is also fine with me.

Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

We should add wait barriers. Not sure why shave task executes at the position you want it to execute but later passes would be free to reorder that prefetch to the beginning of the schedule

dynamicExecTile, kernelName);

prefetchedKernels.push_back(newPrefetchKernel);
}

return prefetchedKernels;
}

void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
auto funcOp = getOperation();
if (!hasVPUSWModule(funcOp)) {
Expand Down Expand Up @@ -444,18 +644,24 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() {
auto [kernelsToPrefetch, firstShvTaskIndex] = getPrefetchCandidatesAndFirstSwTask(funcOp, allTasks);
auto [firstShaveTaskInIR, bestUpdateBarrier, bestReleaseCycle] =
getFirstSwTaskInIRAndBestUpdateBarrier(infSim, allTasks, firstShvTaskIndex);
if (firstShaveTaskInIR == nullptr || kernelsToPrefetch.empty()) {
return;
}
_log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);

if (_useDummyKernelForInstructionPrefetch) {
auto memSpaceAttr = mlir::SymbolRefAttr::get(module->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN));
auto dummyKernelResMem = config::getDummySwKernelsForInstructionPrefetchReservedMemory(module, memSpaceAttr);
VPUX_THROW_WHEN(dummyKernelResMem == nullptr,
"Cannot find DummySWKernelsForInstructionPrefetchReservedMemory!");
}
auto newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
if (kernelsToPrefetch.empty()) {
return;
}

std::vector<VPUIP::SwKernelOp> newPrefetchKernels;
if (firstShaveTaskInIR) {
_log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle);
newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier);
} else {
newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks);
Copy link

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

It would be great if we could make the logic common between prefetching at the start and prefetching in the middle of the schedule since the former is just a special case of latter(at least conceptually). Though I understand that this is probably done to limit the impact on other platforms. If results are good we can follow up on this internally.

}

// Update dependencies for cache handling operations to meet requirements of control graph split.
auto& barrierInfo = getAnalysis<BarrierInfo>();
Expand Down
Loading
Loading