-
Notifications
You must be signed in to change notification settings - Fork 37
Enable prefetching of SW kernel instructions after the first SW task #199
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 1 commit
2ecf4c2
681035a
8f39a27
65e4a87
75421b6
bf5cab7
ff57812
38a8e24
48fdf84
0ccf9bf
8110a44
51c2ac3
e96f0f8
1b364a5
a7cc88d
bacb483
0e46bcd
7680564
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -31,7 +31,8 @@ using namespace vpux; | |
|
|
||
| namespace { | ||
|
|
||
| static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"}; | ||
| // static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"convert", "softmax"}; | ||
| static const SmallVector<StringLiteral> SW_DUMMY_KERNELS_PREFETCH_SUPPORTED = {"activation_swish", "eltwise_mul", "softmax", "convert", "rms_norm", "activation_swish", "activation_sin", "eltwise_equal", "activation_cos", "eltwise_select"}; | ||
|
|
||
| // | ||
| // AddSwKernelInstructionPrefetch | ||
|
|
@@ -66,19 +67,21 @@ class AddSwKernelInstructionPrefetch final : | |
| size_t clusterIdx, std::string& kernelName, | ||
| mlir::SymbolRefAttr functionSymbol); | ||
|
|
||
| VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::Value updateBarrier, | ||
| VPUIP::SwKernelOp insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, mlir::ValueRange updateBarrier, | ||
| size_t clusterIdx, std::string& kernelName); | ||
| mlir::Operation* getFirstSwTaskInIRWaitingForBarrier(mlir::Value waitBarrier); | ||
| std::pair<std::string, size_t> getKernelNameAndSize(VPUIP::SwKernelOp swKernelOp); | ||
|
|
||
| using SwKernelPrefetchVec = std::vector<std::pair<std::string, size_t>>; | ||
| using SwKernelPrefetchVec = std::vector<std::tuple<std::string, size_t, size_t>>; | ||
| std::pair<SwKernelPrefetchVec, size_t> getPrefetchCandidatesAndFirstSwTask(mlir::Operation* funcOp, | ||
| VPURT::TaskConfigVec& allTasks); | ||
| std::tuple<mlir::Operation*, mlir::Value, size_t> getFirstSwTaskInIRAndBestUpdateBarrier( | ||
| VPURT::InferenceExecutionSimulator& infSim, VPURT::TaskConfigVec& allTasks, size_t firstShvTaskIndex); | ||
| std::vector<VPUIP::SwKernelOp> insertPrefetchTasks(mlir::Operation* funcOp, SwKernelPrefetchVec& kernelsToPrefetch, | ||
| mlir::Operation* firstShaveTaskInIR, | ||
| mlir::Value bestUpdateBarrier); | ||
| std::vector<VPUIP::SwKernelOp> insertPrefetchTasksDuringExec(mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, | ||
| VPURT::TaskConfigVec& allTasks); | ||
|
|
||
| bool hasVPUSWModule(mlir::Operation* funcOp); | ||
| size_t getOffsetReservedMem(const mlir::ModuleOp module); | ||
|
|
@@ -94,6 +97,7 @@ class AddSwKernelInstructionPrefetch final : | |
| bool _minFreeCyclesHasValue = false; | ||
| size_t _minimumFreeCyclesForPrefetch = 250000; | ||
| bool _useDummyKernelForInstructionPrefetch = false; | ||
| size_t _dynamicPrefetchTileCounter = 0; | ||
| }; | ||
|
|
||
| bool AddSwKernelInstructionPrefetch::hasVPUSWModule(mlir::Operation* funcOp) { | ||
|
|
@@ -187,20 +191,26 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertPrefetchOpBeforeFirstKer | |
|
|
||
| // For LNL, Shave kernel instruction prefetch needs to insert a dummy kernel instead of prefetch kernel | ||
| VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirstKernelTask(mlir::Operation* firstSwTask, | ||
| mlir::Value updateBarrier, | ||
| mlir::ValueRange updateBarrier, | ||
| size_t clusterIdx, | ||
| std::string& kernelName) { | ||
| mlir::OpBuilder builder(firstSwTask); | ||
| auto moduleOp = firstSwTask->getParentOfType<mlir::ModuleOp>(); | ||
| auto kernelOp = kernelNameToOps[kernelName]; | ||
| auto moduleOp = kernelOp->getParentOfType<mlir::ModuleOp>(); | ||
| auto reservedMemOffset = getOffsetReservedMem(moduleOp); | ||
| auto offsetAttr = getIntAttr(moduleOp->getContext(), reservedMemOffset); | ||
| auto kernelOp = kernelNameToOps[kernelName]; | ||
| auto tileIndexAttr = kernelOp.getTileIndexAttr(); | ||
| VPUX_THROW_UNLESS(tileIndexAttr, "SwKernelOp '{0}' does not have a tileIndex attribute", kernelOp->getLoc()); | ||
| const int64_t tileIndex = static_cast<int64_t>(clusterIdx); | ||
|
|
||
| auto createBuffer = [&](mlir::Value io, StringRef suffix, mlir::SmallVector<mlir::Value>& buffers) { | ||
| if (auto bufOp = io.getDefiningOp<VPURT::DeclareBufferOp>()) { | ||
| auto newType = mlir::cast<NDTypeInterface>(io.getType()).changeShape({1, 1, 1, 1}); | ||
| auto origType = mlir::cast<NDTypeInterface>(io.getType()); | ||
| auto newMemSpaceAttr = vpux::IndexedSymbolAttr::get(moduleOp->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN), tileIndex); | ||
| auto newSectionIndexAttr = builder.getI64ArrayAttr({tileIndex}); | ||
| auto newType = origType.changeShape({1, 1, 1, 1}).changeMemSpace(newMemSpaceAttr); | ||
| auto newBuff = builder.create<VPURT::DeclareBufferOp>(appendLoc(bufOp->getLoc(), suffix), newType, | ||
| bufOp.getSectionAttr(), bufOp.getSectionIndexAttr(), | ||
| bufOp.getSectionAttr(), newSectionIndexAttr, | ||
| offsetAttr, bufOp.getSwizzlingKeyAttr()); | ||
| buffers.push_back(newBuff); | ||
| return true; | ||
|
|
@@ -230,14 +240,17 @@ VPUIP::SwKernelOp AddSwKernelInstructionPrefetch::insertDummyKernelOpBeforeFirst | |
|
|
||
| auto cachePrefetchSwKernel = vpux::VPURT::wrapIntoTaskOp<VPUIP::SwKernelOp>( | ||
| builder, mlir::ValueRange(), updateBarrier, newLoc, mlir::ValueRange(srcBuffers), | ||
| mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], kernelOp.getTileIndexAttr(), | ||
| mlir::ValueRange(dstBuffers), nullptr, kernelNameToSymbol[kernelName], builder.getI64IntegerAttr(tileIndex), | ||
| kernelOp.getInputStridesAttr(), kernelOp.getOutputStridesAttr()); | ||
| // The dummy kernels here are generated after ActShaveProfilingPass, | ||
| // so we need to add skipProfiling as attribute to avoid capturing their metadata | ||
| cachePrefetchSwKernel->setAttr("skipProfiling", mlir::UnitAttr::get(firstSwTask->getContext())); | ||
|
|
||
| auto args = | ||
| (kernelName == "convert") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName]; | ||
| (kernelName == "convert" || kernelName == "eltwise_mul" || kernelName == "activation_cos" | ||
| || kernelName == "activation_sin" || kernelName == "eltwise_equal" | ||
| || kernelName == "eltwise_select" || kernelName == "rms_norm") ? mlir::ArrayAttr::get(moduleOp->getContext(), {}) : kernelNameToArgs[kernelName]; | ||
|
|
||
Kepontry marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| vpux::VPUIP::initSwKernel(cachePrefetchSwKernel, mlir::ValueRange(srcBuffers), mlir::ValueRange(dstBuffers), args, | ||
| _log.nest(), /*swKernelRunOp=*/nullptr); | ||
|
|
||
|
|
@@ -316,7 +329,7 @@ AddSwKernelInstructionPrefetch::getPrefetchCandidatesAndFirstSwTask(mlir::Operat | |
| } | ||
|
|
||
| if (!cache.isLoaded(kernelName)) { | ||
| kernelsToPrefetch.push_back(std::move(kernelNameAndSize)); | ||
| kernelsToPrefetch.push_back(std::make_tuple(kernelName, kernelSize, shvTaskIndex)); | ||
| } | ||
| cache.loadKernel(kernelName, kernelSize); | ||
|
|
||
|
|
@@ -394,7 +407,7 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas | |
| for (size_t shaveIdx = 0; (shaveIdx < numClusters * noOfShavesPerCluster) && (shaveIdx < kernelsToPrefetch.size()); | ||
| shaveIdx++) { | ||
| auto clusterIdx = shaveIdx / noOfShavesPerCluster; | ||
| auto [kernelName, kernelSize] = kernelsToPrefetch[shaveIdx]; | ||
| auto [kernelName, kernelSize, shvTaskIndex] = kernelsToPrefetch[shaveIdx]; | ||
| _log.trace("Prefetching kernel {0} on cluster {1}", kernelName, clusterIdx); | ||
| auto newPrefetchKernel = | ||
| _useDummyKernelForInstructionPrefetch | ||
|
|
@@ -410,6 +423,200 @@ std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTas | |
| return prefetchedKernels; | ||
| } | ||
|
|
||
| uint64_t findNextSaturationStart(size_t startIndex, | ||
| vpux::VPURT::TaskConfigVec& allTasks, | ||
| size_t numClusters, | ||
| std::map<uint64_t, size_t>& swKernelCountsCache) { | ||
|
|
||
| // Saturation is defined as 2x the number of clusters (e.g., 4 clusters -> 8 SW kernels) | ||
| const size_t saturationThreshold = numClusters * 2; | ||
Kepontry marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| // Iterate through tasks strictly AFTER the startIndex | ||
| for (size_t i = startIndex + 1; i < allTasks.size(); ++i) { | ||
| uint64_t currentStartTime = static_cast<uint64_t>(allTasks[i].cycleStart); | ||
|
|
||
| if (swKernelCountsCache.find(currentStartTime) == swKernelCountsCache.end()) { | ||
| size_t swKernelCount = 0; | ||
| // Count all SW Kernels that start at this specific time | ||
| for (auto& task : allTasks) { | ||
| if (static_cast<uint64_t>(task.cycleStart) == currentStartTime) { | ||
| if (mlir::isa<VPUIP::SwKernelOp>(task.taskOp.getInnerTaskOp())) { | ||
| swKernelCount++; | ||
| } | ||
| } | ||
| if (static_cast<uint64_t>(task.cycleStart) > currentStartTime) { | ||
| break; | ||
| } | ||
| } | ||
| swKernelCountsCache[currentStartTime] = swKernelCount; | ||
| } | ||
|
|
||
| if (swKernelCountsCache[currentStartTime] >= saturationThreshold) { | ||
| return currentStartTime; | ||
| } | ||
| } | ||
|
|
||
| return std::numeric_limits<uint64_t>::max(); | ||
| } | ||
|
|
||
| struct GapCandidate { | ||
| uint64_t lookaheadGap = 0; | ||
| int64_t insertionPointTaskIndex = -1; | ||
|
|
||
| // used for sort | ||
| bool operator>(const GapCandidate& other) const { | ||
| return lookaheadGap > other.lookaheadGap; | ||
| } | ||
| }; | ||
|
|
||
| size_t getSwKernelCountAtTime(uint64_t startTime, | ||
| VPURT::TaskConfigVec& allTasks) { | ||
| size_t count = 0; | ||
| for (auto& taskConfig : allTasks) { | ||
| if (static_cast<uint64_t>(taskConfig.cycleStart) == startTime) { | ||
| if (mlir::isa<VPUIP::SwKernelOp>(taskConfig.taskOp.getInnerTaskOp())) { | ||
| count++; | ||
| } | ||
| } | ||
| if (static_cast<uint64_t>(taskConfig.cycleStart) > startTime) { | ||
| break; | ||
| } | ||
| } | ||
Kepontry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| return count; | ||
| } | ||
|
|
||
| std::optional<GapCandidate> findBestInsertionGap( | ||
| const std::string& kernelName, | ||
| uint64_t targetKernelGroupStartTime, | ||
| VPURT::TaskConfigVec& allTasks, | ||
| size_t numClusters, | ||
| Logger& log) { | ||
|
|
||
| const int64_t targetInsertTile = 3; | ||
| const uint64_t GAP_THRESHOLD = 50000; | ||
| const size_t saturationThreshold = numClusters * 2; | ||
|
|
||
| // <LookaheadGapSize, GapCandidate> | ||
| std::map<uint64_t, GapCandidate, std::greater<uint64_t>> validGaps; | ||
| std::map<uint64_t, size_t> swKernelCountsCache; // local cache | ||
|
|
||
| int64_t previousT3TaskIndex = -1; | ||
| uint64_t previousT3TaskEndTime = 0; | ||
|
|
||
| // find the largest gap between a non-saturated SW task and a saturated SW task / the kernel to be prefetched | ||
| for (size_t i = 0; i < allTasks.size(); ++i) { | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think this code assumes there is no eviction event between the start of schedule and kernel start time. This might be true for existing platforms but might not be true for future. If we get good results we can accept it as is but for now let's at least put a comment here that we make such an assumption. |
||
| auto& currentTaskConfig = allTasks[i]; | ||
| uint64_t currentTaskStartTime = static_cast<uint64_t>(currentTaskConfig.cycleStart); | ||
| if (currentTaskStartTime > targetKernelGroupStartTime) { | ||
| break; | ||
| } | ||
|
|
||
| bool isT3Task = false; | ||
| if (auto swOp = mlir::dyn_cast<VPUIP::SwKernelOp>(currentTaskConfig.taskOp.getInnerTaskOp())) { | ||
Kepontry marked this conversation as resolved.
Show resolved
Hide resolved
|
||
| isT3Task = (swOp.getTileIndexAttr().getInt() == targetInsertTile); | ||
| } | ||
|
|
||
| if (previousT3TaskIndex != -1 && isT3Task) { | ||
|
|
||
| auto& insertionPointTask = allTasks[previousT3TaskIndex]; | ||
| uint64_t insertionPointStartTime = static_cast<uint64_t>(insertionPointTask.cycleStart); | ||
|
|
||
| size_t simultaneousSwKernels = getSwKernelCountAtTime(insertionPointStartTime, allTasks); | ||
Kepontry marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
|
|
||
| if (simultaneousSwKernels < saturationThreshold) { | ||
| uint64_t nextSaturationStart = findNextSaturationStart(previousT3TaskIndex, allTasks, numClusters, swKernelCountsCache); | ||
| uint64_t gapEnd = std::min(nextSaturationStart, targetKernelGroupStartTime); | ||
| uint64_t lookaheadGap = 0; | ||
| if (gapEnd > previousT3TaskEndTime) { | ||
| lookaheadGap = gapEnd - previousT3TaskEndTime; | ||
| } | ||
|
|
||
| if (lookaheadGap >= GAP_THRESHOLD) { | ||
| GapCandidate gap; | ||
| gap.lookaheadGap = lookaheadGap; | ||
| gap.insertionPointTaskIndex = previousT3TaskIndex; | ||
| validGaps[lookaheadGap] = gap; | ||
| } | ||
| } | ||
| } | ||
|
|
||
| if (isT3Task) { | ||
| previousT3TaskIndex = static_cast<int64_t>(i); | ||
| previousT3TaskEndTime = currentTaskStartTime + static_cast<uint64_t>(allTasks[i].cycleCost); | ||
| } | ||
| } | ||
|
|
||
| if (validGaps.empty()) { | ||
| log.trace("Kernel '{0}': No suitable insertion point found.", kernelName); | ||
| return std::nullopt; | ||
| } | ||
|
|
||
| return validGaps.begin()->second; | ||
| } | ||
|
|
||
| std::vector<VPUIP::SwKernelOp> AddSwKernelInstructionPrefetch::insertPrefetchTasksDuringExec( | ||
| mlir::Operation* funcOp, AddSwKernelInstructionPrefetch::SwKernelPrefetchVec& kernelsToPrefetch, | ||
| VPURT::TaskConfigVec& allTasks) { | ||
|
|
||
| auto moduleOp = funcOp->getParentOfType<mlir::ModuleOp>(); | ||
| const auto numClusters = getNumTiles(moduleOp); | ||
| VPUX_THROW_WHEN(numClusters == 0, "Number of tiles is zero."); | ||
|
|
||
| std::vector<VPUIP::SwKernelOp> prefetchedKernels{}; | ||
|
|
||
| for (auto& kernelInfo : kernelsToPrefetch) { | ||
| std::string kernelName = std::get<0>(kernelInfo); | ||
| size_t firstAppearanceIndex = std::get<2>(kernelInfo); | ||
|
|
||
| if (firstAppearanceIndex >= allTasks.size()) { | ||
Kepontry marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| _log.trace("Skipping kernel '{0}': Invalid firstAppearanceIndex {1}", kernelName, firstAppearanceIndex); | ||
| continue; | ||
| } | ||
| if (kernelNameToOps.count(kernelName) == 0) { | ||
Kepontry marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| _log.trace("Skipping kernel '{0}': Missing dependencies (kernelNameToOps)", kernelName); | ||
| continue; | ||
| } | ||
|
|
||
| uint64_t targetKernelGroupStartTime = static_cast<uint64_t>(allTasks[firstAppearanceIndex].cycleStart); | ||
|
|
||
| auto bestGapOpt = findBestInsertionGap(kernelName, targetKernelGroupStartTime, | ||
| allTasks, numClusters, _log); | ||
|
|
||
| if (!bestGapOpt.has_value()) { | ||
| _log.trace("Kernel '{0}': No valid gap found.", kernelName); | ||
| continue; | ||
| } | ||
|
|
||
| GapCandidate bestGap = bestGapOpt.value(); | ||
| _log.trace("Kernel '{0}': Found best gap of {1} cycles. Inserting relative to task {2}.", | ||
| kernelName, bestGap.lookaheadGap, bestGap.insertionPointTaskIndex); | ||
| std::cout << "[Prefetch DEBUG] Kernel: " << kernelName | ||
Kepontry marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| << " Found best gap of " << bestGap.lookaheadGap | ||
| << " cycles. Inserting relative to task " << bestGap.insertionPointTaskIndex << std::endl; | ||
|
|
||
| if (bestGap.insertionPointTaskIndex < 0 || static_cast<size_t>(bestGap.insertionPointTaskIndex) >= allTasks.size()) { | ||
| _log.error("Kernel '{0}': Invalid insertionPointTaskIndex {1}. Skipping insertion.", | ||
| kernelName, bestGap.insertionPointTaskIndex); | ||
| continue; | ||
| } | ||
|
|
||
| auto insertBeforeOp = allTasks[bestGap.insertionPointTaskIndex].taskOp; | ||
| size_t dynamicExecTile = _dynamicPrefetchTileCounter % numClusters; | ||
| _dynamicPrefetchTileCounter++; | ||
|
|
||
| auto newPrefetchKernel = insertDummyKernelOpBeforeFirstKernelTask( | ||
| insertBeforeOp, | ||
| mlir::ValueRange(), | ||
| dynamicExecTile, | ||
| kernelName | ||
| ); | ||
|
|
||
| prefetchedKernels.push_back(newPrefetchKernel); | ||
| } | ||
|
|
||
| return prefetchedKernels; | ||
| } | ||
|
|
||
| void AddSwKernelInstructionPrefetch::safeRunOnFunc() { | ||
| auto funcOp = getOperation(); | ||
| if (!hasVPUSWModule(funcOp)) { | ||
|
|
@@ -444,18 +651,24 @@ void AddSwKernelInstructionPrefetch::safeRunOnFunc() { | |
| auto [kernelsToPrefetch, firstShvTaskIndex] = getPrefetchCandidatesAndFirstSwTask(funcOp, allTasks); | ||
| auto [firstShaveTaskInIR, bestUpdateBarrier, bestReleaseCycle] = | ||
| getFirstSwTaskInIRAndBestUpdateBarrier(infSim, allTasks, firstShvTaskIndex); | ||
| if (firstShaveTaskInIR == nullptr || kernelsToPrefetch.empty()) { | ||
| return; | ||
| } | ||
| _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); | ||
|
|
||
| if (_useDummyKernelForInstructionPrefetch) { | ||
| auto memSpaceAttr = mlir::SymbolRefAttr::get(module->getContext(), stringifyEnum(VPU::MemoryKind::CMX_NN)); | ||
| auto dummyKernelResMem = config::getDummySwKernelsForInstructionPrefetchReservedMemory(module, memSpaceAttr); | ||
| VPUX_THROW_WHEN(dummyKernelResMem == nullptr, | ||
| "Cannot find DummySWKernelsForInstructionPrefetchReservedMemory!"); | ||
| } | ||
| auto newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); | ||
| if (kernelsToPrefetch.empty()) { | ||
| return; | ||
| } | ||
| _log.trace("insertPoint: {0}, bestReleaseCycle: {1}", *firstShaveTaskInIR, bestReleaseCycle); | ||
|
|
||
| std::vector<VPUIP::SwKernelOp> newPrefetchKernels; | ||
| if (firstShaveTaskInIR == nullptr){ | ||
Kepontry marked this conversation as resolved.
Outdated
Show resolved
Hide resolved
|
||
| newPrefetchKernels = insertPrefetchTasksDuringExec(funcOp, kernelsToPrefetch, allTasks); | ||
|
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. It would be great if we could make the logic common between prefetching at the start and prefetching in the middle of the schedule since the former is just a special case of latter(at least conceptually). Though I understand that this is probably done to limit the impact on other platforms. If results are good we can follow up on this internally. |
||
| } else { | ||
| newPrefetchKernels = insertPrefetchTasks(funcOp, kernelsToPrefetch, firstShaveTaskInIR, bestUpdateBarrier); | ||
| } | ||
|
|
||
| // Update dependencies for cache handling operations to meet requirements of control graph split. | ||
| auto& barrierInfo = getAnalysis<BarrierInfo>(); | ||
|
|
||
Uh oh!
There was an error while loading. Please reload this page.