Skip to content

Commit 0f2d602

Browse files
rakesroymangupta
authored andcommitted
SWDEV-521011 - Allow max stack size as per ISA.
Change-Id: Ie8ce7b6ca6bedbaa127c82b643c13002c43f5537
1 parent f378843 commit 0f2d602

4 files changed

Lines changed: 39 additions & 9 deletions

File tree

rocclr/device/device.cpp

Lines changed: 14 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -748,6 +748,18 @@ bool Device::ValidateHsail() {
748748
return true;
749749
}
750750

751+
size_t GetMaxStackSize(const std::string& procName) {
752+
if (procName.find("gfx9") != std::string::npos || procName.find("gfx8")
753+
!= std::string::npos) {
754+
return kMaxStackSize9X;
755+
} else if (procName.find("gfx11") != std::string::npos || procName.find("gfx10")
756+
!= std::string::npos) {
757+
return kMaxStackSize11X;
758+
} else {
759+
return kMaxStackSize12X;
760+
}
761+
}
762+
751763
bool Device::create(const Isa &isa) {
752764
assert(!vaCacheAccess_ && !vaCacheMap_);
753765
isa_ = &isa;
@@ -764,6 +776,7 @@ bool Device::create(const Isa &isa) {
764776
if (!amd::IS_HIP) {
765777
stack_size_ = 16 * Ki;
766778
}
779+
maxStackSize_ = GetMaxStackSize(isa_->processorName());
767780
return true;
768781
}
769782

@@ -939,7 +952,7 @@ bool Device::disableP2P(amd::Device* ptrDev) {
939952
}
940953

941954
bool Device::UpdateStackSize(uint64_t stackSize) {
942-
if (stackSize > kMaxStackSize) {
955+
if (stackSize > maxStackSize_) {
943956
return false;
944957
}
945958
stack_size_ = amd::alignUp(stackSize, 16);

rocclr/device/device.hpp

Lines changed: 16 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,16 @@ enum MemRangeAttribute : uint32_t {
126126
constexpr int CpuDeviceId = static_cast<int>(-1);
127127
constexpr int InvalidDeviceId = static_cast<int>(-2);
128128

129+
// Max scratch size is device dependent.
130+
constexpr size_t kWave32 = 32;
131+
constexpr size_t kWave64 = 64;
132+
constexpr size_t kScratchBits12X = 18;
133+
constexpr size_t kScratchBits9X = 15;
134+
constexpr size_t kCompilerRequired = 64;
135+
constexpr size_t kMaxStackSize12X = (((1 << kScratchBits12X) - 1) * 256 / kWave32) - kCompilerRequired;
136+
constexpr size_t kMaxStackSize11X = (((1 << kScratchBits9X) - 1) * 256 / kWave32) - kCompilerRequired;
137+
constexpr size_t kMaxStackSize9X = (((1 << kScratchBits9X) - 1) * 256 / kWave64) - kCompilerRequired;
138+
129139
enum class ExternalSemaphoreHandleType : uint32_t {
130140
OpaqueFd = 1, // Handle is an opaque file descriptor
131141
OpaqueWin32 = 2, // Handle is an opaque shared NT handle
@@ -1653,11 +1663,9 @@ class Device : public RuntimeObject {
16531663
static constexpr size_t kMGInfoSizePerDevice = kMGSyncDataSize + sizeof(MGSyncInfo);
16541664
static constexpr size_t kSGInfoSize = kMGSyncDataSize;
16551665

1656-
// Amount of space used by each wave is in units of 256 dwords.
1657-
// As per COMPUTE_TMPRING_SIZE.WAVE_SIZE 24:12
1658-
// The field size supports a range of 0->(2M-256) dwords per wave64.
1659-
// Per lane this works out to 131056 bytes or 128K - 16
1660-
static constexpr size_t kMaxStackSize = ((128 * Ki) - 16);
1666+
// Max Scratch size is based on ISA and thus per device.
1667+
// Def value is as per GFX9 being the least among supported devices.
1668+
size_t maxStackSize_ = kMaxStackSize9X;
16611669

16621670
typedef std::list<CommandQueue*> CommandQueues;
16631671

@@ -2132,6 +2140,9 @@ class Device : public RuntimeObject {
21322140
return nullptr;
21332141
}
21342142

2143+
//! Returns stack size set for the device
2144+
size_t MaxStackSize() const { return maxStackSize_; }
2145+
21352146
#if defined(__clang__)
21362147
#if __has_feature(address_sanitizer)
21372148
virtual device::UriLocator* createUriLocator() const = 0;

rocclr/device/pal/palvirtual.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2751,6 +2751,13 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
27512751
if ((hsaKernel.workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
27522752
privateMemSize = std::max<uint32_t>(static_cast<uint32_t>(device().StackSize()),
27532753
hsaKernel.workGroupInfo()->scratchRegs_ * sizeof(uint32_t)) ;
2754+
// Validate privateMemSize is more than max allowed.
2755+
size_t maxStackSize = device().MaxStackSize();
2756+
if (privateMemSize > maxStackSize) {
2757+
LogError("Scratch size (%zu) exceeds max allowed (%zu) for kernel : %s", privateMemSize,
2758+
maxStackSize, hsaKernel.name().c_str());
2759+
return false;
2760+
}
27542761
}
27552762

27562763
// Set up the dispatch information

rocclr/device/rocm/rocvirtual.cpp

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -3472,9 +3472,8 @@ bool VirtualGPU::submitKernelInternal(const amd::NDRangeContainer& sizes,
34723472
dispatchPacket.private_segment_size = devKernel->workGroupInfo()->privateMemSize_;
34733473

34743474
if ((devKernel->workGroupInfo()->usedStackSize_ & 0x1) == 0x1) {
3475-
dispatchPacket.private_segment_size = std::min<uint64_t>(
3476-
std::max<uint64_t>(dev().StackSize(), dispatchPacket.private_segment_size),
3477-
Device::kMaxStackSize);
3475+
dispatchPacket.private_segment_size = std::max<uint64_t>(dev().StackSize(),
3476+
dispatchPacket.private_segment_size);
34783477
}
34793478

34803479
// Pass the header accordingly

0 commit comments

Comments
 (0)