Skip to content

Commit f15904e

Browse files
authored
[offload][OpenMP] Add strict flag for blocks and threads in kernel arguments (llvm#199483)
Until now, strict behavior in the number of threads and blocks has been applied only when the kernel is in bare mode. When this mode is enabled, the values passed in UserNumBlocks and UserThreadLimit are not adjusted and are the definitive values used to launch the kernel. This commit detaches the strictness from the kernel mode. This is going to be used by the kernel replay tool. Additionally, it starts clearing the path for the upcoming OpenMP dims modifier, used to configure multidimensional teams and leagues, which will include strictness choices for teams and threads. All the bare kernels must indicate strict behavior. Asserts are added to check this condition.
1 parent e3f80fe commit f15904e

14 files changed

Lines changed: 301 additions & 73 deletions

File tree

clang/lib/CodeGen/CGOpenMPRuntime.cpp

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -10982,7 +10982,8 @@ static void emitTargetCallKernelLaunch(
1098210982

1098310983
llvm::OpenMPIRBuilder::TargetKernelArgs Args(
1098410984
NumTargetItems, RTArgs, NumIterations, NumTeams, NumThreads,
10985-
DynCGroupMem, HasNoWait, DynCGroupMemFallback);
10985+
DynCGroupMem, HasNoWait, /*StrictBlocksAndThreads=*/IsBare,
10986+
DynCGroupMemFallback);
1098610987

1098710988
llvm::OpenMPIRBuilder::InsertPointTy AfterIP =
1098810989
cantFail(OMPRuntime->getOMPBuilder().emitKernelLaunch(

clang/test/OpenMP/target_teams_codegen.cpp

Lines changed: 42 additions & 42 deletions
Large diffs are not rendered by default.

llvm/include/llvm/Frontend/OpenMP/OMPIRBuilder.h

Lines changed: 6 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2771,6 +2771,9 @@ class OpenMPIRBuilder {
27712771
Value *DynCGroupMem = nullptr;
27722772
/// True if the kernel has 'no wait' clause.
27732773
bool HasNoWait = false;
2774+
/// True if the kernel strictly requires the number of blocks and threads
2775+
/// above to run.
2776+
bool StrictBlocksAndThreads = false;
27742777
/// The fallback mechanism for the shared memory.
27752778
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback =
27762779
omp::OMPDynGroupprivateFallbackType::Abort;
@@ -2780,12 +2783,13 @@ class OpenMPIRBuilder {
27802783
TargetKernelArgs(unsigned NumTargetItems, TargetDataRTArgs RTArgs,
27812784
Value *NumIterations, ArrayRef<Value *> NumTeams,
27822785
ArrayRef<Value *> NumThreads, Value *DynCGroupMem,
2783-
bool HasNoWait,
2786+
bool HasNoWait, bool StrictBlocksAndThreads,
27842787
omp::OMPDynGroupprivateFallbackType DynCGroupMemFallback)
27852788
: NumTargetItems(NumTargetItems), RTArgs(RTArgs),
27862789
NumIterations(NumIterations), NumTeams(NumTeams),
27872790
NumThreads(NumThreads), DynCGroupMem(DynCGroupMem),
2788-
HasNoWait(HasNoWait), DynCGroupMemFallback(DynCGroupMemFallback) {}
2791+
HasNoWait(HasNoWait), StrictBlocksAndThreads(StrictBlocksAndThreads),
2792+
DynCGroupMemFallback(DynCGroupMemFallback) {}
27892793
};
27902794

27912795
/// Create the kernel args vector used by emitTargetKernel. This function

llvm/lib/Frontend/OpenMP/OMPIRBuilder.cpp

Lines changed: 6 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -647,7 +647,12 @@ void OpenMPIRBuilder::getKernelArgsVector(TargetKernelArgs &KernelArgs,
647647
Value *DynCGroupMemFallbackFlag =
648648
Builder.getInt64(static_cast<uint64_t>(KernelArgs.DynCGroupMemFallback));
649649
DynCGroupMemFallbackFlag = Builder.CreateShl(DynCGroupMemFallbackFlag, 2);
650+
651+
Value *StrictFlag = Builder.getInt64(KernelArgs.StrictBlocksAndThreads);
652+
StrictFlag = Builder.CreateShl(StrictFlag, 6);
653+
650654
Value *Flags = Builder.CreateOr(HasNoWaitFlag, DynCGroupMemFallbackFlag);
655+
Flags = Builder.CreateOr(Flags, StrictFlag);
651656

652657
assert(!KernelArgs.NumTeams.empty() && !KernelArgs.NumThreads.empty());
653658

@@ -9783,7 +9788,7 @@ static void emitTargetCall(
97839788

97849789
KArgs = OpenMPIRBuilder::TargetKernelArgs(
97859790
NumTargetItems, RTArgs, TripCount, NumTeamsC, NumThreadsC, DynCGroupMem,
9786-
HasNoWait, DynCGroupMemFallback);
9791+
HasNoWait, /*StrictBlocksAndThreads=*/false, DynCGroupMemFallback);
97879792

97889793
// Assume no error was returned because TaskBodyCB and
97899794
// EmitTargetCallFallbackCB don't produce any.

offload/include/Shared/APITypes.h

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -106,8 +106,10 @@ struct KernelArgsTy {
106106
uint64_t DynCGroupMemFallback : 2; // The fallback for dynamic cgroup mem.
107107
uint64_t Cooperative : 1; // Was this kernel spawned as cooperative.
108108
uint64_t IsPtrArgs : 1; // Arguments are laid out as an array of pointers.
109-
uint64_t Unused : 58;
110-
} Flags = {0, 0, 0, 0, 0, 0};
109+
uint64_t StrictBlocksAndThreads
110+
: 1; // The user-requested number of blocks and threads are strict.
111+
uint64_t Unused : 57;
112+
} Flags = {0, 0, 0, 0, 0, 0, 0};
111113
// User-requested number of blocks (for x,y,z dimension).
112114
uint32_t UserNumBlocks[3] = {0, 0, 0};
113115
// User-requested number of threads (for x,y,z dimension).

offload/liboffload/src/OffloadImpl.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -1124,6 +1124,7 @@ Error olLaunchKernel_impl(ol_queue_handle_t Queue, ol_device_handle_t Device,
11241124
LaunchArgs.UserThreadLimit[1] = LaunchSizeArgs->GroupSize.y;
11251125
LaunchArgs.UserThreadLimit[2] = LaunchSizeArgs->GroupSize.z;
11261126
LaunchArgs.DynCGroupMem = LaunchSizeArgs->DynSharedMemory;
1127+
LaunchArgs.Flags.StrictBlocksAndThreads = true;
11271128

11281129
while (Properties && Properties->type != OL_KERNEL_LAUNCH_PROP_TYPE_NONE) {
11291130
switch (Properties->type) {

offload/libomptarget/KernelLanguage/API.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -68,6 +68,7 @@ unsigned llvmLaunchKernel(const void *func, dim3 gridDim, dim3 blockDim,
6868
Args.UserThreadLimit[2] = blockDim.z;
6969
Args.ArgPtrs = reinterpret_cast<void **>(args);
7070
Args.Flags.IsCUDA = true;
71+
Args.Flags.StrictBlocksAndThreads = true;
7172
return __tgt_target_kernel(nullptr, 0, gridDim.x, blockDim.x, func, &Args);
7273
}
7374
}

offload/libomptarget/omptarget.cpp

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -2481,6 +2481,7 @@ int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
24812481
KernelArgs.UserThreadLimit[1] = 1;
24822482
KernelArgs.UserThreadLimit[2] = 1;
24832483
KernelArgs.DynCGroupMem = SharedMemorySize;
2484+
KernelArgs.Flags.StrictBlocksAndThreads = true;
24842485

24852486
KernelExtraArgsTy KernelExtraArgs{};
24862487
KernelExtraArgs.ReplayOutcome = ReplayOutcome;

offload/plugins-nextgen/common/include/PluginInterface.h

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -529,16 +529,15 @@ struct GenericKernelTy {
529529
/// Get the effective number of threads for the kernel based on the
530530
/// user-defined number of threads.
531531
uint32_t getEffectiveNumThreads(GenericDeviceTy &GenericDevice,
532-
uint32_t UserThreadLimit[3]) const;
532+
uint32_t UserThreadLimit) const;
533533

534534
/// Get the effective number of blocks for the kernel based on the
535535
/// user-defined number of blocks and the loop trip count.
536536
/// The number of threads \p NumThreads can be adjusted by this method.
537537
/// \p IsNumThreadsFromUser is true is \p NumThreads is defined by user via
538538
/// thread_limit clause.
539539
uint32_t getEffectiveNumBlocks(GenericDeviceTy &GenericDevice,
540-
uint32_t UserNumBlocks[3],
541-
uint64_t LoopTripCount,
540+
uint32_t UserNumBlocks, uint64_t LoopTripCount,
542541
uint32_t &EffectiveNumThreads,
543542
bool IsNumThreadsFromUser) const;
544543

offload/plugins-nextgen/common/src/PluginInterface.cpp

Lines changed: 27 additions & 22 deletions
Original file line numberDiff line numberDiff line change
@@ -249,15 +249,27 @@ Error GenericKernelTy::launch(GenericDeviceTy &GenericDevice, void **ArgPtrs,
249249
uint32_t EffectiveNumBlocks[3] = {KernelArgs.UserNumBlocks[0],
250250
KernelArgs.UserNumBlocks[1],
251251
KernelArgs.UserNumBlocks[2]};
252-
if (!isBareMode()) {
253-
assert(
254-
EffectiveNumThreads[1] == 1 && EffectiveNumThreads[2] == 1 &&
255-
EffectiveNumBlocks[1] == 1 && EffectiveNumBlocks[2] == 1 &&
256-
"Non-bare mode should only use the first thread and block dimensions");
252+
253+
// Multidimensional is only supported with bare mode for now.
254+
assert(isBareMode() ||
255+
EffectiveNumThreads[1] == 1 && EffectiveNumThreads[2] == 1 &&
256+
EffectiveNumBlocks[1] == 1 && EffectiveNumBlocks[2] == 1 &&
257+
"Non-bare mode should only use the first thread and block "
258+
"dimensions");
259+
260+
assert(!KernelArgs.Flags.StrictBlocksAndThreads ||
261+
EffectiveNumThreads[0] > 0 && EffectiveNumThreads[1] > 0 &&
262+
EffectiveNumThreads[2] > 0 && EffectiveNumBlocks[0] > 0 &&
263+
EffectiveNumBlocks[1] > 0 && EffectiveNumBlocks[2] > 0 &&
264+
"Strict requires number of blocks and threads greater than zero");
265+
266+
// Calculate or adjust the effective number of threads and blocks if needed.
267+
if (!KernelArgs.Flags.StrictBlocksAndThreads) {
257268
EffectiveNumThreads[0] =
258-
getEffectiveNumThreads(GenericDevice, EffectiveNumThreads);
269+
getEffectiveNumThreads(GenericDevice, EffectiveNumThreads[0]);
270+
259271
EffectiveNumBlocks[0] = getEffectiveNumBlocks(
260-
GenericDevice, EffectiveNumBlocks, KernelArgs.Tripcount,
272+
GenericDevice, EffectiveNumBlocks[0], KernelArgs.Tripcount,
261273
EffectiveNumThreads[0], KernelArgs.UserThreadLimit[0] > 0);
262274
}
263275

@@ -364,34 +376,27 @@ GenericKernelTy::prepareArgs(GenericDeviceTy &GenericDevice, void **ArgPtrs,
364376

365377
uint32_t
366378
GenericKernelTy::getEffectiveNumThreads(GenericDeviceTy &GenericDevice,
367-
uint32_t UserThreadLimit[3]) const {
379+
uint32_t UserThreadLimit) const {
368380
assert(!isBareMode() && "bare kernel should not call this function");
369381

370-
assert(UserThreadLimit[1] == 1 && UserThreadLimit[2] == 1 &&
371-
"Multi dimensional launch not supported yet.");
382+
if (UserThreadLimit > 0 && isGenericMode())
383+
UserThreadLimit += GenericDevice.getWarpSize();
372384

373-
if (UserThreadLimit[0] > 0 && isGenericMode())
374-
UserThreadLimit[0] += GenericDevice.getWarpSize();
375-
376-
return std::min(MaxNumThreads, (UserThreadLimit[0] > 0)
377-
? UserThreadLimit[0]
378-
: PreferredNumThreads);
385+
return std::min(MaxNumThreads, (UserThreadLimit > 0) ? UserThreadLimit
386+
: PreferredNumThreads);
379387
}
380388

381389
uint32_t GenericKernelTy::getEffectiveNumBlocks(
382-
GenericDeviceTy &GenericDevice, uint32_t UserNumBlocks[3],
390+
GenericDeviceTy &GenericDevice, uint32_t UserNumBlocks,
383391
uint64_t LoopTripCount, uint32_t &EffectiveNumThreads,
384392
bool IsNumThreadsFromUser) const {
385393
assert(!isBareMode() && "bare kernel should not call this function");
386394

387-
assert(UserNumBlocks[1] == 1 && UserNumBlocks[2] == 1 &&
388-
"Multi dimensional launch not supported yet.");
389-
390-
if (UserNumBlocks[0] > 0) {
395+
if (UserNumBlocks > 0) {
391396
// TODO: We need to honor any value and consequently allow more than the
392397
// block limit. For this we might need to start multiple kernels or let the
393398
// blocks start again until the requested number has been started.
394-
return std::min(UserNumBlocks[0], GenericDevice.getBlockLimit());
399+
return std::min(UserNumBlocks, GenericDevice.getBlockLimit());
395400
}
396401

397402
// Return the number of blocks required to cover the loop iterations.

0 commit comments

Comments
 (0)