Skip to content

Commit 12a4adf

Browse files
authored
[offload] Fix kernel record/replay and add extensible mechanism (llvm#190588)
This commit fixes the kernel record replay on both AMD and CUDA devices. It also re-organizes the record replay code, moves the whole code to separate files, and makes it extensible to support other record formats (potentially in the future). The environment variables for controlling the recording have also been modified.
1 parent 4a24c68 commit 12a4adf

File tree

21 files changed

+1284
-644
lines changed

21 files changed

+1284
-644
lines changed

offload/cmake/OpenMPTesting.cmake

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,7 @@ endif()
1212
set(OFFLOAD_NOT_EXECUTABLE ${LLVM_TOOLS_BINARY_DIR}/not)
1313
set(OFFLOAD_DEVICE_INFO_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-offload-device-info)
1414
set(OFFLOAD_TBLGEN_EXECUTABLE ${LLVM_RUNTIME_OUTPUT_INTDIR}/offload-tblgen)
15+
set(OMP_KERNEL_REPLAY ${LLVM_RUNTIME_OUTPUT_INTDIR}/llvm-omp-kernel-replay)
1516

1617
# Set the information that we know.
1718
set(OPENMP_TEST_COMPILER_ID "Clang")

offload/include/omptarget.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -428,17 +428,20 @@ void __tgt_target_nowait_query(void **AsyncHandle);
428428
/// device memory.
429429
int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId, void *HostPtr,
430430
void *DeviceMemory, int64_t DeviceMemorySize,
431-
void **TgtArgs, ptrdiff_t *TgtOffsets,
432-
int32_t NumArgs, int32_t NumTeams,
433-
int32_t ThreadLimit, uint64_t LoopTripCount);
431+
const llvm::offloading::EntryTy *Globals,
432+
int32_t NumGlobals, void **TgtArgs,
433+
ptrdiff_t *TgtOffsets, int32_t NumArgs,
434+
int32_t NumTeams, int32_t ThreadLimit,
435+
uint32_t SharedMemorySize,
436+
uint64_t LoopTripCount);
434437

435438
void __tgt_set_info_flag(uint32_t);
436439

437440
int __tgt_print_device_info(int64_t DeviceId);
438441

439442
int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
440443
void *VAddr, bool IsRecord, bool SaveOutput,
441-
uint64_t &ReqPtrArgOffset);
444+
bool EmitReport, const char *OutputDirPath);
442445

443446
// Registers a callback for the RPC server. Expects this function type.
444447
// unsigned callback(rpc::Server::Port *Port, unsigned NumLanes). See the RPC

offload/libomptarget/device.cpp

Lines changed: 17 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -89,12 +89,23 @@ llvm::Error DeviceTy::init() {
8989
// Enables recording kernels if set.
9090
BoolEnvar OMPX_RecordKernel("LIBOMPTARGET_RECORD", false);
9191
if (OMPX_RecordKernel) {
92-
// Enables saving the device memory kernel output post execution if set.
93-
BoolEnvar OMPX_ReplaySaveOutput("LIBOMPTARGET_RR_SAVE_OUTPUT", false);
94-
95-
uint64_t ReqPtrArgOffset;
96-
RTL->initialize_record_replay(RTLDeviceID, 0, nullptr, true,
97-
OMPX_ReplaySaveOutput, ReqPtrArgOffset);
92+
BoolEnvar OMPX_RecordOutput("LIBOMPTARGET_RECORD_OUTPUT", true);
93+
Int64Envar OMPX_RecordMemSize("LIBOMPTARGET_RECORD_MEMSIZE",
94+
8 * 1024 * 1024 * 1024ULL);
95+
Int32Envar OMPX_RecordDevice("LIBOMPTARGET_RECORD_DEVICE", 0);
96+
StringEnvar OMPX_RecordOutputDir("LIBOMPTARGET_RECORD_DIR", "");
97+
BoolEnvar OMPX_EmitRecordReport("LIBOMPTARGET_RECORD_REPORT", false);
98+
if (OMPX_RecordDevice != RTLDeviceID)
99+
return llvm::Error::success();
100+
101+
Ret = RTL->initialize_record_replay(
102+
RTLDeviceID, OMPX_RecordMemSize, nullptr,
103+
/*IsRecord=*/true, /*IsNative=*/true, OMPX_RecordOutput,
104+
OMPX_EmitRecordReport, OMPX_RecordOutputDir.get().c_str());
105+
if (Ret != OFFLOAD_SUCCESS)
106+
return error::createOffloadError(error::ErrorCode::BACKEND_FAILURE,
107+
"failed to initialize RR in device %d\n",
108+
DeviceID);
98109
}
99110

100111
return llvm::Error::success();

offload/libomptarget/interface.cpp

Lines changed: 32 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -473,25 +473,32 @@ EXTERN int __tgt_target_kernel(ident_t *Loc, int64_t DeviceId, int32_t NumTeams,
473473
/// Activates the record replay mechanism.
474474
/// \param DeviceId The device identifier to execute the target region.
475475
/// \param MemorySize The number of bytes to be (pre-)allocated
476-
/// by the bump allocator
476+
/// by the record replay allocator.
477477
/// /param IsRecord Activates the record replay mechanism in
478-
/// 'record' mode or 'replay' mode.
478+
/// 'record' or 'replay' mode.
479479
/// /param SaveOutput Store the device memory after kernel
480-
/// execution on persistent storage
480+
/// execution on persistent storage.
481+
/// /param EmitReport Emit a summary report after the recording.
482+
/// /param OutputDirPath The output directory where the record replay files
483+
/// should be stored. An empty string or nullptr indicates the current working
484+
/// directory should be used.
481485
EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
482486
void *VAddr, bool IsRecord,
483-
bool SaveOutput,
484-
uint64_t &ReqPtrArgOffset) {
487+
bool SaveOutput, bool EmitReport,
488+
const char *OutputDirPath) {
485489
assert(PM && "Runtime not initialized");
486490
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
487491
auto DeviceOrErr = PM->getDevice(DeviceId);
488492
if (!DeviceOrErr)
489493
FATAL_MESSAGE(DeviceId, "%s", toString(DeviceOrErr.takeError()).c_str());
490494

491-
[[maybe_unused]] int Rc = target_activate_rr(
492-
*DeviceOrErr, MemorySize, VAddr, IsRecord, SaveOutput, ReqPtrArgOffset);
493-
assert(Rc == OFFLOAD_SUCCESS &&
494-
"__tgt_activate_record_replay unexpected failure!");
495+
int Rc = target_activate_rr(*DeviceOrErr, MemorySize, VAddr, IsRecord,
496+
SaveOutput, EmitReport, OutputDirPath);
497+
if (Rc != OFFLOAD_SUCCESS) {
498+
ODBG(ODT_Interface) << "Record replay failed to activate in device "
499+
<< DeviceId;
500+
return OMP_TGT_FAIL;
501+
}
495502
return OMP_TGT_SUCCESS;
496503
}
497504

@@ -512,12 +519,12 @@ EXTERN int __tgt_activate_record_replay(int64_t DeviceId, uint64_t MemorySize,
512519
/// execution.
513520
/// \param LoopTripCount The pre-recorded value of the loop tripcount, if any.
514521
/// \return OMP_TGT_SUCCESS on success, OMP_TGT_FAIL on failure.
515-
EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
516-
void *HostPtr, void *DeviceMemory,
517-
int64_t DeviceMemorySize, void **TgtArgs,
518-
ptrdiff_t *TgtOffsets, int32_t NumArgs,
519-
int32_t NumTeams, int32_t ThreadLimit,
520-
uint64_t LoopTripCount) {
522+
EXTERN int __tgt_target_kernel_replay(
523+
ident_t *Loc, int64_t DeviceId, void *HostPtr, void *DeviceMemory,
524+
int64_t DeviceMemorySize, const llvm::offloading::EntryTy *Globals,
525+
int32_t NumGlobals, void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
526+
int32_t NumTeams, int32_t ThreadLimit, uint32_t SharedMemorySize,
527+
uint64_t LoopTripCount) {
521528
assert(PM && "Runtime not initialized");
522529
OMPT_IF_BUILT(ReturnAddressSetterRAII RA(__builtin_return_address(0)));
523530
if (checkDevice(DeviceId, Loc)) {
@@ -534,14 +541,18 @@ EXTERN int __tgt_target_kernel_replay(ident_t *Loc, int64_t DeviceId,
534541
/*CodePtr=*/OMPT_GET_RETURN_ADDRESS);)
535542

536543
AsyncInfoTy AsyncInfo(*DeviceOrErr);
537-
int Rc = target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory,
538-
DeviceMemorySize, TgtArgs, TgtOffsets, NumArgs,
539-
NumTeams, ThreadLimit, LoopTripCount, AsyncInfo);
544+
int Rc =
545+
target_replay(Loc, *DeviceOrErr, HostPtr, DeviceMemory, DeviceMemorySize,
546+
Globals, NumGlobals, TgtArgs, TgtOffsets, NumArgs, NumTeams,
547+
ThreadLimit, SharedMemorySize, LoopTripCount, AsyncInfo);
548+
540549
if (Rc == OFFLOAD_SUCCESS)
541550
Rc = AsyncInfo.synchronize();
542-
handleTargetOutcome(Rc == OFFLOAD_SUCCESS, Loc);
543-
assert(Rc == OFFLOAD_SUCCESS &&
544-
"__tgt_target_kernel_replay unexpected failure!");
551+
552+
if (Rc != OFFLOAD_SUCCESS) {
553+
ODBG(ODT_Interface) << "Kernel replay failed in device " << DeviceId;
554+
return OMP_TGT_FAIL;
555+
}
545556
return OMP_TGT_SUCCESS;
546557
}
547558

offload/libomptarget/omptarget.cpp

Lines changed: 70 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -2379,67 +2379,105 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
23792379
/// and informing the record-replayer of whether to store the output
23802380
/// in some file.
23812381
int target_activate_rr(DeviceTy &Device, uint64_t MemorySize, void *VAddr,
2382-
bool IsRecord, bool SaveOutput,
2383-
uint64_t &ReqPtrArgOffset) {
2384-
return Device.RTL->initialize_record_replay(Device.DeviceID, MemorySize,
2385-
VAddr, IsRecord, SaveOutput,
2386-
ReqPtrArgOffset);
2382+
bool IsRecord, bool SaveOutput, bool EmitReport,
2383+
const char *OutputDirPath) {
2384+
return Device.RTL->initialize_record_replay(
2385+
Device.DeviceID, MemorySize, VAddr, IsRecord,
2386+
/*IsNative=*/true, SaveOutput, EmitReport, OutputDirPath);
23872387
}
23882388

23892389
/// Executes a kernel using pre-recorded information for loading to
23902390
/// device memory to launch the target kernel with the pre-recorded
23912391
/// configuration.
23922392
int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
2393-
void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
2394-
ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
2395-
int32_t ThreadLimit, uint64_t LoopTripCount,
2393+
void *DeviceMemory, int64_t DeviceMemorySize,
2394+
const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
2395+
void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
2396+
int32_t NumTeams, int32_t ThreadLimit,
2397+
uint32_t SharedMemorySize, uint64_t LoopTripCount,
23962398
AsyncInfoTy &AsyncInfo) {
23972399
int32_t DeviceId = Device.DeviceID;
2398-
TableMap *TM = getTableMap(HostPtr);
2399-
// Fail if the table map fails to find the target kernel pointer for the
2400-
// provided host pointer.
2401-
if (!TM) {
2402-
REPORT() << "Host ptr " << HostPtr
2403-
<< " does not have a matching target pointer.";
2404-
return OFFLOAD_FAIL;
2400+
int32_t NumSymbols = NumGlobals + 1;
2401+
2402+
struct SymbolDataTy {
2403+
void *DevPtr = nullptr;
2404+
TableMap *TM = nullptr;
2405+
__tgt_target_table *TargetTable = nullptr;
2406+
};
2407+
SmallVector<SymbolDataTy> Symbols(NumSymbols);
2408+
2409+
for (int32_t I = 0; I < NumSymbols; ++I) {
2410+
// The first symbol is the kernel entry.
2411+
void *SymbolHostPtr = (I == 0) ? HostPtr : Globals[I - 1].Address;
2412+
2413+
// Get the table map for each symbol.
2414+
Symbols[I].TM = getTableMap(SymbolHostPtr);
2415+
if (!Symbols[I].TM) {
2416+
REPORT() << "Host pointer " << SymbolHostPtr
2417+
<< " does not have a matching target pointer.";
2418+
return OFFLOAD_FAIL;
2419+
}
24052420
}
24062421

2407-
// Retrieve the target table of offloading entries.
2408-
__tgt_target_table *TargetTable = nullptr;
2422+
// Retrieve the target table for each symbol.
24092423
{
24102424
std::lock_guard<std::mutex> TrlTblLock(PM->TrlTblMtx);
24112425
assert(TM->Table->TargetsTable.size() > (size_t)DeviceId &&
24122426
"Not expecting a device ID outside the table's bounds!");
2413-
TargetTable = TM->Table->TargetsTable[DeviceId];
2427+
for (auto &S : Symbols) {
2428+
S.TargetTable = S.TM->Table->TargetsTable[DeviceId];
2429+
assert(S.TargetTable && "Global data has not been mapped\n");
2430+
}
24142431
}
2415-
assert(TargetTable && "Global data has not been mapped\n");
24162432

2417-
// Retrieve the target kernel pointer, allocate and store the recorded device
2418-
// memory data, and launch device execution.
2419-
void *TgtEntryPtr = TargetTable->EntriesBegin[TM->Index].Address;
2420-
ODBG(ODT_Kernel) << "Launching target execution "
2421-
<< TargetTable->EntriesBegin[TM->Index].SymbolName
2422-
<< " with pointer " << TgtEntryPtr << " (index=" << TM->Index
2423-
<< ").";
2433+
// Retrieve the device pointers for each symbol.
2434+
for (auto &S : Symbols)
2435+
S.DevPtr = S.TargetTable->EntriesBegin[S.TM->Index].Address;
2436+
2437+
// Initialize the device memory of each global.
2438+
for (int32_t I = 0; I < NumGlobals; ++I) {
2439+
assert(Globals[I].AuxAddr && "Global has no AuxAddr.");
2440+
2441+
// Initialize the value of the global in the device.
2442+
int Ret = Device.submitData(Symbols[I + 1].DevPtr, Globals[I].AuxAddr,
2443+
Globals[I].Size, AsyncInfo);
2444+
if (Ret != OFFLOAD_SUCCESS) {
2445+
REPORT() << "Failed to submit data to a global.";
2446+
return OFFLOAD_FAIL;
2447+
}
2448+
}
24242449

24252450
void *TgtPtr = Device.allocData(DeviceMemorySize, /*HstPtr=*/nullptr,
24262451
TARGET_ALLOC_DEFAULT);
2427-
Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
2452+
if (!TgtPtr) {
2453+
REPORT() << "Failed to allocate device memory.";
2454+
return OFFLOAD_FAIL;
2455+
}
2456+
2457+
int Ret =
2458+
Device.submitData(TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
2459+
if (Ret != OFFLOAD_SUCCESS) {
2460+
REPORT() << "Failed to submit data to a global.";
2461+
return OFFLOAD_FAIL;
2462+
}
24282463

24292464
KernelArgsTy KernelArgs{};
24302465
KernelArgs.Version = OMP_KERNEL_ARG_VERSION;
24312466
KernelArgs.NumArgs = NumArgs;
24322467
KernelArgs.Tripcount = LoopTripCount;
24332468
KernelArgs.NumTeams[0] = NumTeams;
2469+
KernelArgs.NumTeams[1] = 1;
2470+
KernelArgs.NumTeams[2] = 1;
24342471
KernelArgs.ThreadLimit[0] = ThreadLimit;
2472+
KernelArgs.ThreadLimit[1] = 1;
2473+
KernelArgs.ThreadLimit[2] = 1;
2474+
KernelArgs.DynCGroupMem = SharedMemorySize;
24352475

2436-
int Ret = Device.launchKernel(TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs,
2437-
AsyncInfo);
2438-
2476+
Ret = Device.launchKernel(Symbols[0].DevPtr, TgtArgs, TgtOffsets, KernelArgs,
2477+
AsyncInfo);
24392478
if (Ret != OFFLOAD_SUCCESS) {
2440-
REPORT() << "Executing target region abort target.";
2479+
REPORT() << "Failed to launch kernel replay.";
24412480
return OFFLOAD_FAIL;
24422481
}
2443-
24442482
return OFFLOAD_SUCCESS;
24452483
}

offload/libomptarget/private.h

Lines changed: 7 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -27,14 +27,17 @@ extern int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
2727
KernelArgsTy &KernelArgs, AsyncInfoTy &AsyncInfo);
2828

2929
extern int target_activate_rr(DeviceTy &Device, uint64_t MemorySize,
30-
void *ReqAddr, bool isRecord, bool SaveOutput,
31-
uint64_t &ReqPtrArgOffset);
30+
void *ReqAddr, bool IsRecord, bool SaveOutput,
31+
bool EmitReport, const char *OutputDirPath);
3232

3333
extern int target_replay(ident_t *Loc, DeviceTy &Device, void *HostPtr,
3434
void *DeviceMemory, int64_t DeviceMemorySize,
35-
void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
35+
const llvm::offloading::EntryTy *Globals,
36+
int32_t NumGlobals, void **TgtArgs,
37+
ptrdiff_t *TgtOffsets, int32_t NumArgs,
3638
int32_t NumTeams, int32_t ThreadLimit,
37-
uint64_t LoopTripCount, AsyncInfoTy &AsyncInfo);
39+
uint32_t SharedMemorySize, uint64_t LoopTripCount,
40+
AsyncInfoTy &AsyncInfo);
3841

3942
extern void handleTargetOutcome(bool Success, ident_t *Loc);
4043

offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.cpp

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -75,6 +75,13 @@ DLWRAP(hsa_amd_profiling_set_profiler_enabled, 2)
7575
DLWRAP(hsa_code_object_reader_create_from_memory, 3)
7676
DLWRAP(hsa_code_object_reader_destroy, 1)
7777
DLWRAP(hsa_executable_load_agent_code_object, 5)
78+
DLWRAP(hsa_amd_vmem_address_reserve, 4)
79+
DLWRAP(hsa_amd_vmem_address_free, 2)
80+
DLWRAP(hsa_amd_vmem_handle_create, 5)
81+
DLWRAP(hsa_amd_vmem_handle_release, 1)
82+
DLWRAP(hsa_amd_vmem_map, 5)
83+
DLWRAP(hsa_amd_vmem_unmap, 2)
84+
DLWRAP(hsa_amd_vmem_set_access, 4)
7885

7986
DLWRAP_FINALIZE()
8087

offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa.h

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -116,6 +116,13 @@ typedef struct hsa_isa_s {
116116
uint64_t handle;
117117
} hsa_isa_t;
118118

119+
typedef enum {
120+
HSA_ACCESS_PERMISSION_NONE = 0,
121+
HSA_ACCESS_PERMISSION_RO = 1,
122+
HSA_ACCESS_PERMISSION_WO = 2,
123+
HSA_ACCESS_PERMISSION_RW = 3
124+
} hsa_access_permission_t;
125+
119126
hsa_status_t hsa_system_get_info(hsa_system_info_t attribute, void *value);
120127

121128
hsa_status_t hsa_agent_get_info(hsa_agent_t agent, hsa_agent_info_t attribute,

offload/plugins-nextgen/amdgpu/dynamic_hsa/hsa_ext_amd.h

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -163,6 +163,20 @@ typedef struct hsa_amd_pointer_info_s {
163163
size_t sizeInBytes;
164164
} hsa_amd_pointer_info_t;
165165

166+
typedef enum {
167+
MEMORY_TYPE_NONE,
168+
MEMORY_TYPE_PINNED,
169+
} hsa_amd_memory_type_t;
170+
171+
typedef struct hsa_amd_vmem_alloc_handle_s {
172+
uint64_t handle;
173+
} hsa_amd_vmem_alloc_handle_t;
174+
175+
typedef struct hsa_amd_memory_access_desc_s {
176+
hsa_access_permission_t permissions;
177+
hsa_agent_t agent_handle;
178+
} hsa_amd_memory_access_desc_t;
179+
166180
hsa_status_t hsa_amd_pointer_info(const void* ptr,
167181
hsa_amd_pointer_info_t* info,
168182
void* (*alloc)(size_t),
@@ -181,6 +195,29 @@ hsa_amd_profiling_get_dispatch_time(hsa_agent_t agent, hsa_signal_t signal,
181195
hsa_status_t hsa_amd_profiling_set_profiler_enabled(hsa_queue_t *queue,
182196
int enable);
183197

198+
hsa_status_t hsa_amd_vmem_address_reserve(void **va, size_t size,
199+
uint64_t address, uint64_t flags);
200+
201+
hsa_status_t hsa_amd_vmem_address_free(void *va, size_t size);
202+
203+
hsa_status_t
204+
hsa_amd_vmem_handle_create(hsa_amd_memory_pool_t pool, size_t size,
205+
hsa_amd_memory_type_t type, uint64_t flags,
206+
hsa_amd_vmem_alloc_handle_t *memory_handle);
207+
208+
hsa_status_t
209+
hsa_amd_vmem_handle_release(hsa_amd_vmem_alloc_handle_t memory_handle);
210+
211+
hsa_status_t hsa_amd_vmem_map(void *va, size_t size, size_t in_offset,
212+
hsa_amd_vmem_alloc_handle_t memory_handle,
213+
uint64_t flags);
214+
215+
hsa_status_t hsa_amd_vmem_unmap(void *va, size_t size);
216+
217+
hsa_status_t hsa_amd_vmem_set_access(void *va, size_t size,
218+
const hsa_amd_memory_access_desc_t *desc,
219+
size_t desc_cnt);
220+
184221
#ifdef __cplusplus
185222
}
186223
#endif

0 commit comments

Comments
 (0)