@@ -2379,67 +2379,105 @@ int target(ident_t *Loc, DeviceTy &Device, void *HostPtr,
23792379// / and informing the record-replayer of whether to store the output
23802380// / in some file.
23812381int target_activate_rr (DeviceTy &Device, uint64_t MemorySize, void *VAddr,
2382- bool IsRecord, bool SaveOutput,
2383- uint64_t &ReqPtrArgOffset ) {
2384- return Device.RTL ->initialize_record_replay (Device. DeviceID , MemorySize,
2385- VAddr, IsRecord, SaveOutput ,
2386- ReqPtrArgOffset );
2382+ bool IsRecord, bool SaveOutput, bool EmitReport,
2383+ const char *OutputDirPath ) {
2384+ return Device.RTL ->initialize_record_replay (
2385+ Device. DeviceID , MemorySize, VAddr, IsRecord,
2386+ /* IsNative= */ true , SaveOutput, EmitReport, OutputDirPath );
23872387}
23882388
23892389// / Executes a kernel using pre-recorded information for loading to
23902390// / device memory to launch the target kernel with the pre-recorded
23912391// / configuration.
23922392int target_replay (ident_t *Loc, DeviceTy &Device, void *HostPtr,
2393- void *DeviceMemory, int64_t DeviceMemorySize, void **TgtArgs,
2394- ptrdiff_t *TgtOffsets, int32_t NumArgs, int32_t NumTeams,
2395- int32_t ThreadLimit, uint64_t LoopTripCount,
2393+ void *DeviceMemory, int64_t DeviceMemorySize,
2394+ const llvm::offloading::EntryTy *Globals, int32_t NumGlobals,
2395+ void **TgtArgs, ptrdiff_t *TgtOffsets, int32_t NumArgs,
2396+ int32_t NumTeams, int32_t ThreadLimit,
2397+ uint32_t SharedMemorySize, uint64_t LoopTripCount,
23962398 AsyncInfoTy &AsyncInfo) {
23972399 int32_t DeviceId = Device.DeviceID ;
2398- TableMap *TM = getTableMap (HostPtr);
2399- // Fail if the table map fails to find the target kernel pointer for the
2400- // provided host pointer.
2401- if (!TM) {
2402- REPORT () << " Host ptr " << HostPtr
2403- << " does not have a matching target pointer." ;
2404- return OFFLOAD_FAIL;
2400+ int32_t NumSymbols = NumGlobals + 1 ;
2401+
2402+ struct SymbolDataTy {
2403+ void *DevPtr = nullptr ;
2404+ TableMap *TM = nullptr ;
2405+ __tgt_target_table *TargetTable = nullptr ;
2406+ };
2407+ SmallVector<SymbolDataTy> Symbols (NumSymbols);
2408+
2409+ for (int32_t I = 0 ; I < NumSymbols; ++I) {
2410+ // The first symbol is the kernel entry.
2411+ void *SymbolHostPtr = (I == 0 ) ? HostPtr : Globals[I - 1 ].Address ;
2412+
2413+ // Get the table map for each symbol.
2414+ Symbols[I].TM = getTableMap (SymbolHostPtr);
2415+ if (!Symbols[I].TM ) {
2416+ REPORT () << " Host pointer " << SymbolHostPtr
2417+ << " does not have a matching target pointer." ;
2418+ return OFFLOAD_FAIL;
2419+ }
24052420 }
24062421
2407- // Retrieve the target table of offloading entries.
2408- __tgt_target_table *TargetTable = nullptr ;
2422+ // Retrieve the target table for each symbol.
24092423 {
24102424 std::lock_guard<std::mutex> TrlTblLock (PM->TrlTblMtx );
24112425 assert (TM->Table ->TargetsTable .size () > (size_t )DeviceId &&
24122426 " Not expecting a device ID outside the table's bounds!" );
2413- TargetTable = TM->Table ->TargetsTable [DeviceId];
2427+ for (auto &S : Symbols) {
2428+ S.TargetTable = S.TM ->Table ->TargetsTable [DeviceId];
2429+ assert (S.TargetTable && " Global data has not been mapped\n " );
2430+ }
24142431 }
2415- assert (TargetTable && " Global data has not been mapped\n " );
24162432
2417- // Retrieve the target kernel pointer, allocate and store the recorded device
2418- // memory data, and launch device execution.
2419- void *TgtEntryPtr = TargetTable->EntriesBegin [TM->Index ].Address ;
2420- ODBG (ODT_Kernel) << " Launching target execution "
2421- << TargetTable->EntriesBegin [TM->Index ].SymbolName
2422- << " with pointer " << TgtEntryPtr << " (index=" << TM->Index
2423- << " )." ;
2433+ // Retrieve the device pointers for each symbol.
2434+ for (auto &S : Symbols)
2435+ S.DevPtr = S.TargetTable ->EntriesBegin [S.TM ->Index ].Address ;
2436+
2437+ // Initialize the device memory of each global.
2438+ for (int32_t I = 0 ; I < NumGlobals; ++I) {
2439+ assert (Globals[I].AuxAddr && " Global has no AuxAddr." );
2440+
2441+ // Initialize the value of the global in the device.
2442+ int Ret = Device.submitData (Symbols[I + 1 ].DevPtr , Globals[I].AuxAddr ,
2443+ Globals[I].Size , AsyncInfo);
2444+ if (Ret != OFFLOAD_SUCCESS) {
2445+ REPORT () << " Failed to submit data to a global." ;
2446+ return OFFLOAD_FAIL;
2447+ }
2448+ }
24242449
24252450 void *TgtPtr = Device.allocData (DeviceMemorySize, /* HstPtr=*/ nullptr ,
24262451 TARGET_ALLOC_DEFAULT);
2427- Device.submitData (TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
2452+ if (!TgtPtr) {
2453+ REPORT () << " Failed to allocate device memory." ;
2454+ return OFFLOAD_FAIL;
2455+ }
2456+
2457+ int Ret =
2458+ Device.submitData (TgtPtr, DeviceMemory, DeviceMemorySize, AsyncInfo);
2459+ if (Ret != OFFLOAD_SUCCESS) {
2460+ REPORT () << " Failed to submit data to a global." ;
2461+ return OFFLOAD_FAIL;
2462+ }
24282463
24292464 KernelArgsTy KernelArgs{};
24302465 KernelArgs.Version = OMP_KERNEL_ARG_VERSION;
24312466 KernelArgs.NumArgs = NumArgs;
24322467 KernelArgs.Tripcount = LoopTripCount;
24332468 KernelArgs.NumTeams [0 ] = NumTeams;
2469+ KernelArgs.NumTeams [1 ] = 1 ;
2470+ KernelArgs.NumTeams [2 ] = 1 ;
24342471 KernelArgs.ThreadLimit [0 ] = ThreadLimit;
2472+ KernelArgs.ThreadLimit [1 ] = 1 ;
2473+ KernelArgs.ThreadLimit [2 ] = 1 ;
2474+ KernelArgs.DynCGroupMem = SharedMemorySize;
24352475
2436- int Ret = Device.launchKernel (TgtEntryPtr, TgtArgs, TgtOffsets, KernelArgs,
2437- AsyncInfo);
2438-
2476+ Ret = Device.launchKernel (Symbols[0 ].DevPtr , TgtArgs, TgtOffsets, KernelArgs,
2477+ AsyncInfo);
24392478 if (Ret != OFFLOAD_SUCCESS) {
2440- REPORT () << " Executing target region abort target ." ;
2479+ REPORT () << " Failed to launch kernel replay ." ;
24412480 return OFFLOAD_FAIL;
24422481 }
2443-
24442482 return OFFLOAD_SUCCESS;
24452483}
0 commit comments