Skip to content

Commit 8fcc7e9

Browse files
committed
Remove two unsafe amrex::launch functions
1 parent 16dce9c commit 8fcc7e9

11 files changed

+27
-89
lines changed

Src/AmrCore/AMReX_TagBox.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -447,8 +447,8 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
447447
const int ncells = fai.fabbox().numPts();
448448
const char* tags = (*this)[fai].dataPtr();
449449
#ifdef AMREX_USE_SYCL
450-
amrex::launch(nblocks[li], block_size, sizeof(int)*Gpu::Device::warp_size,
451-
Gpu::Device::gpuStream(),
450+
amrex::launch<block_size>(nblocks[li], sizeof(int)*Gpu::Device::warp_size,
451+
Gpu::Device::gpuStream(),
452452
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
453453
{
454454
int bid = h.item->get_group_linear_id();
@@ -467,7 +467,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
467467
}
468468
});
469469
#else
470-
amrex::launch(nblocks[li], block_size, Gpu::Device::gpuStream(),
470+
amrex::launch<block_size>(nblocks[li], Gpu::Device::gpuStream(),
471471
[=] AMREX_GPU_DEVICE () noexcept
472472
{
473473
int bid = blockIdx.x;
@@ -525,7 +525,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
525525
const int ncells = bx.numPts();
526526
const char* tags = (*this)[fai].dataPtr();
527527
#ifdef AMREX_USE_SYCL
528-
amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
528+
amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
529529
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
530530
{
531531
int bid = h.item->get_group(0);
@@ -553,7 +553,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
553553
}
554554
});
555555
#else
556-
amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
556+
amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
557557
[=] AMREX_GPU_DEVICE () noexcept
558558
{
559559
int bid = blockIdx.x;

Src/Base/AMReX_BaseFabUtility.H

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,14 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
3838
if (Gpu::inLaunchRegion()) {
3939
BoxIndexer indexer(box);
4040
const auto ntotcells = std::uint64_t(box.numPts());
41-
int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
41+
constexpr int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
4242
std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
4343
AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
4444
auto nblocks = int(nblocks_long);
4545
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
4646
T* p = (T*)aos_fab.dataPtr();
4747
#ifdef AMREX_USE_SYCL
48-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
48+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
4949
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
5050
{
5151
auto const icell = std::uint64_t(handler.globalIdx());
@@ -66,7 +66,7 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
6666
}
6767
});
6868
#else
69-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
69+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
7070
[=] AMREX_GPU_DEVICE () noexcept
7171
{
7272
std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;

Src/Base/AMReX_BlockMutex.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ void BlockMutex::init_states (state_t* state, int N) noexcept {
99
amrex::ignore_unused(state,N);
1010
amrex::Abort("xxxxx SYCL todo");
1111
#else
12-
amrex::launch((N+255)/256, 256, Gpu::gpuStream(),
12+
amrex::launch<256>((N+255)/256, Gpu::gpuStream(),
1313
[=] AMREX_GPU_DEVICE () noexcept
1414
{
1515
int i = threadIdx.x + blockIdx.x*blockDim.x;

Src/Base/AMReX_GpuContainers.H

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -433,11 +433,11 @@ namespace amrex::Gpu {
433433
unsigned long long, unsigned int>;
434434
constexpr Long nU = sizeof(T) / sizeof(U);
435435
auto pu = reinterpret_cast<U*>(p);
436-
int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
436+
constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
437437
int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
438438
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
439439
#ifdef AMREX_USE_SYCL
440-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
440+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
441441
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
442442
{
443443
Long i = handler.globalIdx();
@@ -458,7 +458,7 @@ namespace amrex::Gpu {
458458
}
459459
});
460460
#else
461-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
461+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
462462
[=] AMREX_GPU_DEVICE () noexcept
463463
{
464464
Long blockDimx = blockDim.x;

Src/Base/AMReX_GpuLaunchFunctsG.H

Lines changed: 0 additions & 61 deletions
Original file line numberDiff line numberDiff line change
@@ -159,51 +159,6 @@ void single_task (gpuStream_t stream, L const& f) noexcept
159159
}
160160
}
161161

162-
template<typename L>
163-
void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
164-
gpuStream_t stream, L const& f) noexcept
165-
{
166-
const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
167-
const std::size_t shared_mem_numull = (shared_mem_bytes+sizeof(unsigned long long)-1)
168-
/ sizeof(unsigned long long);
169-
auto& q = *(stream.queue);
170-
try {
171-
q.submit([&] (sycl::handler& h) {
172-
sycl::local_accessor<unsigned long long>
173-
shared_data(sycl::range<1>(shared_mem_numull), h);
174-
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
175-
sycl::range<1>(nthreads_per_block)),
176-
[=] (sycl::nd_item<1> item)
177-
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
178-
{
179-
f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
180-
});
181-
});
182-
} catch (sycl::exception const& ex) {
183-
amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
184-
}
185-
}
186-
187-
template<typename L>
188-
void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L const& f) noexcept
189-
{
190-
const auto nthreads_total = std::size_t(nthreads_per_block) * nblocks;
191-
auto& q = *(stream.queue);
192-
try {
193-
q.submit([&] (sycl::handler& h) {
194-
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
195-
sycl::range<1>(nthreads_per_block)),
196-
[=] (sycl::nd_item<1> item)
197-
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
198-
{
199-
f(item);
200-
});
201-
});
202-
} catch (sycl::exception const& ex) {
203-
amrex::Abort(std::string("launch: ")+ex.what()+"!!!!!");
204-
}
205-
}
206-
207162
template <int MT, typename L>
208163
void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
209164
L const& f) noexcept
@@ -731,22 +686,6 @@ void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
731686
AMREX_GPU_ERROR_CHECK();
732687
}
733688

734-
template<typename L>
735-
void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
736-
gpuStream_t stream, L const& f) noexcept
737-
{
738-
AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);
739-
AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,
740-
stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
741-
AMREX_GPU_ERROR_CHECK();
742-
}
743-
744-
template<typename L>
745-
void launch (int nblocks, int nthreads_per_block, gpuStream_t stream, L&& f) noexcept
746-
{
747-
launch(nblocks, nthreads_per_block, 0, stream, std::forward<L>(f));
748-
}
749-
750689
template<int MT, typename T, typename L, std::enable_if_t<std::is_integral_v<T>,int> FOO = 0>
751690
void launch (T const& n, L const& f) noexcept
752691
{

Src/Base/AMReX_MultiFabUtil.cpp

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -846,10 +846,10 @@ namespace amrex
846846
int nblocks = n2dblocks * b.length(direction);
847847
#ifdef AMREX_USE_SYCL
848848
std::size_t shared_mem_byte = sizeof(Real)*Gpu::Device::warp_size;
849-
amrex::launch(nblocks, AMREX_GPU_MAX_THREADS, shared_mem_byte, Gpu::gpuStream(),
849+
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks, shared_mem_byte, Gpu::gpuStream(),
850850
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
851851
#else
852-
amrex::launch(nblocks, AMREX_GPU_MAX_THREADS, Gpu::gpuStream(),
852+
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks, Gpu::gpuStream(),
853853
[=] AMREX_GPU_DEVICE () noexcept
854854
#endif
855855
{

Src/Base/AMReX_Scan.H

Lines changed: 7 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -209,7 +209,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
209209
T* blocksum_p = (T*)(dp + nbytes_blockresult);
210210
T* totalsum_p = (T*)(dp + nbytes_blockresult + nbytes_blocksum);
211211

212-
amrex::launch(nblocks, nthreads, sm, stream,
212+
amrex::launch<nthreads>(nblocks, sm, stream,
213213
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
214214
{
215215
sycl::sub_group const& sg = gh.item->get_sub_group();
@@ -289,7 +289,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
289289
}
290290
});
291291

292-
amrex::launch(1, nthreads, sm, stream,
292+
amrex::launch<nthreads>(1, sm, stream,
293293
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
294294
{
295295
sycl::sub_group const& sg = gh.item->get_sub_group();
@@ -355,7 +355,7 @@ T PrefixSum_mp (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum)
355355
}
356356
});
357357

358-
amrex::launch(nblocks, nthreads, 0, stream,
358+
amrex::launch<nthreads>(nblocks, 0, stream,
359359
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
360360
{
361361
int threadIdxx = gh.item->get_local_id(0);
@@ -429,7 +429,7 @@ T PrefixSum (N n, FIN && fin, FOUT && fout, TYPE type, RetSum a_ret_sum = retSum
429429
}
430430
});
431431

432-
amrex::launch(nblocks, nthreads, sm, stream,
432+
amrex::launch<nthreads>(nblocks, sm, stream,
433433
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
434434
{
435435
sycl::sub_group const& sg = gh.item->get_sub_group();
@@ -672,7 +672,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
672672
(reinterpret_cast<OrderedBlockId::id_type*>(dp + nbytes_tile_state));
673673

674674
// Init ScanTileState on device
675-
amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
675+
amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
676676
{
677677
auto& scan_tile_state = const_cast<ScanTileState&>(tile_state);
678678
auto& scan_bid = const_cast<OrderedBlockId&>(ordered_block_id);
@@ -813,7 +813,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
813813

814814
if (nblocks > 1) {
815815
// Init ScanTileState on device
816-
amrex::launch((nblocks+nthreads-1)/nthreads, nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
816+
amrex::launch<nthreads>((nblocks+nthreads-1)/nthreads, 0, stream, [=] AMREX_GPU_DEVICE ()
817817
{
818818
const_cast<ScanTileState&>(tile_state).InitializeStatus(nblocks);
819819
});
@@ -957,7 +957,7 @@ T PrefixSum (N n, FIN const& fin, FOUT const& fout, TYPE, RetSum a_ret_sum = ret
957957
}
958958
});
959959

960-
amrex::launch(nblocks, nthreads, sm, stream,
960+
amrex::launch<nthreads>(nblocks, sm, stream,
961961
[=] AMREX_GPU_DEVICE () noexcept
962962
{
963963
int lane = threadIdx.x % Gpu::Device::warp_size;

Src/LinearSolvers/MLMG/AMReX_MLEBTensorOp_bc.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -85,7 +85,7 @@ MLEBTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel,
8585

8686
#ifdef AMREX_USE_GPU
8787
if (Gpu::inLaunchRegion()) {
88-
amrex::launch(12, 64, Gpu::gpuStream(),
88+
amrex::launch<64>(12, Gpu::gpuStream(),
8989
#ifdef AMREX_USE_SYCL
9090
[=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
9191
{

Src/LinearSolvers/MLMG/AMReX_MLTensorOp.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -408,7 +408,7 @@ MLTensorOp::applyBCTensor (int amrlev, int mglev, MultiFab& vel, // NOLINT(reada
408408
// only edge vals used in 3D stencil
409409
#ifdef AMREX_USE_GPU
410410
if (Gpu::inLaunchRegion()) {
411-
amrex::launch(12, 64, Gpu::gpuStream(),
411+
amrex::launch<64>(12, Gpu::gpuStream(),
412412
#ifdef AMREX_USE_SYCL
413413
[=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item)
414414
{

Src/LinearSolvers/OpenBC/AMReX_OpenBC.H

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -116,7 +116,6 @@ private:
116116
Gpu::DeviceVector<openbc::MomTag> m_momtags_d;
117117
Gpu::PinnedVector<int> m_ngpublocks_h;
118118
Gpu::DeviceVector<int> m_ngpublocks_d;
119-
int m_nthreads_momtag;
120119
#endif
121120

122121
int m_nblocks_local = 0;

Src/LinearSolvers/OpenBC/AMReX_OpenBC.cpp

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -94,7 +94,6 @@ void OpenBCSolver::define (const Vector<Geometry>& a_geom,
9494
m_momtags_d.resize(ntags);
9595
Gpu::copyAsync(Gpu::hostToDevice, m_momtags_h.begin(), m_momtags_h.end(), m_momtags_d.begin());
9696

97-
m_nthreads_momtag = (m_coarsen_ratio == 8) ? 64 : 128;
9897
int ntotgpublocks = 0;
9998
m_ngpublocks_h.reserve(ntags+1);
10099
for (auto const& tag : m_momtags_h) {
@@ -396,14 +395,15 @@ void OpenBCSolver::compute_moments (Gpu::DeviceVector<openbc::Moments>& moments)
396395
openbc::Moments* pm = moments.data();
397396
openbc::MomTag const* ptag = m_momtags_d.data();
398397
int const* pnblks = m_ngpublocks_d.data();
399-
std::size_t shared_mem_bytes = m_nthreads_momtag * sizeof(openbc::Moments::array_type);
398+
constexpr int nthreads = 128;
399+
constexpr std::size_t shared_mem_bytes = nthreads * sizeof(openbc::Moments::array_type);
400400

401401
#ifdef AMREX_USE_SYCL
402402
amrex::ignore_unused(problo,probhi,dx,crse_ratio,ntags,pm,ptag,pnblks,
403403
shared_mem_bytes);
404404
amrex::Abort("xxxx SYCL todo: openbc compute_moments");
405405
#else
406-
amrex::launch(m_ngpublocks_h.back(), m_nthreads_momtag, shared_mem_bytes, Gpu::gpuStream(),
406+
amrex::launch<nthreads>(m_ngpublocks_h.back(), shared_mem_bytes, Gpu::gpuStream(),
407407
[=] AMREX_GPU_DEVICE () noexcept
408408
{
409409
Gpu::SharedMemory<openbc::Moments::array_type> gsm;
@@ -745,7 +745,7 @@ void OpenBCSolver::compute_potential (Gpu::DeviceVector<openbc::Moments> const&
745745
lenxy,lenx);
746746
amrex::Abort("xxxxx SYCL todo: openbc compute_potential");
747747
#else
748-
amrex::launch(b.numPts(), AMREX_GPU_MAX_THREADS, Gpu::gpuStream(),
748+
amrex::launch<AMREX_GPU_MAX_THREADS>(b.numPts(), Gpu::gpuStream(),
749749
[=] AMREX_GPU_DEVICE () noexcept
750750
{
751751
int icell = blockIdx.x;

0 commit comments

Comments
 (0)