Skip to content

Commit 463bdf4

Browse files
authored
Match the dimensions of reqd_work_group_size to submitted nd_range (#4002)
## Summary According to sycl spec, the number of arguments of reqd_work_group_size must match the dimensions of the work-group used to invoke the kernel. ## Additional background https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:kernel.attributes ## Checklist The proposed changes: - [x] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate
1 parent 27b399a commit 463bdf4

File tree

3 files changed

+19
-19
lines changed

3 files changed

+19
-19
lines changed

Src/Base/AMReX_GpuLaunchFunctsG.H

Lines changed: 16 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -79,7 +79,7 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
7979
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
8080
sycl::range<1>(MT)),
8181
[=] (sycl::nd_item<1> item)
82-
[[sycl::reqd_work_group_size(1,1,MT)]]
82+
[[sycl::reqd_work_group_size(MT)]]
8383
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
8484
{
8585
f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
@@ -100,7 +100,7 @@ void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
100100
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
101101
sycl::range<1>(MT)),
102102
[=] (sycl::nd_item<1> item)
103-
[[sycl::reqd_work_group_size(1,1,MT)]]
103+
[[sycl::reqd_work_group_size(MT)]]
104104
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
105105
{
106106
f(item);
@@ -124,7 +124,7 @@ void launch (T const& n, L const& f) noexcept
124124
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
125125
sycl::range<1>(nthreads_per_block)),
126126
[=] (sycl::nd_item<1> item)
127-
[[sycl::reqd_work_group_size(1,1,MT)]]
127+
[[sycl::reqd_work_group_size(MT)]]
128128
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
129129
{
130130
for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
@@ -203,7 +203,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
203203
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
204204
sycl::range<1>(nthreads_per_block)),
205205
[=] (sycl::nd_item<1> item)
206-
[[sycl::reqd_work_group_size(1,1,MT)]]
206+
[[sycl::reqd_work_group_size(MT)]]
207207
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
208208
{
209209
for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
@@ -220,7 +220,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
220220
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
221221
sycl::range<1>(nthreads_per_block)),
222222
[=] (sycl::nd_item<1> item)
223-
[[sycl::reqd_work_group_size(1,1,MT)]]
223+
[[sycl::reqd_work_group_size(MT)]]
224224
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
225225
{
226226
for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
@@ -252,7 +252,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexc
252252
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
253253
sycl::range<1>(nthreads_per_block)),
254254
[=] (sycl::nd_item<1> item)
255-
[[sycl::reqd_work_group_size(1,1,MT)]]
255+
[[sycl::reqd_work_group_size(MT)]]
256256
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
257257
{
258258
for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
@@ -270,7 +270,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexc
270270
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
271271
sycl::range<1>(nthreads_per_block)),
272272
[=] (sycl::nd_item<1> item)
273-
[[sycl::reqd_work_group_size(1,1,MT)]]
273+
[[sycl::reqd_work_group_size(MT)]]
274274
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
275275
{
276276
for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
@@ -303,7 +303,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L const&
303303
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
304304
sycl::range<1>(nthreads_per_block)),
305305
[=] (sycl::nd_item<1> item)
306-
[[sycl::reqd_work_group_size(1,1,MT)]]
306+
[[sycl::reqd_work_group_size(MT)]]
307307
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
308308
{
309309
for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
@@ -322,7 +322,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L const&
322322
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
323323
sycl::range<1>(nthreads_per_block)),
324324
[=] (sycl::nd_item<1> item)
325-
[[sycl::reqd_work_group_size(1,1,MT)]]
325+
[[sycl::reqd_work_group_size(MT)]]
326326
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
327327
{
328328
for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
@@ -353,7 +353,7 @@ void ParallelForRNG (T n, L const& f) noexcept
353353
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
354354
sycl::range<1>(nthreads_per_block)),
355355
[=] (sycl::nd_item<1> item)
356-
[[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
356+
[[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
357357
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
358358
{
359359
auto const tid = item.get_global_id(0);
@@ -387,7 +387,7 @@ void ParallelForRNG (Box const& box, L const& f) noexcept
387387
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
388388
sycl::range<1>(nthreads_per_block)),
389389
[=] (sycl::nd_item<1> item)
390-
[[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
390+
[[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
391391
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
392392
{
393393
auto const tid = item.get_global_id(0);
@@ -423,7 +423,7 @@ void ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
423423
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
424424
sycl::range<1>(nthreads_per_block)),
425425
[=] (sycl::nd_item<1> item)
426-
[[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
426+
[[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
427427
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
428428
{
429429
auto const tid = item.get_global_id(0);
@@ -460,7 +460,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
460460
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
461461
sycl::range<1>(nthreads_per_block)),
462462
[=] (sycl::nd_item<1> item)
463-
[[sycl::reqd_work_group_size(1,1,MT)]]
463+
[[sycl::reqd_work_group_size(MT)]]
464464
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
465465
{
466466
auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
@@ -500,7 +500,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
500500
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
501501
sycl::range<1>(nthreads_per_block)),
502502
[=] (sycl::nd_item<1> item)
503-
[[sycl::reqd_work_group_size(1,1,MT)]]
503+
[[sycl::reqd_work_group_size(MT)]]
504504
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
505505
{
506506
auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
@@ -545,7 +545,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
545545
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
546546
sycl::range<1>(nthreads_per_block)),
547547
[=] (sycl::nd_item<1> item)
548-
[[sycl::reqd_work_group_size(1,1,MT)]]
548+
[[sycl::reqd_work_group_size(MT)]]
549549
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
550550
{
551551
auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
@@ -593,7 +593,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
593593
h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
594594
sycl::range<1>(nthreads_per_block)),
595595
[=] (sycl::nd_item<1> item)
596-
[[sycl::reqd_work_group_size(1,1,MT)]]
596+
[[sycl::reqd_work_group_size(MT)]]
597597
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
598598
{
599599
auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});

Src/Base/AMReX_GpuLaunchMacrosG.nolint.H

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,7 @@
1616
amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
1717
sycl::range<1>(amrex_i_nthreads_per_block)), \
1818
[=] (sycl::nd_item<1> amrex_i_item) \
19-
[[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]] \
19+
[[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
2020
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
2121
{ \
2222
for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
@@ -266,7 +266,7 @@
266266
amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
267267
sycl::range<1>(amrex_i_nthreads_per_block)), \
268268
[=] (sycl::nd_item<1> amrex_i_item) \
269-
[[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]] \
269+
[[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
270270
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
271271
{ \
272272
for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \

Src/Base/AMReX_TagParallelFor.H

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -211,7 +211,7 @@ ParallelFor_doit (Vector<TagType> const& tags, F && f)
211211
amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
212212
#ifdef AMREX_USE_SYCL
213213
[=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
214-
[[sycl::reqd_work_group_size(1,1,nthreads)]]
214+
[[sycl::reqd_work_group_size(nthreads)]]
215215
[[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
216216
#else
217217
[=] AMREX_GPU_DEVICE () noexcept

0 commit comments

Comments
 (0)