Match the dimensions of reqd_work_group_size to submitted nd_range (#4002)

zhaomaosu · web-flow · commit 463bdf493cd7 · 2024-06-25T06:39:16.000Z
## Summary According to sycl spec, the number of arguments of reqd_work_group_size must match the dimensions of the work-group used to invoke the kernel. ## Additional background https://registry.khronos.org/SYCL/specs/sycl-2020/html/sycl-2020.html#sec:kernel.attributes ## Checklist The proposed changes: - [x] fix a bug or incorrect behavior in AMReX - [ ] add new capabilities to AMReX - [ ] changes answers in the test suite to more than roundoff level - [ ] are likely to significantly affect the results of downstream AMReX users - [ ] include documentation in the code and/or rst files, if appropriate
diff --git a/Src/Base/AMReX_GpuLaunchFunctsG.H b/Src/Base/AMReX_GpuLaunchFunctsG.H
@@ -79,7 +79,7 @@ void launch (int nblocks, std::size_t shared_mem_bytes, gpuStream_t stream,
             h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                              sycl::range<1>(MT)),
             [=] (sycl::nd_item<1> item)
-            [[sycl::reqd_work_group_size(1,1,MT)]]
+            [[sycl::reqd_work_group_size(MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
                 f(Gpu::Handler{&item,shared_data.get_multi_ptr<sycl::access::decorated::yes>().get()});
@@ -100,7 +100,7 @@ void launch (int nblocks, gpuStream_t stream, L const& f) noexcept
             h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                              sycl::range<1>(MT)),
             [=] (sycl::nd_item<1> item)
-            [[sycl::reqd_work_group_size(1,1,MT)]]
+            [[sycl::reqd_work_group_size(MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
                 f(item);
@@ -124,7 +124,7 @@ void launch (T const& n, L const& f) noexcept
             h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                              sycl::range<1>(nthreads_per_block)),
             [=] (sycl::nd_item<1> item)
-            [[sycl::reqd_work_group_size(1,1,MT)]]
+            [[sycl::reqd_work_group_size(MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
                 for (auto const i : Gpu::Range(n,item.get_global_id(0),item.get_global_range(0))) {
@@ -203,7 +203,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
                 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                                  sycl::range<1>(nthreads_per_block)),
                 [=] (sycl::nd_item<1> item)
-                [[sycl::reqd_work_group_size(1,1,MT)]]
+                [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
@@ -220,7 +220,7 @@ void ParallelFor (Gpu::KernelInfo const& info, T n, L const& f) noexcept
                 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                                  sycl::range<1>(nthreads_per_block)),
                 [=] (sycl::nd_item<1> item)
-                [[sycl::reqd_work_group_size(1,1,MT)]]
+                [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (std::size_t i = item.get_global_id(0), stride = item.get_global_range(0);
@@ -252,7 +252,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexc
                 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                                  sycl::range<1>(nthreads_per_block)),
                 [=] (sycl::nd_item<1> item)
-                [[sycl::reqd_work_group_size(1,1,MT)]]
+                [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
@@ -270,7 +270,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, L const& f) noexc
                 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                                  sycl::range<1>(nthreads_per_block)),
                 [=] (sycl::nd_item<1> item)
-                [[sycl::reqd_work_group_size(1,1,MT)]]
+                [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
@@ -303,7 +303,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L const&
                 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                                  sycl::range<1>(nthreads_per_block)),
                 [=] (sycl::nd_item<1> item)
-                [[sycl::reqd_work_group_size(1,1,MT)]]
+                [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
@@ -322,7 +322,7 @@ void ParallelFor (Gpu::KernelInfo const& info, Box const& box, T ncomp, L const&
                 h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                                  sycl::range<1>(nthreads_per_block)),
                 [=] (sycl::nd_item<1> item)
-                [[sycl::reqd_work_group_size(1,1,MT)]]
+                [[sycl::reqd_work_group_size(MT)]]
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
                 {
                     for (std::uint64_t icell = item.get_global_id(0), stride = item.get_global_range(0);
@@ -353,7 +353,7 @@ void ParallelForRNG (T n, L const& f) noexcept
             h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                              sycl::range<1>(nthreads_per_block)),
             [=] (sycl::nd_item<1> item)
-            [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
+            [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
                 auto const tid = item.get_global_id(0);
@@ -387,7 +387,7 @@ void ParallelForRNG (Box const& box, L const& f) noexcept
             h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                              sycl::range<1>(nthreads_per_block)),
             [=] (sycl::nd_item<1> item)
-            [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
+            [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
                 auto const tid = item.get_global_id(0);
@@ -423,7 +423,7 @@ void ParallelForRNG (Box const& box, T ncomp, L const& f) noexcept
             h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                              sycl::range<1>(nthreads_per_block)),
             [=] (sycl::nd_item<1> item)
-            [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]]
+            [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
                 auto const tid = item.get_global_id(0);
@@ -460,7 +460,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/, Box const& box1, Box const& b
             h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                              sycl::range<1>(nthreads_per_block)),
             [=] (sycl::nd_item<1> item)
-            [[sycl::reqd_work_group_size(1,1,MT)]]
+            [[sycl::reqd_work_group_size(MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
                 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
@@ -500,7 +500,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                              sycl::range<1>(nthreads_per_block)),
             [=] (sycl::nd_item<1> item)
-            [[sycl::reqd_work_group_size(1,1,MT)]]
+            [[sycl::reqd_work_group_size(MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
                 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
@@ -545,7 +545,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                              sycl::range<1>(nthreads_per_block)),
             [=] (sycl::nd_item<1> item)
-            [[sycl::reqd_work_group_size(1,1,MT)]]
+            [[sycl::reqd_work_group_size(MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
                 auto const ncells = std::max(indexer1.numPts(), indexer2.numPts());
@@ -593,7 +593,7 @@ void ParallelFor (Gpu::KernelInfo const& /*info*/,
             h.parallel_for(sycl::nd_range<1>(sycl::range<1>(nthreads_total),
                                              sycl::range<1>(nthreads_per_block)),
             [=] (sycl::nd_item<1> item)
-            [[sycl::reqd_work_group_size(1,1,MT)]]
+            [[sycl::reqd_work_group_size(MT)]]
             [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
             {
                 auto const ncells = std::max({indexer1.numPts(), indexer2.numPts(), indexer3.numPts()});
diff --git a/Src/Base/AMReX_GpuLaunchMacrosG.nolint.H b/Src/Base/AMReX_GpuLaunchMacrosG.nolint.H
@@ -16,7 +16,7 @@
                 amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
                                                          sycl::range<1>(amrex_i_nthreads_per_block)), \
                 [=] (sycl::nd_item<1> amrex_i_item) \
-                [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]] \
+                [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
                 { \
                     for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
@@ -266,7 +266,7 @@
                 amrex_i_h.parallel_for(sycl::nd_range<1>(sycl::range<1>(amrex_i_nthreads_total), \
                                                          sycl::range<1>(amrex_i_nthreads_per_block)), \
                 [=] (sycl::nd_item<1> amrex_i_item) \
-                [[sycl::reqd_work_group_size(1,1,AMREX_GPU_MAX_THREADS)]] \
+                [[sycl::reqd_work_group_size(AMREX_GPU_MAX_THREADS)]] \
                 [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]] \
                 { \
                     for (auto const TI : amrex::Gpu::Range(amrex_i_tn,amrex_i_item.get_global_id(0),amrex_i_item.get_global_range(0))) { \
diff --git a/Src/Base/AMReX_TagParallelFor.H b/Src/Base/AMReX_TagParallelFor.H
@@ -211,7 +211,7 @@ ParallelFor_doit (Vector<TagType> const& tags, F && f)
     amrex::launch(nblocks, nthreads, Gpu::gpuStream(),
 #ifdef AMREX_USE_SYCL
     [=] AMREX_GPU_DEVICE (sycl::nd_item<1> const& item) noexcept
-    [[sycl::reqd_work_group_size(1,1,nthreads)]]
+    [[sycl::reqd_work_group_size(nthreads)]]
     [[sycl::reqd_sub_group_size(Gpu::Device::warp_size)]]
 #else
     [=] AMREX_GPU_DEVICE () noexcept