AMReX-Codes
diff --git a/‎Docs/sphinx_documentation/source/GPU.rst
Lines changed: 4 additions & 4 deletions b/‎Docs/sphinx_documentation/source/GPU.rst
Lines changed: 4 additions & 4 deletions
diff --git a/‎Src/AmrCore/AMReX_TagBox.cpp
Lines changed: 5 additions & 5 deletions b/‎Src/AmrCore/AMReX_TagBox.cpp
Lines changed: 5 additions & 5 deletions
diff --git a/‎Src/Base/AMReX_BaseFabUtility.H
Lines changed: 3 additions & 3 deletions b/‎Src/Base/AMReX_BaseFabUtility.H
Lines changed: 3 additions & 3 deletions
diff --git a/‎Src/Base/AMReX_BlockMutex.cpp
Lines changed: 1 addition & 1 deletion b/‎Src/Base/AMReX_BlockMutex.cpp
Lines changed: 1 addition & 1 deletion
diff --git a/‎Src/Base/AMReX_FabArrayBase.H
Lines changed: 6 additions & 6 deletions b/‎Src/Base/AMReX_FabArrayBase.H
Lines changed: 6 additions & 6 deletions
diff --git a/‎Src/Base/AMReX_FabArrayBase.cpp
Lines changed: 9 additions & 13 deletions b/‎Src/Base/AMReX_FabArrayBase.cpp
Lines changed: 9 additions & 13 deletions
diff --git a/‎Src/Base/AMReX_GpuContainers.H
Lines changed: 3 additions & 3 deletions b/‎Src/Base/AMReX_GpuContainers.H
Lines changed: 3 additions & 3 deletions
diff --git a/‎Src/Base/AMReX_GpuLaunch.H
Lines changed: 4 additions & 0 deletions b/‎Src/Base/AMReX_GpuLaunch.H
Lines changed: 4 additions & 0 deletions
diff --git a/‎Src/Base/AMReX_GpuLaunchFunctsG.H
Lines changed: 2 additions & 3 deletions b/‎Src/Base/AMReX_GpuLaunchFunctsG.H
Lines changed: 2 additions & 3 deletions
diff --git a/‎Src/Base/AMReX_MFParallelFor.H
Lines changed: 5 additions & 5 deletions b/‎Src/Base/AMReX_MFParallelFor.H
Lines changed: 5 additions & 5 deletions
@@ -229,9 +229,9 @@ Building with CMake
 
 To build AMReX with GPU support in CMake, add
 ``-DAMReX_GPU_BACKEND=CUDA|HIP|SYCL`` to the ``cmake`` invocation, for CUDA,
-HIP and SYCL, respectively. By default, AMReX uses 256 threads per GPU
-block/group in most situations. This can be changed with
-``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 128 for example.
+HIP and SYCL, respectively. By default, AMReX uses 128 threads per GPU block
+in most situations for CUDA, and 256 for HIP and SYCL. This can be changed
+with ``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 256 or 128 for example.
 
 Enabling CUDA support
 ^^^^^^^^^^^^^^^^^^^^^
@@ -1166,7 +1166,7 @@ GPU block size
 
 By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads
 per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant
-with a default value of 256.  The users can also explicitly specify the
+with a default value of 128 for CUDA and 256 for HIP and SYCL.  The users can also explicitly specify the
 number of threads per block by :cpp:`ParallelFor<MY_BLOCK_SIZE>(...)`, where
 ``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128).  This allows
 the users to do performance tuning for individual kernels.
 
@@ -447,8 +447,8 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
         const int ncells = fai.fabbox().numPts();
         const char* tags = (*this)[fai].dataPtr();
 #ifdef AMREX_USE_SYCL
-        amrex::launch(nblocks[li], block_size, sizeof(int)*Gpu::Device::warp_size,
-                      Gpu::Device::gpuStream(),
+        amrex::launch<block_size>(nblocks[li], sizeof(int)*Gpu::Device::warp_size,
+                                  Gpu::Device::gpuStream(),
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
         {
             int bid = h.item->get_group_linear_id();
@@ -467,7 +467,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             }
         });
 #else
-        amrex::launch(nblocks[li], block_size, Gpu::Device::gpuStream(),
+        amrex::launch<block_size>(nblocks[li], Gpu::Device::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept
         {
             int bid = blockIdx.x;
@@ -525,7 +525,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
             const int ncells = bx.numPts();
             const char* tags = (*this)[fai].dataPtr();
 #ifdef AMREX_USE_SYCL
-            amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
+            amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
             [=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
             {
                 int bid = h.item->get_group(0);
@@ -553,7 +553,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
                 }
             });
 #else
-            amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
+            amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
             [=] AMREX_GPU_DEVICE () noexcept
             {
                 int bid = blockIdx.x;
 
@@ -38,14 +38,14 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
     if (Gpu::inLaunchRegion()) {
         BoxIndexer indexer(box);
         const auto ntotcells = std::uint64_t(box.numPts());
-        int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
+        constexpr int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
         std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
         AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
         auto nblocks = int(nblocks_long);
         std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
         T* p = (T*)aos_fab.dataPtr();
 #ifdef AMREX_USE_SYCL
-        amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
+        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
         {
             auto const icell = std::uint64_t(handler.globalIdx());
@@ -66,7 +66,7 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
             }
         });
 #else
-        amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
+        amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
         [=] AMREX_GPU_DEVICE () noexcept
         {
             std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;
 
@@ -9,7 +9,7 @@ void BlockMutex::init_states (state_t* state, int N) noexcept {
     amrex::ignore_unused(state,N);
     amrex::Abort("xxxxx SYCL todo");
 #else
-    amrex::launch((N+255)/256, 256, Gpu::gpuStream(),
+    amrex::launch<256>((N+255)/256, Gpu::gpuStream(),
     [=] AMREX_GPU_DEVICE () noexcept
     {
         int i = threadIdx.x + blockIdx.x*blockDim.x;
 
@@ -650,10 +650,12 @@ public:
     //! For ParallelFor(FabArray)
     struct ParForInfo
     {
-        ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads);
+        ParForInfo (const FabArrayBase& fa, const IntVect& nghost);
         ~ParForInfo ();
 
-        std::pair<int*,int*> const& getBlocks () const { return m_nblocks_x; }
+        int getNBlocksPerBox (int nthreads) const {
+            return int((m_ncellsmax+nthreads-1)/nthreads);
+        }
         BoxIndexer const* getBoxes () const { return m_boxes; }
 
         ParForInfo () = delete;
@@ -664,14 +666,12 @@ public:
 
         BATransformer m_bat;
         IntVect m_ng;
-        int m_nthreads;
-        std::pair<int*,int*> m_nblocks_x;
+        Long m_ncellsmax = 0;
         BoxIndexer* m_boxes = nullptr;
         char* m_hp = nullptr;
-        char* m_dp = nullptr;
     };
 
-    ParForInfo const& getParForInfo (const IntVect& nghost, int nthreads) const;
+    ParForInfo const& getParForInfo (const IntVect& nghost) const;
 
     static std::multimap<BDKey,ParForInfo*> m_TheParForCache;
 
 
@@ -2635,15 +2635,12 @@ FabArrayBase::isFusingCandidate () const noexcept // NOLINT(readability-convert-
 
 #ifdef AMREX_USE_GPU
 
-FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads)
+FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost)
     : m_bat(fa.boxArray().transformer()),
-      m_ng(nghost),
-      m_nthreads(nthreads),
-      m_nblocks_x({nullptr,nullptr})
+      m_ng(nghost)
 {
     Vector<Box> boxes;
-    Vector<Long> ncells;
-    ncells.reserve(fa.indexArray.size());
+    m_ncellsmax = 0;
     for (int K : fa.indexArray) {
         Long N = 0;
         Box b = fa.box(K);
@@ -2652,31 +2649,30 @@ FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& ngh
             N = b.numPts();
         }
         boxes.push_back(b);
-        ncells.push_back(N);
+        m_ncellsmax = std::max(m_ncellsmax, N);
     }
-    detail::build_par_for_nblocks(m_hp, m_dp, m_nblocks_x, m_boxes, boxes, ncells, nthreads);
+    detail::build_par_for_boxes(m_hp, m_boxes, boxes);
 }
 
 FabArrayBase::ParForInfo::~ParForInfo ()
 {
-    detail::destroy_par_for_nblocks(m_hp, m_dp);
+    detail::destroy_par_for_boxes(m_hp, (char*)m_boxes);
 }
 
 FabArrayBase::ParForInfo const&
-FabArrayBase::getParForInfo (const IntVect& nghost, int nthreads) const
+FabArrayBase::getParForInfo (const IntVect& nghost) const
 {
     AMREX_ASSERT(getBDKey() == m_bdkey);
     auto er_it = m_TheParForCache.equal_range(m_bdkey);
     for (auto it = er_it.first; it != er_it.second; ++it) {
         if (it->second->m_bat        == boxArray().transformer() &&
-            it->second->m_ng         == nghost                 &&
-            it->second->m_nthreads   == nthreads)
+            it->second->m_ng         == nghost)
         {
             return *(it->second);
         }
     }
 
-    ParForInfo* new_pfi = new ParForInfo(*this, nghost, nthreads);
+    ParForInfo* new_pfi = new ParForInfo(*this, nghost);
     m_TheParForCache.insert(er_it.second,
                             std::multimap<BDKey,ParForInfo*>::value_type(m_bdkey,new_pfi));
     return *new_pfi;
 
@@ -433,11 +433,11 @@ namespace amrex::Gpu {
                                          unsigned long long, unsigned int>;
             constexpr Long nU = sizeof(T) / sizeof(U);
             auto pu = reinterpret_cast<U*>(p);
-            int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
+            constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
             int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
             std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
 #ifdef AMREX_USE_SYCL
-            amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
+            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
             [=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
             {
                 Long i = handler.globalIdx();
@@ -458,7 +458,7 @@ namespace amrex::Gpu {
                 }
             });
 #else
-            amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
+            amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
                           [=] AMREX_GPU_DEVICE () noexcept
             {
                 Long blockDimx = blockDim.x;
 
@@ -34,9 +34,13 @@
 #ifdef AMREX_USE_CUDA
 #  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
         amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
+#  define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
+        amrex::launch_global    <<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
 #elif defined(AMREX_USE_HIP)
 #  define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
         hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
+#  define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
+        hipLaunchKernelGGL(launch_global    , blocks, threads, sharedMem, stream, __VA_ARGS__)
 #endif
 
 
 
@@ -735,9 +735,8 @@ template<typename L>
 void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
              gpuStream_t stream, L const& f) noexcept
 {
-    AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);
-    AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,
-                        stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
+    AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes,
+                                stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
     AMREX_GPU_ERROR_CHECK();
 }
 
 
@@ -68,7 +68,7 @@ std::enable_if_t<IsFabArray<MF>::value>
 ParallelFor (MF const& mf, F&& f)
 {
 #ifdef AMREX_USE_GPU
-    detail::ParallelFor<MT>(mf, IntVect(0), FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
+    detail::ParallelFor<MT>(mf, IntVect(0), 1, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
 #else
     detail::ParallelFor(mf, IntVect(0), FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
 #endif
@@ -119,7 +119,7 @@ std::enable_if_t<IsFabArray<MF>::value>
 ParallelFor (MF const& mf, IntVect const& ng, F&& f)
 {
 #ifdef AMREX_USE_GPU
-    detail::ParallelFor<MT>(mf, ng, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
+    detail::ParallelFor<MT>(mf, ng, 1, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
 #else
     detail::ParallelFor(mf, ng, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
 #endif
@@ -225,7 +225,7 @@ std::enable_if_t<IsFabArray<MF>::value>
 ParallelFor (MF const& mf, TileSize const& ts, F&& f)
 {
 #ifdef AMREX_USE_GPU
-    detail::ParallelFor<MT>(mf, IntVect(0), ts.tile_size, false, std::forward<F>(f));
+    detail::ParallelFor<MT>(mf, IntVect(0), 1, ts.tile_size, false, std::forward<F>(f));
 #else
     detail::ParallelFor(mf, IntVect(0), ts.tile_size, false, std::forward<F>(f));
 #endif
@@ -280,7 +280,7 @@ std::enable_if_t<IsFabArray<MF>::value>
 ParallelFor (MF const& mf, IntVect const& ng, TileSize const& ts, F&& f)
 {
 #ifdef AMREX_USE_GPU
-    detail::ParallelFor<MT>(mf, ng, ts.tile_size, false, std::forward<F>(f));
+    detail::ParallelFor<MT>(mf, ng, 1, ts.tile_size, false, std::forward<F>(f));
 #else
     detail::ParallelFor(mf, ng, ts.tile_size, false, std::forward<F>(f));
 #endif
@@ -423,7 +423,7 @@ ParallelFor (MF const& mf, IntVect const& ng, TileSize const& ts,
              DynamicTiling dt, F&& f)
 {
 #ifdef AMREX_USE_GPU
-    detail::ParallelFor<MT>(mf, ng, ts.tile_size, dt.dynamic, std::forward<F>(f));
+    detail::ParallelFor<MT>(mf, ng, 1, ts.tile_size, dt.dynamic, std::forward<F>(f));
 #else
     detail::ParallelFor(mf, ng, ts.tile_size, dt.dynamic, std::forward<F>(f));
 #endif
Original file line number	Diff line number	Diff line change
`@@ -9,7 +9,7 @@ void BlockMutex::init_states (state_t* state, int N) noexcept {`
`9`	`9`	`amrex::ignore_unused(state,N);`
`10`	`10`	`amrex::Abort("xxxxx SYCL todo");`
`11`	`11`	`#else`
`12`		`- amrex::launch((N+255)/256, 256, Gpu::gpuStream(),`
	`12`	`+ amrex::launch<256>((N+255)/256, Gpu::gpuStream(),`
`13`	`13`	`[=] AMREX_GPU_DEVICE () noexcept`
`14`	`14`	`{`
`15`	`15`	`int i = threadIdx.x + blockIdx.x*blockDim.x;`
Original file line number	Diff line number	Diff line change
`@@ -2635,15 +2635,12 @@ FabArrayBase::isFusingCandidate () const noexcept // NOLINT(readability-convert-`
`2635`	`2635`
`2636`	`2636`	`#ifdef AMREX_USE_GPU`
`2637`	`2637`
`2638`		`-FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads)`
	`2638`	`+FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost)`
`2639`	`2639`	`: m_bat(fa.boxArray().transformer()),`
`2640`		`- m_ng(nghost),`
`2641`		`- m_nthreads(nthreads),`
`2642`		`- m_nblocks_x({nullptr,nullptr})`
	`2640`	`+ m_ng(nghost)`
`2643`	`2641`	`{`
`2644`	`2642`	`Vector<Box> boxes;`
`2645`		`- Vector<Long> ncells;`
`2646`		`- ncells.reserve(fa.indexArray.size());`
	`2643`	`+ m_ncellsmax = 0;`
`2647`	`2644`	`for (int K : fa.indexArray) {`
`2648`	`2645`	`Long N = 0;`
`2649`	`2646`	`Box b = fa.box(K);`
`@@ -2652,31 +2649,30 @@ FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& ngh`
`2652`	`2649`	`N = b.numPts();`
`2653`	`2650`	`}`
`2654`	`2651`	`boxes.push_back(b);`
`2655`		`- ncells.push_back(N);`
	`2652`	`+ m_ncellsmax = std::max(m_ncellsmax, N);`
`2656`	`2653`	`}`
`2657`		`- detail::build_par_for_nblocks(m_hp, m_dp, m_nblocks_x, m_boxes, boxes, ncells, nthreads);`
	`2654`	`+ detail::build_par_for_boxes(m_hp, m_boxes, boxes);`
`2658`	`2655`	`}`
`2659`	`2656`
`2660`	`2657`	`FabArrayBase::ParForInfo::~ParForInfo ()`
`2661`	`2658`	`{`
`2662`		`- detail::destroy_par_for_nblocks(m_hp, m_dp);`
	`2659`	`+ detail::destroy_par_for_boxes(m_hp, (char*)m_boxes);`
`2663`	`2660`	`}`
`2664`	`2661`
`2665`	`2662`	`FabArrayBase::ParForInfo const&`
`2666`		`-FabArrayBase::getParForInfo (const IntVect& nghost, int nthreads) const`
	`2663`	`+FabArrayBase::getParForInfo (const IntVect& nghost) const`
`2667`	`2664`	`{`
`2668`	`2665`	`AMREX_ASSERT(getBDKey() == m_bdkey);`
`2669`	`2666`	`auto er_it = m_TheParForCache.equal_range(m_bdkey);`
`2670`	`2667`	`for (auto it = er_it.first; it != er_it.second; ++it) {`
`2671`	`2668`	`if (it->second->m_bat == boxArray().transformer() &&`
`2672`		`- it->second->m_ng == nghost &&`
`2673`		`- it->second->m_nthreads == nthreads)`
	`2669`	`+ it->second->m_ng == nghost)`
`2674`	`2670`	`{`
`2675`	`2671`	`return *(it->second);`
`2676`	`2672`	`}`
`2677`	`2673`	`}`
`2678`	`2674`
`2679`		`- ParForInfo* new_pfi = new ParForInfo(*this, nghost, nthreads);`
	`2675`	`+ ParForInfo* new_pfi = new ParForInfo(*this, nghost);`
`2680`	`2676`	`m_TheParForCache.insert(er_it.second,`
`2681`	`2677`	`std::multimap<BDKey,ParForInfo*>::value_type(m_bdkey,new_pfi));`
`2682`	`2678`	`return *new_pfi;`
Original file line number	Diff line number	Diff line change
`@@ -735,9 +735,8 @@ template<typename L>`
`735`	`735`	`void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,`
`736`	`736`	`gpuStream_t stream, L const& f) noexcept`
`737`	`737`	`{`
`738`		`- AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);`
`739`		`- AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,`
`740`		`- stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });`
	`738`	`+ AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes,`
	`739`	`+ stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });`
`741`	`740`	`AMREX_GPU_ERROR_CHECK();`
`742`	`741`	`}`
`743`	`742`