diff --git a/Docs/sphinx_documentation/source/GPU.rst b/Docs/sphinx_documentation/source/GPU.rst index 9e370b8ee2..c6da1a0ab3 100644 --- a/Docs/sphinx_documentation/source/GPU.rst +++ b/Docs/sphinx_documentation/source/GPU.rst @@ -229,9 +229,9 @@ Building with CMake To build AMReX with GPU support in CMake, add ``-DAMReX_GPU_BACKEND=CUDA|HIP|SYCL`` to the ``cmake`` invocation, for CUDA, -HIP and SYCL, respectively. By default, AMReX uses 256 threads per GPU -block/group in most situations. This can be changed with -``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 128 for example. +HIP and SYCL, respectively. By default, AMReX uses 128 threads per GPU block +in most situations for CUDA, and 256 for HIP and SYCL. This can be changed +with ``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 256 or 128 for example. Enabling CUDA support ^^^^^^^^^^^^^^^^^^^^^ @@ -1166,7 +1166,7 @@ GPU block size By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant -with a default value of 256. The users can also explicitly specify the +with a default value of 128 for CUDA and 256 for HIP and SYCL. The users can also explicitly specify the number of threads per block by :cpp:`ParallelFor(...)`, where ``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128). This allows the users to do performance tuning for individual kernels. diff --git a/Src/Base/AMReX_Reduce.H b/Src/Base/AMReX_Reduce.H index f6f7a8ed3d..dec0b46b06 100644 --- a/Src/Base/AMReX_Reduce.H +++ b/Src/Base/AMReX_Reduce.H @@ -12,6 +12,8 @@ #include #include +#define AMREX_GPU_MAX_THREADS_REDUCE 256 + namespace amrex { namespace Reduce::detail { @@ -93,7 +95,7 @@ struct ReduceOpSum if (h.threadIdx() == 0) { d += r; } } #else - template + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update (T& d, T const& s) const noexcept { T r = Gpu::blockReduceSum(s); @@ -121,7 +123,7 @@ struct ReduceOpMin if (h.threadIdx() == 0) { d = amrex::min(d,r); } } #else - template + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update (T& d, T const& s) const noexcept { T r = Gpu::blockReduceMin(s); @@ -154,7 +156,7 @@ struct ReduceOpMax if (h.threadIdx() == 0) { d = amrex::max(d,r); } } #else - template + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE void parallel_update (T& d, T const& s) const noexcept { T r = Gpu::blockReduceMax(s); @@ -188,7 +190,7 @@ struct ReduceOpLogicalAnd if (h.threadIdx() == 0) { d = d && r; } } #else - template + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t::value> parallel_update (T& d, T s) const noexcept { @@ -220,7 +222,7 @@ struct ReduceOpLogicalOr if (h.threadIdx() == 0) { d = d || r; } } #else - template + template AMREX_GPU_DEVICE AMREX_FORCE_INLINE std::enable_if_t::value> parallel_update (T& d, T s) const noexcept { @@ -372,7 +374,7 @@ public: const int nboxes = mf.local_size(); if (nboxes > 0) { auto const& parforinfo = mf.getParForInfo(nghost); - auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS); + auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS_REDUCE); AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits::max())); const int nblocks = nblocks_per_box * nboxes; const BoxIndexer* dp_boxes = parforinfo.getBoxes(); @@ -387,14 +389,14 @@ public: #ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, + amrex::launch(nblocks_ec, shared_mem_bytes, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { Dim1 blockIdx {gh.blockIdx()}; Dim1 threadIdx{gh.threadIdx()}; #else - amrex::launch_global - <<>> + amrex::launch_global + <<>> ([=] AMREX_GPU_DEVICE () noexcept { #endif @@ -406,7 +408,7 @@ public: } for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) { int ibox = iblock / nblocks_per_box; - auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x; + auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS_REDUCE + threadIdx.x; BoxIndexer const& indexer = dp_boxes[ibox]; if (icell < indexer.numPts()) { @@ -496,21 +498,21 @@ public: const auto lenx = len.x; IndexType ixtype = box.ixType(); constexpr int nitems_per_thread = 4; - int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1) - / (nitems_per_thread*AMREX_GPU_MAX_THREADS); + int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1) + / (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE); nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks()); reduce_data.updateMaxStreamIndex(stream); #ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, + amrex::launch(nblocks_ec, shared_mem_bytes, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { Dim1 blockIdx {gh.blockIdx()}; Dim1 threadIdx{gh.threadIdx()}; Dim1 gridDim {gh.gridDim()}; #else - amrex::launch(nblocks_ec, 0, stream, + amrex::launch(nblocks_ec, 0, stream, [=] AMREX_GPU_DEVICE () noexcept { #endif @@ -520,7 +522,7 @@ public: if (threadIdx.x == 0 && static_cast(blockIdx.x) >= nblocks) { dst = r; } - for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; icell < ncells; icell += stride) { int k = icell / lenxy; int j = (icell - k*lenxy) / lenx; @@ -554,21 +556,21 @@ public: const auto lenxy = len.x*len.y; const auto lenx = len.x; constexpr int nitems_per_thread = 4; - int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1) - / (nitems_per_thread*AMREX_GPU_MAX_THREADS); + int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1) + / (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE); nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks()); reduce_data.updateMaxStreamIndex(stream); #ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, + amrex::launch(nblocks_ec, shared_mem_bytes, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { Dim1 blockIdx {gh.blockIdx()}; Dim1 threadIdx{gh.threadIdx()}; Dim1 gridDim {gh.gridDim()}; #else - amrex::launch(nblocks_ec, 0, stream, + amrex::launch(nblocks_ec, 0, stream, [=] AMREX_GPU_DEVICE () noexcept { #endif @@ -578,7 +580,7 @@ public: if (threadIdx.x == 0 && static_cast(blockIdx.x) >= nblocks) { dst = r; } - for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; icell < ncells; icell += stride) { int k = icell / lenxy; int j = (icell - k*lenxy) / lenx; @@ -610,21 +612,21 @@ public: auto dp = reduce_data.devicePtr(stream); int& nblocks = reduce_data.nBlocks(stream); constexpr int nitems_per_thread = 4; - int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1) - / (nitems_per_thread*AMREX_GPU_MAX_THREADS); + int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1) + / (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE); nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks()); reduce_data.updateMaxStreamIndex(stream); #ifdef AMREX_USE_SYCL // device reduce needs local(i.e., shared) memory constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size; - amrex::launch(nblocks_ec, shared_mem_bytes, stream, + amrex::launch(nblocks_ec, shared_mem_bytes, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { Dim1 blockIdx {gh.blockIdx()}; Dim1 threadIdx{gh.threadIdx()}; Dim1 gridDim {gh.gridDim()}; #else - amrex::launch(nblocks_ec, 0, stream, + amrex::launch(nblocks_ec, 0, stream, [=] AMREX_GPU_DEVICE () noexcept { #endif @@ -634,7 +636,7 @@ public: if (threadIdx.x == 0 && static_cast(blockIdx.x) >= nblocks) { dst = r; } - for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (N i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; i < n; i += stride) { auto pr = f(i); Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr); @@ -689,7 +691,7 @@ public: #else auto presult = hp; #endif - amrex::launch(1, shared_mem_bytes, stream, + amrex::launch(1, shared_mem_bytes, stream, [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { ReduceTuple r; @@ -709,7 +711,7 @@ public: Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple)); #endif #else - amrex::launch(1, 0, stream, + amrex::launch(1, 0, stream, [=] AMREX_GPU_DEVICE () noexcept { ReduceTuple r; @@ -717,7 +719,7 @@ public: ReduceTuple dst = r; for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) { auto dp_stream = dp+istream*maxblocks; - for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (int i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; i < nblocks[istream]; i += stride) { Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]); } @@ -851,7 +853,7 @@ bool AnyOf (N n, T const* v, P const& pred) #ifdef AMREX_USE_SYCL const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1; const std::size_t shared_mem_bytes = num_ints*sizeof(int); - amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), + amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { int* has_any = &(static_cast(gh.sharedMemory())[num_ints-1]); if (gh.threadIdx() == 0) { *has_any = *dp; } @@ -860,7 +862,7 @@ bool AnyOf (N n, T const* v, P const& pred) if (!(*has_any)) { int r = false; - for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim(); + for (N i = AMREX_GPU_MAX_THREADS_REDUCE*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS_REDUCE*gh.gridDim(); i < n && !r; i += stride) { r = pred(v[i]) ? 1 : 0; @@ -872,7 +874,7 @@ bool AnyOf (N n, T const* v, P const& pred) } }); #else - amrex::launch(ec.numBlocks.x, 0, Gpu::gpuStream(), + amrex::launch(ec.numBlocks.x, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { __shared__ int has_any; if (threadIdx.x == 0) { has_any = *dp; } @@ -881,7 +883,7 @@ bool AnyOf (N n, T const* v, P const& pred) if (!has_any) { int r = false; - for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (N i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; i < n && !r; i += stride) { r = pred(v[i]) ? 1 : 0; @@ -912,7 +914,7 @@ bool AnyOf (Box const& box, P const& pred) #ifdef AMREX_USE_SYCL const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1; const std::size_t shared_mem_bytes = num_ints*sizeof(int); - amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), + amrex::launch(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept { int* has_any = &(static_cast(gh.sharedMemory())[num_ints-1]); if (gh.threadIdx() == 0) { *has_any = *dp; } @@ -921,7 +923,7 @@ bool AnyOf (Box const& box, P const& pred) if (!(*has_any)) { int r = false; - for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim(); + for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS_REDUCE*gh.gridDim(); icell < ncells && !r; icell += stride) { int k = icell / lenxy; int j = (icell - k*lenxy) / lenx; @@ -937,7 +939,7 @@ bool AnyOf (Box const& box, P const& pred) } }); #else - AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0, + AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS_REDUCE, ec.numBlocks, ec.numThreads, 0, Gpu::gpuStream(), [=] AMREX_GPU_DEVICE () noexcept { __shared__ int has_any; @@ -947,7 +949,7 @@ bool AnyOf (Box const& box, P const& pred) if (!has_any) { int r = false; - for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x; + for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x; icell < ncells && !r; icell += stride) { int k = icell / lenxy; int j = (icell - k*lenxy) / lenx; diff --git a/Tools/CMake/AMReXOptions.cmake b/Tools/CMake/AMReXOptions.cmake index ef55a1053c..51063b7dc1 100644 --- a/Tools/CMake/AMReXOptions.cmake +++ b/Tools/CMake/AMReXOptions.cmake @@ -132,8 +132,11 @@ endif () if (NOT AMReX_GPU_BACKEND STREQUAL NONE) message( STATUS " AMReX_GPU_BACKEND = ${AMReX_GPU_BACKEND}") - # We might set different default for different GPUs in the future. - set(AMReX_GPU_MAX_THREADS_DEFAULT "256") + if (AMReX_GPU_BACKEND STREQUAL CUDA) + set(AMReX_GPU_MAX_THREADS_DEFAULT "128") + else () + set(AMReX_GPU_MAX_THREADS_DEFAULT "256") + endif () set(AMReX_GPU_MAX_THREADS ${AMReX_GPU_MAX_THREADS_DEFAULT} CACHE STRING "Maximum number of GPU threads per block" ) message( STATUS " AMReX_GPU_MAX_THREADS = ${AMReX_GPU_MAX_THREADS}") diff --git a/Tools/GNUMake/Make.defs b/Tools/GNUMake/Make.defs index ed3a1a4df9..e077424fa6 100644 --- a/Tools/GNUMake/Make.defs +++ b/Tools/GNUMake/Make.defs @@ -269,8 +269,12 @@ else endif # Maximum number of GPU threads per block. -CUDA_MAX_THREADS ?= 256 -GPU_MAX_THREADS ?= $(CUDA_MAX_THREADS) +CUDA_MAX_THREADS ?= 128 +ifeq ($(USE_CUDA),TRUE) + GPU_MAX_THREADS ?= $(CUDA_MAX_THREADS) +else + GPU_MAX_THREADS ?= 256 +endif ifeq ($(USE_CUDA),TRUE) # Set the default CUDA architecture version.