Skip to content

AMREX_GPU_MAX_THREADS: 256 -> 128 for CUDA #4417

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 1 commit into
base: development
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions Docs/sphinx_documentation/source/GPU.rst
Original file line number Diff line number Diff line change
Expand Up @@ -229,9 +229,9 @@ Building with CMake

To build AMReX with GPU support in CMake, add
``-DAMReX_GPU_BACKEND=CUDA|HIP|SYCL`` to the ``cmake`` invocation, for CUDA,
HIP and SYCL, respectively. By default, AMReX uses 256 threads per GPU
block/group in most situations. This can be changed with
``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 128 for example.
HIP and SYCL, respectively. By default, AMReX uses 128 threads per GPU block
in most situations for CUDA, and 256 for HIP and SYCL. This can be changed
with ``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 256 or 128 for example.

Enabling CUDA support
^^^^^^^^^^^^^^^^^^^^^
Expand Down Expand Up @@ -1166,7 +1166,7 @@ GPU block size

By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads
per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant
with a default value of 256. The users can also explicitly specify the
with a default value of 128 for CUDA and 256 for HIP and SYCL. The users can also explicitly specify the
number of threads per block by :cpp:`ParallelFor<MY_BLOCK_SIZE>(...)`, where
``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128). This allows
the users to do performance tuning for individual kernels.
Expand Down
74 changes: 38 additions & 36 deletions Src/Base/AMReX_Reduce.H
Original file line number Diff line number Diff line change
Expand Up @@ -12,6 +12,8 @@
#include <functional>
#include <limits>

#define AMREX_GPU_MAX_THREADS_REDUCE 256

namespace amrex {

namespace Reduce::detail {
Expand Down Expand Up @@ -93,7 +95,7 @@ struct ReduceOpSum
if (h.threadIdx() == 0) { d += r; }
}
#else
template <typename T, int MT=AMREX_GPU_MAX_THREADS>
template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
void parallel_update (T& d, T const& s) const noexcept {
T r = Gpu::blockReduceSum<MT>(s);
Expand Down Expand Up @@ -121,7 +123,7 @@ struct ReduceOpMin
if (h.threadIdx() == 0) { d = amrex::min(d,r); }
}
#else
template <typename T, int MT=AMREX_GPU_MAX_THREADS>
template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
void parallel_update (T& d, T const& s) const noexcept {
T r = Gpu::blockReduceMin<MT>(s);
Expand Down Expand Up @@ -154,7 +156,7 @@ struct ReduceOpMax
if (h.threadIdx() == 0) { d = amrex::max(d,r); }
}
#else
template <typename T, int MT=AMREX_GPU_MAX_THREADS>
template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
void parallel_update (T& d, T const& s) const noexcept {
T r = Gpu::blockReduceMax<MT>(s);
Expand Down Expand Up @@ -188,7 +190,7 @@ struct ReduceOpLogicalAnd
if (h.threadIdx() == 0) { d = d && r; }
}
#else
template <typename T, int MT=AMREX_GPU_MAX_THREADS>
template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
std::enable_if_t<std::is_integral<T>::value>
parallel_update (T& d, T s) const noexcept {
Expand Down Expand Up @@ -220,7 +222,7 @@ struct ReduceOpLogicalOr
if (h.threadIdx() == 0) { d = d || r; }
}
#else
template <typename T, int MT=AMREX_GPU_MAX_THREADS>
template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE>
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
std::enable_if_t<std::is_integral<T>::value>
parallel_update (T& d, T s) const noexcept {
Expand Down Expand Up @@ -372,7 +374,7 @@ public:
const int nboxes = mf.local_size();
if (nboxes > 0) {
auto const& parforinfo = mf.getParForInfo(nghost);
auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS_REDUCE);
AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));
const int nblocks = nblocks_per_box * nboxes;
const BoxIndexer* dp_boxes = parforinfo.getBoxes();
Expand All @@ -387,14 +389,14 @@ public:
#ifdef AMREX_USE_SYCL
// device reduce needs local(i.e., shared) memory
constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, shared_mem_bytes, stream,
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
{
Dim1 blockIdx {gh.blockIdx()};
Dim1 threadIdx{gh.threadIdx()};
#else
amrex::launch_global<AMREX_GPU_MAX_THREADS>
<<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
amrex::launch_global<AMREX_GPU_MAX_THREADS_REDUCE>
<<<nblocks_ec, AMREX_GPU_MAX_THREADS_REDUCE, 0, stream>>>
([=] AMREX_GPU_DEVICE () noexcept
{
#endif
Expand All @@ -406,7 +408,7 @@ public:
}
for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
int ibox = iblock / nblocks_per_box;
auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS_REDUCE + threadIdx.x;

BoxIndexer const& indexer = dp_boxes[ibox];
if (icell < indexer.numPts()) {
Expand Down Expand Up @@ -496,21 +498,21 @@ public:
const auto lenx = len.x;
IndexType ixtype = box.ixType();
constexpr int nitems_per_thread = 4;
int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS);
int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1)
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE);
nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
reduce_data.updateMaxStreamIndex(stream);
#ifdef AMREX_USE_SYCL
// device reduce needs local(i.e., shared) memory
constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, shared_mem_bytes, stream,
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
{
Dim1 blockIdx {gh.blockIdx()};
Dim1 threadIdx{gh.threadIdx()};
Dim1 gridDim {gh.gridDim()};
#else
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, 0, stream,
[=] AMREX_GPU_DEVICE () noexcept
{
#endif
Expand All @@ -520,7 +522,7 @@ public:
if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
dst = r;
}
for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
icell < ncells; icell += stride) {
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
Expand Down Expand Up @@ -554,21 +556,21 @@ public:
const auto lenxy = len.x*len.y;
const auto lenx = len.x;
constexpr int nitems_per_thread = 4;
int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS);
int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1)
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE);
nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
reduce_data.updateMaxStreamIndex(stream);
#ifdef AMREX_USE_SYCL
// device reduce needs local(i.e., shared) memory
constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, shared_mem_bytes, stream,
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
{
Dim1 blockIdx {gh.blockIdx()};
Dim1 threadIdx{gh.threadIdx()};
Dim1 gridDim {gh.gridDim()};
#else
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, 0, stream,
[=] AMREX_GPU_DEVICE () noexcept
{
#endif
Expand All @@ -578,7 +580,7 @@ public:
if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
dst = r;
}
for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
icell < ncells; icell += stride) {
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
Expand Down Expand Up @@ -610,21 +612,21 @@ public:
auto dp = reduce_data.devicePtr(stream);
int& nblocks = reduce_data.nBlocks(stream);
constexpr int nitems_per_thread = 4;
int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS);
int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1)
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE);
nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
reduce_data.updateMaxStreamIndex(stream);
#ifdef AMREX_USE_SYCL
// device reduce needs local(i.e., shared) memory
constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, shared_mem_bytes, stream,
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
{
Dim1 blockIdx {gh.blockIdx()};
Dim1 threadIdx{gh.threadIdx()};
Dim1 gridDim {gh.gridDim()};
#else
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, 0, stream,
[=] AMREX_GPU_DEVICE () noexcept
{
#endif
Expand All @@ -634,7 +636,7 @@ public:
if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
dst = r;
}
for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
for (N i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
i < n; i += stride) {
auto pr = f(i);
Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
Expand Down Expand Up @@ -689,7 +691,7 @@ public:
#else
auto presult = hp;
#endif
amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(1, shared_mem_bytes, stream,
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
{
ReduceTuple r;
Expand All @@ -709,15 +711,15 @@ public:
Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
#endif
#else
amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(1, 0, stream,
[=] AMREX_GPU_DEVICE () noexcept
{
ReduceTuple r;
Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
ReduceTuple dst = r;
for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
auto dp_stream = dp+istream*maxblocks;
for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
for (int i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
i < nblocks[istream]; i += stride) {
Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
}
Expand Down Expand Up @@ -851,7 +853,7 @@ bool AnyOf (N n, T const* v, P const& pred)
#ifdef AMREX_USE_SYCL
const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
const std::size_t shared_mem_bytes = num_ints*sizeof(int);
amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
if (gh.threadIdx() == 0) { *has_any = *dp; }
Expand All @@ -860,7 +862,7 @@ bool AnyOf (N n, T const* v, P const& pred)
if (!(*has_any))
{
int r = false;
for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
for (N i = AMREX_GPU_MAX_THREADS_REDUCE*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS_REDUCE*gh.gridDim();
i < n && !r; i += stride)
{
r = pred(v[i]) ? 1 : 0;
Expand All @@ -872,7 +874,7 @@ bool AnyOf (N n, T const* v, P const& pred)
}
});
#else
amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(ec.numBlocks.x, 0, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
__shared__ int has_any;
if (threadIdx.x == 0) { has_any = *dp; }
Expand All @@ -881,7 +883,7 @@ bool AnyOf (N n, T const* v, P const& pred)
if (!has_any)
{
int r = false;
for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
for (N i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
i < n && !r; i += stride)
{
r = pred(v[i]) ? 1 : 0;
Expand Down Expand Up @@ -912,7 +914,7 @@ bool AnyOf (Box const& box, P const& pred)
#ifdef AMREX_USE_SYCL
const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
const std::size_t shared_mem_bytes = num_ints*sizeof(int);
amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
if (gh.threadIdx() == 0) { *has_any = *dp; }
Expand All @@ -921,7 +923,7 @@ bool AnyOf (Box const& box, P const& pred)
if (!(*has_any))
{
int r = false;
for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS_REDUCE*gh.gridDim();
icell < ncells && !r; icell += stride) {
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
Expand All @@ -937,7 +939,7 @@ bool AnyOf (Box const& box, P const& pred)
}
});
#else
AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS_REDUCE, ec.numBlocks, ec.numThreads, 0,
Gpu::gpuStream(),
[=] AMREX_GPU_DEVICE () noexcept {
__shared__ int has_any;
Expand All @@ -947,7 +949,7 @@ bool AnyOf (Box const& box, P const& pred)
if (!has_any)
{
int r = false;
for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
icell < ncells && !r; icell += stride) {
int k = icell / lenxy;
int j = (icell - k*lenxy) / lenx;
Expand Down
7 changes: 5 additions & 2 deletions Tools/CMake/AMReXOptions.cmake
Original file line number Diff line number Diff line change
Expand Up @@ -132,8 +132,11 @@ endif ()
if (NOT AMReX_GPU_BACKEND STREQUAL NONE)
message( STATUS " AMReX_GPU_BACKEND = ${AMReX_GPU_BACKEND}")

# We might set different default for different GPUs in the future.
set(AMReX_GPU_MAX_THREADS_DEFAULT "256")
if (AMReX_GPU_BACKEND STREQUAL CUDA)
set(AMReX_GPU_MAX_THREADS_DEFAULT "128")
else ()
set(AMReX_GPU_MAX_THREADS_DEFAULT "256")
endif ()
set(AMReX_GPU_MAX_THREADS ${AMReX_GPU_MAX_THREADS_DEFAULT} CACHE STRING
"Maximum number of GPU threads per block" )
message( STATUS " AMReX_GPU_MAX_THREADS = ${AMReX_GPU_MAX_THREADS}")
Expand Down
8 changes: 6 additions & 2 deletions Tools/GNUMake/Make.defs
Original file line number Diff line number Diff line change
Expand Up @@ -269,8 +269,12 @@ else
endif

# Maximum number of GPU threads per block.
CUDA_MAX_THREADS ?= 256
GPU_MAX_THREADS ?= $(CUDA_MAX_THREADS)
CUDA_MAX_THREADS ?= 128
ifeq ($(USE_CUDA),TRUE)
GPU_MAX_THREADS ?= $(CUDA_MAX_THREADS)
else
GPU_MAX_THREADS ?= 256
endif

ifeq ($(USE_CUDA),TRUE)
# Set the default CUDA architecture version.
Expand Down
Loading