Skip to content

Commit 57af087

Browse files
committed
AMREX_GPU_MAX_THREADS: 256 -> 128 for CUDA
1 parent 8976d06 commit 57af087

File tree

4 files changed

+53
-44
lines changed

4 files changed

+53
-44
lines changed

Docs/sphinx_documentation/source/GPU.rst

+4-4
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,9 @@ Building with CMake
229229

230230
To build AMReX with GPU support in CMake, add
231231
``-DAMReX_GPU_BACKEND=CUDA|HIP|SYCL`` to the ``cmake`` invocation, for CUDA,
232-
HIP and SYCL, respectively. By default, AMReX uses 256 threads per GPU
233-
block/group in most situations. This can be changed with
234-
``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 128 for example.
232+
HIP and SYCL, respectively. By default, AMReX uses 128 threads per GPU block
233+
in most situations for CUDA, and 256 for HIP and SYCL. This can be changed
234+
with ``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 256 or 128 for example.
235235

236236
Enabling CUDA support
237237
^^^^^^^^^^^^^^^^^^^^^
@@ -1166,7 +1166,7 @@ GPU block size
11661166

11671167
By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads
11681168
per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant
1169-
with a default value of 256. The users can also explicitly specify the
1169+
with a default value of 128 for CUDA and 256 for HIP and SYCL. The users can also explicitly specify the
11701170
number of threads per block by :cpp:`ParallelFor<MY_BLOCK_SIZE>(...)`, where
11711171
``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128). This allows
11721172
the users to do performance tuning for individual kernels.

Src/Base/AMReX_Reduce.H

+38-36
Original file line numberDiff line numberDiff line change
@@ -12,6 +12,8 @@
1212
#include <functional>
1313
#include <limits>
1414

15+
#define AMREX_GPU_MAX_THREADS_REDUCE 256
16+
1517
namespace amrex {
1618

1719
namespace Reduce::detail {
@@ -93,7 +95,7 @@ struct ReduceOpSum
9395
if (h.threadIdx() == 0) { d += r; }
9496
}
9597
#else
96-
template <typename T, int MT=AMREX_GPU_MAX_THREADS>
98+
template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE>
9799
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
98100
void parallel_update (T& d, T const& s) const noexcept {
99101
T r = Gpu::blockReduceSum<MT>(s);
@@ -121,7 +123,7 @@ struct ReduceOpMin
121123
if (h.threadIdx() == 0) { d = amrex::min(d,r); }
122124
}
123125
#else
124-
template <typename T, int MT=AMREX_GPU_MAX_THREADS>
126+
template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE>
125127
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
126128
void parallel_update (T& d, T const& s) const noexcept {
127129
T r = Gpu::blockReduceMin<MT>(s);
@@ -154,7 +156,7 @@ struct ReduceOpMax
154156
if (h.threadIdx() == 0) { d = amrex::max(d,r); }
155157
}
156158
#else
157-
template <typename T, int MT=AMREX_GPU_MAX_THREADS>
159+
template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE>
158160
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
159161
void parallel_update (T& d, T const& s) const noexcept {
160162
T r = Gpu::blockReduceMax<MT>(s);
@@ -188,7 +190,7 @@ struct ReduceOpLogicalAnd
188190
if (h.threadIdx() == 0) { d = d && r; }
189191
}
190192
#else
191-
template <typename T, int MT=AMREX_GPU_MAX_THREADS>
193+
template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE>
192194
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
193195
std::enable_if_t<std::is_integral<T>::value>
194196
parallel_update (T& d, T s) const noexcept {
@@ -220,7 +222,7 @@ struct ReduceOpLogicalOr
220222
if (h.threadIdx() == 0) { d = d || r; }
221223
}
222224
#else
223-
template <typename T, int MT=AMREX_GPU_MAX_THREADS>
225+
template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE>
224226
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
225227
std::enable_if_t<std::is_integral<T>::value>
226228
parallel_update (T& d, T s) const noexcept {
@@ -372,7 +374,7 @@ public:
372374
const int nboxes = mf.local_size();
373375
if (nboxes > 0) {
374376
auto const& parforinfo = mf.getParForInfo(nghost);
375-
auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS);
377+
auto nblocks_per_box = parforinfo.getNBlocksPerBox(AMREX_GPU_MAX_THREADS_REDUCE);
376378
AMREX_ASSERT(Long(nblocks_per_box)*Long(nboxes) < Long(std::numeric_limits<int>::max()));
377379
const int nblocks = nblocks_per_box * nboxes;
378380
const BoxIndexer* dp_boxes = parforinfo.getBoxes();
@@ -387,14 +389,14 @@ public:
387389
#ifdef AMREX_USE_SYCL
388390
// device reduce needs local(i.e., shared) memory
389391
constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
390-
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
392+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, shared_mem_bytes, stream,
391393
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
392394
{
393395
Dim1 blockIdx {gh.blockIdx()};
394396
Dim1 threadIdx{gh.threadIdx()};
395397
#else
396-
amrex::launch_global<AMREX_GPU_MAX_THREADS>
397-
<<<nblocks_ec, AMREX_GPU_MAX_THREADS, 0, stream>>>
398+
amrex::launch_global<AMREX_GPU_MAX_THREADS_REDUCE>
399+
<<<nblocks_ec, AMREX_GPU_MAX_THREADS_REDUCE, 0, stream>>>
398400
([=] AMREX_GPU_DEVICE () noexcept
399401
{
400402
#endif
@@ -406,7 +408,7 @@ public:
406408
}
407409
for (int iblock = blockIdx.x; iblock < nblocks; iblock += nblocks_ec) {
408410
int ibox = iblock / nblocks_per_box;
409-
auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x;
411+
auto icell = std::uint64_t(iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS_REDUCE + threadIdx.x;
410412

411413
BoxIndexer const& indexer = dp_boxes[ibox];
412414
if (icell < indexer.numPts()) {
@@ -496,21 +498,21 @@ public:
496498
const auto lenx = len.x;
497499
IndexType ixtype = box.ixType();
498500
constexpr int nitems_per_thread = 4;
499-
int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
500-
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS);
501+
int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1)
502+
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE);
501503
nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
502504
reduce_data.updateMaxStreamIndex(stream);
503505
#ifdef AMREX_USE_SYCL
504506
// device reduce needs local(i.e., shared) memory
505507
constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
506-
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
508+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, shared_mem_bytes, stream,
507509
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
508510
{
509511
Dim1 blockIdx {gh.blockIdx()};
510512
Dim1 threadIdx{gh.threadIdx()};
511513
Dim1 gridDim {gh.gridDim()};
512514
#else
513-
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
515+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, 0, stream,
514516
[=] AMREX_GPU_DEVICE () noexcept
515517
{
516518
#endif
@@ -520,7 +522,7 @@ public:
520522
if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
521523
dst = r;
522524
}
523-
for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
525+
for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
524526
icell < ncells; icell += stride) {
525527
int k = icell / lenxy;
526528
int j = (icell - k*lenxy) / lenx;
@@ -554,21 +556,21 @@ public:
554556
const auto lenxy = len.x*len.y;
555557
const auto lenx = len.x;
556558
constexpr int nitems_per_thread = 4;
557-
int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
558-
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS);
559+
int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1)
560+
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE);
559561
nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
560562
reduce_data.updateMaxStreamIndex(stream);
561563
#ifdef AMREX_USE_SYCL
562564
// device reduce needs local(i.e., shared) memory
563565
constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
564-
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
566+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, shared_mem_bytes, stream,
565567
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
566568
{
567569
Dim1 blockIdx {gh.blockIdx()};
568570
Dim1 threadIdx{gh.threadIdx()};
569571
Dim1 gridDim {gh.gridDim()};
570572
#else
571-
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
573+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, 0, stream,
572574
[=] AMREX_GPU_DEVICE () noexcept
573575
{
574576
#endif
@@ -578,7 +580,7 @@ public:
578580
if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
579581
dst = r;
580582
}
581-
for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
583+
for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
582584
icell < ncells; icell += stride) {
583585
int k = icell / lenxy;
584586
int j = (icell - k*lenxy) / lenx;
@@ -610,21 +612,21 @@ public:
610612
auto dp = reduce_data.devicePtr(stream);
611613
int& nblocks = reduce_data.nBlocks(stream);
612614
constexpr int nitems_per_thread = 4;
613-
int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS-1)
614-
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS);
615+
int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE-1)
616+
/ (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE);
615617
nblocks_ec = std::min(nblocks_ec, reduce_data.maxBlocks());
616618
reduce_data.updateMaxStreamIndex(stream);
617619
#ifdef AMREX_USE_SYCL
618620
// device reduce needs local(i.e., shared) memory
619621
constexpr std::size_t shared_mem_bytes = sizeof(unsigned long long)*Gpu::Device::warp_size;
620-
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, shared_mem_bytes, stream,
622+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, shared_mem_bytes, stream,
621623
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
622624
{
623625
Dim1 blockIdx {gh.blockIdx()};
624626
Dim1 threadIdx{gh.threadIdx()};
625627
Dim1 gridDim {gh.gridDim()};
626628
#else
627-
amrex::launch<AMREX_GPU_MAX_THREADS>(nblocks_ec, 0, stream,
629+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(nblocks_ec, 0, stream,
628630
[=] AMREX_GPU_DEVICE () noexcept
629631
{
630632
#endif
@@ -634,7 +636,7 @@ public:
634636
if (threadIdx.x == 0 && static_cast<int>(blockIdx.x) >= nblocks) {
635637
dst = r;
636638
}
637-
for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
639+
for (N i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
638640
i < n; i += stride) {
639641
auto pr = f(i);
640642
Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r,pr);
@@ -689,7 +691,7 @@ public:
689691
#else
690692
auto presult = hp;
691693
#endif
692-
amrex::launch<AMREX_GPU_MAX_THREADS>(1, shared_mem_bytes, stream,
694+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(1, shared_mem_bytes, stream,
693695
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept
694696
{
695697
ReduceTuple r;
@@ -709,15 +711,15 @@ public:
709711
Gpu::dtoh_memcpy_async(hp, dtmp.data(), sizeof(ReduceTuple));
710712
#endif
711713
#else
712-
amrex::launch<AMREX_GPU_MAX_THREADS>(1, 0, stream,
714+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(1, 0, stream,
713715
[=] AMREX_GPU_DEVICE () noexcept
714716
{
715717
ReduceTuple r;
716718
Reduce::detail::for_each_init<0, ReduceTuple, Ps...>(r);
717719
ReduceTuple dst = r;
718720
for (int istream = 0, nstreams = nblocks.size(); istream < nstreams; ++istream) {
719721
auto dp_stream = dp+istream*maxblocks;
720-
for (int i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
722+
for (int i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
721723
i < nblocks[istream]; i += stride) {
722724
Reduce::detail::for_each_local<0, ReduceTuple, Ps...>(r, dp_stream[i]);
723725
}
@@ -851,7 +853,7 @@ bool AnyOf (N n, T const* v, P const& pred)
851853
#ifdef AMREX_USE_SYCL
852854
const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
853855
const std::size_t shared_mem_bytes = num_ints*sizeof(int);
854-
amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
856+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
855857
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
856858
int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
857859
if (gh.threadIdx() == 0) { *has_any = *dp; }
@@ -860,7 +862,7 @@ bool AnyOf (N n, T const* v, P const& pred)
860862
if (!(*has_any))
861863
{
862864
int r = false;
863-
for (N i = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
865+
for (N i = AMREX_GPU_MAX_THREADS_REDUCE*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS_REDUCE*gh.gridDim();
864866
i < n && !r; i += stride)
865867
{
866868
r = pred(v[i]) ? 1 : 0;
@@ -872,7 +874,7 @@ bool AnyOf (N n, T const* v, P const& pred)
872874
}
873875
});
874876
#else
875-
amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, 0, Gpu::gpuStream(),
877+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(ec.numBlocks.x, 0, Gpu::gpuStream(),
876878
[=] AMREX_GPU_DEVICE () noexcept {
877879
__shared__ int has_any;
878880
if (threadIdx.x == 0) { has_any = *dp; }
@@ -881,7 +883,7 @@ bool AnyOf (N n, T const* v, P const& pred)
881883
if (!has_any)
882884
{
883885
int r = false;
884-
for (N i = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
886+
for (N i = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
885887
i < n && !r; i += stride)
886888
{
887889
r = pred(v[i]) ? 1 : 0;
@@ -912,7 +914,7 @@ bool AnyOf (Box const& box, P const& pred)
912914
#ifdef AMREX_USE_SYCL
913915
const int num_ints = std::max(Gpu::Device::warp_size, int(ec.numThreads.x)/Gpu::Device::warp_size) + 1;
914916
const std::size_t shared_mem_bytes = num_ints*sizeof(int);
915-
amrex::launch<AMREX_GPU_MAX_THREADS>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
917+
amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE>(ec.numBlocks.x, shared_mem_bytes, Gpu::gpuStream(),
916918
[=] AMREX_GPU_DEVICE (Gpu::Handler const& gh) noexcept {
917919
int* has_any = &(static_cast<int*>(gh.sharedMemory())[num_ints-1]);
918920
if (gh.threadIdx() == 0) { *has_any = *dp; }
@@ -921,7 +923,7 @@ bool AnyOf (Box const& box, P const& pred)
921923
if (!(*has_any))
922924
{
923925
int r = false;
924-
for (int icell = AMREX_GPU_MAX_THREADS*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS*gh.gridDim();
926+
for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*gh.blockIdx()+gh.threadIdx(), stride = AMREX_GPU_MAX_THREADS_REDUCE*gh.gridDim();
925927
icell < ncells && !r; icell += stride) {
926928
int k = icell / lenxy;
927929
int j = (icell - k*lenxy) / lenx;
@@ -937,7 +939,7 @@ bool AnyOf (Box const& box, P const& pred)
937939
}
938940
});
939941
#else
940-
AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, ec.numBlocks, ec.numThreads, 0,
942+
AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS_REDUCE, ec.numBlocks, ec.numThreads, 0,
941943
Gpu::gpuStream(),
942944
[=] AMREX_GPU_DEVICE () noexcept {
943945
__shared__ int has_any;
@@ -947,7 +949,7 @@ bool AnyOf (Box const& box, P const& pred)
947949
if (!has_any)
948950
{
949951
int r = false;
950-
for (int icell = AMREX_GPU_MAX_THREADS*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS*gridDim.x;
952+
for (int icell = AMREX_GPU_MAX_THREADS_REDUCE*blockIdx.x+threadIdx.x, stride = AMREX_GPU_MAX_THREADS_REDUCE*gridDim.x;
951953
icell < ncells && !r; icell += stride) {
952954
int k = icell / lenxy;
953955
int j = (icell - k*lenxy) / lenx;

Tools/CMake/AMReXOptions.cmake

+5-2
Original file line numberDiff line numberDiff line change
@@ -132,8 +132,11 @@ endif ()
132132
if (NOT AMReX_GPU_BACKEND STREQUAL NONE)
133133
message( STATUS " AMReX_GPU_BACKEND = ${AMReX_GPU_BACKEND}")
134134

135-
# We might set different default for different GPUs in the future.
136-
set(AMReX_GPU_MAX_THREADS_DEFAULT "256")
135+
if (AMReX_GPU_BACKEND STREQUAL CUDA)
136+
set(AMReX_GPU_MAX_THREADS_DEFAULT "128")
137+
else ()
138+
set(AMReX_GPU_MAX_THREADS_DEFAULT "256")
139+
endif ()
137140
set(AMReX_GPU_MAX_THREADS ${AMReX_GPU_MAX_THREADS_DEFAULT} CACHE STRING
138141
"Maximum number of GPU threads per block" )
139142
message( STATUS " AMReX_GPU_MAX_THREADS = ${AMReX_GPU_MAX_THREADS}")

Tools/GNUMake/Make.defs

+6-2
Original file line numberDiff line numberDiff line change
@@ -269,8 +269,12 @@ else
269269
endif
270270

271271
# Maximum number of GPU threads per block.
272-
CUDA_MAX_THREADS ?= 256
273-
GPU_MAX_THREADS ?= $(CUDA_MAX_THREADS)
272+
CUDA_MAX_THREADS ?= 128
273+
ifeq ($(USE_CUDA),TRUE)
274+
GPU_MAX_THREADS ?= $(CUDA_MAX_THREADS)
275+
else
276+
GPU_MAX_THREADS ?= 256
277+
endif
274278

275279
ifeq ($(USE_CUDA),TRUE)
276280
# Set the default CUDA architecture version.

0 commit comments

Comments
 (0)