Skip to content

Commit be801b9

Browse files
committed
Future base
1 parent d3540b9 commit be801b9

19 files changed

+137
-166
lines changed

Docs/sphinx_documentation/source/GPU.rst

Lines changed: 4 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -229,9 +229,9 @@ Building with CMake
229229

230230
To build AMReX with GPU support in CMake, add
231231
``-DAMReX_GPU_BACKEND=CUDA|HIP|SYCL`` to the ``cmake`` invocation, for CUDA,
232-
HIP and SYCL, respectively. By default, AMReX uses 256 threads per GPU
233-
block/group in most situations. This can be changed with
234-
``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 128 for example.
232+
HIP and SYCL, respectively. By default, AMReX uses 128 threads per GPU block
233+
in most situations for CUDA, and 256 for HIP and SYCL. This can be changed
234+
with ``-DAMReX_GPU_MAX_THREADS=N``, where ``N`` is 256 or 128 for example.
235235

236236
Enabling CUDA support
237237
^^^^^^^^^^^^^^^^^^^^^
@@ -1166,7 +1166,7 @@ GPU block size
11661166

11671167
By default, :cpp:`ParallelFor` launches ``AMREX_GPU_MAX_THREADS`` threads
11681168
per GPU block, where ``AMREX_GPU_MAX_THREADS`` is a compile-time constant
1169-
with a default value of 256. The users can also explicitly specify the
1169+
with a default value of 128 for CUDA and 256 for HIP and SYCL. The users can also explicitly specify the
11701170
number of threads per block by :cpp:`ParallelFor<MY_BLOCK_SIZE>(...)`, where
11711171
``MY_BLOCK_SIZE`` is a multiple of the warp size (e.g., 128). This allows
11721172
the users to do performance tuning for individual kernels.

Src/AmrCore/AMReX_TagBox.cpp

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -447,8 +447,8 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
447447
const int ncells = fai.fabbox().numPts();
448448
const char* tags = (*this)[fai].dataPtr();
449449
#ifdef AMREX_USE_SYCL
450-
amrex::launch(nblocks[li], block_size, sizeof(int)*Gpu::Device::warp_size,
451-
Gpu::Device::gpuStream(),
450+
amrex::launch<block_size>(nblocks[li], sizeof(int)*Gpu::Device::warp_size,
451+
Gpu::Device::gpuStream(),
452452
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
453453
{
454454
int bid = h.item->get_group_linear_id();
@@ -467,7 +467,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
467467
}
468468
});
469469
#else
470-
amrex::launch(nblocks[li], block_size, Gpu::Device::gpuStream(),
470+
amrex::launch<block_size>(nblocks[li], Gpu::Device::gpuStream(),
471471
[=] AMREX_GPU_DEVICE () noexcept
472472
{
473473
int bid = blockIdx.x;
@@ -525,7 +525,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
525525
const int ncells = bx.numPts();
526526
const char* tags = (*this)[fai].dataPtr();
527527
#ifdef AMREX_USE_SYCL
528-
amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
528+
amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
529529
[=] AMREX_GPU_DEVICE (Gpu::Handler const& h) noexcept
530530
{
531531
int bid = h.item->get_group(0);
@@ -553,7 +553,7 @@ TagBoxArray::local_collate_gpu (Gpu::PinnedVector<IntVect>& v) const
553553
}
554554
});
555555
#else
556-
amrex::launch(nblocks[li], block_size, sizeof(unsigned int), Gpu::Device::gpuStream(),
556+
amrex::launch<block_size>(nblocks[li], sizeof(unsigned int), Gpu::Device::gpuStream(),
557557
[=] AMREX_GPU_DEVICE () noexcept
558558
{
559559
int bid = blockIdx.x;

Src/Base/AMReX_BaseFabUtility.H

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -38,14 +38,14 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
3838
if (Gpu::inLaunchRegion()) {
3939
BoxIndexer indexer(box);
4040
const auto ntotcells = std::uint64_t(box.numPts());
41-
int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
41+
constexpr int nthreads_per_block = (STRUCTSIZE <= 8) ? 256 : 128;
4242
std::uint64_t nblocks_long = (ntotcells+nthreads_per_block-1)/nthreads_per_block;
4343
AMREX_ASSERT(nblocks_long <= std::uint64_t(std::numeric_limits<int>::max()));
4444
auto nblocks = int(nblocks_long);
4545
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(STRUCT);
4646
T* p = (T*)aos_fab.dataPtr();
4747
#ifdef AMREX_USE_SYCL
48-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
48+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
4949
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
5050
{
5151
auto const icell = std::uint64_t(handler.globalIdx());
@@ -66,7 +66,7 @@ void fill (BaseFab<STRUCT>& aos_fab, F const& f)
6666
}
6767
});
6868
#else
69-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
69+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
7070
[=] AMREX_GPU_DEVICE () noexcept
7171
{
7272
std::uint64_t const icell = std::uint64_t(blockDim.x)*blockIdx.x+threadIdx.x;

Src/Base/AMReX_BlockMutex.cpp

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ void BlockMutex::init_states (state_t* state, int N) noexcept {
99
amrex::ignore_unused(state,N);
1010
amrex::Abort("xxxxx SYCL todo");
1111
#else
12-
amrex::launch((N+255)/256, 256, Gpu::gpuStream(),
12+
amrex::launch<256>((N+255)/256, Gpu::gpuStream(),
1313
[=] AMREX_GPU_DEVICE () noexcept
1414
{
1515
int i = threadIdx.x + blockIdx.x*blockDim.x;

Src/Base/AMReX_FabArrayBase.H

Lines changed: 6 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -650,10 +650,12 @@ public:
650650
//! For ParallelFor(FabArray)
651651
struct ParForInfo
652652
{
653-
ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads);
653+
ParForInfo (const FabArrayBase& fa, const IntVect& nghost);
654654
~ParForInfo ();
655655

656-
std::pair<int*,int*> const& getBlocks () const { return m_nblocks_x; }
656+
int getNBlocksPerBox (int nthreads) const {
657+
return int((m_ncellsmax+nthreads-1)/nthreads);
658+
}
657659
BoxIndexer const* getBoxes () const { return m_boxes; }
658660

659661
ParForInfo () = delete;
@@ -664,14 +666,12 @@ public:
664666

665667
BATransformer m_bat;
666668
IntVect m_ng;
667-
int m_nthreads;
668-
std::pair<int*,int*> m_nblocks_x;
669+
Long m_ncellsmax = 0;
669670
BoxIndexer* m_boxes = nullptr;
670671
char* m_hp = nullptr;
671-
char* m_dp = nullptr;
672672
};
673673

674-
ParForInfo const& getParForInfo (const IntVect& nghost, int nthreads) const;
674+
ParForInfo const& getParForInfo (const IntVect& nghost) const;
675675

676676
static std::multimap<BDKey,ParForInfo*> m_TheParForCache;
677677

Src/Base/AMReX_FabArrayBase.cpp

Lines changed: 9 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -2635,15 +2635,12 @@ FabArrayBase::isFusingCandidate () const noexcept // NOLINT(readability-convert-
26352635

26362636
#ifdef AMREX_USE_GPU
26372637

2638-
FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost, int nthreads)
2638+
FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& nghost)
26392639
: m_bat(fa.boxArray().transformer()),
2640-
m_ng(nghost),
2641-
m_nthreads(nthreads),
2642-
m_nblocks_x({nullptr,nullptr})
2640+
m_ng(nghost)
26432641
{
26442642
Vector<Box> boxes;
2645-
Vector<Long> ncells;
2646-
ncells.reserve(fa.indexArray.size());
2643+
m_ncellsmax = 0;
26472644
for (int K : fa.indexArray) {
26482645
Long N = 0;
26492646
Box b = fa.box(K);
@@ -2652,31 +2649,30 @@ FabArrayBase::ParForInfo::ParForInfo (const FabArrayBase& fa, const IntVect& ngh
26522649
N = b.numPts();
26532650
}
26542651
boxes.push_back(b);
2655-
ncells.push_back(N);
2652+
m_ncellsmax = std::max(m_ncellsmax, N);
26562653
}
2657-
detail::build_par_for_nblocks(m_hp, m_dp, m_nblocks_x, m_boxes, boxes, ncells, nthreads);
2654+
detail::build_par_for_boxes(m_hp, m_boxes, boxes);
26582655
}
26592656

26602657
FabArrayBase::ParForInfo::~ParForInfo ()
26612658
{
2662-
detail::destroy_par_for_nblocks(m_hp, m_dp);
2659+
detail::destroy_par_for_boxes(m_hp, (char*)m_boxes);
26632660
}
26642661

26652662
FabArrayBase::ParForInfo const&
2666-
FabArrayBase::getParForInfo (const IntVect& nghost, int nthreads) const
2663+
FabArrayBase::getParForInfo (const IntVect& nghost) const
26672664
{
26682665
AMREX_ASSERT(getBDKey() == m_bdkey);
26692666
auto er_it = m_TheParForCache.equal_range(m_bdkey);
26702667
for (auto it = er_it.first; it != er_it.second; ++it) {
26712668
if (it->second->m_bat == boxArray().transformer() &&
2672-
it->second->m_ng == nghost &&
2673-
it->second->m_nthreads == nthreads)
2669+
it->second->m_ng == nghost)
26742670
{
26752671
return *(it->second);
26762672
}
26772673
}
26782674

2679-
ParForInfo* new_pfi = new ParForInfo(*this, nghost, nthreads);
2675+
ParForInfo* new_pfi = new ParForInfo(*this, nghost);
26802676
m_TheParForCache.insert(er_it.second,
26812677
std::multimap<BDKey,ParForInfo*>::value_type(m_bdkey,new_pfi));
26822678
return *new_pfi;

Src/Base/AMReX_GpuContainers.H

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -433,11 +433,11 @@ namespace amrex::Gpu {
433433
unsigned long long, unsigned int>;
434434
constexpr Long nU = sizeof(T) / sizeof(U);
435435
auto pu = reinterpret_cast<U*>(p);
436-
int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
436+
constexpr int nthreads_per_block = (sizeof(T) <= 64) ? 256 : 128;
437437
int nblocks = static_cast<int>((N+nthreads_per_block-1)/nthreads_per_block);
438438
std::size_t shared_mem_bytes = nthreads_per_block * sizeof(T);
439439
#ifdef AMREX_USE_SYCL
440-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
440+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
441441
[=] AMREX_GPU_DEVICE (Gpu::Handler const& handler) noexcept
442442
{
443443
Long i = handler.globalIdx();
@@ -458,7 +458,7 @@ namespace amrex::Gpu {
458458
}
459459
});
460460
#else
461-
amrex::launch(nblocks, nthreads_per_block, shared_mem_bytes, Gpu::gpuStream(),
461+
amrex::launch<nthreads_per_block>(nblocks, shared_mem_bytes, Gpu::gpuStream(),
462462
[=] AMREX_GPU_DEVICE () noexcept
463463
{
464464
Long blockDimx = blockDim.x;

Src/Base/AMReX_GpuLaunch.H

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,9 +34,13 @@
3434
#ifdef AMREX_USE_CUDA
3535
# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
3636
amrex::launch_global<MT><<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
37+
# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
38+
amrex::launch_global <<<blocks, threads, sharedMem, stream>>>(__VA_ARGS__)
3739
#elif defined(AMREX_USE_HIP)
3840
# define AMREX_LAUNCH_KERNEL(MT, blocks, threads, sharedMem, stream, ... ) \
3941
hipLaunchKernelGGL(launch_global<MT>, blocks, threads, sharedMem, stream, __VA_ARGS__)
42+
# define AMREX_LAUNCH_KERNEL_NOBOUND(blocks, threads, sharedMem, stream, ... ) \
43+
hipLaunchKernelGGL(launch_global , blocks, threads, sharedMem, stream, __VA_ARGS__)
4044
#endif
4145

4246

Src/Base/AMReX_GpuLaunchFunctsG.H

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -735,9 +735,8 @@ template<typename L>
735735
void launch (int nblocks, int nthreads_per_block, std::size_t shared_mem_bytes,
736736
gpuStream_t stream, L const& f) noexcept
737737
{
738-
AMREX_ASSERT(nthreads_per_block <= AMREX_GPU_MAX_THREADS);
739-
AMREX_LAUNCH_KERNEL(AMREX_GPU_MAX_THREADS, nblocks, nthreads_per_block, shared_mem_bytes,
740-
stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
738+
AMREX_LAUNCH_KERNEL_NOBOUND(nblocks, nthreads_per_block, shared_mem_bytes,
739+
stream, [=] AMREX_GPU_DEVICE () noexcept { f(); });
741740
AMREX_GPU_ERROR_CHECK();
742741
}
743742

Src/Base/AMReX_MFParallelFor.H

Lines changed: 5 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -68,7 +68,7 @@ std::enable_if_t<IsFabArray<MF>::value>
6868
ParallelFor (MF const& mf, F&& f)
6969
{
7070
#ifdef AMREX_USE_GPU
71-
detail::ParallelFor<MT>(mf, IntVect(0), FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
71+
detail::ParallelFor<MT>(mf, IntVect(0), 1, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
7272
#else
7373
detail::ParallelFor(mf, IntVect(0), FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
7474
#endif
@@ -119,7 +119,7 @@ std::enable_if_t<IsFabArray<MF>::value>
119119
ParallelFor (MF const& mf, IntVect const& ng, F&& f)
120120
{
121121
#ifdef AMREX_USE_GPU
122-
detail::ParallelFor<MT>(mf, ng, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
122+
detail::ParallelFor<MT>(mf, ng, 1, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
123123
#else
124124
detail::ParallelFor(mf, ng, FabArrayBase::mfiter_tile_size, false, std::forward<F>(f));
125125
#endif
@@ -225,7 +225,7 @@ std::enable_if_t<IsFabArray<MF>::value>
225225
ParallelFor (MF const& mf, TileSize const& ts, F&& f)
226226
{
227227
#ifdef AMREX_USE_GPU
228-
detail::ParallelFor<MT>(mf, IntVect(0), ts.tile_size, false, std::forward<F>(f));
228+
detail::ParallelFor<MT>(mf, IntVect(0), 1, ts.tile_size, false, std::forward<F>(f));
229229
#else
230230
detail::ParallelFor(mf, IntVect(0), ts.tile_size, false, std::forward<F>(f));
231231
#endif
@@ -280,7 +280,7 @@ std::enable_if_t<IsFabArray<MF>::value>
280280
ParallelFor (MF const& mf, IntVect const& ng, TileSize const& ts, F&& f)
281281
{
282282
#ifdef AMREX_USE_GPU
283-
detail::ParallelFor<MT>(mf, ng, ts.tile_size, false, std::forward<F>(f));
283+
detail::ParallelFor<MT>(mf, ng, 1, ts.tile_size, false, std::forward<F>(f));
284284
#else
285285
detail::ParallelFor(mf, ng, ts.tile_size, false, std::forward<F>(f));
286286
#endif
@@ -423,7 +423,7 @@ ParallelFor (MF const& mf, IntVect const& ng, TileSize const& ts,
423423
DynamicTiling dt, F&& f)
424424
{
425425
#ifdef AMREX_USE_GPU
426-
detail::ParallelFor<MT>(mf, ng, ts.tile_size, dt.dynamic, std::forward<F>(f));
426+
detail::ParallelFor<MT>(mf, ng, 1, ts.tile_size, dt.dynamic, std::forward<F>(f));
427427
#else
428428
detail::ParallelFor(mf, ng, ts.tile_size, dt.dynamic, std::forward<F>(f));
429429
#endif

0 commit comments

Comments
 (0)