12
12
#include < functional>
13
13
#include < limits>
14
14
15
+ #define AMREX_GPU_MAX_THREADS_REDUCE 256
16
+
15
17
namespace amrex {
16
18
17
19
namespace Reduce ::detail {
@@ -93,7 +95,7 @@ struct ReduceOpSum
93
95
if (h.threadIdx () == 0 ) { d += r; }
94
96
}
95
97
#else
96
- template <typename T, int MT=AMREX_GPU_MAX_THREADS >
98
+ template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE >
97
99
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
98
100
void parallel_update (T& d, T const & s) const noexcept {
99
101
T r = Gpu::blockReduceSum<MT>(s);
@@ -121,7 +123,7 @@ struct ReduceOpMin
121
123
if (h.threadIdx () == 0 ) { d = amrex::min (d,r); }
122
124
}
123
125
#else
124
- template <typename T, int MT=AMREX_GPU_MAX_THREADS >
126
+ template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE >
125
127
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
126
128
void parallel_update (T& d, T const & s) const noexcept {
127
129
T r = Gpu::blockReduceMin<MT>(s);
@@ -154,7 +156,7 @@ struct ReduceOpMax
154
156
if (h.threadIdx () == 0 ) { d = amrex::max (d,r); }
155
157
}
156
158
#else
157
- template <typename T, int MT=AMREX_GPU_MAX_THREADS >
159
+ template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE >
158
160
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
159
161
void parallel_update (T& d, T const & s) const noexcept {
160
162
T r = Gpu::blockReduceMax<MT>(s);
@@ -188,7 +190,7 @@ struct ReduceOpLogicalAnd
188
190
if (h.threadIdx () == 0 ) { d = d && r; }
189
191
}
190
192
#else
191
- template <typename T, int MT=AMREX_GPU_MAX_THREADS >
193
+ template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE >
192
194
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
193
195
std::enable_if_t <std::is_integral<T>::value>
194
196
parallel_update (T& d, T s) const noexcept {
@@ -220,7 +222,7 @@ struct ReduceOpLogicalOr
220
222
if (h.threadIdx () == 0 ) { d = d || r; }
221
223
}
222
224
#else
223
- template <typename T, int MT=AMREX_GPU_MAX_THREADS >
225
+ template <typename T, int MT=AMREX_GPU_MAX_THREADS_REDUCE >
224
226
AMREX_GPU_DEVICE AMREX_FORCE_INLINE
225
227
std::enable_if_t <std::is_integral<T>::value>
226
228
parallel_update (T& d, T s) const noexcept {
@@ -372,7 +374,7 @@ public:
372
374
const int nboxes = mf.local_size ();
373
375
if (nboxes > 0 ) {
374
376
auto const & parforinfo = mf.getParForInfo (nghost);
375
- auto nblocks_per_box = parforinfo.getNBlocksPerBox (AMREX_GPU_MAX_THREADS );
377
+ auto nblocks_per_box = parforinfo.getNBlocksPerBox (AMREX_GPU_MAX_THREADS_REDUCE );
376
378
AMREX_ASSERT (Long (nblocks_per_box)*Long (nboxes) < Long (std::numeric_limits<int >::max ()));
377
379
const int nblocks = nblocks_per_box * nboxes;
378
380
const BoxIndexer* dp_boxes = parforinfo.getBoxes ();
@@ -387,14 +389,14 @@ public:
387
389
#ifdef AMREX_USE_SYCL
388
390
// device reduce needs local(i.e., shared) memory
389
391
constexpr std::size_t shared_mem_bytes = sizeof (unsigned long long )*Gpu::Device::warp_size;
390
- amrex::launch<AMREX_GPU_MAX_THREADS >(nblocks_ec, shared_mem_bytes, stream,
392
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(nblocks_ec, shared_mem_bytes, stream,
391
393
[=] AMREX_GPU_DEVICE (Gpu::Handler const & gh) noexcept
392
394
{
393
395
Dim1 blockIdx {gh.blockIdx ()};
394
396
Dim1 threadIdx{gh.threadIdx ()};
395
397
#else
396
- amrex::launch_global<AMREX_GPU_MAX_THREADS >
397
- <<<nblocks_ec, AMREX_GPU_MAX_THREADS , 0 , stream>>>
398
+ amrex::launch_global<AMREX_GPU_MAX_THREADS_REDUCE >
399
+ <<<nblocks_ec, AMREX_GPU_MAX_THREADS_REDUCE , 0 , stream>>>
398
400
([=] AMREX_GPU_DEVICE () noexcept
399
401
{
400
402
#endif
@@ -406,7 +408,7 @@ public:
406
408
}
407
409
for (int iblock = blockIdx.x ; iblock < nblocks; iblock += nblocks_ec) {
408
410
int ibox = iblock / nblocks_per_box;
409
- auto icell = std::uint64_t (iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS + threadIdx.x ;
411
+ auto icell = std::uint64_t (iblock-ibox*nblocks_per_box)*AMREX_GPU_MAX_THREADS_REDUCE + threadIdx.x ;
410
412
411
413
BoxIndexer const & indexer = dp_boxes[ibox];
412
414
if (icell < indexer.numPts ()) {
@@ -496,21 +498,21 @@ public:
496
498
const auto lenx = len.x ;
497
499
IndexType ixtype = box.ixType ();
498
500
constexpr int nitems_per_thread = 4 ;
499
- int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS -1 )
500
- / (nitems_per_thread*AMREX_GPU_MAX_THREADS );
501
+ int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE -1 )
502
+ / (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE );
501
503
nblocks_ec = std::min (nblocks_ec, reduce_data.maxBlocks ());
502
504
reduce_data.updateMaxStreamIndex (stream);
503
505
#ifdef AMREX_USE_SYCL
504
506
// device reduce needs local(i.e., shared) memory
505
507
constexpr std::size_t shared_mem_bytes = sizeof (unsigned long long )*Gpu::Device::warp_size;
506
- amrex::launch<AMREX_GPU_MAX_THREADS >(nblocks_ec, shared_mem_bytes, stream,
508
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(nblocks_ec, shared_mem_bytes, stream,
507
509
[=] AMREX_GPU_DEVICE (Gpu::Handler const & gh) noexcept
508
510
{
509
511
Dim1 blockIdx {gh.blockIdx ()};
510
512
Dim1 threadIdx{gh.threadIdx ()};
511
513
Dim1 gridDim {gh.gridDim ()};
512
514
#else
513
- amrex::launch<AMREX_GPU_MAX_THREADS >(nblocks_ec, 0 , stream,
515
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(nblocks_ec, 0 , stream,
514
516
[=] AMREX_GPU_DEVICE () noexcept
515
517
{
516
518
#endif
@@ -520,7 +522,7 @@ public:
520
522
if (threadIdx.x == 0 && static_cast <int >(blockIdx.x ) >= nblocks) {
521
523
dst = r;
522
524
}
523
- for (int icell = AMREX_GPU_MAX_THREADS *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS *gridDim.x ;
525
+ for (int icell = AMREX_GPU_MAX_THREADS_REDUCE *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS_REDUCE *gridDim.x ;
524
526
icell < ncells; icell += stride) {
525
527
int k = icell / lenxy;
526
528
int j = (icell - k*lenxy) / lenx;
@@ -554,21 +556,21 @@ public:
554
556
const auto lenxy = len.x *len.y ;
555
557
const auto lenx = len.x ;
556
558
constexpr int nitems_per_thread = 4 ;
557
- int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS -1 )
558
- / (nitems_per_thread*AMREX_GPU_MAX_THREADS );
559
+ int nblocks_ec = (ncells + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE -1 )
560
+ / (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE );
559
561
nblocks_ec = std::min (nblocks_ec, reduce_data.maxBlocks ());
560
562
reduce_data.updateMaxStreamIndex (stream);
561
563
#ifdef AMREX_USE_SYCL
562
564
// device reduce needs local(i.e., shared) memory
563
565
constexpr std::size_t shared_mem_bytes = sizeof (unsigned long long )*Gpu::Device::warp_size;
564
- amrex::launch<AMREX_GPU_MAX_THREADS >(nblocks_ec, shared_mem_bytes, stream,
566
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(nblocks_ec, shared_mem_bytes, stream,
565
567
[=] AMREX_GPU_DEVICE (Gpu::Handler const & gh) noexcept
566
568
{
567
569
Dim1 blockIdx {gh.blockIdx ()};
568
570
Dim1 threadIdx{gh.threadIdx ()};
569
571
Dim1 gridDim {gh.gridDim ()};
570
572
#else
571
- amrex::launch<AMREX_GPU_MAX_THREADS >(nblocks_ec, 0 , stream,
573
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(nblocks_ec, 0 , stream,
572
574
[=] AMREX_GPU_DEVICE () noexcept
573
575
{
574
576
#endif
@@ -578,7 +580,7 @@ public:
578
580
if (threadIdx.x == 0 && static_cast <int >(blockIdx.x ) >= nblocks) {
579
581
dst = r;
580
582
}
581
- for (int icell = AMREX_GPU_MAX_THREADS *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS *gridDim.x ;
583
+ for (int icell = AMREX_GPU_MAX_THREADS_REDUCE *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS_REDUCE *gridDim.x ;
582
584
icell < ncells; icell += stride) {
583
585
int k = icell / lenxy;
584
586
int j = (icell - k*lenxy) / lenx;
@@ -610,21 +612,21 @@ public:
610
612
auto dp = reduce_data.devicePtr (stream);
611
613
int & nblocks = reduce_data.nBlocks (stream);
612
614
constexpr int nitems_per_thread = 4 ;
613
- int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS -1 )
614
- / (nitems_per_thread*AMREX_GPU_MAX_THREADS );
615
+ int nblocks_ec = (n + nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE -1 )
616
+ / (nitems_per_thread*AMREX_GPU_MAX_THREADS_REDUCE );
615
617
nblocks_ec = std::min (nblocks_ec, reduce_data.maxBlocks ());
616
618
reduce_data.updateMaxStreamIndex (stream);
617
619
#ifdef AMREX_USE_SYCL
618
620
// device reduce needs local(i.e., shared) memory
619
621
constexpr std::size_t shared_mem_bytes = sizeof (unsigned long long )*Gpu::Device::warp_size;
620
- amrex::launch<AMREX_GPU_MAX_THREADS >(nblocks_ec, shared_mem_bytes, stream,
622
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(nblocks_ec, shared_mem_bytes, stream,
621
623
[=] AMREX_GPU_DEVICE (Gpu::Handler const & gh) noexcept
622
624
{
623
625
Dim1 blockIdx {gh.blockIdx ()};
624
626
Dim1 threadIdx{gh.threadIdx ()};
625
627
Dim1 gridDim {gh.gridDim ()};
626
628
#else
627
- amrex::launch<AMREX_GPU_MAX_THREADS >(nblocks_ec, 0 , stream,
629
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(nblocks_ec, 0 , stream,
628
630
[=] AMREX_GPU_DEVICE () noexcept
629
631
{
630
632
#endif
@@ -634,7 +636,7 @@ public:
634
636
if (threadIdx.x == 0 && static_cast <int >(blockIdx.x ) >= nblocks) {
635
637
dst = r;
636
638
}
637
- for (N i = AMREX_GPU_MAX_THREADS *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS *gridDim.x ;
639
+ for (N i = AMREX_GPU_MAX_THREADS_REDUCE *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS_REDUCE *gridDim.x ;
638
640
i < n; i += stride) {
639
641
auto pr = f (i);
640
642
Reduce::detail::for_each_local<0 , ReduceTuple, Ps...>(r,pr);
@@ -689,7 +691,7 @@ public:
689
691
#else
690
692
auto presult = hp;
691
693
#endif
692
- amrex::launch<AMREX_GPU_MAX_THREADS >(1 , shared_mem_bytes, stream,
694
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(1 , shared_mem_bytes, stream,
693
695
[=] AMREX_GPU_DEVICE (Gpu::Handler const & gh) noexcept
694
696
{
695
697
ReduceTuple r;
@@ -709,15 +711,15 @@ public:
709
711
Gpu::dtoh_memcpy_async (hp, dtmp.data (), sizeof (ReduceTuple));
710
712
#endif
711
713
#else
712
- amrex::launch<AMREX_GPU_MAX_THREADS >(1 , 0 , stream,
714
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(1 , 0 , stream,
713
715
[=] AMREX_GPU_DEVICE () noexcept
714
716
{
715
717
ReduceTuple r;
716
718
Reduce::detail::for_each_init<0 , ReduceTuple, Ps...>(r);
717
719
ReduceTuple dst = r;
718
720
for (int istream = 0 , nstreams = nblocks.size (); istream < nstreams; ++istream) {
719
721
auto dp_stream = dp+istream*maxblocks;
720
- for (int i = AMREX_GPU_MAX_THREADS *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS *gridDim.x ;
722
+ for (int i = AMREX_GPU_MAX_THREADS_REDUCE *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS_REDUCE *gridDim.x ;
721
723
i < nblocks[istream]; i += stride) {
722
724
Reduce::detail::for_each_local<0 , ReduceTuple, Ps...>(r, dp_stream[i]);
723
725
}
@@ -851,7 +853,7 @@ bool AnyOf (N n, T const* v, P const& pred)
851
853
#ifdef AMREX_USE_SYCL
852
854
const int num_ints = std::max (Gpu::Device::warp_size, int (ec.numThreads .x )/Gpu::Device::warp_size) + 1 ;
853
855
const std::size_t shared_mem_bytes = num_ints*sizeof (int );
854
- amrex::launch<AMREX_GPU_MAX_THREADS >(ec.numBlocks .x , shared_mem_bytes, Gpu::gpuStream (),
856
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(ec.numBlocks .x , shared_mem_bytes, Gpu::gpuStream (),
855
857
[=] AMREX_GPU_DEVICE (Gpu::Handler const & gh) noexcept {
856
858
int * has_any = &(static_cast <int *>(gh.sharedMemory ())[num_ints-1 ]);
857
859
if (gh.threadIdx () == 0 ) { *has_any = *dp; }
@@ -860,7 +862,7 @@ bool AnyOf (N n, T const* v, P const& pred)
860
862
if (!(*has_any))
861
863
{
862
864
int r = false ;
863
- for (N i = AMREX_GPU_MAX_THREADS *gh.blockIdx ()+gh.threadIdx (), stride = AMREX_GPU_MAX_THREADS *gh.gridDim ();
865
+ for (N i = AMREX_GPU_MAX_THREADS_REDUCE *gh.blockIdx ()+gh.threadIdx (), stride = AMREX_GPU_MAX_THREADS_REDUCE *gh.gridDim ();
864
866
i < n && !r; i += stride)
865
867
{
866
868
r = pred (v[i]) ? 1 : 0 ;
@@ -872,7 +874,7 @@ bool AnyOf (N n, T const* v, P const& pred)
872
874
}
873
875
});
874
876
#else
875
- amrex::launch<AMREX_GPU_MAX_THREADS >(ec.numBlocks .x , 0 , Gpu::gpuStream (),
877
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(ec.numBlocks .x , 0 , Gpu::gpuStream (),
876
878
[=] AMREX_GPU_DEVICE () noexcept {
877
879
__shared__ int has_any;
878
880
if (threadIdx.x == 0 ) { has_any = *dp; }
@@ -881,7 +883,7 @@ bool AnyOf (N n, T const* v, P const& pred)
881
883
if (!has_any)
882
884
{
883
885
int r = false ;
884
- for (N i = AMREX_GPU_MAX_THREADS *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS *gridDim.x ;
886
+ for (N i = AMREX_GPU_MAX_THREADS_REDUCE *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS_REDUCE *gridDim.x ;
885
887
i < n && !r; i += stride)
886
888
{
887
889
r = pred (v[i]) ? 1 : 0 ;
@@ -912,7 +914,7 @@ bool AnyOf (Box const& box, P const& pred)
912
914
#ifdef AMREX_USE_SYCL
913
915
const int num_ints = std::max (Gpu::Device::warp_size, int (ec.numThreads .x )/Gpu::Device::warp_size) + 1 ;
914
916
const std::size_t shared_mem_bytes = num_ints*sizeof (int );
915
- amrex::launch<AMREX_GPU_MAX_THREADS >(ec.numBlocks .x , shared_mem_bytes, Gpu::gpuStream (),
917
+ amrex::launch<AMREX_GPU_MAX_THREADS_REDUCE >(ec.numBlocks .x , shared_mem_bytes, Gpu::gpuStream (),
916
918
[=] AMREX_GPU_DEVICE (Gpu::Handler const & gh) noexcept {
917
919
int * has_any = &(static_cast <int *>(gh.sharedMemory ())[num_ints-1 ]);
918
920
if (gh.threadIdx () == 0 ) { *has_any = *dp; }
@@ -921,7 +923,7 @@ bool AnyOf (Box const& box, P const& pred)
921
923
if (!(*has_any))
922
924
{
923
925
int r = false ;
924
- for (int icell = AMREX_GPU_MAX_THREADS *gh.blockIdx ()+gh.threadIdx (), stride = AMREX_GPU_MAX_THREADS *gh.gridDim ();
926
+ for (int icell = AMREX_GPU_MAX_THREADS_REDUCE *gh.blockIdx ()+gh.threadIdx (), stride = AMREX_GPU_MAX_THREADS_REDUCE *gh.gridDim ();
925
927
icell < ncells && !r; icell += stride) {
926
928
int k = icell / lenxy;
927
929
int j = (icell - k*lenxy) / lenx;
@@ -937,7 +939,7 @@ bool AnyOf (Box const& box, P const& pred)
937
939
}
938
940
});
939
941
#else
940
- AMREX_LAUNCH_KERNEL (AMREX_GPU_MAX_THREADS , ec.numBlocks , ec.numThreads , 0 ,
942
+ AMREX_LAUNCH_KERNEL (AMREX_GPU_MAX_THREADS_REDUCE , ec.numBlocks , ec.numThreads , 0 ,
941
943
Gpu::gpuStream (),
942
944
[=] AMREX_GPU_DEVICE () noexcept {
943
945
__shared__ int has_any;
@@ -947,7 +949,7 @@ bool AnyOf (Box const& box, P const& pred)
947
949
if (!has_any)
948
950
{
949
951
int r = false ;
950
- for (int icell = AMREX_GPU_MAX_THREADS *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS *gridDim.x ;
952
+ for (int icell = AMREX_GPU_MAX_THREADS_REDUCE *blockIdx.x +threadIdx.x , stride = AMREX_GPU_MAX_THREADS_REDUCE *gridDim.x ;
951
953
icell < ncells && !r; icell += stride) {
952
954
int k = icell / lenxy;
953
955
int j = (icell - k*lenxy) / lenx;
0 commit comments