Skip to content

Commit ed1e146

Browse files
Reduce warpspeed scan to 256 threads/block on NVHPC (#7892)
Fixes: #7700
1 parent 29ebe15 commit ed1e146

File tree

1 file changed

+13
-2
lines changed

1 file changed

+13
-2
lines changed

cub/cub/device/dispatch/tuning/tuning_scan.cuh

Lines changed: 13 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -573,8 +573,15 @@ struct policy_hub
573573
static constexpr int num_threads_per_warp = 32;
574574

575575
// TODO(bgruber): tune this
576-
static constexpr int num_reduce_warps = 4;
577-
static constexpr int num_scan_stor_warps = 4;
576+
# if _CCCL_COMPILER(NVHPC)
577+
// need to reduce the number of threads to <= 256, so each thread can use up to 255 registers. This avoids an
578+
// error in ptxas, see also: https://github.com/NVIDIA/cccl/issues/7700.
579+
static constexpr int num_reduce_warps = 2;
580+
static constexpr int num_scan_stor_warps = 2;
581+
# else // _CCCL_COMPILER(NVHPC)
582+
static constexpr int num_reduce_warps = 4;
583+
static constexpr int num_scan_stor_warps = 4;
584+
# endif // _CCCL_COMPILER(NVHPC)
578585
static constexpr int num_load_warps = 1;
579586
static constexpr int num_sched_warps = 1;
580587
static constexpr int num_look_ahead_warps = 1;
@@ -587,6 +594,10 @@ struct policy_hub
587594
num_reduce_warps + num_scan_stor_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
588595
static constexpr int num_total_threads = num_total_warps * num_threads_per_warp;
589596

597+
# if _CCCL_COMPILER(NVHPC)
598+
static_assert(num_total_threads <= 256);
599+
# endif // _CCCL_COMPILER(NVHPC)
600+
590601
static constexpr int squad_reduce_thread_count = num_reduce_warps * num_threads_per_warp;
591602

592603
// 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM)

0 commit comments

Comments
 (0)