@@ -573,8 +573,15 @@ struct policy_hub
573573 static constexpr int num_threads_per_warp = 32 ;
574574
575575 // TODO(bgruber): tune this
576- static constexpr int num_reduce_warps = 4 ;
577- static constexpr int num_scan_stor_warps = 4 ;
576+ # if _CCCL_COMPILER(NVHPC)
577+ // need to reduce the number of threads to <= 256, so each thread can use up to 255 registers. This avoids an
578+ // error in ptxas, see also: https://github.com/NVIDIA/cccl/issues/7700.
579+ static constexpr int num_reduce_warps = 2 ;
580+ static constexpr int num_scan_stor_warps = 2 ;
581+ # else // _CCCL_COMPILER(NVHPC)
582+ static constexpr int num_reduce_warps = 4 ;
583+ static constexpr int num_scan_stor_warps = 4 ;
584+ # endif // _CCCL_COMPILER(NVHPC)
578585 static constexpr int num_load_warps = 1 ;
579586 static constexpr int num_sched_warps = 1 ;
580587 static constexpr int num_look_ahead_warps = 1 ;
@@ -587,6 +594,10 @@ struct policy_hub
587594 num_reduce_warps + num_scan_stor_warps + num_load_warps + num_sched_warps + num_look_ahead_warps;
588595 static constexpr int num_total_threads = num_total_warps * num_threads_per_warp;
589596
597+ # if _CCCL_COMPILER(NVHPC)
598+ static_assert (num_total_threads <= 256 );
599+ # endif // _CCCL_COMPILER(NVHPC)
600+
590601 static constexpr int squad_reduce_thread_count = num_reduce_warps * num_threads_per_warp;
591602
592603 // 256 / sizeof(InputValueT) - 1 should minimize bank conflicts (and fits into 48KiB SMEM)
0 commit comments