88import numpy as np
99
1010
11+ class CCCLStream :
12+ "Class to work around https://github.com/NVIDIA/cccl/issues/5144"
13+
14+ def __init__ (self , ptr ):
15+ self ._ptr = ptr
16+
17+ def __cuda_stream__ (self ):
18+ return (0 , self ._ptr )
19+
20+
1121def as_core_Stream (cs : nvbench .CudaStream ) -> core .Stream :
1222 return core .Stream .from_handle (cs .addressof ())
1323
1424
25+ def as_cccl_Stream (cs : nvbench .CudaStream ) -> CCCLStream :
26+ return CCCLStream (cs .addressof ())
27+
28+
29+ def as_cp_ExternalStream (
30+ cs : nvbench .CudaStream , dev_id : int = - 1
31+ ) -> cp .cuda .ExternalStream :
32+ h = cs .addressof ()
33+ return cp .cuda .ExternalStream (h , dev_id )
34+
35+
1536def segmented_reduce (state : nvbench .State ):
1637 "Benchmark segmented_reduce example"
1738 n_elems = state .getInt64 ("numElems" )
@@ -21,8 +42,12 @@ def segmented_reduce(state: nvbench.State):
2142 state .add_summary ("numRows" , n_rows )
2243 state .collectCUPTIMetrics ()
2344
24- rng = cp .random .default_rng ()
25- mat = rng .integers (low = - 31 , high = 32 , dtype = np .int32 , size = (n_rows , n_cols ))
45+ dev_id = state .getDevice ()
46+ cp_stream = as_cp_ExternalStream (state .getStream (), dev_id )
47+
48+ with cp_stream :
49+ rng = cp .random .default_rng ()
50+ mat = rng .integers (low = - 31 , high = 32 , dtype = np .int32 , size = (n_rows , n_cols ))
2651
2752 def add_op (a , b ):
2853 return a + b
@@ -41,22 +66,29 @@ def scale(row_id):
4166
4267 end_offsets = start_offsets + 1
4368
44- d_input = mat
4569 h_init = np .zeros (tuple (), dtype = np .int32 )
46- d_output = cp .empty (n_rows , dtype = d_input .dtype )
70+ with cp_stream :
71+ d_input = mat
72+ d_output = cp .empty (n_rows , dtype = d_input .dtype )
4773
4874 alg = algorithms .segmented_reduce (
4975 d_input , d_output , start_offsets , end_offsets , add_op , h_init
5076 )
5177
78+ # print(1)
79+ cccl_stream = as_cccl_Stream (state .getStream ())
80+ # print(2, core_stream, core_stream.__cuda_stream__())
5281 # query size of temporary storage and allocate
5382 temp_nbytes = alg (
54- None , d_input , d_output , n_rows , start_offsets , end_offsets , h_init
83+ None , d_input , d_output , n_rows , start_offsets , end_offsets , h_init , cccl_stream
5584 )
56- temp_storage = cp .empty (temp_nbytes , dtype = cp .uint8 )
85+ h_init = np .zeros (tuple (), dtype = np .int32 )
86+ # print(3)
87+ with cp_stream :
88+ temp_storage = cp .empty (temp_nbytes , dtype = cp .uint8 )
5789
5890 def launcher (launch : nvbench .Launch ):
59- s = as_core_Stream (launch .getStream ())
91+ s = as_cccl_Stream (launch .getStream ())
6092 alg (
6193 temp_storage ,
6294 d_input ,
0 commit comments