@@ -68,7 +68,7 @@ void copy_sweep_grid_shape(nvbench::state &state)
6868 state.add_global_memory_writes <nvbench::int32_t >(num_values);
6969
7070 // Allocate device memory:
71- thrust::device_vector<nvbench::int32_t > in (num_values, 0 );
71+ thrust::device_vector<nvbench::int32_t > in (num_values, 1 );
7272 thrust::device_vector<nvbench::int32_t > out (num_values, 0 );
7373
7474 state.exec ([block_size,
@@ -102,7 +102,7 @@ void copy_type_sweep(nvbench::state &state, nvbench::type_list<ValueType>)
102102 state.add_global_memory_writes <ValueType>(num_values);
103103
104104 // Allocate device memory:
105- thrust::device_vector<ValueType> in (num_values, 0 );
105+ thrust::device_vector<ValueType> in (num_values, ValueType{ 17 } );
106106 thrust::device_vector<ValueType> out (num_values, 0 );
107107
108108 state.exec ([num_values,
@@ -171,3 +171,42 @@ using ctcs_types = nvbench::type_list<nvbench::int8_t,
171171 nvbench::float64_t >;
172172NVBENCH_BENCH_TYPES (copy_type_conversion_sweep, NVBENCH_TYPE_AXES(ctcs_types, ctcs_types))
173173 .set_type_axes_names({" In" , " Out" });
174+
175+ // ==================================================================================
176+ // Passing list of typenames and `enum_type_list` to build cartesian product
177+ // of typenames and integral constants
178+
179+ // define constant wrapper helper type
180+ template <auto V, typename T = decltype (V)>
181+ using cw_t = std::integral_constant<T, V>;
182+
183+ template <typename ValueT, unsigned BLOCK_DIM>
184+ void copy_type_and_block_size_sweep (nvbench::state &state,
185+ nvbench::type_list<ValueT, cw_t <BLOCK_DIM>>)
186+ {
187+ const std::size_t nelems = 256 * 1024 * 1024 / sizeof (ValueT);
188+ ValueT fill_value{42 };
189+ thrust::device_vector<ValueT> inp (nelems, fill_value);
190+ thrust::device_vector<ValueT> out (nelems, ValueT{});
191+
192+ // use cuda::ceil_div(nelems, BLOCK_DIM) with CCCL 2.8 and newer
193+ const auto gridSize = (nelems + BLOCK_DIM - 1 ) / BLOCK_DIM;
194+
195+ const ValueT *inp_p = thrust::raw_pointer_cast (inp.data ());
196+ ValueT *out_p = thrust::raw_pointer_cast (out.data ());
197+
198+ state.add_element_count (nelems, " ElementCount" );
199+ state.add_global_memory_reads <ValueT>(nelems, " Input" );
200+ state.add_global_memory_writes <ValueT>(nelems, " Output" );
201+
202+ state.exec ([&](nvbench::launch &launch) {
203+ nvbench::copy_kernel<<<gridSize, BLOCK_DIM, 0 , launch.get_stream()>>> (inp_p, out_p, nelems);
204+ });
205+ }
206+
207+ template <auto ... V>
208+ using cw_list = nvbench::type_list<cw_t <V>...>;
209+
210+ using block_sizes = cw_list<64u , 128u , 196u , 256u , 320u , 512u >;
211+ NVBENCH_BENCH_TYPES (copy_type_and_block_size_sweep, NVBENCH_TYPE_AXES(ctcs_types, block_sizes))
212+ .set_type_axes_names({" Type" , " BlockSize" });
0 commit comments