Skip to content

Commit 1bbe440

Browse files
authored
Add keep option to distinct nvbench (#16497)
This PR adopts some work from @srinivasyadav18 with additional modifications. This is meant to complement #16484. Authors: - Bradley Dice (https://github.com/bdice) - Srinivas Yadav (https://github.com/srinivasyadav18) Approvers: - Yunsong Wang (https://github.com/PointKernel) - Srinivas Yadav (https://github.com/srinivasyadav18) URL: #16497
1 parent 792dd06 commit 1bbe440

File tree

5 files changed

+113
-32
lines changed

5 files changed

+113
-32
lines changed

cpp/benchmarks/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -162,6 +162,7 @@ ConfigureNVBench(
162162
stream_compaction/distinct.cpp
163163
stream_compaction/distinct_count.cpp
164164
stream_compaction/stable_distinct.cpp
165+
stream_compaction/stream_compaction_common.cpp
165166
stream_compaction/unique.cpp
166167
stream_compaction/unique_count.cpp
167168
)

cpp/benchmarks/stream_compaction/distinct.cpp

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2020-2023, NVIDIA CORPORATION.
2+
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
1515
*/
1616

1717
#include <benchmarks/common/generate_input.hpp>
18+
#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
1819

1920
#include <cudf/column/column_view.hpp>
2021
#include <cudf/lists/list_view.hpp>
@@ -23,15 +24,29 @@
2324

2425
#include <nvbench/nvbench.cuh>
2526

27+
#include <limits>
28+
2629
NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
2730

2831
template <typename Type>
2932
void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
3033
{
31-
cudf::size_type const num_rows = state.get_int64("NumRows");
34+
cudf::size_type const num_rows = state.get_int64("NumRows");
35+
auto const keep = get_keep(state.get_string("keep"));
36+
cudf::size_type const cardinality = state.get_int64("cardinality");
37+
38+
if (cardinality > num_rows) {
39+
state.skip("cardinality > num_rows");
40+
return;
41+
}
3242

33-
data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
34-
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
43+
data_profile profile = data_profile_builder()
44+
.cardinality(cardinality)
45+
.null_probability(0.01)
46+
.distribution(cudf::type_to_id<Type>(),
47+
distribution_id::UNIFORM,
48+
static_cast<Type>(0),
49+
std::numeric_limits<Type>::max());
3550

3651
auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
3752

@@ -40,27 +55,27 @@ void nvbench_distinct(nvbench::state& state, nvbench::type_list<Type>)
4055

4156
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
4257
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
43-
auto result = cudf::distinct(input_table,
44-
{0},
45-
cudf::duplicate_keep_option::KEEP_ANY,
46-
cudf::null_equality::EQUAL,
47-
cudf::nan_equality::ALL_EQUAL);
58+
auto result = cudf::distinct(
59+
input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
4860
});
4961
}
5062

51-
using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
63+
using data_type = nvbench::type_list<int32_t, int64_t>;
5264

5365
NVBENCH_BENCH_TYPES(nvbench_distinct, NVBENCH_TYPE_AXES(data_type))
5466
.set_name("distinct")
5567
.set_type_axes_names({"Type"})
56-
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
68+
.add_string_axis("keep", {"any", "first", "last", "none"})
69+
.add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
70+
.add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
5771

5872
template <typename Type>
5973
void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
6074
{
6175
auto const size = state.get_int64("ColumnSize");
6276
auto const dtype = cudf::type_to_id<Type>();
6377
double const null_probability = state.get_float64("null_probability");
78+
auto const keep = get_keep(state.get_string("keep"));
6479

6580
auto builder = data_profile_builder().null_probability(null_probability);
6681
if (dtype == cudf::type_id::LIST) {
@@ -80,17 +95,15 @@ void nvbench_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
8095

8196
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
8297
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
83-
auto result = cudf::distinct(*table,
84-
{0},
85-
cudf::duplicate_keep_option::KEEP_ANY,
86-
cudf::null_equality::EQUAL,
87-
cudf::nan_equality::ALL_EQUAL);
98+
auto result =
99+
cudf::distinct(*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
88100
});
89101
}
90102

91103
NVBENCH_BENCH_TYPES(nvbench_distinct_list,
92104
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
93105
.set_name("distinct_list")
94106
.set_type_axes_names({"Type"})
107+
.add_string_axis("keep", {"any", "first", "last", "none"})
95108
.add_float64_axis("null_probability", {0.0, 0.1})
96109
.add_int64_axis("ColumnSize", {100'000'000});

cpp/benchmarks/stream_compaction/stable_distinct.cpp

Lines changed: 29 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
/*
2-
* Copyright (c) 2023, NVIDIA CORPORATION.
2+
* Copyright (c) 2023-2024, NVIDIA CORPORATION.
33
*
44
* Licensed under the Apache License, Version 2.0 (the "License");
55
* you may not use this file except in compliance with the License.
@@ -15,6 +15,7 @@
1515
*/
1616

1717
#include <benchmarks/common/generate_input.hpp>
18+
#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
1819

1920
#include <cudf/column/column_view.hpp>
2021
#include <cudf/lists/list_view.hpp>
@@ -23,15 +24,29 @@
2324

2425
#include <nvbench/nvbench.cuh>
2526

27+
#include <limits>
28+
2629
NVBENCH_DECLARE_TYPE_STRINGS(cudf::timestamp_ms, "cudf::timestamp_ms", "cudf::timestamp_ms");
2730

2831
template <typename Type>
2932
void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
3033
{
31-
cudf::size_type const num_rows = state.get_int64("NumRows");
34+
cudf::size_type const num_rows = state.get_int64("NumRows");
35+
auto const keep = get_keep(state.get_string("keep"));
36+
cudf::size_type const cardinality = state.get_int64("cardinality");
37+
38+
if (cardinality > num_rows) {
39+
state.skip("cardinality > num_rows");
40+
return;
41+
}
3242

33-
data_profile profile = data_profile_builder().cardinality(0).null_probability(0.01).distribution(
34-
cudf::type_to_id<Type>(), distribution_id::UNIFORM, 0, 100);
43+
data_profile profile = data_profile_builder()
44+
.cardinality(cardinality)
45+
.null_probability(0.01)
46+
.distribution(cudf::type_to_id<Type>(),
47+
distribution_id::UNIFORM,
48+
static_cast<Type>(0),
49+
std::numeric_limits<Type>::max());
3550

3651
auto source_column = create_random_column(cudf::type_to_id<Type>(), row_count{num_rows}, profile);
3752

@@ -40,27 +55,27 @@ void nvbench_stable_distinct(nvbench::state& state, nvbench::type_list<Type>)
4055

4156
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
4257
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
43-
auto result = cudf::stable_distinct(input_table,
44-
{0},
45-
cudf::duplicate_keep_option::KEEP_ANY,
46-
cudf::null_equality::EQUAL,
47-
cudf::nan_equality::ALL_EQUAL);
58+
auto result = cudf::stable_distinct(
59+
input_table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
4860
});
4961
}
5062

51-
using data_type = nvbench::type_list<bool, int8_t, int32_t, int64_t, float, cudf::timestamp_ms>;
63+
using data_type = nvbench::type_list<int32_t, int64_t>;
5264

5365
NVBENCH_BENCH_TYPES(nvbench_stable_distinct, NVBENCH_TYPE_AXES(data_type))
5466
.set_name("stable_distinct")
5567
.set_type_axes_names({"Type"})
56-
.add_int64_axis("NumRows", {10'000, 100'000, 1'000'000, 10'000'000});
68+
.add_string_axis("keep", {"any", "first", "last", "none"})
69+
.add_int64_axis("cardinality", {100, 100'000, 10'000'000, 1'000'000'000})
70+
.add_int64_axis("NumRows", {100, 100'000, 10'000'000, 1'000'000'000});
5771

5872
template <typename Type>
5973
void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type>)
6074
{
6175
auto const size = state.get_int64("ColumnSize");
6276
auto const dtype = cudf::type_to_id<Type>();
6377
double const null_probability = state.get_float64("null_probability");
78+
auto const keep = get_keep(state.get_string("keep"));
6479

6580
auto builder = data_profile_builder().null_probability(null_probability);
6681
if (dtype == cudf::type_id::LIST) {
@@ -80,17 +95,15 @@ void nvbench_stable_distinct_list(nvbench::state& state, nvbench::type_list<Type
8095

8196
state.set_cuda_stream(nvbench::make_cuda_stream_view(cudf::get_default_stream().value()));
8297
state.exec(nvbench::exec_tag::sync, [&](nvbench::launch& launch) {
83-
auto result = cudf::stable_distinct(*table,
84-
{0},
85-
cudf::duplicate_keep_option::KEEP_ANY,
86-
cudf::null_equality::EQUAL,
87-
cudf::nan_equality::ALL_EQUAL);
98+
auto result = cudf::stable_distinct(
99+
*table, {0}, keep, cudf::null_equality::EQUAL, cudf::nan_equality::ALL_EQUAL);
88100
});
89101
}
90102

91103
NVBENCH_BENCH_TYPES(nvbench_stable_distinct_list,
92104
NVBENCH_TYPE_AXES(nvbench::type_list<int32_t, cudf::list_view>))
93105
.set_name("stable_distinct_list")
94106
.set_type_axes_names({"Type"})
107+
.add_string_axis("keep", {"any", "first", "last", "none"})
95108
.add_float64_axis("null_probability", {0.0, 0.1})
96109
.add_int64_axis("ColumnSize", {100'000'000});
Lines changed: 35 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,35 @@
1+
/*
2+
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <benchmarks/stream_compaction/stream_compaction_common.hpp>
18+
19+
#include <cudf/stream_compaction.hpp>
20+
#include <cudf/utilities/error.hpp>
21+
22+
cudf::duplicate_keep_option get_keep(std::string const& keep_str)
23+
{
24+
if (keep_str == "any") {
25+
return cudf::duplicate_keep_option::KEEP_ANY;
26+
} else if (keep_str == "first") {
27+
return cudf::duplicate_keep_option::KEEP_FIRST;
28+
} else if (keep_str == "last") {
29+
return cudf::duplicate_keep_option::KEEP_LAST;
30+
} else if (keep_str == "none") {
31+
return cudf::duplicate_keep_option::KEEP_NONE;
32+
} else {
33+
CUDF_FAIL("Unsupported keep option.");
34+
}
35+
}
Lines changed: 19 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,19 @@
1+
/*
2+
* Copyright (c) 2020-2024, NVIDIA CORPORATION.
3+
*
4+
* Licensed under the Apache License, Version 2.0 (the "License");
5+
* you may not use this file except in compliance with the License.
6+
* You may obtain a copy of the License at
7+
*
8+
* http://www.apache.org/licenses/LICENSE-2.0
9+
*
10+
* Unless required by applicable law or agreed to in writing, software
11+
* distributed under the License is distributed on an "AS IS" BASIS,
12+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
* See the License for the specific language governing permissions and
14+
* limitations under the License.
15+
*/
16+
17+
#include <cudf/stream_compaction.hpp>
18+
19+
cudf::duplicate_keep_option get_keep(std::string const& keep_str);

0 commit comments

Comments
 (0)