cccl/libcudacxx/benchmarks/bench/merge/basic.cu at 656544d9f5c6429b1627e369f53d0c185b778534 · NVIDIA/cccl · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
//===----------------------------------------------------------------------===//
//
// Part of CUDA Experimental in CUDA C++ Core Libraries,
// under the Apache License v2.0 with LLVM Exceptions.
// See https://llvm.org/LICENSE.txt for license information.
// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
// SPDX-FileCopyrightText: Copyright (c) 2026 NVIDIA CORPORATION & AFFILIATES.
//
//===----------------------------------------------------------------------===//

#include <thrust/device_vector.h>
#include <thrust/execution_policy.h>
#include <thrust/merge.h>
#include <thrust/sort.h>

#include <cuda/memory_pool>
#include <cuda/std/execution>
#include <cuda/stream>

#include "nvbench_helper.cuh"

template <typename T>
static void basic(nvbench::state& state, nvbench::type_list<T>)
{
  const auto elements        = static_cast<std::size_t>(state.get_int64("Elements"));
  const auto size_ratio      = static_cast<std::size_t>(state.get_int64("InputSizeRatio"));
  const auto entropy         = str_to_entropy(state.get_string("Entropy"));
  const auto elements_in_lhs = static_cast<std::size_t>(static_cast<double>(size_ratio * elements) / 100.0);

  thrust::device_vector<T> out(elements);
  thrust::device_vector<T> in = generate(elements, entropy);
  thrust::sort(in.begin(), in.begin() + elements_in_lhs);
  thrust::sort(in.begin() + elements_in_lhs, in.end());

  state.add_element_count(elements);
  state.add_global_memory_reads<T>(elements);
  state.add_global_memory_writes<T>(elements);

  caching_allocator_t alloc;
  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
             [&](nvbench::launch& launch) {
               cuda::std::merge(
                 cuda_policy(alloc, launch),
                 in.cbegin(),
                 in.cbegin() + elements_in_lhs,
                 in.cbegin() + elements_in_lhs,
                 in.cend(),
                 out.begin());
             });
}

NVBENCH_BENCH_TYPES(basic, NVBENCH_TYPE_AXES(fundamental_types))
  .set_name("base")
  .set_type_axes_names({"T{ct}"})
  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
  .add_string_axis("Entropy", {"1.000", "0.201"})
  .add_int64_axis("InputSizeRatio", {25, 50, 75});

template <typename T>
static void with_comp(nvbench::state& state, nvbench::type_list<T>)
{
  const auto elements        = static_cast<std::size_t>(state.get_int64("Elements"));
  const auto size_ratio      = static_cast<std::size_t>(state.get_int64("InputSizeRatio"));
  const auto entropy         = str_to_entropy(state.get_string("Entropy"));
  const auto elements_in_lhs = static_cast<std::size_t>(static_cast<double>(size_ratio * elements) / 100.0);

  thrust::device_vector<T> out(elements);
  thrust::device_vector<T> in = generate(elements, entropy);
  thrust::sort(in.begin(), in.begin() + elements_in_lhs, ::cuda::std::greater<T>{});
  thrust::sort(in.begin() + elements_in_lhs, in.end(), ::cuda::std::greater<T>{});

  state.add_element_count(elements);
  state.add_global_memory_reads<T>(elements);
  state.add_global_memory_writes<T>(elements);

  caching_allocator_t alloc;
  state.exec(nvbench::exec_tag::gpu | nvbench::exec_tag::no_batch | nvbench::exec_tag::sync,
             [&](nvbench::launch& launch) {
               cuda::std::merge(
                 cuda_policy(alloc, launch),
                 in.cbegin(),
                 in.cbegin() + elements_in_lhs,
                 in.cbegin() + elements_in_lhs,
                 in.cend(),
                 out.begin(),
                 ::cuda::std::greater<T>{});
             });
}

NVBENCH_BENCH_TYPES(with_comp, NVBENCH_TYPE_AXES(fundamental_types))
  .set_name("with_comp")
  .set_type_axes_names({"T{ct}"})
  .add_int64_power_of_two_axis("Elements", nvbench::range(16, 28, 4))
  .add_string_axis("Entropy", {"1.000", "0.201"})
  .add_int64_axis("InputSizeRatio", {25, 50, 75});