NVIDIA
diff --git a/‎docs/benchmarks.md‎
Lines changed: 31 additions & 6 deletions b/‎docs/benchmarks.md‎
Lines changed: 31 additions & 6 deletions
diff --git a/‎examples/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎examples/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎examples/custom_iteration_spaces.cu‎
Lines changed: 247 additions & 0 deletions b/‎examples/custom_iteration_spaces.cu‎
Lines changed: 247 additions & 0 deletions
diff --git a/‎nvbench/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions b/‎nvbench/CMakeLists.txt‎
Lines changed: 1 addition & 0 deletions
diff --git a/‎nvbench/axes_metadata.cuh‎
Lines changed: 28 additions & 1 deletion b/‎nvbench/axes_metadata.cuh‎
Lines changed: 28 additions & 1 deletion
@@ -237,9 +237,34 @@ NVBENCH_BENCH_TYPES(benchmark, NVBENCH_TYPE_AXES(input_types, output_types))
 ```
 
 This would generate a total of 36 configurations and instantiate the benchmark 6
-times. Keep the rapid growth of these combinations in mind when choosing the
-number of values in an axis. See the section about combinatorial explosion for
-more examples and information.
+times.
+
+Keep the rapid growth of combinations due to multiple parameter axes in mind when
+choosing the number of values in an axis. See the section about combinatorial
+explosion for more examples and information.
+
+## Zipped/Tied Iteration of Value Axes
+
+At times multiple value axes need to be iterated like they are actually a tuple
+or zipped together. To enable this behavior you can request axes to be 'tied'
+together.
+
+```cpp
+// InputTypes: {char, int, unsigned int}
+// OutputTypes: {float, double}
+// NumInputs: {2^10, 2^20, 2^30}
+// Quality: {0.5, 1.0}
+
+using input_types = nvbench::type_list<char, int, unsigned int>;
+using output_types = nvbench::type_list<float, double>;
+NVBENCH_BENCH_TYPES(benchmark, NVBENCH_TYPE_AXES(input_types, output_types))
+  .set_type_axes_names({"InputType", "OutputType"})
+  .add_int64_axis("NumInputs", {1000, 10000, 100000, 200000, 200000, 200000})
+  .add_float64_axis("Quality", {0.05, 0.1, 0.25, 0.5, 0.75, 1.});
+```
+
+This tieing reduces the total combinations from 24 to 6, reducing the
+combinatorial explosion.
 
 # Throughput Measurements
 
@@ -426,9 +451,9 @@ NVBENCH_BENCH_TYPES(my_benchmark,
 ```
 
 For large configuration spaces like this, pruning some of the less useful
-combinations (e.g. `sizeof(init_type) < sizeof(output)`) using the techniques
-described in the "Skip Uninteresting / Invalid Benchmarks" section can help
-immensely with keeping compile / run times manageable.
+combinations using the techniques described in the "Zipped/Tied Iteration of Value Axes"
+or "Skip Uninteresting / Invalid Benchmarks" section can help immensely with
+keeping compile / run times manageable.
 
 Splitting a single large configuration space into multiple, more focused
 benchmarks with reduced dimensionality will likely be worth the effort as well.
@@ -7,6 +7,7 @@ set(example_srcs
   stream.cu
   throughput.cu
   auto_throughput.cu
+  custom_iteration_spaces.cu
 )
 
 # Metatarget for all examples:
 
@@ -0,0 +1,247 @@
+/*
+ *  Copyright 2021 NVIDIA Corporation
+ *
+ *  Licensed under the Apache License, Version 2.0 with the LLVM exception
+ *  (the "License"); you may not use this file except in compliance with
+ *  the License.
+ *
+ *  You may obtain a copy of the License at
+ *
+ *      http://llvm.org/foundation/relicensing/LICENSE.txt
+ *
+ *  Unless required by applicable law or agreed to in writing, software
+ *  distributed under the License is distributed on an "AS IS" BASIS,
+ *  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+ *  See the License for the specific language governing permissions and
+ *  limitations under the License.
+ */
+
+#include <nvbench/nvbench.cuh>
+
+// Grab some testing kernels from NVBench:
+#include <nvbench/test_kernels.cuh>
+
+// Thrust vectors simplify memory management:
+#include <thrust/device_vector.h>
+
+#include <random>
+
+//==============================================================================
+// Multiple parameters:
+// Varies block_size and num_blocks while invoking a naive copy of 256 MiB worth
+// of int32_t.
+void copy_sweep_grid_shape(nvbench::state &state)
+{
+  // Get current parameters:
+  const int block_size = static_cast<int>(state.get_int64("BlockSize"));
+  const int num_blocks = static_cast<int>(state.get_int64("NumBlocks"));
+
+  // Number of int32s in 256 MiB:
+  const std::size_t num_values = 256 * 1024 * 1024 / sizeof(nvbench::int32_t);
+
+  // Report throughput stats:
+  state.add_element_count(num_values);
+  state.add_global_memory_reads<nvbench::int32_t>(num_values);
+  state.add_global_memory_writes<nvbench::int32_t>(num_values);
+
+  // Allocate device memory:
+  thrust::device_vector<nvbench::int32_t> in(num_values, 0);
+  thrust::device_vector<nvbench::int32_t> out(num_values, 0);
+
+  state.exec(
+    [block_size,
+     num_blocks,
+     num_values,
+     in_ptr  = thrust::raw_pointer_cast(in.data()),
+     out_ptr = thrust::raw_pointer_cast(out.data())](nvbench::launch &launch) {
+      nvbench::copy_kernel<<<num_blocks, block_size, 0, launch.get_stream()>>>(
+        in_ptr,
+        out_ptr,
+        num_values);
+    });
+}
+
+//==============================================================================
+// Tied iteration space allows you to iterate two or more axes at the same
+// time allowing for sparse exploration of the search space. Can also be used
+// to test the diagonal of a square matrix
+//
+void tied_copy_sweep_grid_shape(nvbench::state &state)
+{
+  copy_sweep_grid_shape(state);
+}
+NVBENCH_BENCH(tied_copy_sweep_grid_shape)
+  // Every power of two from  64->1024:
+  .add_int64_axis("BlockSize", {32,64,128,256})
+  .add_int64_axis("NumBlocks", {1024,512,256,128})
+  .tie_axes({"BlockSize", "NumBlocks"});
+
+//==============================================================================
+// under_diag:
+// Custom iterator that only searches the `X` locations of two axi
+// [- - - - X]
+// [- - - X X]
+// [- - X X X]
+// [- X X X X]
+// [X X X X X]
+//
+struct under_diag final : nvbench::user_axis_space
+{
+  under_diag(std::vector<std::size_t> input_indices,
+             std::vector<std::size_t> output_indices)
+      : nvbench::user_axis_space(std::move(input_indices), std::move(output_indices))
+  {}
+
+  mutable std::size_t x_pos   = 0;
+  mutable std::size_t y_pos   = 0;
+  mutable std::size_t x_start = 0;
+
+  nvbench::detail::axis_space_iterator do_iter(axes_info info) const
+  {
+    // generate our increment function
+    auto adv_func = [&, info](std::size_t &inc_index,
+                              std::size_t /*len*/) -> bool {
+      inc_index++;
+      x_pos++;
+      if (x_pos == info[0].size)
+      {
+        x_pos = ++x_start;
+        y_pos = x_start;
+        return true;
+      }
+      return false;
+    };
+
+    // our update function
+    std::vector<std::size_t> locs = m_output_indices;
+    auto diag_under =
+      [&, locs, info](std::size_t,
+                      std::vector<nvbench::detail::axis_index> &indices) {
+        nvbench::detail::axis_index temp = info[0];
+        temp.index                       = x_pos;
+        indices[locs[0]]                 = temp;
+
+        temp             = info[1];
+        temp.index       = y_pos;
+        indices[locs[1]] = temp;
+      };
+
+    const size_t iteration_length = ((info[0].size * (info[1].size + 1)) / 2);
+    return nvbench::detail::make_space_iterator(2,
+                                                iteration_length,
+                                                adv_func,
+                                                diag_under);
+  }
+
+  std::size_t do_size(const axes_info &info) const
+  {
+    return ((info[0].size * (info[1].size + 1)) / 2);
+  }
+
+  std::size_t do_valid_count(const axes_info &info) const
+  {
+    return ((info[0].size * (info[1].size + 1)) / 2);
+  }
+
+  std::unique_ptr<nvbench::axis_space_base> do_clone() const
+  {
+    return std::make_unique<under_diag>(*this);
+  }
+};
+
+void user_copy_sweep_grid_shape(nvbench::state &state)
+{
+  copy_sweep_grid_shape(state);
+}
+NVBENCH_BENCH(user_copy_sweep_grid_shape)
+  // Every power of two from  64->1024:
+  .add_int64_power_of_two_axis("BlockSize", nvbench::range(6, 10))
+  .add_int64_power_of_two_axis("NumBlocks", nvbench::range(6, 10))
+  .user_iteration_axes({"NumBlocks", "BlockSize"},
+                       [](auto... args)
+                         -> std::unique_ptr<nvbench::axis_space_base> {
+                         return std::make_unique<under_diag>(args...);
+                       });
+
+
+//==============================================================================
+// gauss:
+// Custom iteration space that uses a gauss distribution to
+// sample the points near the middle of the index space
+//
+struct gauss final : nvbench::user_axis_space
+{
+
+  gauss(std::vector<std::size_t> input_indices,
+        std::vector<std::size_t> output_indices)
+      : nvbench::user_axis_space(std::move(input_indices), std::move(output_indices))
+  {}
+
+  nvbench::detail::axis_space_iterator do_iter(axes_info info) const
+  {
+    const double mid_point = static_cast<double>((info[0].size / 2));
+
+    std::random_device rd{};
+    std::mt19937 gen{rd()};
+    std::normal_distribution<> d{mid_point, 2};
+
+    const size_t iteration_length = info[0].size;
+    std::vector<std::size_t> gauss_indices(iteration_length);
+    for (auto &g : gauss_indices)
+    {
+      auto v = std::min(static_cast<double>(info[0].size), d(gen));
+      v      = std::max(0.0, v);
+      g      = static_cast<std::size_t>(v);
+    }
+
+    // our update function
+    std::vector<std::size_t> locs = m_output_indices;
+    auto gauss_func               = [=](std::size_t index,
+                          std::vector<nvbench::detail::axis_index> &indices) {
+      nvbench::detail::axis_index temp = info[0];
+      temp.index                       = gauss_indices[index];
+      indices[locs[0]]                 = temp;
+    };
+
+    return nvbench::detail::make_space_iterator(1,
+                                                iteration_length,
+                                                gauss_func);
+  }
+
+  std::size_t do_size(const axes_info &info) const { return info[0].size; }
+
+  std::size_t do_valid_count(const axes_info &info) const
+  {
+    return info[0].size;
+  }
+
+  std::unique_ptr<axis_space_base> do_clone() const
+  {
+    return std::make_unique<gauss>(*this);
+  }
+};
+//==============================================================================
+// Dual parameter sweep:
+void dual_float64_axis(nvbench::state &state)
+{
+  const auto duration_A = state.get_float64("Duration_A");
+  const auto duration_B = state.get_float64("Duration_B");
+
+  state.exec([duration_A, duration_B](nvbench::launch &launch) {
+    nvbench::sleep_kernel<<<1, 1, 0, launch.get_stream()>>>(duration_A +
+                                                            duration_B);
+  });
+}
+NVBENCH_BENCH(dual_float64_axis)
+  .add_float64_axis("Duration_A", nvbench::range(0., 1e-4, 1e-5))
+  .add_float64_axis("Duration_B", nvbench::range(0., 1e-4, 1e-5))
+  .user_iteration_axes({"Duration_A"},
+                       [](auto... args)
+                         -> std::unique_ptr<nvbench::axis_space_base> {
+                         return std::make_unique<gauss>(args...);
+                       })
+  .user_iteration_axes({"Duration_B"},
+                       [](auto... args)
+                         -> std::unique_ptr<nvbench::axis_space_base> {
+                         return std::make_unique<gauss>(args...);
+                       });
@@ -1,6 +1,7 @@
 set(srcs
   axes_metadata.cxx
   axis_base.cxx
+  axis_iteration_space.cxx
   benchmark_base.cxx
   benchmark_manager.cxx
   blocking_kernel.cu
 
@@ -18,12 +18,14 @@
 
 #pragma once
 
+#include <nvbench/axis_iteration_space.cuh>
 #include <nvbench/float64_axis.cuh>
 #include <nvbench/int64_axis.cuh>
 #include <nvbench/string_axis.cuh>
 #include <nvbench/type_axis.cuh>
 #include <nvbench/types.cuh>
 
+#include <functional>
 #include <memory>
 #include <stdexcept>
 #include <string>
@@ -37,6 +39,8 @@ namespace nvbench
 struct axes_metadata
 {
   using axes_type = std::vector<std::unique_ptr<nvbench::axis_base>>;
+  using axes_iteration_space =
+    std::vector<std::unique_ptr<nvbench::axis_space_base>>;
 
   template <typename... TypeAxes>
   explicit axes_metadata(nvbench::type_list<TypeAxes...>);
@@ -58,6 +62,21 @@ struct axes_metadata
 
   void add_string_axis(std::string name, std::vector<std::string> data);
 
+  void tie_axes(std::vector<std::string> names);
+
+  void
+  user_iteration_axes(std::vector<std::string> names,
+                      std::function<nvbench::make_user_space_signature> make);
+
+  [[nodiscard]] const axes_iteration_space &get_type_iteration_space() const
+  {
+    return m_type_space;
+  }
+  [[nodiscard]] const axes_iteration_space &get_value_iteration_space() const
+  {
+    return m_value_space;
+  }
+
   [[nodiscard]] const nvbench::int64_axis &
   get_int64_axis(std::string_view name) const;
   [[nodiscard]] nvbench::int64_axis &get_int64_axis(std::string_view name);
@@ -93,6 +112,9 @@ struct axes_metadata
 
 private:
   axes_type m_axes;
+  std::size_t m_type_axe_count = 0;
+  axes_iteration_space m_type_space;
+  axes_iteration_space m_value_space;
 };
 
 template <typename... TypeAxes>
@@ -105,11 +127,15 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
 
   auto names_iter = names.begin(); // contents will be moved from
   nvbench::tl::foreach<type_axes_list>(
-    [&axes = m_axes, &names_iter]([[maybe_unused]] auto wrapped_type) {
+    [&axes = m_axes, &spaces = m_type_space, &names_iter](
+      [[maybe_unused]] auto wrapped_type) {
       // This is always called before other axes are added, so the length of the
       // axes vector will be the type axis index:
       const std::size_t type_axis_index = axes.size();
 
+      spaces.push_back(
+        std::make_unique<linear_axis_space>(type_axis_index, type_axis_index));
+
       // Note:
       // The word "type" appears 6 times in the next line.
       // Every. Single. Token.
@@ -119,6 +145,7 @@ axes_metadata::axes_metadata(nvbench::type_list<TypeAxes...>)
       axis->template set_inputs<type_list>();
       axes.push_back(std::move(axis));
     });
+  m_type_axe_count = m_axes.size();
 }
 
 } // namespace nvbench
Original file line number	Diff line number	Diff line change
`@@ -7,6 +7,7 @@ set(example_srcs`
`7`	`7`	`stream.cu`
`8`	`8`	`throughput.cu`
`9`	`9`	`auto_throughput.cu`
	`10`	`+ custom_iteration_spaces.cu`
`10`	`11`	`)`
`11`	`12`
`12`	`13`	`# Metatarget for all examples:`