cub: test DeviceTransform aligned_size_t vectorized store (same-width/narrowing/widening)

nanan-nvidia · nanan-nvidia · commit c6559fb7e99b · 2026-06-15T22:44:36.000-07:00
diff --git a/cub/test/catch2_test_device_transform_aligned.cu b/cub/test/catch2_test_device_transform_aligned.cu
@@ -0,0 +1,103 @@
+// SPDX-FileCopyrightText: Copyright (c) 2024, NVIDIA CORPORATION. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_transform.cuh>
+
+#include <cuda/__memory/aligned_size.h>
+
+#include <algorithm>
+
+#include "catch2_test_launch_helper.h"
+#include <c2h/catch2_test_helper.h>
+#include <c2h/test_util_vec.h>
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceTransform::Transform, transform_many);
+
+#define ALIGNED_ITEM_COUNTS 0, 16, 128, 4096, 4112, 65536, 99'984
+
+template <typename Out>
+struct cast_to
+{
+  template <typename T>
+  __host__ __device__ Out operator()(T v) const
+  {
+    return static_cast<Out>(v);
+  }
+};
+
+C2H_TEST("DeviceTransform::Transform aligned_size_t<16> same-width",
+         "[device][transform]",
+         c2h::type_list<std::uint8_t, std::uint16_t, std::uint32_t, std::uint64_t, float, double, uchar3>)
+{
+  using type               = c2h::get<0, TestType>;
+  using offset_t           = cuda::std::int64_t;
+  const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS);
+  CAPTURE(c2h::type_name<type>(), num_items);
+
+  c2h::device_vector<type> a(num_items, thrust::no_init);
+  c2h::device_vector<type> b(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(1), a);
+  c2h::gen(C2H_SEED(1), b);
+
+  c2h::device_vector<type> result(num_items, thrust::no_init);
+  transform_many(cuda::std::make_tuple(a.begin(), b.begin()),
+                 result.begin(),
+                 cuda::aligned_size_t<16>(num_items),
+                 cuda::std::plus<type>{});
+
+  c2h::host_vector<type> a_h = a;
+  c2h::host_vector<type> b_h = b;
+  c2h::host_vector<type> reference_h(num_items, thrust::no_init);
+  std::transform(a_h.begin(), a_h.end(), b_h.begin(), reference_h.begin(), cuda::std::plus<type>{});
+  REQUIRE(reference_h == result);
+}
+
+C2H_TEST("DeviceTransform::Transform aligned_size_t<16> narrowing to uint8",
+         "[device][transform]",
+         c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>)
+{
+  using in_t               = c2h::get<0, TestType>;
+  using out_t              = std::uint8_t;
+  using offset_t           = cuda::std::int64_t;
+  const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS);
+  CAPTURE(c2h::type_name<in_t>(), num_items);
+
+  c2h::device_vector<in_t> in(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(1), in);
+
+  c2h::device_vector<out_t> result(num_items, thrust::no_init);
+  transform_many(
+    cuda::std::make_tuple(in.begin()), result.begin(), cuda::aligned_size_t<16>(num_items), cast_to<out_t>{});
+
+  c2h::host_vector<in_t> in_h = in;
+  c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
+  std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
+  REQUIRE(reference_h == result);
+}
+
+C2H_TEST("DeviceTransform::Transform aligned_size_t<16> widening from uint8",
+         "[device][transform]",
+         c2h::type_list<std::uint16_t, std::uint32_t, std::uint64_t>)
+{
+  using in_t               = std::uint8_t;
+  using out_t              = c2h::get<0, TestType>;
+  using offset_t           = cuda::std::int64_t;
+  const offset_t num_items = GENERATE(ALIGNED_ITEM_COUNTS);
+  CAPTURE(c2h::type_name<out_t>(), num_items);
+
+  c2h::device_vector<in_t> in(num_items, thrust::no_init);
+  c2h::gen(C2H_SEED(1), in);
+
+  c2h::device_vector<out_t> result(num_items, thrust::no_init);
+  transform_many(
+    cuda::std::make_tuple(in.begin()), result.begin(), cuda::aligned_size_t<16>(num_items), cast_to<out_t>{});
+
+  c2h::host_vector<in_t> in_h = in;
+  c2h::host_vector<out_t> reference_h(num_items, thrust::no_init);
+  std::transform(in_h.begin(), in_h.end(), reference_h.begin(), cast_to<out_t>{});
+  REQUIRE(reference_h == result);
+}