-
Notifications
You must be signed in to change notification settings - Fork 355
Add env DeviceMemcpy::Batched and tests #7966
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from all commits
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change | ||||
|---|---|---|---|---|---|---|
| @@ -0,0 +1,131 @@ | ||||||
| // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||||||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||||||
|
|
||||||
| // Should precede any includes | ||||||
| struct stream_registry_factory_t; | ||||||
| #define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t | ||||||
|
|
||||||
| #include "insert_nested_NVTX_range_guard.h" | ||||||
|
|
||||||
| #include <cub/device/device_memcpy.cuh> | ||||||
|
|
||||||
| #include <thrust/device_vector.h> | ||||||
| #include <thrust/iterator/counting_iterator.h> | ||||||
| #include <thrust/iterator/transform_iterator.h> | ||||||
|
|
||||||
| #include "catch2_test_env_launch_helper.h" | ||||||
|
|
||||||
| DECLARE_LAUNCH_WRAPPER(cub::DeviceMemcpy::Batched, device_memcpy_batched); | ||||||
|
|
||||||
| // %PARAM% TEST_LAUNCH lid 0:1:2 | ||||||
|
|
||||||
| #include <cuda/__execution/require.h> | ||||||
|
|
||||||
| #include <c2h/catch2_test_helper.h> | ||||||
|
|
||||||
| namespace stdexec = cuda::std::execution; | ||||||
|
|
||||||
| template <typename T> | ||||||
| struct index_to_ptr | ||||||
| { | ||||||
| T* base; | ||||||
| const int* offsets; | ||||||
| __host__ __device__ __forceinline__ T* operator()(int index) const | ||||||
| { | ||||||
| return base + offsets[index]; | ||||||
| } | ||||||
| }; | ||||||
|
|
||||||
| struct get_size | ||||||
| { | ||||||
| const int* offsets; | ||||||
| __host__ __device__ __forceinline__ int operator()(int index) const | ||||||
| { | ||||||
| return (offsets[index + 1] - offsets[index]) * static_cast<int>(sizeof(int)); | ||||||
| } | ||||||
| }; | ||||||
|
|
||||||
| #if TEST_LAUNCH == 0 | ||||||
|
|
||||||
| TEST_CASE("DeviceMemcpy::Batched works with default environment", "[memcpy][device]") | ||||||
| { | ||||||
| // 3 buffers: [10, 20], [30, 40, 50], [60] | ||||||
| auto d_src = c2h::device_vector<int>{10, 20, 30, 40, 50, 60}; | ||||||
| auto d_dst = c2h::device_vector<int>(6, 0); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion:
Suggested change
|
||||||
| auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6}; | ||||||
|
|
||||||
| int num_buffers = 3; | ||||||
|
|
||||||
| thrust::counting_iterator<int> iota(0); | ||||||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important: Please prefer cuda iterators over thrust iterators. |
||||||
| auto input_it = thrust::make_transform_iterator( | ||||||
| iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())}); | ||||||
| auto output_it = thrust::make_transform_iterator( | ||||||
| iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())}); | ||||||
| auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())}); | ||||||
|
|
||||||
| REQUIRE(cudaSuccess == cub::DeviceMemcpy::Batched(input_it, output_it, sizes, num_buffers)); | ||||||
|
|
||||||
| REQUIRE(d_dst == d_src); | ||||||
| } | ||||||
|
|
||||||
| #endif | ||||||
|
|
||||||
| C2H_TEST("DeviceMemcpy::Batched uses environment", "[memcpy][device]") | ||||||
| { | ||||||
| // 3 buffers: [10, 20], [30, 40, 50], [60] | ||||||
| auto d_src = c2h::device_vector<int>{10, 20, 30, 40, 50, 60}; | ||||||
| auto d_dst = c2h::device_vector<int>(6, 0); | ||||||
| auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6}; | ||||||
|
|
||||||
| int num_buffers = 3; | ||||||
|
|
||||||
| thrust::counting_iterator<int> iota(0); | ||||||
| auto input_it = thrust::make_transform_iterator( | ||||||
| iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())}); | ||||||
| auto output_it = thrust::make_transform_iterator( | ||||||
| iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())}); | ||||||
| auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())}); | ||||||
|
|
||||||
| size_t expected_bytes_allocated{}; | ||||||
| REQUIRE(cudaSuccess | ||||||
| == cub::DeviceMemcpy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_buffers)); | ||||||
|
|
||||||
| auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)}; | ||||||
|
|
||||||
| device_memcpy_batched(input_it, output_it, sizes, num_buffers, env); | ||||||
|
|
||||||
| REQUIRE(d_dst == d_src); | ||||||
| } | ||||||
|
|
||||||
| TEST_CASE("DeviceMemcpy::Batched uses custom stream", "[memcpy][device]") | ||||||
| { | ||||||
| // 3 buffers: [10, 20], [30, 40, 50], [60] | ||||||
| auto d_src = c2h::device_vector<int>{10, 20, 30, 40, 50, 60}; | ||||||
| auto d_dst = c2h::device_vector<int>(6, 0); | ||||||
| auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6}; | ||||||
|
|
||||||
| int num_buffers = 3; | ||||||
|
|
||||||
| thrust::counting_iterator<int> iota(0); | ||||||
| auto input_it = thrust::make_transform_iterator( | ||||||
| iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())}); | ||||||
| auto output_it = thrust::make_transform_iterator( | ||||||
| iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())}); | ||||||
| auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())}); | ||||||
|
|
||||||
| cudaStream_t custom_stream; | ||||||
| REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream)); | ||||||
|
Comment on lines
+116
to
+117
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Important: Please use |
||||||
|
|
||||||
| size_t expected_bytes_allocated{}; | ||||||
| REQUIRE(cudaSuccess | ||||||
| == cub::DeviceMemcpy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_buffers)); | ||||||
|
|
||||||
| auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}}; | ||||||
| auto env = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)}; | ||||||
|
|
||||||
| device_memcpy_batched(input_it, output_it, sizes, num_buffers, env); | ||||||
|
|
||||||
| REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream)); | ||||||
| REQUIRE(d_dst == d_src); | ||||||
| REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream)); | ||||||
| } | ||||||
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,72 @@ | ||
| // SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved. | ||
| // SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception | ||
|
|
||
| #include "insert_nested_NVTX_range_guard.h" | ||
|
|
||
| #include <cub/device/device_memcpy.cuh> | ||
|
|
||
| #include <thrust/detail/raw_pointer_cast.h> | ||
| #include <thrust/device_vector.h> | ||
| #include <thrust/iterator/counting_iterator.h> | ||
| #include <thrust/iterator/transform_iterator.h> | ||
|
|
||
| #include <cuda/devices> | ||
| #include <cuda/stream> | ||
|
|
||
| #include <iostream> | ||
|
|
||
| #include <c2h/catch2_test_helper.h> | ||
|
|
||
| template <typename T> | ||
| struct index_to_ptr | ||
| { | ||
| T* base; | ||
| const int* offsets; | ||
| __host__ __device__ __forceinline__ T* operator()(int index) const | ||
| { | ||
| return base + offsets[index]; | ||
| } | ||
| }; | ||
|
|
||
| struct get_size | ||
| { | ||
| const int* offsets; | ||
| __host__ __device__ __forceinline__ int operator()(int index) const | ||
| { | ||
| return (offsets[index + 1] - offsets[index]) * static_cast<int>(sizeof(int)); | ||
| } | ||
| }; | ||
|
|
||
| C2H_TEST("cub::DeviceMemcpy::Batched accepts env with stream", "[memcpy][env]") | ||
| { | ||
| // example-begin memcpy-batched-env | ||
| // 3 buffers of different sizes: [10, 20], [30, 40, 50], [60] | ||
| auto d_src = thrust::device_vector<int>{10, 20, 30, 40, 50, 60}; | ||
| auto d_dst = thrust::device_vector<int>(6, 0); | ||
| auto d_offsets = thrust::device_vector<int>{0, 2, 5, 6}; | ||
|
|
||
| int num_buffers = 3; | ||
|
|
||
| thrust::counting_iterator<int> iota(0); | ||
| auto input_it = thrust::make_transform_iterator( | ||
| iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())}); | ||
| auto output_it = thrust::make_transform_iterator( | ||
| iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())}); | ||
| auto sizes = thrust::make_transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())}); | ||
|
|
||
| cuda::stream stream{cuda::devices[0]}; | ||
| cuda::stream_ref stream_ref{stream}; | ||
| auto env = cuda::std::execution::env{stream_ref}; | ||
|
|
||
| auto error = cub::DeviceMemcpy::Batched(input_it, output_it, sizes, num_buffers, env); | ||
| if (error != cudaSuccess) | ||
| { | ||
| std::cerr << "cub::DeviceMemcpy::Batched failed with status: " << error << std::endl; | ||
| } | ||
|
|
||
| thrust::device_vector<int> expected{10, 20, 30, 40, 50, 60}; | ||
| // example-end memcpy-batched-env | ||
|
Comment on lines
+43
to
+68
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Suggestion: I don't think this is a great API example. We should focus on the API's design of taking an iterator to pointers. Something like: auto d_src = thrust::device_vector<int>{10, 20, 30, 40, 50, 60};
auto d_src_pointers = thrust::device_vector<int*>{d_src[0], d_src[2], d_src[5]};
|
||
|
|
||
| REQUIRE(error == cudaSuccess); | ||
| REQUIRE(d_dst == expected); | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Q: What prevents us from running the below unit test for launch id 1 and 2?