diff --git a/cub/cub/device/device_memcpy.cuh b/cub/cub/device/device_memcpy.cuh
index cd57f11ac35..32679ce03e6 100644
--- a/cub/cub/device/device_memcpy.cuh
+++ b/cub/cub/device/device_memcpy.cuh
@@ -16,8 +16,10 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/env_dispatch.cuh>
 #include <cub/device/dispatch/dispatch_batch_memcpy.cuh>
 
+#include <cuda/std/__execution/env.h>
 #include <cuda/std/__type_traits/is_pointer.h>
 #include <cuda/std/cstdint>
 
@@ -169,6 +171,96 @@ struct DeviceMemcpy
       DispatchBatchMemcpy<InputBufferIt, OutputBufferIt, BufferSizeIteratorT, BlockOffsetT, CopyAlg::Memcpy>::Dispatch(
         d_temp_storage, temp_storage_bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream);
   }
+
+  //! @rst
+  //! Copies data from a batch of given source buffers to their corresponding destination buffer.
+  //!
+  //! .. versionadded:: 3.4.0
+  //!    First appears in CUDA Toolkit 13.4.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! .. note::
+  //!
+  //!    If any input buffer aliases memory from any output buffer the behavior is undefined.
+  //!    If any output buffer aliases memory of another output buffer the behavior is undefined.
+  //!    Input buffers can alias one another.
+  //!
+  //! Snippet
+  //! +++++++
+  //!
+  //! The code snippet below illustrates usage of DeviceMemcpy::Batched with an environment:
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_memcpy_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin memcpy-batched-env
+  //!     :end-before: example-end memcpy-batched-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam InputBufferIt
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to
+  //!   the source memory buffers
+  //!
+  //! @tparam OutputBufferIt
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the pointers to
+  //!   the destination memory buffers
+  //!
+  //! @tparam BufferSizeIteratorT
+  //!   **[inferred]** Device-accessible random-access input iterator type providing the number of bytes
+  //!   to be copied for each pair of buffers
+  //!
+  //! @tparam EnvT
+  //!   **[inferred]** Environment type (e.g., `cuda::std::execution::env<...>`)
+  //!
+  //! @param[in] input_buffer_it
+  //!   Device-accessible iterator providing the pointers to the source memory buffers
+  //!
+  //! @param[in] output_buffer_it
+  //!   Device-accessible iterator providing the pointers to the destination memory buffers
+  //!
+  //! @param[in] buffer_sizes
+  //!   Device-accessible iterator providing the number of bytes to be copied for each pair of buffers
+  //!
+  //! @param[in] num_buffers
+  //!   The total number of buffer pairs
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  template <typename InputBufferIt,
+            typename OutputBufferIt,
+            typename BufferSizeIteratorT,
+            typename EnvT = ::cuda::std::execution::env<>,
+            ::cuda::std::enable_if_t<!::cuda::std::is_same_v<InputBufferIt, void*>, int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION _CCCL_FORCEINLINE static cudaError_t
+  Batched(InputBufferIt input_buffer_it,
+          OutputBufferIt output_buffer_it,
+          BufferSizeIteratorT buffer_sizes,
+          ::cuda::std::int64_t num_buffers,
+          EnvT env = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceMemcpy::Batched");
+    static_assert(::cuda::std::is_pointer_v<cub::detail::it_value_t<InputBufferIt>>,
+                  "DeviceMemcpy::Batched only supports copying of memory buffers."
+                  "Please consider using DeviceCopy::Batched instead.");
+    static_assert(::cuda::std::is_pointer_v<cub::detail::it_value_t<OutputBufferIt>>,
+                  "DeviceMemcpy::Batched only supports copying of memory buffers."
+                  "Please consider using DeviceCopy::Batched instead.");
+
+    using BlockOffsetT = uint32_t;
+
+    return detail::dispatch_with_env(env, [&]([[maybe_unused]] auto tuning, void* storage, size_t& bytes, auto stream) {
+      return detail::
+        DispatchBatchMemcpy<InputBufferIt, OutputBufferIt, BufferSizeIteratorT, BlockOffsetT, CopyAlg::Memcpy>::Dispatch(
+          storage, bytes, input_buffer_it, output_buffer_it, buffer_sizes, num_buffers, stream);
+    });
+  }
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/test/catch2_test_device_memcpy_env.cu b/cub/test/catch2_test_device_memcpy_env.cu
new file mode 100644
index 00000000000..a3a4b428cd8
--- /dev/null
+++ b/cub/test/catch2_test_device_memcpy_env.cu
@@ -0,0 +1,130 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Should precede any includes
+struct stream_registry_factory_t;
+#define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_memcpy.cuh>
+
+#include <thrust/device_vector.h>
+
+#include <cuda/iterator>
+#include <cuda/stream>
+
+#include "catch2_test_env_launch_helper.h"
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMemcpy::Batched, device_memcpy_batched);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+#include <cuda/__execution/require.h>
+
+#include <c2h/catch2_test_helper.h>
+
+namespace stdexec = cuda::std::execution;
+
+template <typename T>
+struct index_to_ptr
+{
+  T* base;
+  const int* offsets;
+  __host__ __device__ __forceinline__ T* operator()(int index) const
+  {
+    return base + offsets[index];
+  }
+};
+
+struct get_size
+{
+  const int* offsets;
+  __host__ __device__ __forceinline__ int operator()(int index) const
+  {
+    return (offsets[index + 1] - offsets[index]) * static_cast<int>(sizeof(int));
+  }
+};
+
+#if TEST_LAUNCH == 0
+
+TEST_CASE("DeviceMemcpy::Batched works with default environment", "[memcpy][device]")
+{
+  // 3 buffers: [10, 20], [30, 40, 50], [60]
+  auto d_src     = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
+  auto d_dst     = c2h::device_vector<int>(6);
+  auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
+
+  int num_buffers = 3;
+
+  cuda::counting_iterator<int> iota(0);
+  auto input_it = cuda::transform_iterator(
+    iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto output_it = cuda::transform_iterator(
+    iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto sizes = cuda::transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
+
+  REQUIRE(cudaSuccess == cub::DeviceMemcpy::Batched(input_it, output_it, sizes, num_buffers));
+
+  REQUIRE(d_dst == d_src);
+}
+
+#endif
+
+C2H_TEST("DeviceMemcpy::Batched uses environment", "[memcpy][device]")
+{
+  // 3 buffers: [10, 20], [30, 40, 50], [60]
+  auto d_src     = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
+  auto d_dst     = c2h::device_vector<int>(6, 0);
+  auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
+
+  int num_buffers = 3;
+
+  cuda::counting_iterator<int> iota(0);
+  auto input_it = cuda::transform_iterator(
+    iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto output_it = cuda::transform_iterator(
+    iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto sizes = cuda::transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
+
+  size_t expected_bytes_allocated{};
+  REQUIRE(cudaSuccess
+          == cub::DeviceMemcpy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_buffers));
+
+  auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)};
+
+  device_memcpy_batched(input_it, output_it, sizes, num_buffers, env);
+
+  REQUIRE(d_dst == d_src);
+}
+
+TEST_CASE("DeviceMemcpy::Batched uses custom stream", "[memcpy][device]")
+{
+  // 3 buffers: [10, 20], [30, 40, 50], [60]
+  auto d_src     = c2h::device_vector<int>{10, 20, 30, 40, 50, 60};
+  auto d_dst     = c2h::device_vector<int>(6, 0);
+  auto d_offsets = c2h::device_vector<int>{0, 2, 5, 6};
+
+  int num_buffers = 3;
+
+  cuda::counting_iterator<int> iota(0);
+  auto input_it = cuda::transform_iterator(
+    iota, index_to_ptr<const int>{thrust::raw_pointer_cast(d_src.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto output_it = cuda::transform_iterator(
+    iota, index_to_ptr<int>{thrust::raw_pointer_cast(d_dst.data()), thrust::raw_pointer_cast(d_offsets.data())});
+  auto sizes = cuda::transform_iterator(iota, get_size{thrust::raw_pointer_cast(d_offsets.data())});
+
+  cuda::stream custom_stream(cuda::device_ref{0});
+
+  size_t expected_bytes_allocated{};
+  REQUIRE(cudaSuccess
+          == cub::DeviceMemcpy::Batched(nullptr, expected_bytes_allocated, input_it, output_it, sizes, num_buffers));
+
+  auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}};
+  auto env         = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)};
+
+  device_memcpy_batched(input_it, output_it, sizes, num_buffers, env);
+
+  custom_stream.sync();
+  REQUIRE(d_dst == d_src);
+}
diff --git a/cub/test/catch2_test_device_memcpy_env_api.cu b/cub/test/catch2_test_device_memcpy_env_api.cu
new file mode 100644
index 00000000000..7f11263591d
--- /dev/null
+++ b/cub/test/catch2_test_device_memcpy_env_api.cu
@@ -0,0 +1,69 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_memcpy.cuh>
+
+#include <thrust/detail/raw_pointer_cast.h>
+#include <thrust/device_vector.h>
+
+#include <cuda/devices>
+#include <cuda/stream>
+
+#include <iostream>
+
+#include <c2h/catch2_test_helper.h>
+
+C2H_TEST("cub::DeviceMemcpy::Batched accepts env with stream", "[memcpy][env]")
+{
+  // example-begin memcpy-batched-env
+  // Source data: 3 buffers of different sizes laid out contiguously
+  // Buffer 0: [10, 20]     Buffer 1: [30, 40, 50]     Buffer 2: [60]
+  auto d_src = thrust::device_vector<int>{10, 20, 30, 40, 50, 60};
+
+  // Copy into two separate destination buffers to highlight the API's flexibility
+  auto d_dst_a = thrust::device_vector<int>(5, 0);
+  auto d_dst_b = thrust::device_vector<int>(1, 0);
+
+  // Source pointers: one per buffer, pointing into d_src
+  auto d_src_ptrs = thrust::device_vector<const int*>{
+    thrust::raw_pointer_cast(d_src.data()) + 0,
+    thrust::raw_pointer_cast(d_src.data()) + 2,
+    thrust::raw_pointer_cast(d_src.data()) + 5};
+
+  // Destination pointers: buffers 0,1 go to d_dst_a, buffer 2 goes to d_dst_b
+  auto d_dst_ptrs = thrust::device_vector<int*>{
+    thrust::raw_pointer_cast(d_dst_a.data()) + 0,
+    thrust::raw_pointer_cast(d_dst_a.data()) + 2,
+    thrust::raw_pointer_cast(d_dst_b.data()) + 0};
+
+  // Sizes in bytes for each buffer
+  auto d_sizes = thrust::device_vector<int>{
+    2 * static_cast<int>(sizeof(int)), 3 * static_cast<int>(sizeof(int)), 1 * static_cast<int>(sizeof(int))};
+
+  int num_buffers = 3;
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{stream_ref};
+
+  auto error = cub::DeviceMemcpy::Batched(
+    thrust::raw_pointer_cast(d_src_ptrs.data()),
+    thrust::raw_pointer_cast(d_dst_ptrs.data()),
+    thrust::raw_pointer_cast(d_sizes.data()),
+    num_buffers,
+    env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceMemcpy::Batched failed with status: " << error << std::endl;
+  }
+
+  thrust::device_vector<int> expected_a{10, 20, 30, 40, 50};
+  thrust::device_vector<int> expected_b{60};
+  // example-end memcpy-batched-env
+
+  REQUIRE(error == cudaSuccess);
+  REQUIRE(d_dst_a == expected_a);
+  REQUIRE(d_dst_b == expected_b);
+}