diff --git a/cub/cub/device/device_merge.cuh b/cub/cub/device/device_merge.cuh
index 2d26bde0822..dc223c9f31b 100644
--- a/cub/cub/device/device_merge.cuh
+++ b/cub/cub/device/device_merge.cuh
@@ -13,9 +13,11 @@
 #  pragma system_header
 #endif // no system header
 
+#include <cub/detail/env_dispatch.cuh>
 #include <cub/device/dispatch/dispatch_merge.cuh>
 #include <cub/util_namespace.cuh>
 
+#include <cuda/__execution/require.h>
 #include <cuda/std/__functional/operations.h>
 #include <cuda/std/cstdint>
 
@@ -103,6 +105,111 @@ struct DeviceMerge
       stream);
   }
 
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Merges two sorted sequences of values (called keys) into a sorted output sequence. Merging is unstable,
+  //! which means any two equivalent values (neither value is ordered before the other) may be written to the output
+  //! sequence in any order.
+  //!
+  //! .. versionadded:: 3.4.0
+  //!    First appears in CUDA Toolkit 13.4.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! Snippet
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin merge-keys-env
+  //!     :end-before: example-end merge-keys-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyIteratorIn1
+  //!   **[deduced]** Random access iterator to the first sorted input sequence. Must have the same
+  //!   value type as KeyIteratorIn2.
+  //!
+  //! @tparam KeyIteratorIn2
+  //!   **[deduced]** Random access iterator to the second sorted input sequence. Must have the
+  //!   same value type as KeyIteratorIn1.
+  //!
+  //! @tparam KeyIteratorOut
+  //!   **[deduced]** Random access iterator to the output sequence.
+  //!
+  //! @tparam CompareOp
+  //!   **[deduced]** Binary predicate to compare the input iterator's value types. Must have a
+  //!   signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering].
+  //!
+  //! @tparam EnvT
+  //!   **[deduced]** Environment type (e.g., `cuda::std::execution::env<...>`)
+  //!
+  //! @param[in] keys_in1
+  //!   Iterator to the beginning of the first sorted input sequence.
+  //!
+  //! @param[in] num_keys1
+  //!   Number of keys in the first input sequence.
+  //!
+  //! @param[in] keys_in2
+  //!   Iterator to the beginning of the second sorted input sequence.
+  //!
+  //! @param[in] num_keys2
+  //!   Number of keys in the second input sequence.
+  //!
+  //! @param[out] keys_out
+  //!   Iterator to the beginning of the output sequence.
+  //!
+  //! @param[in] compare_op
+  //!   Comparison function object, returning true if the first argument is ordered before the
+  //!   second. Must establish a [strict weak ordering].
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  //! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+  template <typename KeyIteratorIn1,
+            typename KeyIteratorIn2,
+            typename KeyIteratorOut,
+            typename CompareOp            = ::cuda::std::less<>,
+            typename EnvT                 = ::cuda::std::execution::env<>,
+            ::cuda::std::enable_if_t<!::cuda::std::is_same_v<KeyIteratorIn1, void*>
+                                       && !::cuda::std::is_same_v<KeyIteratorIn1, ::cuda::std::nullptr_t>
+                                       && !::cuda::std::is_arithmetic_v<CompareOp>,
+                                     int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MergeKeys(
+    KeyIteratorIn1 keys_in1,
+    ::cuda::std::int64_t num_keys1,
+    KeyIteratorIn2 keys_in2,
+    ::cuda::std::int64_t num_keys2,
+    KeyIteratorOut keys_out,
+    CompareOp compare_op = {},
+    EnvT env             = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceMerge::MergeKeys");
+
+    return detail::dispatch_with_env(
+      env, [&]([[maybe_unused]] auto tuning, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
+        return detail::merge::dispatch(
+          d_temp_storage,
+          temp_storage_bytes,
+          keys_in1,
+          static_cast<NullType*>(nullptr),
+          num_keys1,
+          keys_in2,
+          static_cast<NullType*>(nullptr),
+          num_keys2,
+          keys_out,
+          static_cast<NullType*>(nullptr),
+          compare_op,
+          stream);
+      });
+  }
+
   //! @rst
   //! Overview
   //! +++++++++++++++++++++++++++++++++++++++++++++
@@ -191,6 +298,138 @@ struct DeviceMerge
       compare_op,
       stream);
   }
+
+  //! @rst
+  //! Overview
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //! Merges two sorted sequences of key-value pairs into a sorted output sequence. Merging is unstable,
+  //! which means any two equivalent values (neither value is ordered before the other) may be written to the output
+  //! sequence in any order.
+  //!
+  //! .. versionadded:: 3.4.0
+  //!    First appears in CUDA Toolkit 13.4.
+  //!
+  //! This is an environment-based API that allows customization of:
+  //!
+  //! - Stream: Query via ``cuda::get_stream``
+  //! - Memory resource: Query via ``cuda::mr::get_memory_resource``
+  //!
+  //! Snippet
+  //! +++++++++++++++++++++++++++++++++++++++++++++
+  //!
+  //! .. literalinclude:: ../../../cub/test/catch2_test_device_merge_env_api.cu
+  //!     :language: c++
+  //!     :dedent:
+  //!     :start-after: example-begin merge-pairs-env
+  //!     :end-before: example-end merge-pairs-env
+  //!
+  //! @endrst
+  //!
+  //! @tparam KeyIteratorIn1
+  //!   **[deduced]** Random access iterator to the keys of the first sorted input sequence. Must
+  //!   have the same value type as KeyIteratorIn2.
+  //!
+  //! @tparam ValueIteratorIn1
+  //!   **[deduced]** Random access iterator to the values of the first sorted input sequence.
+  //!   Must have the same value type as ValueIteratorIn2.
+  //!
+  //! @tparam KeyIteratorIn2
+  //!   **[deduced]** Random access iterator to the second sorted input sequence. Must have the
+  //!   same value type as KeyIteratorIn1.
+  //!
+  //! @tparam ValueIteratorIn2
+  //!   **[deduced]** Random access iterator to the values of the second sorted input sequence.
+  //!   Must have the same value type as ValueIteratorIn1.
+  //!
+  //! @tparam KeyIteratorOut
+  //!   **[deduced]** Random access iterator to the keys of the output sequence.
+  //!
+  //! @tparam ValueIteratorOut
+  //!   **[deduced]** Random access iterator to the values of the output sequence.
+  //!
+  //! @tparam CompareOp
+  //!   **[deduced]** Binary predicate to compare the key input iterator's value types. Must have a
+  //!   signature equivalent to `bool operator()(Key lhs, Key rhs)` and establish a [strict weak ordering].
+  //!
+  //! @tparam EnvT
+  //!   **[deduced]** Environment type (e.g., `cuda::std::execution::env<...>`)
+  //!
+  //! @param[in] keys_in1
+  //!   Iterator to the beginning of the keys of the first sorted input sequence.
+  //!
+  //! @param[in] values_in1
+  //!   Iterator to the beginning of the values of the first sorted input sequence.
+  //!
+  //! @param[in] num_pairs1
+  //!   Number of key-value pairs in the first input sequence.
+  //!
+  //! @param[in] keys_in2
+  //!   Iterator to the beginning of the keys of the second sorted input sequence.
+  //!
+  //! @param[in] values_in2
+  //!   Iterator to the beginning of the values of the second sorted input sequence.
+  //!
+  //! @param[in] num_pairs2
+  //!   Number of key-value pairs in the second input sequence.
+  //!
+  //! @param[out] keys_out
+  //!   Iterator to the beginning of the keys of the output sequence.
+  //!
+  //! @param[out] values_out
+  //!   Iterator to the beginning of the values of the output sequence.
+  //!
+  //! @param[in] compare_op
+  //!   Comparison function object, returning true if the first argument is ordered before the
+  //!   second. Must establish a [strict weak ordering].
+  //!
+  //! @param[in] env
+  //!   @rst
+  //!   **[optional]** Execution environment. Default is ``cuda::std::execution::env{}``.
+  //!   @endrst
+  //! [strict weak ordering]: https://en.cppreference.com/w/cpp/concepts/strict_weak_order
+  template <typename KeyIteratorIn1,
+            typename ValueIteratorIn1,
+            typename KeyIteratorIn2,
+            typename ValueIteratorIn2,
+            typename KeyIteratorOut,
+            typename ValueIteratorOut,
+            typename CompareOp            = ::cuda::std::less<>,
+            typename EnvT                 = ::cuda::std::execution::env<>,
+            ::cuda::std::enable_if_t<!::cuda::std::is_same_v<KeyIteratorIn1, void*>
+                                       && !::cuda::std::is_same_v<KeyIteratorIn1, ::cuda::std::nullptr_t>
+                                       && !::cuda::std::is_arithmetic_v<CompareOp>,
+                                     int> = 0>
+  [[nodiscard]] CUB_RUNTIME_FUNCTION static cudaError_t MergePairs(
+    KeyIteratorIn1 keys_in1,
+    ValueIteratorIn1 values_in1,
+    ::cuda::std::int64_t num_pairs1,
+    KeyIteratorIn2 keys_in2,
+    ValueIteratorIn2 values_in2,
+    ::cuda::std::int64_t num_pairs2,
+    KeyIteratorOut keys_out,
+    ValueIteratorOut values_out,
+    CompareOp compare_op = {},
+    EnvT env             = {})
+  {
+    _CCCL_NVTX_RANGE_SCOPE("cub::DeviceMerge::MergePairs");
+
+    return detail::dispatch_with_env(
+      env, [&]([[maybe_unused]] auto tuning, void* d_temp_storage, size_t& temp_storage_bytes, cudaStream_t stream) {
+        return detail::merge::dispatch(
+          d_temp_storage,
+          temp_storage_bytes,
+          keys_in1,
+          values_in1,
+          num_pairs1,
+          keys_in2,
+          values_in2,
+          num_pairs2,
+          keys_out,
+          values_out,
+          compare_op,
+          stream);
+      });
+  }
 };
 
 CUB_NAMESPACE_END
diff --git a/cub/test/catch2_test_device_merge_env.cu b/cub/test/catch2_test_device_merge_env.cu
new file mode 100644
index 00000000000..dee1b9f2e61
--- /dev/null
+++ b/cub/test/catch2_test_device_merge_env.cu
@@ -0,0 +1,242 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+// Should precede any includes
+struct stream_registry_factory_t;
+#define CUB_DETAIL_DEFAULT_KERNEL_LAUNCHER_FACTORY stream_registry_factory_t
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_merge.cuh>
+
+#include <thrust/device_vector.h>
+
+#include "catch2_test_env_launch_helper.h"
+
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergeKeys, merge_keys);
+DECLARE_LAUNCH_WRAPPER(cub::DeviceMerge::MergePairs, merge_pairs);
+
+// %PARAM% TEST_LAUNCH lid 0:1:2
+
+#include <cuda/__execution/require.h>
+
+#include <c2h/catch2_test_helper.h>
+
+namespace stdexec = cuda::std::execution;
+
+#if TEST_LAUNCH == 0
+
+TEST_CASE("DeviceMerge::MergeKeys works with default environment", "[merge][device]")
+{
+  auto keys1  = c2h::device_vector<int>{0, 2, 5};
+  auto keys2  = c2h::device_vector<int>{0, 3, 3, 4};
+  auto result = c2h::device_vector<int>(7);
+
+  REQUIRE(
+    cudaSuccess
+    == cub::DeviceMerge::MergeKeys(
+      keys1.begin(), static_cast<int>(keys1.size()), keys2.begin(), static_cast<int>(keys2.size()), result.begin()));
+
+  c2h::device_vector<int> expected{0, 0, 2, 3, 3, 4, 5};
+  REQUIRE(result == expected);
+}
+
+TEST_CASE("DeviceMerge::MergePairs works with default environment", "[merge][device]")
+{
+  auto keys1   = c2h::device_vector<int>{0, 2, 5};
+  auto values1 = c2h::device_vector<char>{'a', 'b', 'c'};
+  auto keys2   = c2h::device_vector<int>{0, 3, 3, 4};
+  auto values2 = c2h::device_vector<char>{'A', 'B', 'C', 'D'};
+
+  auto result_keys   = c2h::device_vector<int>(7);
+  auto result_values = c2h::device_vector<char>(7);
+
+  REQUIRE(
+    cudaSuccess
+    == cub::DeviceMerge::MergePairs(
+      keys1.begin(),
+      values1.begin(),
+      static_cast<int>(keys1.size()),
+      keys2.begin(),
+      values2.begin(),
+      static_cast<int>(keys2.size()),
+      result_keys.begin(),
+      result_values.begin()));
+
+  c2h::device_vector<int> expected_keys{0, 0, 2, 3, 3, 4, 5};
+  c2h::device_vector<char> expected_values{'a', 'A', 'b', 'B', 'C', 'D', 'c'};
+  REQUIRE(result_keys == expected_keys);
+  REQUIRE(result_values == expected_values);
+}
+
+#endif
+
+C2H_TEST("DeviceMerge::MergeKeys uses environment", "[merge][device]")
+{
+  auto keys1  = c2h::device_vector<int>{0, 2, 5};
+  auto keys2  = c2h::device_vector<int>{0, 3, 3, 4};
+  auto result = c2h::device_vector<int>(7);
+
+  size_t expected_bytes_allocated{};
+  REQUIRE(
+    cudaSuccess
+    == cub::DeviceMerge::MergeKeys(
+      nullptr,
+      expected_bytes_allocated,
+      keys1.begin(),
+      static_cast<int>(keys1.size()),
+      keys2.begin(),
+      static_cast<int>(keys2.size()),
+      result.begin()));
+
+  auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)};
+
+  merge_keys(keys1.begin(),
+             static_cast<int>(keys1.size()),
+             keys2.begin(),
+             static_cast<int>(keys2.size()),
+             result.begin(),
+             cuda::std::less<>{},
+             env);
+
+  c2h::device_vector<int> expected{0, 0, 2, 3, 3, 4, 5};
+  REQUIRE(result == expected);
+}
+
+TEST_CASE("DeviceMerge::MergeKeys uses custom stream", "[merge][device]")
+{
+  auto keys1  = c2h::device_vector<int>{0, 2, 5};
+  auto keys2  = c2h::device_vector<int>{0, 3, 3, 4};
+  auto result = c2h::device_vector<int>(7);
+
+  cudaStream_t custom_stream;
+  REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream));
+
+  size_t expected_bytes_allocated{};
+  REQUIRE(
+    cudaSuccess
+    == cub::DeviceMerge::MergeKeys(
+      nullptr,
+      expected_bytes_allocated,
+      keys1.begin(),
+      static_cast<int>(keys1.size()),
+      keys2.begin(),
+      static_cast<int>(keys2.size()),
+      result.begin()));
+
+  auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}};
+  auto env         = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)};
+
+  merge_keys(keys1.begin(),
+             static_cast<int>(keys1.size()),
+             keys2.begin(),
+             static_cast<int>(keys2.size()),
+             result.begin(),
+             cuda::std::less<>{},
+             env);
+
+  REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream));
+
+  c2h::device_vector<int> expected{0, 0, 2, 3, 3, 4, 5};
+  REQUIRE(result == expected);
+
+  REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream));
+}
+
+C2H_TEST("DeviceMerge::MergePairs uses environment", "[merge][device]")
+{
+  auto keys1   = c2h::device_vector<int>{0, 2, 5};
+  auto values1 = c2h::device_vector<char>{'a', 'b', 'c'};
+  auto keys2   = c2h::device_vector<int>{0, 3, 3, 4};
+  auto values2 = c2h::device_vector<char>{'A', 'B', 'C', 'D'};
+
+  auto result_keys   = c2h::device_vector<int>(7);
+  auto result_values = c2h::device_vector<char>(7);
+
+  size_t expected_bytes_allocated{};
+  REQUIRE(
+    cudaSuccess
+    == cub::DeviceMerge::MergePairs(
+      nullptr,
+      expected_bytes_allocated,
+      keys1.begin(),
+      values1.begin(),
+      static_cast<int>(keys1.size()),
+      keys2.begin(),
+      values2.begin(),
+      static_cast<int>(keys2.size()),
+      result_keys.begin(),
+      result_values.begin()));
+
+  auto env = stdexec::env{expected_allocation_size(expected_bytes_allocated)};
+
+  merge_pairs(
+    keys1.begin(),
+    values1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    values2.begin(),
+    static_cast<int>(keys2.size()),
+    result_keys.begin(),
+    result_values.begin(),
+    cuda::std::less<>{},
+    env);
+
+  c2h::device_vector<int> expected_keys{0, 0, 2, 3, 3, 4, 5};
+  c2h::device_vector<char> expected_values{'a', 'A', 'b', 'B', 'C', 'D', 'c'};
+  REQUIRE(result_keys == expected_keys);
+  REQUIRE(result_values == expected_values);
+}
+
+TEST_CASE("DeviceMerge::MergePairs uses custom stream", "[merge][device]")
+{
+  auto keys1   = c2h::device_vector<int>{0, 2, 5};
+  auto values1 = c2h::device_vector<char>{'a', 'b', 'c'};
+  auto keys2   = c2h::device_vector<int>{0, 3, 3, 4};
+  auto values2 = c2h::device_vector<char>{'A', 'B', 'C', 'D'};
+
+  auto result_keys   = c2h::device_vector<int>(7);
+  auto result_values = c2h::device_vector<char>(7);
+
+  cudaStream_t custom_stream;
+  REQUIRE(cudaSuccess == cudaStreamCreate(&custom_stream));
+
+  size_t expected_bytes_allocated{};
+  REQUIRE(
+    cudaSuccess
+    == cub::DeviceMerge::MergePairs(
+      nullptr,
+      expected_bytes_allocated,
+      keys1.begin(),
+      values1.begin(),
+      static_cast<int>(keys1.size()),
+      keys2.begin(),
+      values2.begin(),
+      static_cast<int>(keys2.size()),
+      result_keys.begin(),
+      result_values.begin()));
+
+  auto stream_prop = stdexec::prop{cuda::get_stream_t{}, cuda::stream_ref{custom_stream}};
+  auto env         = stdexec::env{stream_prop, expected_allocation_size(expected_bytes_allocated)};
+
+  merge_pairs(
+    keys1.begin(),
+    values1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    values2.begin(),
+    static_cast<int>(keys2.size()),
+    result_keys.begin(),
+    result_values.begin(),
+    cuda::std::less<>{},
+    env);
+
+  REQUIRE(cudaSuccess == cudaStreamSynchronize(custom_stream));
+
+  c2h::device_vector<int> expected_keys{0, 0, 2, 3, 3, 4, 5};
+  c2h::device_vector<char> expected_values{'a', 'A', 'b', 'B', 'C', 'D', 'c'};
+  REQUIRE(result_keys == expected_keys);
+  REQUIRE(result_values == expected_values);
+
+  REQUIRE(cudaSuccess == cudaStreamDestroy(custom_stream));
+}
diff --git a/cub/test/catch2_test_device_merge_env_api.cu b/cub/test/catch2_test_device_merge_env_api.cu
new file mode 100644
index 00000000000..6e75f2221fe
--- /dev/null
+++ b/cub/test/catch2_test_device_merge_env_api.cu
@@ -0,0 +1,86 @@
+// SPDX-FileCopyrightText: Copyright (c) 2026, NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+// SPDX-License-Identifier: Apache-2.0 WITH LLVM-exception
+
+#include "insert_nested_NVTX_range_guard.h"
+
+#include <cub/device/device_merge.cuh>
+
+#include <thrust/device_vector.h>
+
+#include <cuda/devices>
+#include <cuda/stream>
+
+#include <iostream>
+
+#include <c2h/catch2_test_helper.h>
+
+C2H_TEST("cub::DeviceMerge::MergeKeys accepts env with stream", "[merge][env]")
+{
+  // example-begin merge-keys-env
+  auto keys1  = thrust::device_vector<int>{0, 2, 5};
+  auto keys2  = thrust::device_vector<int>{0, 3, 3, 4};
+  auto result = thrust::device_vector<int>(7);
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{stream_ref};
+
+  auto error = cub::DeviceMerge::MergeKeys(
+    keys1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    static_cast<int>(keys2.size()),
+    result.begin(),
+    cuda::std::less<>{},
+    env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceMerge::MergeKeys failed with status: " << error << std::endl;
+  }
+
+  thrust::device_vector<int> expected{0, 0, 2, 3, 3, 4, 5};
+  // example-end merge-keys-env
+
+  REQUIRE(error == cudaSuccess);
+  REQUIRE(result == expected);
+}
+
+C2H_TEST("cub::DeviceMerge::MergePairs accepts env with stream", "[merge][env]")
+{
+  // example-begin merge-pairs-env
+  auto keys1   = thrust::device_vector<int>{0, 2, 5};
+  auto values1 = thrust::device_vector<char>{'a', 'b', 'c'};
+  auto keys2   = thrust::device_vector<int>{0, 3, 3, 4};
+  auto values2 = thrust::device_vector<char>{'A', 'B', 'C', 'D'};
+
+  auto result_keys   = thrust::device_vector<int>(7);
+  auto result_values = thrust::device_vector<char>(7);
+
+  cuda::stream stream{cuda::devices[0]};
+  cuda::stream_ref stream_ref{stream};
+  auto env = cuda::std::execution::env{stream_ref};
+
+  auto error = cub::DeviceMerge::MergePairs(
+    keys1.begin(),
+    values1.begin(),
+    static_cast<int>(keys1.size()),
+    keys2.begin(),
+    values2.begin(),
+    static_cast<int>(keys2.size()),
+    result_keys.begin(),
+    result_values.begin(),
+    cuda::std::less<>{},
+    env);
+  if (error != cudaSuccess)
+  {
+    std::cerr << "cub::DeviceMerge::MergePairs failed with status: " << error << std::endl;
+  }
+
+  thrust::device_vector<int> expected_keys{0, 0, 2, 3, 3, 4, 5};
+  thrust::device_vector<char> expected_values{'a', 'A', 'b', 'B', 'C', 'D', 'c'};
+  // example-end merge-pairs-env
+
+  REQUIRE(error == cudaSuccess);
+  REQUIRE(result_keys == expected_keys);
+  REQUIRE(result_values == expected_values);
+}