rapidsai
diff --git a/‎cpp/include/rapidsmpf/buffer/buffer.hpp
Lines changed: 92 additions & 15 deletions b/‎cpp/include/rapidsmpf/buffer/buffer.hpp
Lines changed: 92 additions & 15 deletions
diff --git a/‎cpp/include/rapidsmpf/buffer/resource.hpp
Lines changed: 11 additions & 5 deletions b/‎cpp/include/rapidsmpf/buffer/resource.hpp
Lines changed: 11 additions & 5 deletions
diff --git a/‎cpp/include/rapidsmpf/communicator/communicator.hpp
Lines changed: 12 additions & 0 deletions b/‎cpp/include/rapidsmpf/communicator/communicator.hpp
Lines changed: 12 additions & 0 deletions
diff --git a/‎cpp/include/rapidsmpf/shuffler/chunk.hpp
Lines changed: 41 additions & 3 deletions b/‎cpp/include/rapidsmpf/shuffler/chunk.hpp
Lines changed: 41 additions & 3 deletions
diff --git a/‎cpp/include/rapidsmpf/shuffler/postbox.hpp
Lines changed: 3 additions & 3 deletions b/‎cpp/include/rapidsmpf/shuffler/postbox.hpp
Lines changed: 3 additions & 3 deletions
@@ -5,17 +5,22 @@
 #pragma once
 
 #include <array>
+#include <atomic>
 #include <memory>
+#include <mutex>
 #include <variant>
 #include <vector>
 
+#include <cuda_runtime.h>
+
 #include <rmm/device_buffer.hpp>
 
 #include <rapidsmpf/error.hpp>
 
 namespace rapidsmpf {
 
 class BufferResource;
+class Event;
 
 /// @brief Enum representing the type of memory.
 enum class MemoryType : int {
@@ -26,17 +31,81 @@ enum class MemoryType : int {
 /// @brief Array of all the different memory types.
 constexpr std::array<MemoryType, 2> MEMORY_TYPES{{MemoryType::DEVICE, MemoryType::HOST}};
 
+namespace {
+/// @brief Helper for overloaded lambdas using std::visit.
+template <class... Ts>
+struct overloaded : Ts... {
+    using Ts::operator()...;
+};
+/// @brief Explicit deduction guide
+template <class... Ts>
+overloaded(Ts...) -> overloaded<Ts...>;
+
+}  // namespace
+
 /**
  * @brief Buffer representing device or host memory.
  *
  * @note The constructors are private, use `BufferResource` to construct buffers.
  * @note The memory type (e.g., host or device) is constant and cannot change during
  * the buffer's lifetime.
+ * @note A buffer is a stream-ordered object, when passing to a library which is
+ * not stream-aware one must ensure that `is_ready` returns `true` otherwise
+ * behaviour is undefined.
  */
 class Buffer {
     friend class BufferResource;
 
   public:
+    /**
+     * @brief CUDA event to provide synchronization among set of chunks.
+     *
+     * This event is used to serve as a synchronization point for a set of chunks
+     * given a user-specified stream.
+     *
+     * @note To prevent undefined behavior due to unfinished memory operations, events
+     * should be used in the following cases, if any of the operations below was
+     * performed *asynchronously with respect to the host*:
+     * 1. Before addressing a device buffer's allocation.
+     * 2. Before accessing a device buffer's data whose data has been copied from
+     * any location, or that has been processed by a CUDA kernel.
+     * 3. Before accessing a host buffer's data whose data has been copied from device,
+     * or processed by a CUDA kernel.
+     */
+    class Event {
+      public:
+        /**
+         * @brief Construct a CUDA event for a given stream.
+         *
+         * @param stream CUDA stream used for device memory operations
+         */
+        Event(rmm::cuda_stream_view stream);
+
+        /**
+         * @brief Destructor for Event.
+         *
+         * Cleans up the CUDA event if one was created.
+         */
+        ~Event();
+
+        /**
+         * @brief Check if the CUDA event has been completed.
+         *
+         * @return true if the event has been completed, false otherwise.
+         */
+        [[nodiscard]] bool is_ready();
+
+      private:
+        cudaEvent_t event_;  ///< CUDA event used to track device memory allocation
+        std::atomic<bool> done_{false
+        };  ///< Cache of the event status to avoid unnecessary queries.
+        mutable std::mutex mutex_;  ///< Protects access to event_
+        std::atomic<bool> destroying_{false
+        };  ///< Flag to indicate destruction in progress
+        std::atomic<int> active_readers_{0
+        };  ///< Number of threads currently executing is_ready()
+    };
+
     /// @brief  Storage type for the device buffer.
     using DeviceStorageT = std::unique_ptr<rmm::device_buffer>;
 
@@ -48,15 +117,6 @@ class Buffer {
      */
     using StorageT = std::variant<DeviceStorageT, HostStorageT>;
 
-    /// @brief Helper for overloaded lambdas for Storage types in StorageT
-    template <class... Ts>
-    struct overloaded : Ts... {
-        using Ts::operator()...;
-    };
-    /// @brief Explicit deduction guide
-    template <class... Ts>
-    overloaded(Ts...) -> overloaded<Ts...>;
-
     /**
      * @brief Access the underlying host memory buffer (const).
      *
@@ -112,7 +172,7 @@ class Buffer {
      *
      * @throws std::logic_error if the buffer is not initialized.
      */
-    MemoryType constexpr mem_type() const {
+    [[nodiscard]] MemoryType constexpr mem_type() const {
         return std::visit(
             overloaded{
                 [](const HostStorageT&) -> MemoryType { return MemoryType::HOST; },
@@ -122,8 +182,16 @@ class Buffer {
         );
     }
 
-    /// @brief Buffer has a move ctor but no copy or assign operator.
-    Buffer(Buffer&&) = default;
+    /**
+     * @brief Check if the device memory operation has completed.
+     *
+     * @return true if the device memory operation has completed or no device
+     * memory operation was performed, false if it is still in progress.
+     */
+    [[nodiscard]] bool is_ready() const;
+
+    /// @brief Delete move and copy constructors and assignment operators.
+    Buffer(Buffer&&) = delete;
     Buffer(Buffer const&) = delete;
     Buffer& operator=(Buffer& o) = delete;
     Buffer& operator=(Buffer&& o) = delete;
@@ -143,13 +211,20 @@ class Buffer {
      * @brief Construct a Buffer from device memory.
      *
      * @param device_buffer A unique pointer to a device buffer.
+     * @param stream CUDA stream used for the device buffer allocation.
      * @param br Buffer resource for memory allocation.
+     * @param event The shared event to use for the buffer.
      *
      * @throws std::invalid_argument if `device_buffer` is null.
      * @throws std::invalid_argument if `stream` or `br->mr` isn't the same used by
      * `device_buffer`.
      */
-    Buffer(std::unique_ptr<rmm::device_buffer> device_buffer, BufferResource* br);
+    Buffer(
+        std::unique_ptr<rmm::device_buffer> device_buffer,
+        rmm::cuda_stream_view stream,
+        BufferResource* br,
+        std::shared_ptr<Event> event = nullptr
+    );
 
     /**
      * @brief Access the underlying host memory buffer.
@@ -184,7 +259,7 @@ class Buffer {
     /**
      * @brief Create a copy of this buffer using the same memory type.
      *
-     * @param stream CUDA stream used for device memory operations.
+     * @param stream CUDA stream used for the device buffer allocation and copy.
      * @return A unique pointer to a new Buffer containing the copied data.
      */
     [[nodiscard]] std::unique_ptr<Buffer> copy(rmm::cuda_stream_view stream) const;
@@ -193,7 +268,7 @@ class Buffer {
      * @brief Create a copy of this buffer using the specified memory type.
      *
      * @param target The target memory type.
-     * @param stream CUDA stream used for device memory operations.
+     * @param stream CUDA stream used for device buffer allocation and copy.
      * @return A unique pointer to a new Buffer containing the copied data.
      */
     [[nodiscard]] std::unique_ptr<Buffer> copy(
@@ -208,6 +283,8 @@ class Buffer {
     /// @brief The underlying storage host memory or device memory buffer (where
     /// applicable).
     StorageT storage_;
+    /// @brief CUDA event used to track copy operations
+    std::shared_ptr<Event> event_;
 };
 
 }  // namespace rapidsmpf
@@ -256,9 +256,15 @@ class BufferResource {
      * @brief Move device buffer data into a Buffer.
      *
      * @param data A unique pointer to the device buffer.
+     * @param stream CUDA stream used for the data allocation, copy, and/or move.
+     * @param event The event to use for the buffer.
      * @return A unique pointer to the resulting Buffer.
      */
-    std::unique_ptr<Buffer> move(std::unique_ptr<rmm::device_buffer> data);
+    std::unique_ptr<Buffer> move(
+        std::unique_ptr<rmm::device_buffer> data,
+        rmm::cuda_stream_view stream,
+        std::shared_ptr<Buffer::Event> event = nullptr
+    );
 
     /**
      * @brief Move a Buffer to the specified memory type.
@@ -267,7 +273,7 @@ class BufferResource {
      *
      * @param target The target memory type.
      * @param buffer The buffer to move.
-     * @param stream CUDA stream for the operation.
+     * @param stream CUDA stream used for the buffer allocation, copy, and/or move.
      * @param reservation The reservation to use for memory allocations.
      * @return A unique pointer to the moved Buffer.
      *
@@ -287,7 +293,7 @@ class BufferResource {
      * If and only if moving between different memory types will this perform a copy.
      *
      * @param buffer The buffer to move.
-     * @param stream CUDA stream for the operation.
+     * @param stream CUDA stream used for the buffer allocation, copy, and/or move.
      * @param reservation The reservation to use for memory allocations.
      * @return A unique pointer to the resulting device buffer.
      *
@@ -307,7 +313,7 @@ class BufferResource {
      * If and only if moving between different memory types will this perform a copy.
      *
      * @param buffer The buffer to move.
-     * @param stream CUDA stream for the operation.
+     * @param stream CUDA stream used for the buffer allocation, copy, and/or move.
      * @param reservation The reservation to use for memory allocations.
      * @return A unique pointer to the resulting host vector.
      *
@@ -328,7 +334,7 @@ class BufferResource {
      *
      * @param target The target memory type.
      * @param buffer The buffer to copy.
-     * @param stream CUDA stream for the operation.
+     * @param stream CUDA stream used for the buffer allocation and copy.
      * @param reservation The reservation to use for memory allocations.
      * @return A unique pointer to the new Buffer.
      *
 
@@ -426,6 +426,12 @@ class Communicator {
      * @param rank The destination rank.
      * @param tag Message tag for identification.
      * @return A unique pointer to a `Future` representing the asynchronous operation.
+     *
+     * @warning The caller is responsible to ensure the underlying `Buffer` allocation
+     * and data are already valid before calling, for example, when a CUDA allocation
+     * and/or copy are done asynchronously. Specifically, the caller should ensure
+     * `Buffer::is_ready()` returns true before calling this function, if not, a
+     * warning is printed and the application will terminate.
      */
     [[nodiscard]] virtual std::unique_ptr<Future> send(
         std::unique_ptr<Buffer> msg, Rank rank, Tag tag
@@ -438,6 +444,12 @@ class Communicator {
      * @param tag Message tag for identification.
      * @param recv_buffer The receive buffer.
      * @return A unique pointer to a `Future` representing the asynchronous operation.
+     *
+     * @warning The caller is responsible to ensure the underlying `Buffer` allocation
+     * is already valid before calling, for example, when a CUDA allocation
+     * and/or copy are done asynchronously. Specifically, the caller should ensure
+     * `Buffer::is_ready()` returns true before calling this function, if not, a
+     * warning is printed and the application will terminate.
      */
     [[nodiscard]] virtual std::unique_ptr<Future> recv(
         Rank rank, Tag tag, std::unique_ptr<Buffer> recv_buffer
 
@@ -4,7 +4,9 @@
  */
 #pragma once
 
+#include <atomic>
 #include <memory>
+#include <mutex>
 #include <sstream>
 #include <vector>
 
@@ -14,6 +16,7 @@
 #include <cudf/table/table.hpp>
 
 #include <rapidsmpf/buffer/buffer.hpp>
+#include <rapidsmpf/communicator/communicator.hpp>
 #include <rapidsmpf/shuffler/partition.hpp>
 
 namespace rapidsmpf::shuffler::detail {
@@ -272,6 +275,7 @@ class ChunkBatch {
 class Chunk {
   public:
     PartID const pid;  ///< Partition ID that this chunk belongs to.
+
     ChunkID const cid;  ///< Unique ID of this chunk.
 
     /// If not zero, the number of chunks of the partition expected to get from the
@@ -292,8 +296,6 @@ class Chunk {
      *
      * @param pid The ID of the partition this chunk is part of.
      * @param cid The ID of the chunk.
-     * @param expected_num_chunks If not zero, the number of chunks of the partition
-     * expected to get from the sending rank. Ignored when it is zero.
      * @param gpu_data_size If known, the size of the gpu data buffer (in bytes).
      * @param metadata The metadata of the packed `cudf::table` that makes up this
      * chunk.
@@ -303,7 +305,6 @@ class Chunk {
     Chunk(
         PartID pid,
         ChunkID cid,
-        std::size_t expected_num_chunks,
         std::size_t gpu_data_size,
         std::unique_ptr<std::vector<uint8_t>> metadata,
         std::unique_ptr<Buffer> gpu_data
@@ -371,6 +372,43 @@ class Chunk {
         std::size_t max_nbytes = 512,
         rmm::cuda_stream_view stream = cudf::get_default_stream()
     ) const;
+
+    /**
+     * @brief Returns true if the chunk is ready for consumption.
+     *
+     * Checks that the gpu_data's CUDA event is ready, if gpu_data contains a valid
+     * buffer. The CUDA event is used to synchronize the chunk's data to ensure
+     * any allocation or copy (e.g., spilling) is complete before the chunk is
+     * consumed. If expected_num_chunks is greater than 0, or gpu_data_size is 0,
+     * the chunk is considered always ready as it should not have any CUDA data
+     * to receive.
+     *
+     * @return true if the chunk is ready, false otherwise.
+     */
+    [[nodiscard]] bool is_ready() const;
+
+  private:
+    /**
+     * @brief Construct a new chunk of a partition.
+     *
+     * @param pid The ID of the partition this chunk is part of.
+     * @param cid The ID of the chunk.
+     * @param expected_num_chunks If not zero, the number of chunks of the partition
+     * expected to get from the sending rank. Ignored when it is zero.
+     * @param gpu_data_size If known, the size of the gpu data buffer (in bytes).
+     * @param metadata The metadata of the packed `cudf::table` that makes up this
+     * chunk.
+     *  @param gpu_data The gpu_data of the packed `cudf::table` that makes up this
+     * chunk.
+     */
+    Chunk(
+        PartID pid,
+        ChunkID cid,
+        std::size_t expected_num_chunks,
+        std::size_t gpu_data_size,
+        std::unique_ptr<std::vector<uint8_t>> metadata,
+        std::unique_ptr<Buffer> gpu_data
+    );
 };
 
 /**
 
@@ -83,11 +83,11 @@ class PostBox {
     std::unordered_map<ChunkID, Chunk> extract_by_key(KeyType key);
 
     /**
-     * @brief Extracts all chunks from the PostBox.
+     * @brief Extracts all ready chunks from the PostBox.
      *
-     * @return A vector of all chunks in the PostBox.
+     * @return A vector of all ready chunks in the PostBox.
      */
-    std::vector<Chunk> extract_all();
+    std::vector<Chunk> extract_all_ready();
 
     /**
      * @brief Checks if the PostBox is empty.
Original file line number	Diff line number	Diff line change
`@@ -256,9 +256,15 @@ class BufferResource {`
`256`	`256`	`* @brief Move device buffer data into a Buffer.`
`257`	`257`	`*`
`258`	`258`	`* @param data A unique pointer to the device buffer.`
	`259`	`+ * @param stream CUDA stream used for the data allocation, copy, and/or move.`
	`260`	`+ * @param event The event to use for the buffer.`
`259`	`261`	`* @return A unique pointer to the resulting Buffer.`
`260`	`262`	`*/`
`261`		`- std::unique_ptr<Buffer> move(std::unique_ptr<rmm::device_buffer> data);`
	`263`	`+ std::unique_ptr<Buffer> move(`
	`264`	`+ std::unique_ptr<rmm::device_buffer> data,`
	`265`	`+ rmm::cuda_stream_view stream,`
	`266`	`+ std::shared_ptr<Buffer::Event> event = nullptr`
	`267`	`+ );`
`262`	`268`
`263`	`269`	`/**`
`264`	`270`	`* @brief Move a Buffer to the specified memory type.`
`@@ -267,7 +273,7 @@ class BufferResource {`
`267`	`273`	`*`
`268`	`274`	`* @param target The target memory type.`
`269`	`275`	`* @param buffer The buffer to move.`
`270`		`- * @param stream CUDA stream for the operation.`
	`276`	`+ * @param stream CUDA stream used for the buffer allocation, copy, and/or move.`
`271`	`277`	`* @param reservation The reservation to use for memory allocations.`
`272`	`278`	`* @return A unique pointer to the moved Buffer.`
`273`	`279`	`*`
`@@ -287,7 +293,7 @@ class BufferResource {`
`287`	`293`	`* If and only if moving between different memory types will this perform a copy.`
`288`	`294`	`*`
`289`	`295`	`* @param buffer The buffer to move.`
`290`		`- * @param stream CUDA stream for the operation.`
	`296`	`+ * @param stream CUDA stream used for the buffer allocation, copy, and/or move.`
`291`	`297`	`* @param reservation The reservation to use for memory allocations.`
`292`	`298`	`* @return A unique pointer to the resulting device buffer.`
`293`	`299`	`*`
`@@ -307,7 +313,7 @@ class BufferResource {`
`307`	`313`	`* If and only if moving between different memory types will this perform a copy.`
`308`	`314`	`*`
`309`	`315`	`* @param buffer The buffer to move.`
`310`		`- * @param stream CUDA stream for the operation.`
	`316`	`+ * @param stream CUDA stream used for the buffer allocation, copy, and/or move.`
`311`	`317`	`* @param reservation The reservation to use for memory allocations.`
`312`	`318`	`* @return A unique pointer to the resulting host vector.`
`313`	`319`	`*`
`@@ -328,7 +334,7 @@ class BufferResource {`
`328`	`334`	`*`
`329`	`335`	`* @param target The target memory type.`
`330`	`336`	`* @param buffer The buffer to copy.`
`331`		`- * @param stream CUDA stream for the operation.`
	`337`	`+ * @param stream CUDA stream used for the buffer allocation and copy.`
`332`	`338`	`* @param reservation The reservation to use for memory allocations.`
`333`	`339`	`* @return A unique pointer to the new Buffer.`
`334`	`340`	`*`