fix(cudf): Refactor CudfToVelox output batching to avoid O(n) D->H syncs (facebookincubator#16620)

bdice · meta-codesync[bot] · commit 7534c2e47136 · 2026-04-02T15:48:19.000-07:00
Summary: This PR implements an optimized `CudfToVelox` batching strategy that results in fewer device-to-host copies and corresponding stream synchronizations, regardless of the input/output batch sizes. The previous implementation split large GPU inputs by doing `cudf::split` + two `cudf::table` deep-copies per output batch, resulting in `O(n_batches)` GPU kernel launches and D->H synchronizations. With maxOutBatchRows=1 and 1000 rows this took ~31s and would timeout under compute-sanitizer. New strategy (see block comment above `getOutput`): **(A)** Large input (`>= targetBatchSize` rows): convert the GPU table to Velox once via a single `to_arrow_host` + synchronize, store it in `veloxBuffer_`, then emit CPU-side slices via `BaseVector::slice()` on subsequent `getOutput()` calls. Zero additional D->H work per slice. **(B)** Small inputs (`< targetBatchSize` rows, e.g. `CudfFilterProject` with high filter selectivity): GPU-concatenate `inputs_` until we reach `targetBatchSize`, then convert the merged table in one D->H transfer. This preserves the GPU-side batching that avoids emitting many undersized Velox batches downstream. Both paths issue exactly one `toVeloxColumn` + `stream.synchronize()` per output batch. The `outputBatchRows` test runtime decreased from around 30 seconds to around 3 seconds. Pull Request resolved: facebookincubator#16620 Reviewed By: srsuryadev Differential Revision: D99330510 Pulled By: peterenescu fbshipit-source-id: a97e283c72f78d9d8733362ec7a3f0587fd3fa76
diff --git a/velox/experimental/cudf/exec/CudfConversion.cpp b/velox/experimental/cudf/exec/CudfConversion.cpp
@@ -26,8 +26,7 @@
 #include "velox/exec/Operator.h"
 #include "velox/vector/ComplexVector.h"
 
-#include <cudf/copying.hpp>
-#include <cudf/table/table.hpp>
+#include <cudf/types.hpp>
 #include <cudf/utilities/default_stream.hpp>
 
 namespace facebook::velox::cudf_velox {
@@ -204,116 +203,136 @@ std::optional<uint64_t> CudfToVelox::averageRowSize() {
   return averageRowSize_;
 }
 
+// Pop inputs_.front(), convert its GPU table to a Velox RowVector via a
+// single to_arrow_host + synchronize, and return it.  The caller is
+// responsible for any further slicing.
+RowVectorPtr CudfToVelox::convertFrontToVelox() {
+  auto cudfVector = std::move(inputs_.front());
+  inputs_.pop_front();
+  auto stream = cudfVector->stream();
+  auto tableView = cudfVector->getTableView();
+  auto output = with_arrow::toVeloxColumn(
+      tableView, pool(), outputType_, "", stream, get_temp_mr());
+  stream.synchronize();
+  output->setType(outputType_);
+  return output;
+}
+
+// Output batching strategy
+// ========================
+// The key constraint is minimising D->H (device-to-host) transfers.
+// Each call to toVeloxColumn / to_arrow_host triggers one D->H copy per
+// column, so calling it once per output batch (rather than once per row
+// or once per input batch) is critical for performance.
+//
+// Two cases arise depending on the size of the front GPU input relative
+// to targetBatchSize:
+//
+//  (A) Front input >= targetBatchSize  (e.g. CudfOrderBy: one large sorted
+//      table).  We convert the whole input to Velox in one shot and then
+//      slice it purely on the CPU using BaseVector::slice().  Subsequent
+//      getOutput() calls return successive CPU slices with no additional
+//      D->H work until veloxBuffer_ is exhausted.
+//
+//  (B) Front input < targetBatchSize  (e.g. CudfFilterProject with high
+//      selectivity: many small GPU batches).  We concatenate inputs on device
+//      until we accumulate targetBatchSize rows, then convert the concat
+//      result to Velox in one shot.  This preserves the GPU-side merge
+//      that avoids emitting many undersized Velox batches downstream.
+//
+// In both cases exactly one toVeloxColumn + stream.synchronize() is issued
+// per output batch, regardless of how many GPU inputs were consumed.
 RowVectorPtr CudfToVelox::getOutput() {
   VELOX_NVTX_OPERATOR_FUNC_RANGE();
-  if (finished_ || inputs_.empty()) {
-    finished_ = noMoreInput_ && inputs_.empty();
+  if (finished_) {
     return nullptr;
   }
 
-  // Get the target batch size
-  const auto targetBatchSize = outputBatchRows(averageRowSize());
-
-  // Process single input directly in these cases:
-  // 1. In passthrough mode
-  // 2. If we only have one input and it's smaller than or equal to the target
-  // batch size
-  if (isPassthroughMode() ||
-      (inputs_.size() == 1 && inputs_.front()->size() <= targetBatchSize)) {
-    // Move the CudfVector out to keep it alive while we use the view.
-    // This avoids expensive materialization when constructed from packed_table.
-    auto cudfVector = std::move(inputs_.front());
-    inputs_.pop_front();
-
-    auto tableView = cudfVector->getTableView();
-    auto stream = cudfVector->stream();
-    if (tableView.num_rows() == 0) {
-      finished_ = noMoreInput_ && inputs_.empty();
+  // Drain veloxBuffer_ (populated on a previous call) before consuming
+  // more GPU inputs.
+  if (!veloxBuffer_) {
+    if (inputs_.empty()) {
+      finished_ = noMoreInput_;
       return nullptr;
     }
-    RowVectorPtr output = with_arrow::toVeloxColumn(
-        tableView, pool(), outputType_, "", stream, get_temp_mr());
-    stream.synchronize();
-    finished_ = noMoreInput_ && inputs_.empty();
-    output->setType(outputType_);
-    // cudfVector goes out of scope here, freeing the GPU memory
-    return output;
-  }
 
-  // Calculate how many tables we need to concatenate to reach the target batch
-  // size and collect them in a vector
-  std::vector<CudfVectorPtr> selectedInputs;
-  vector_size_t totalSize = 0;
+    // Passthrough mode: emit each GPU input as a single Velox batch with no
+    // re-batching.  Used when the caller knows the batch size is already
+    // correct (e.g. default pipeline without explicit batch-size overrides).
+    if (isPassthroughMode()) {
+      auto output = convertFrontToVelox();
+      finished_ = noMoreInput_ && inputs_.empty();
+      if (output->size() == 0) {
+        return nullptr;
+      }
+      return output;
+    }
 
-  while (!inputs_.empty() && totalSize < targetBatchSize) {
-    auto& input = inputs_.front();
-    if (totalSize + input->size() <= targetBatchSize) {
-      totalSize += input->size();
-      selectedInputs.push_back(std::move(input));
-      inputs_.pop_front();
+    const auto targetBatchSize = outputBatchRows(averageRowSize());
+
+    if (static_cast<vector_size_t>(inputs_.front()->size()) >=
+        targetBatchSize) {
+      // Case A: large input.  Convert once; subsequent calls slice CPU-side.
+      veloxBuffer_ = convertFrontToVelox();
+      veloxOffset_ = 0;
+      averageRowSize_ = std::nullopt; // recompute from next input
     } else {
-      // If the next input would exceed targetBatchSize,
-      // we need to split it and only take what we need
-      auto cudfTableView = input->getTableView();
-      auto stream = input->stream();
-      auto partitions = std::vector<cudf::size_type>{
-          static_cast<cudf::size_type>(targetBatchSize - totalSize)};
-      auto tableSplits = cudf::split(cudfTableView, partitions, stream);
-
-      // Create new CudfVector from the first part
-      auto firstPart =
-          std::make_unique<cudf::table>(tableSplits[0], stream, get_temp_mr());
-      auto firstPartSize = firstPart->num_rows();
-      auto firstPartVector = std::make_shared<CudfVector>(
-          pool(), input->type(), firstPartSize, std::move(firstPart), stream);
-
-      // Create new CudfVector from the second part
-      auto secondPart =
-          std::make_unique<cudf::table>(tableSplits[1], stream, get_temp_mr());
-      auto secondPartSize = secondPart->num_rows();
-      auto secondPartVector = std::make_shared<CudfVector>(
-          pool(), input->type(), secondPartSize, std::move(secondPart), stream);
-
-      // Replace the original input with the second part
-      input = std::move(secondPartVector);
-
-      // Add the first part to selectedInputs
-      selectedInputs.push_back(std::move(firstPartVector));
-      totalSize += firstPartSize;
-      break;
+      // Case B: small inputs.  GPU-concat until we reach targetBatchSize,
+      // then convert the merged table in one D->H transfer.
+      auto stream = inputs_.front()->stream();
+      std::vector<CudfVectorPtr> toConcat;
+      vector_size_t accumulated = 0;
+      while (!inputs_.empty() && accumulated < targetBatchSize) {
+        accumulated += static_cast<vector_size_t>(inputs_.front()->size());
+        toConcat.push_back(std::move(inputs_.front()));
+        inputs_.pop_front();
+      }
+      VELOX_CHECK_LE(
+          accumulated,
+          std::numeric_limits<cudf::size_type>::max(),
+          "Accumulated row count exceeds cudf int32 limit");
+      auto concatTable = getConcatenatedTable(
+          std::move(toConcat), outputType_, stream, get_temp_mr());
+      auto tableView = concatTable->view();
+      veloxBuffer_ = with_arrow::toVeloxColumn(
+          tableView, pool(), outputType_, "", stream, get_temp_mr());
+      stream.synchronize();
+      veloxBuffer_->setType(outputType_);
+      veloxOffset_ = 0;
+      averageRowSize_ = std::nullopt;
     }
   }
 
-  finished_ = noMoreInput_ && inputs_.empty();
-
-  // If we have no inputs to process, return nullptr
-  if (selectedInputs.empty()) {
+  // Slice veloxBuffer_ on the CPU to produce the next output batch.
+  const auto totalRows = static_cast<vector_size_t>(veloxBuffer_->size());
+  if (veloxOffset_ >= totalRows) {
+    veloxBuffer_.reset();
+    finished_ = noMoreInput_ && inputs_.empty();
     return nullptr;
   }
 
-  // Concatenate the selected tables on the GPU
-  auto stream = cudfGlobalStreamPool().get_stream();
-  auto resultTable = getConcatenatedTable(
-      std::move(selectedInputs), outputType_, stream, get_temp_mr());
+  const auto targetBatchSize = outputBatchRows(
+      veloxBuffer_->estimateFlatSize() /
+      static_cast<uint64_t>(std::max<vector_size_t>(totalRows, 1)));
+  const auto take = std::min(targetBatchSize, totalRows - veloxOffset_);
 
-  // Convert the concatenated table to a RowVector
-  const auto size = resultTable->num_rows();
-  VELOX_CHECK_NOT_NULL(resultTable);
-  if (size == 0) {
-    return nullptr;
+  auto slice = std::dynamic_pointer_cast<RowVector>(
+      veloxBuffer_->slice(veloxOffset_, take));
+  VELOX_CHECK_NOT_NULL(slice);
+  veloxOffset_ += take;
+
+  if (veloxOffset_ >= totalRows) {
+    veloxBuffer_.reset();
+    finished_ = noMoreInput_ && inputs_.empty();
   }
 
-  RowVectorPtr output = with_arrow::toVeloxColumn(
-      resultTable->view(), pool(), outputType_, "", stream, get_temp_mr());
-  stream.synchronize();
-  finished_ = noMoreInput_ && inputs_.empty();
-  output->setType(outputType_);
-  return output;
+  return slice;
 }
 
 void CudfToVelox::close() {
   exec::Operator::close();
   inputs_.clear();
+  veloxBuffer_.reset();
 }
 
 } // namespace facebook::velox::cudf_velox
diff --git a/velox/experimental/cudf/exec/CudfConversion.h b/velox/experimental/cudf/exec/CudfConversion.h
@@ -96,8 +96,14 @@ class CudfToVelox : public exec::Operator, public NvtxHelper {
  private:
   bool isPassthroughMode() const;
   std::optional<uint64_t> averageRowSize();
+  // Convert inputs_.front() to Velox once; slice it CPU-side per batch.
+  RowVectorPtr convertFrontToVelox();
   std::optional<uint64_t> averageRowSize_;
   std::deque<CudfVectorPtr> inputs_;
+  // Converted CPU-side buffer being drained by successive getOutput() calls.
+  RowVectorPtr veloxBuffer_;
+  // Current offset into veloxBuffer_ for the next slice.
+  vector_size_t veloxOffset_{0};
   bool finished_ = false;
 };
 
diff --git a/velox/experimental/cudf/exec/CudfOrderBy.cpp b/velox/experimental/cudf/exec/CudfOrderBy.cpp
@@ -98,7 +98,7 @@ RowVectorPtr CudfOrderBy::getOutput() {
   if (finished_ || !noMoreInput_) {
     return nullptr;
   }
-  finished_ = noMoreInput_;
+  finished_ = true;
   return outputTable_;
 }
 

Original file line number	Diff line number	Diff line change
`@@ -98,7 +98,7 @@ RowVectorPtr CudfOrderBy::getOutput() {`
`98`	`98`	`if (finished_ \|\| !noMoreInput_) {`
`99`	`99`	`return nullptr;`
`100`	`100`	`}`
`101`		`- finished_ = noMoreInput_;`
	`101`	`+ finished_ = true;`
`102`	`102`	`return outputTable_;`
`103`	`103`	`}`
`104`	`104`