implement single owner

pang-wu · pang-wu · commit 4fbfd7c83dc6 · 2026-01-14T01:14:37.000-08:00
diff --git a/core/raydp-main/src/main/java/org/apache/spark/raydp/RayExecutorUtils.java b/core/raydp-main/src/main/java/org/apache/spark/raydp/RayExecutorUtils.java
@@ -25,7 +25,6 @@
 import io.ray.runtime.object.ObjectRefImpl;
 import java.util.List;
 import java.util.Map;
-import org.apache.spark.executor.PutRDDPartitionToBlockStoreArgs;
 import org.apache.spark.executor.RayDPExecutor;
 
 public class RayExecutorUtils {
@@ -87,33 +86,6 @@ public static ObjectRef<byte[]> getRDDPartition(
             .remote();
   }
 
-  public static ObjectRef<Boolean> putRDDPartitionToBlockStoreViaRegistry(
-      ActorHandle<RayDPExecutor> handle,
-      int rddId,
-      int partitionId,
-      String schema,
-      String driverAgentUrl,
-      String registryActorName,
-      String blockStoreActorName,
-      String batchKey,
-      double numCpus,
-      double memory,
-      double nodeAffinity) {
-    PutRDDPartitionToBlockStoreArgs args =
-        new PutRDDPartitionToBlockStoreArgs(
-            rddId,
-            partitionId,
-            schema,
-            driverAgentUrl,
-            registryActorName,
-            blockStoreActorName,
-            batchKey,
-            numCpus,
-            memory,
-            nodeAffinity);
-    return handle.task(RayDPExecutor::putRDDPartitionToBlockStoreViaRegistry, args).remote();
-  }
-
   public static void exitExecutor(ActorHandle<RayDPExecutor> handle) {
     handle.task(RayDPExecutor::stop).remote();
   }
diff --git a/core/raydp-main/src/main/java/org/apache/spark/raydp/SparkOnRayConfigs.java b/core/raydp-main/src/main/java/org/apache/spark/raydp/SparkOnRayConfigs.java
@@ -10,19 +10,6 @@ public class SparkOnRayConfigs {
     public static final String SPARK_MASTER_ACTOR_RESOURCE_PREFIX =
             "spark.ray.raydp_spark_master.actor.resource";
 
-    public static final String BLOCKSTORE_ACTOR_RESOURCE_CPU =
-            "spark.ray.raydp_blockstore.actor.resource.CPU";
-    public static final String BLOCKSTORE_ACTOR_RESOURCE_MEMORY =
-            "spark.ray.raydp_blockstore.actor.resource.memory";
-    /**
-     * Node affinity resource fraction used to pin per-executor BlockStore actors to the executor
-     * node via the special "node:&lt;ip&gt;" resource. Ray provides 1.0 of this resource per node.
-     *
-     * Defaults to 0.001 (allowing up to ~1000 such actors per node).
-     */
-    public static final String BLOCKSTORE_ACTOR_NODE_AFFINITY_RESOURCE =
-            "spark.ray.raydp_blockstore.actor.resource.node_affinity";
-
     /**
      * Extra JVM options for the RayDP AppMaster actor and gateway process.
      * This is useful for passing JDK 17+ --add-opens flags.
diff --git a/core/raydp-main/src/main/scala/org/apache/spark/executor/RayDPExecutor.scala b/core/raydp-main/src/main/scala/org/apache/spark/executor/RayDPExecutor.scala
@@ -20,16 +20,13 @@ package org.apache.spark.executor
 import java.io.{ByteArrayOutputStream, File}
 import java.nio.channels.Channels
 import java.nio.file.Paths
-import java.util.Optional
+import java.util.concurrent.ConcurrentHashMap
 import java.util.concurrent.atomic.AtomicBoolean
 
 import scala.reflect.classTag
 
 import com.intel.raydp.shims.SparkShimLoader
-import io.ray.api.PyActorHandle
 import io.ray.api.Ray
-import io.ray.api.call.PyActorTaskCaller
-import io.ray.api.function.PyActorMethod
 import io.ray.runtime.config.RayConfig
 import org.apache.arrow.vector.ipc.{ArrowStreamWriter, WriteChannel}
 import org.apache.arrow.vector.ipc.message.{IpcOption, MessageSerializer}
@@ -272,6 +269,22 @@ class RayDPExecutor(
     Ray.exitActor
   }
 
+  /**
+   * Pop (remove and return) a previously stored Arrow IPC stream by key.
+   *
+   * This method is intended to be called from a Python "owner/registry" actor via Ray
+   * cross-language actor calls. Since the Python actor is the caller, Ray will assign
+   * ownership of the returned object to that Python actor.
+   */
+  def popArrowIPC(batchKey: String): Array[Byte] = {
+    val bytes = RayDPExecutor.popArrowIPC(batchKey)
+    if (bytes == null) {
+      throw new RayDPException(
+        s"Missing Arrow IPC bytes for batchKey=$batchKey on executorId=$executorId.")
+    }
+    bytes
+  }
+
   def getBlockLocations(rddId: Int, numPartitions: Int): Array[String] = {
     val env = SparkEnv.get
     val blockIds = (0 until numPartitions).map(i =>
@@ -353,63 +366,18 @@ class RayDPExecutor(
     byteOut.close
     result
   }
+}
 
-  /**
-   * For recoverable Spark->Ray Dataset conversion:
-   * read cached Arrow IPC bytes from Spark block manager, then push them into a Python BlockStore
-   * actor created via a Python registry actor.
-   */
-  def putRDDPartitionToBlockStoreViaRegistry(
-      args: PutRDDPartitionToBlockStoreArgs): Boolean = {
-    val bytes = getRDDPartition(args.rddId, args.partitionId, args.schemaStr, args.driverAgentUrl)
+object RayDPExecutor {
+  // Per-executor in-memory buffer for Arrow IPC streams produced by Spark tasks.
+  // Stored in the executor (Ray actor) process; entries are removed by popArrowIPC.
+  private val arrowIpcByKey = new ConcurrentHashMap[String, Array[Byte]]()
 
-    val registryOpt = Ray.getActor(args.registryActorName).asInstanceOf[Optional[AnyRef]]
-    if (!registryOpt.isPresent) {
-      throw new RayDPException(s"Registry actor ${args.registryActorName} not found.")
-    }
-    val regAny: AnyRef = registryOpt.get()
-    if (!regAny.isInstanceOf[PyActorHandle]) {
-      throw new RayDPException(s"Registry actor ${args.registryActorName} is not a Python actor.")
-    }
-    val registryHandle = regAny.asInstanceOf[PyActorHandle]
-
-    val getActorMethod =
-      PyActorMethod.of("get_or_create_blockstore_actor", classOf[java.lang.Boolean])
-    val createArgs: Array[AnyRef] = Array(
-      args.blockStoreActorName,
-      nodeIp,
-      Double.box(args.numCpus),
-      Double.box(args.memory),
-      Double.box(args.nodeAffinity)
-    )
-    new PyActorTaskCaller(registryHandle, getActorMethod, createArgs).remote().get()
-
-    val bsOpt = Ray.getActor(args.blockStoreActorName).asInstanceOf[Optional[AnyRef]]
-    if (!bsOpt.isPresent) {
-      throw new RayDPException(s"BlockStore actor ${args.blockStoreActorName} not found.")
-    }
-    val bsAny: AnyRef = bsOpt.get()
-    if (!bsAny.isInstanceOf[PyActorHandle]) {
-      throw new RayDPException(
-        s"BlockStore actor ${args.blockStoreActorName} is not a Python actor.")
-    }
-    val bsHandle = bsAny.asInstanceOf[PyActorHandle]
+  def putArrowIPC(batchKey: String, bytes: Array[Byte]): Unit = {
+    arrowIpcByKey.put(batchKey, bytes)
+  }
 
-    val putMethod = PyActorMethod.of("put_arrow_ipc", classOf[java.lang.Boolean])
-    val putArgs: Array[AnyRef] = Array(args.batchKey, bytes.asInstanceOf[AnyRef])
-    new PyActorTaskCaller(bsHandle, putMethod, putArgs).remote().get()
-    true
+  def popArrowIPC(batchKey: String): Array[Byte] = {
+    arrowIpcByKey.remove(batchKey)
   }
 }
-
-case class PutRDDPartitionToBlockStoreArgs(
-    rddId: Int,
-    partitionId: Int,
-    schemaStr: String,
-    driverAgentUrl: String,
-    registryActorName: String,
-    blockStoreActorName: String,
-    batchKey: String,
-    numCpus: Double,
-    memory: Double,
-    nodeAffinity: Double) extends Serializable
diff --git a/core/raydp-main/src/main/scala/org/apache/spark/sql/raydp/ObjectStoreWriter.scala b/core/raydp-main/src/main/scala/org/apache/spark/sql/raydp/ObjectStoreWriter.scala
@@ -19,12 +19,9 @@ package org.apache.spark.sql.raydp
 
 import com.intel.raydp.shims.SparkShimLoader
 import io.ray.api.{ActorHandle, ObjectRef, Ray}
-import io.ray.api.PyActorHandle
-import io.ray.api.function.PyActorMethod
 import io.ray.runtime.AbstractRayRuntime
-import io.ray.runtime.config.RayConfig
 import java.io.ByteArrayOutputStream
-import java.util.{List, Optional, UUID}
+import java.util.{List, UUID}
 import java.util.concurrent.{ConcurrentHashMap, ConcurrentLinkedQueue}
 import java.util.function.{Function => JFunction}
 import org.apache.arrow.vector.VectorSchemaRoot
@@ -39,7 +36,6 @@ import org.apache.spark.deploy.raydp._
 import org.apache.spark.executor.RayDPExecutor
 import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.raydp.{RayDPUtils, RayExecutorUtils}
-import org.apache.spark.raydp.SparkOnRayConfigs
 import org.apache.spark.sql.DataFrame
 import org.apache.spark.sql.execution.arrow.ArrowWriter
 import org.apache.spark.sql.execution.python.BatchIterator
@@ -66,76 +62,33 @@ class ObjectStoreWriter(@transient val df: DataFrame) extends Serializable {
   def writeToRay(
       data: Array[Byte],
       numRecords: Int,
-      queue: ObjectRefHolder.Queue,
       ownerName: String): RecordBatch = {
 
-    // Owner-transfer only implementation:
-    // - ownerName must always be provided (non-empty) and refer to a Python actor
-    //   implemented RayDPBlockStoreActorRegistry.
-    // - JVM never creates/handles Ray ObjectRefs for the dataset blocks.
-    // - JVM returns only a per-batch key encoded in RecordBatch.objectId (bytes),
-    //   and Python will fetch the real ObjectRefs from the owner actor by key.
-
+    // Single-owner implementation:
+    // - Spark executor JVM actor produces Arrow IPC bytes (data).
+    // - Bytes are buffered inside the executor actor process keyed by batchKey.
+    // - JVM returns (executorActorName, batchKey) to Python.
+    // - A Python single owner actor later calls executorActor.popArrowIPC(batchKey),
+    //   decodes into pyarrow.Table and becomes the Ray owner of the resulting Dataset blocks.
+    //
+    // We keep ownerName non-empty for API consistency and to avoid accidental usage without
+    // a dedicated owner actor on the Python side.
     if (ownerName == null || ownerName.isEmpty) {
       throw new RayDPException("ownerName must be set for Spark->Ray conversion.")
     }
 
-    val registryActorOptional = Ray.getActor(ownerName).asInstanceOf[Optional[AnyRef]]
-    if (!registryActorOptional.isPresent) {
-      throw new RayDPException(s"Blobstore registry actor $ownerName not found.")
-    }
-    val registryActorHandle: AnyRef = registryActorOptional.get()
-    if (!registryActorHandle.isInstanceOf[PyActorHandle]) {
-      throw new RayDPException(
-        s"Blobstore registry actor $ownerName is not a Python actor.")
-    }
-
-    val appName = SparkEnv.get.conf.get("spark.app.name", "raydp")
-    val blockStoreActorName =
-      ObjectStoreWriter.getBlockStoreActorName(appName, SparkEnv.get.executorId)
-    val pyHandle = registryActorHandle.asInstanceOf[PyActorHandle]
-    val getActorMethod = PyActorMethod.of(
-      "get_or_create_blockstore_actor", classOf[java.lang.Boolean])
-
-    // Get config inside to retain backward compatibility since this is a public API.
-    val nodeIp = RayConfig.create().nodeIp
-    val cpuOpt =
-      SparkEnv.get.conf.getOption(SparkOnRayConfigs.BLOCKSTORE_ACTOR_RESOURCE_CPU)
-    val memOpt =
-      SparkEnv.get.conf.getOption(SparkOnRayConfigs.BLOCKSTORE_ACTOR_RESOURCE_MEMORY)
-    val nodeAffinityOpt =
-      SparkEnv.get.conf.getOption(SparkOnRayConfigs.BLOCKSTORE_ACTOR_NODE_AFFINITY_RESOURCE)
-    val numCpus = cpuOpt.map(_.toDouble).getOrElse(0.0)
-    val memory = memOpt.map(ObjectStoreWriter.parseMemoryBytes).getOrElse(0.0)
-    val nodeAffinity = nodeAffinityOpt.map(_.toDouble).getOrElse(0.001)
-
-    pyHandle
-      .task(
-        getActorMethod,
-        blockStoreActorName,
-        nodeIp,
-        Double.box(numCpus),
-        Double.box(memory),
-        Double.box(nodeAffinity))
-      .remote()
-      .get()
-    val blockStorageActorHandleOpt =
-      Ray.getActor(blockStoreActorName).asInstanceOf[Optional[PyActorHandle]]
-    if (!blockStorageActorHandleOpt.isPresent) {
-      throw new RayDPException(s"Actor $blockStoreActorName not found when putting dataset block.")
-    }
-    val blockStorageActorHandle = blockStorageActorHandleOpt.get()
-
+    val executorId = SparkEnv.get.executorId
+    val executorActorName = s"raydp-executor-${executorId}"
     val batchKey = UUID.randomUUID().toString
 
-    // put_arrow_ipc(batchKey, arrowBytes) -> boolean ack
-    val putArrowIPCMethod = PyActorMethod.of("put_arrow_ipc", classOf[java.lang.Boolean])
-    blockStorageActorHandle.task(putArrowIPCMethod, batchKey, data).remote().get()
+    // Buffer bytes in the executor actor process. The Python owner actor will pop them via
+    // cross-language actor call later.
+    RayDPExecutor.putArrowIPC(batchKey, data)
 
-    // RecordBatch payload is an application-level locator (not Ray object metadata):
-    // - ownerAddress encodes the BlockStore actor name (UTF-8)
+    // RecordBatch payload:
+    // - ownerAddress encodes the RayDPExecutor actor name (UTF-8)
     // - objectId encodes the batch key (UTF-8)
-    RecordBatch(blockStoreActorName.getBytes("UTF-8"), batchKey.getBytes("UTF-8"), numRecords)
+    RecordBatch(executorActorName.getBytes("UTF-8"), batchKey.getBytes("UTF-8"), numRecords)
   }
 
   /**
@@ -151,8 +104,6 @@ class ObjectStoreWriter(@transient val df: DataFrame) extends Serializable {
     val schema = df.schema
 
     val objectIds = df.queryExecution.toRdd.mapPartitions{ iter =>
-      val queue = ObjectRefHolder.getQueue(uuid)
-
       // DO NOT use iter.grouped(). See BatchIterator.
       val batchIter = if (batchSize > 0) {
         new BatchIterator(iter, batchSize)
@@ -196,7 +147,7 @@ class ObjectStoreWriter(@transient val df: DataFrame) extends Serializable {
 
           // get the wrote ByteArray and save to Ray ObjectStore
           val byteArray = byteOut.toByteArray
-          results += writeToRay(byteArray, numRecords, queue, ownerName)
+          results += writeToRay(byteArray, numRecords, ownerName)
           // end writes footer to the output stream and doesn't clean any resources.
           // It could throw exception if the output stream is closed, so it should be
           // in the try block.
@@ -267,20 +218,6 @@ object ObjectStoreWriter {
     }
   }
 
-  private def sanitizeActorName(name: String): String = {
-    if (name == null || name.isEmpty) {
-      "raydp"
-    } else {
-      // Ray named actor names should be reasonably simple; normalize to [A-Za-z0-9_].
-      name.replaceAll("[^A-Za-z0-9_]", "_")
-    }
-  }
-
-  private[spark] def getBlockStoreActorName(appName: String, executorId: String): String = {
-    val safeAppName = sanitizeActorName(appName)
-    s"${safeAppName}_BLOCKSTORE_${executorId}"
-  }
-
   def getAddress(): Array[Byte] = {
     if (address == null) {
       val objectRef = Ray.put(1)
diff --git a/python/raydp/spark/dataset.py b/python/raydp/spark/dataset.py
@@ -150,25 +150,19 @@ def _save_spark_df_to_object_store(df: sql.DataFrame, use_batch: bool = True,
     records = object_store_writer.save(use_batch, actor_owner_name)
 
     # JVM returns List[RecordBatch] where:
-    # - record.ownerAddress() is UTF-8 bytes of the BlockStore actor name
+    # - record.ownerAddress() is UTF-8 bytes of the Spark executor (RayDPExecutor) actor name
     # - record.objectId() is UTF-8 bytes of the batch_key
     actor_names = [bytes(record.ownerAddress()).decode("utf-8") for record in records]
     batch_keys = [bytes(record.objectId()).decode("utf-8") for record in records]
     block_sizes = [record.numRecords() for record in records]
 
-    # Group by BlockStore actor, fetch refs, then restore original order.
-    keys_by_actor = {}
-    for actor_name, key in zip(actor_names, batch_keys):
-        keys_by_actor.setdefault(actor_name, []).append(key)
-
-    refs_by_key = {}
-    for actor_name, keys in keys_by_actor.items():
-        blockstore_actor = ray.get_actor(actor_name)
-        refs = ray.get(blockstore_actor.get_block_refs.remote(keys))
-        for k, ref in zip(keys, refs):
-            refs_by_key[k] = ref
-
-    blocks = [refs_by_key[k] for k in batch_keys]
+    # Materialize blocks via the owner actor so the owner actor becomes the Ray owner
+    # of the returned Dataset blocks (ObjectRefs), while the blocks are still produced
+    # on (and typically stored on) executor nodes for locality.
+    owner_actor = ray.get_actor(actor_owner_name)
+    blocks = ray.get(owner_actor.fetch_block_refs.remote(actor_names, batch_keys))
+    # Keep refs in owner actor state to prevent owner-side GC from releasing ownership.
+    owner.set_reference_as_state(owner_actor, blocks)
     return blocks, block_sizes
 
 def spark_dataframe_to_ray_dataset(df: sql.DataFrame,
diff --git a/python/raydp/spark/object_owner.py b/python/raydp/spark/object_owner.py
diff --git a/python/raydp/tests/test_data_owner_transfer.py b/python/raydp/tests/test_data_owner_transfer.py