use mixin base class

pang-wu · pang-wu · commit b6510ac26c83 · 2025-12-23T22:14:05.000-08:00
diff --git a/core/raydp-main/src/main/scala/org/apache/spark/sql/raydp/ObjectStoreWriter.scala b/core/raydp-main/src/main/scala/org/apache/spark/sql/raydp/ObjectStoreWriter.scala
@@ -19,6 +19,9 @@ package org.apache.spark.sql.raydp
 
 import com.intel.raydp.shims.SparkShimLoader
 import io.ray.api.{ActorHandle, ObjectRef, Ray}
+import io.ray.api.PyActorHandle
+import io.ray.api.call.PyActorTaskCaller
+import io.ray.api.function.PyActorMethod
 import io.ray.runtime.AbstractRayRuntime
 import java.io.ByteArrayOutputStream
 import java.util.{List, Optional, UUID}
@@ -65,12 +68,33 @@ class ObjectStoreWriter(@transient val df: DataFrame) extends Serializable {
       ownerName: String): RecordBatch = {
 
     // NOTE: We intentionally do NOT pass an owner argument to Ray.put anymore.
-    // The default JVM path puts the serialized Arrow batch into Ray's object store
-    // from the Spark executor JVM process.
     //
-    // Ownership transfer to a long-lived Python actor is implemented on the Python side
-    // by "adopting" (re-putting) these ObjectRefs inside the target actor.
-    val objectRef: ObjectRef[Array[Byte]] = Ray.put(data)
+    // - When ownerName is empty, route the put via the JVM RayAppMaster actor.
+    // - When ownerName is set to a Python actor name (e.g. RayDPSparkMaster),
+    //   invoke that Python actor's put_data(data) method via Ray cross-language
+    //   calls so that the Python actor becomes the owner of the created object.
+    val objectRef: ObjectRef[_] =
+      if (ownerName == "") {
+        Ray.put(data)
+      } else {
+        // Ray.getActor(String) is a raw Java Optional in Ray's Java API.
+        // If we don't cast it to an explicit reference type here, Scala may infer
+        // Optional[Nothing] and insert an invalid cast at runtime.
+        val opt = Ray.getActor(ownerName).asInstanceOf[Optional[AnyRef]]
+        if (!opt.isPresent) {
+          throw new RayDPException(s"Actor $ownerName not found when putting dataset block.")
+        }
+        val handleAny: AnyRef = opt.get()
+        if (!handleAny.isInstanceOf[PyActorHandle]) {
+          throw new RayDPException(
+            s"Actor $ownerName is not a Python actor; cannot invoke put_data."
+          )
+        }
+        val pyHandle = handleAny.asInstanceOf[PyActorHandle]
+        val method = PyActorMethod.of("put_data", classOf[AnyRef])
+        val refOfRef = pyHandle.task(method, data).remote()
+        refOfRef
+      }
 
     // add the objectRef to the objectRefHolder to avoid reference GC
     queue.add(objectRef)
@@ -171,7 +195,7 @@ class ObjectStoreWriter(@transient val df: DataFrame) extends Serializable {
   /**
    * For test.
    */
-  def getRandomRef(): List[Array[Byte]] = {
+  def getRandomRef(): List[_] = {
 
     df.queryExecution.toRdd.mapPartitions { _ =>
       Iterator(ObjectRefHolder.getRandom(uuid))
@@ -270,7 +294,7 @@ object ObjectStoreWriter {
 }
 
 object ObjectRefHolder {
-  type Queue = ConcurrentLinkedQueue[ObjectRef[Array[Byte]]]
+  type Queue = ConcurrentLinkedQueue[ObjectRef[_]]
   private val dfToQueue = new ConcurrentHashMap[UUID, Queue]()
 
   def getQueue(df: UUID): Queue = {
@@ -295,7 +319,7 @@ object ObjectRefHolder {
     queue.size()
   }
 
-  def getRandom(df: UUID): Array[Byte] = {
+  def getRandom(df: UUID): Any = {
     val queue = checkQueueExists(df)
     val ref = RayDPUtils.convert(queue.peek())
     ref.get()
diff --git a/python/raydp/spark/dataset.py b/python/raydp/spark/dataset.py
@@ -103,9 +103,7 @@ def get_raydp_master_owner(spark: Optional[SparkSession] = None) -> PartitionObj
     def raydp_master_set_reference_as_state(
             raydp_master_actor: ray.actor.ActorHandle,
             objects: List[ObjectRef]) -> ObjectRef:
-        # Adopt objects in the Python master actor so it becomes the owner of the
-        # dataset blocks without using Ray.put `_owner`.
-        return raydp_master_actor.adopt_objects.remote(uuid.uuid4(), objects)
+        return raydp_master_actor.add_objects.remote(uuid.uuid4(), objects)
 
     return PartitionObjectsOwner(
         obj_holder_name,
@@ -143,10 +141,7 @@ def _save_spark_df_to_object_store(df: sql.DataFrame, use_batch: bool = True,
 
     if owner is not None:
         actor_owner = ray.get_actor(actor_owner_name)
-        adopted = ray.get(owner.set_reference_as_state(actor_owner, blocks))
-        # If the owner callback returns a new list of refs (adoption), use it.
-        if adopted is not None:
-            blocks = adopted
+        ray.get(owner.set_reference_as_state(actor_owner, blocks))
 
     return blocks, block_sizes
 
diff --git a/python/raydp/spark/ray_cluster_master.py b/python/raydp/spark/ray_cluster_master.py
@@ -29,6 +29,7 @@
 import ray
 from py4j.java_gateway import JavaGateway, GatewayParameters
 from raydp import versions
+import pyarrow as pa
 
 logger = logging.getLogger(__name__)
 
@@ -48,10 +49,25 @@
 SPARK_LOG4J_CONFIG_FILE_NAME = "spark.log4j.config.file.name"
 RAY_LOG4J_CONFIG_FILE_NAME = "spark.ray.log4j.config.file.name"
 
+class RayDPObjectOwnerMixin:
+    """Mixin for Python actors that can be used as dataset block owners.
+
+    The JVM side can invoke the actor method `put_data` via Ray's cross-language
+    actor call support so that this Python actor becomes the owner of the created
+    objects, without using Ray's experimental `ray.put(_owner=...)` API.
+    """
+
+    def put_data(self, data) -> "pa.Table":
+        """Put one serialized Arrow batch into the Ray object store."""
+        # data is Arrow IPC stream bytes written by ArrowStreamWriter
+        reader = pa.ipc.open_stream(pa.BufferReader(data))
+        table = reader.read_all()
+        return table
+
 
 
 @ray.remote
-class RayDPSparkMaster():
+class RayDPSparkMaster(RayDPObjectOwnerMixin):
     def __init__(self, configs):
         self._gateway = None
         self._app_master_java_bridge = None
@@ -224,18 +240,6 @@ def get_spark_home(self) -> str:
     def add_objects(self, timestamp, objects):
         self._objects[timestamp] = objects
 
-    def adopt_objects(self, timestamp, objects):
-        """Adopt objects by re-putting them inside this actor.
-
-        This makes this actor the owner of the newly created objects without
-        using the Ray.put `_owner` argument.
-
-        Returns the new ObjectRefs.
-        """
-        new_objects = [ray.put(ray.get(obj)) for obj in objects]
-        self._objects[timestamp] = new_objects
-        return new_objects
-
     def get_object(self, timestamp, idx):
         return self._objects[timestamp][idx]
 
diff --git a/python/raydp/tests/test_data_owner_transfer.py b/python/raydp/tests/test_data_owner_transfer.py
@@ -12,6 +12,7 @@
 from raydp.spark import PartitionObjectsOwner
 from pyspark.sql import SparkSession
 from raydp.spark import get_raydp_master_owner
+from raydp.spark.ray_cluster_master import RayDPObjectOwnerMixin
 
 
 def gen_test_data(spark_session: SparkSession):
@@ -145,7 +146,7 @@ def test_custom_ownership_transfer_custom_actor(ray_cluster, jdk17_extra_spark_c
   """
 
   @ray.remote
-  class CustomActor:
+  class CustomActor(RayDPObjectOwnerMixin):
       objects: Any
 
       def wake(self):
@@ -154,11 +155,6 @@ def wake(self):
       def set_objects(self, objects):
           self.objects = objects
 
-      def adopt_objects(self, objects):
-          # Re-put inside this actor so this actor becomes the owner of the new objects.
-          self.objects = [ray.put(ray.get(o)) for o in objects]
-          return self.objects
-
   if ray_client.ray.is_connected():
       pytest.skip("Skip this test if using ray client")
 
@@ -190,7 +186,7 @@ def adopt_objects(self, objects):
   # and transfer data ownership to dedicated Object Holder (Singleton)
   ds = spark_dataframe_to_ray_dataset(df_train, parallelism=4, owner=PartitionObjectsOwner(
       owner_actor_name,
-      lambda actor, objects: actor.adopt_objects.remote(objects)))
+      lambda actor, objects: actor.set_objects.remote(objects)))
 
   # display data
   ds.show(5)