ray-project
diff --git a/‎.github/workflows/raydp.yml‎
Lines changed: 1 addition & 1 deletion b/‎.github/workflows/raydp.yml‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎README.md‎
Lines changed: 8 additions & 11 deletions b/‎README.md‎
Lines changed: 8 additions & 11 deletions
diff --git a/‎core/raydp-main/src/main/java/org/apache/spark/raydp/RayExecutorUtils.java‎
Lines changed: 14 additions & 23 deletions b/‎core/raydp-main/src/main/java/org/apache/spark/raydp/RayExecutorUtils.java‎
Lines changed: 14 additions & 23 deletions
diff --git a/‎core/raydp-main/src/main/scala/org/apache/spark/sql/raydp/ObjectStoreWriter.scala‎
Lines changed: 73 additions & 130 deletions b/‎core/raydp-main/src/main/scala/org/apache/spark/sql/raydp/ObjectStoreWriter.scala‎
Lines changed: 73 additions & 130 deletions
@@ -34,7 +34,7 @@ jobs:
         os: [ubuntu-latest]
         python-version: [3.9, 3.10.14]
         spark-version: [3.3.2, 3.4.0, 3.5.0]
-        ray-version: [2.34.0, 2.40.0]
+        ray-version: [2.37.0, 2.40.0, 2.50.0]
 
     runs-on: ${{ matrix.os }}
 
 
@@ -153,28 +153,25 @@ Please refer to [NYC Taxi PyTorch Estimator](./examples/pytorch_nyctaxi.py) and
 
 ***Fault Tolerance***
 
-The ray dataset converted from spark dataframe like above is not fault-tolerant. This is because we implement it using `Ray.put` combined with spark `mapPartitions`. Objects created by `Ray.put` is not recoverable in Ray.
+RayDP now converts Spark DataFrames to Ray Datasets using a recoverable pipeline by default. This makes the resulting Ray Dataset resilient to Spark executor loss (the Arrow IPC bytes are cached in Spark and fetched via Ray tasks with lineage).
+
+The recoverable conversion is also available directly via `raydp.spark.from_spark_recoverable`, and it persists (caches) the Spark DataFrame. You can provide the storage level through the `storage_level` keyword parameter.
 
-RayDP now supports converting data in a way such that the resulting ray dataset is fault-tolerant. This feature is currently *experimental*. Here is how to use it:
 ```python
 import ray
 import raydp
 
 ray.init(address="auto")
-# set fault_tolerance_mode to True to enable the feature
-# this will connect pyspark driver to ray cluster
 spark = raydp.init_spark(app_name="RayDP Example",
                          num_executors=2,
                          executor_cores=2,
-                         executor_memory="4GB",
-                         fault_tolerance_mode=True)
-# df should be large enough so that result will be put into plasma
+                         executor_memory="4GB")
+
 df = spark.range(100000)
-# use this API instead of ray.data.from_spark
-ds = raydp.spark.from_spark_recoverable(df)
-# ds is now fault-tolerant.
+ds = raydp.spark.from_spark_recoverable(df)  # fault-tolerant
 ```
-Notice that `from_spark_recoverable` will persist the converted dataframe. You can provide the storage level through keyword parameter `storage_level`. In addition, this feature is not available in ray client mode. If you need to use ray client, please wrap your application in a ray actor, as described in the ray client chapter.
+
+Note: recoverable conversion is not available in Ray client mode. If you need to use Ray client, wrap your application in a Ray actor as described in the Ray client docs.
 
 
 ## Getting Involved
 
@@ -21,18 +21,14 @@
 import io.ray.api.ObjectRef;
 import io.ray.api.Ray;
 import io.ray.api.call.ActorCreator;
-import java.util.Map;
-import java.util.List;
-
 import io.ray.api.placementgroup.PlacementGroup;
 import io.ray.runtime.object.ObjectRefImpl;
+import java.util.List;
+import java.util.Map;
 import org.apache.spark.executor.RayDPExecutor;
 
 public class RayExecutorUtils {
-  /**
-   * Convert from mbs -> memory units. The memory units in ray is byte
-   */
-
+  /** Convert from mbs -> memory units. The memory units in ray is byte. */
   private static double toMemoryUnits(int memoryInMB) {
     double result = 1.0 * memoryInMB * 1024 * 1024;
     return Math.round(result);
@@ -47,14 +43,13 @@ public static ActorHandle<RayDPExecutor> createExecutorActor(
       PlacementGroup placementGroup,
       int bundleIndex,
       List<String> javaOpts) {
-    ActorCreator<RayDPExecutor> creator = Ray.actor(
-            RayDPExecutor::new, executorId, appMasterURL);
+    ActorCreator<RayDPExecutor> creator = Ray.actor(RayDPExecutor::new, executorId, appMasterURL);
     creator.setName("raydp-executor-" + executorId);
     creator.setJvmOptions(javaOpts);
     creator.setResource("CPU", cores);
     creator.setResource("memory", toMemoryUnits(memoryInMB));
 
-    for (Map.Entry<String, Double> entry: resources.entrySet()) {
+    for (Map.Entry<String, Double> entry : resources.entrySet()) {
       creator.setResource(entry.getKey(), entry.getValue());
     }
     if (placementGroup != null) {
@@ -72,16 +67,12 @@ public static void setUpExecutor(
       String driverUrl,
       int cores,
       String classPathEntries) {
-    handler.task(RayDPExecutor::startUp,
-        appId, driverUrl, cores, classPathEntries).remote();
+    handler.task(RayDPExecutor::startUp, appId, driverUrl, cores, classPathEntries).remote();
   }
 
   public static String[] getBlockLocations(
-      ActorHandle<RayDPExecutor> handler,
-      int rddId,
-      int numPartitions) {
-    return handler.task(RayDPExecutor::getBlockLocations,
-        rddId, numPartitions).remote().get();
+      ActorHandle<RayDPExecutor> handler, int rddId, int numPartitions) {
+    return handler.task(RayDPExecutor::getBlockLocations, rddId, numPartitions).remote().get();
   }
 
   public static ObjectRef<byte[]> getRDDPartition(
@@ -90,14 +81,14 @@ public static ObjectRef<byte[]> getRDDPartition(
       int partitionId,
       String schema,
       String driverAgentUrl) {
-    return (ObjectRefImpl<byte[]>) handle.task(
-        RayDPExecutor::getRDDPartition,
-        rddId, partitionId, schema, driverAgentUrl).remote();
+    return (ObjectRefImpl<byte[]>)
+        handle.task(RayDPExecutor::getRDDPartition, rddId, partitionId, schema, driverAgentUrl)
+            .remote();
   }
 
-  public static void exitExecutor(
-    ActorHandle<RayDPExecutor> handle
-  ) {
+  public static void exitExecutor(ActorHandle<RayDPExecutor> handle) {
     handle.task(RayDPExecutor::stop).remote();
   }
 }
+
+
@@ -18,156 +18,28 @@
 package org.apache.spark.sql.raydp
 
 import com.intel.raydp.shims.SparkShimLoader
-import io.ray.api.{ActorHandle, ObjectRef, PyActorHandle, Ray}
+import io.ray.api.{ActorHandle, ObjectRef, Ray}
 import io.ray.runtime.AbstractRayRuntime
-import java.io.ByteArrayOutputStream
 import java.util.{List, UUID}
 import java.util.concurrent.{ConcurrentHashMap, ConcurrentLinkedQueue}
 import java.util.function.{Function => JFunction}
-import org.apache.arrow.vector.VectorSchemaRoot
-import org.apache.arrow.vector.ipc.ArrowStreamWriter
 import org.apache.arrow.vector.types.pojo.Schema
 import scala.collection.JavaConverters._
 import scala.collection.mutable
-import scala.collection.mutable.ArrayBuffer
 
 import org.apache.spark.{RayDPException, SparkContext}
 import org.apache.spark.deploy.raydp._
 import org.apache.spark.executor.RayDPExecutor
+import org.apache.spark.network.util.JavaUtils
 import org.apache.spark.raydp.{RayDPUtils, RayExecutorUtils}
 import org.apache.spark.sql.DataFrame
-import org.apache.spark.sql.execution.arrow.ArrowWriter
-import org.apache.spark.sql.execution.python.BatchIterator
 import org.apache.spark.sql.internal.SQLConf
-import org.apache.spark.sql.util.ArrowUtils
 import org.apache.spark.storage.StorageLevel
-import org.apache.spark.util.Utils
-
-/**
- * A batch of record that has been wrote into Ray object store.
- * @param ownerAddress the owner address of the ray worker
- * @param objectId the ObjectId for the stored data
- * @param numRecords the number of records for the stored data
- */
-case class RecordBatch(
-    ownerAddress: Array[Byte],
-    objectId: Array[Byte],
-    numRecords: Int)
 
 class ObjectStoreWriter(@transient val df: DataFrame) extends Serializable {
 
   val uuid: UUID = ObjectStoreWriter.dfToId.getOrElseUpdate(df, UUID.randomUUID())
 
-  def writeToRay(
-      data: Array[Byte],
-      numRecords: Int,
-      queue: ObjectRefHolder.Queue,
-      ownerName: String): RecordBatch = {
-
-    var objectRef: ObjectRef[Array[Byte]] = null
-    if (ownerName == "") {
-      objectRef = Ray.put(data)
-    } else {
-      var dataOwner: PyActorHandle = Ray.getActor(ownerName).get()
-      objectRef = Ray.put(data, dataOwner)
-    }
-
-    // add the objectRef to the objectRefHolder to avoid reference GC
-    queue.add(objectRef)
-    val objectRefImpl = RayDPUtils.convert(objectRef)
-    val objectId = objectRefImpl.getId
-    val runtime = Ray.internal.asInstanceOf[AbstractRayRuntime]
-    val addressInfo = runtime.getObjectStore.getOwnershipInfo(objectId)
-    RecordBatch(addressInfo, objectId.getBytes, numRecords)
-  }
-
-  /**
-   * Save the DataFrame to Ray object store with Apache Arrow format.
-   */
-  def save(useBatch: Boolean, ownerName: String): List[RecordBatch] = {
-    val conf = df.queryExecution.sparkSession.sessionState.conf
-    val timeZoneId = conf.getConf(SQLConf.SESSION_LOCAL_TIMEZONE)
-    var batchSize = conf.getConf(SQLConf.ARROW_EXECUTION_MAX_RECORDS_PER_BATCH)
-    if (!useBatch) {
-      batchSize = 0
-    }
-    val schema = df.schema
-
-    val objectIds = df.queryExecution.toRdd.mapPartitions{ iter =>
-      val queue = ObjectRefHolder.getQueue(uuid)
-
-      // DO NOT use iter.grouped(). See BatchIterator.
-      val batchIter = if (batchSize > 0) {
-        new BatchIterator(iter, batchSize)
-      } else {
-        Iterator(iter)
-      }
-
-      val arrowSchema = SparkShimLoader.getSparkShims.toArrowSchema(schema, timeZoneId)
-      val allocator = ArrowUtils.rootAllocator.newChildAllocator(
-        s"ray object store writer", 0, Long.MaxValue)
-      val root = VectorSchemaRoot.create(arrowSchema, allocator)
-      val results = new ArrayBuffer[RecordBatch]()
-
-      val byteOut = new ByteArrayOutputStream()
-      val arrowWriter = ArrowWriter.create(root)
-      var numRecords: Int = 0
-
-      Utils.tryWithSafeFinally {
-        while (batchIter.hasNext) {
-          // reset the state
-          numRecords = 0
-          byteOut.reset()
-          arrowWriter.reset()
-
-          // write out the schema meta data
-          val writer = new ArrowStreamWriter(root, null, byteOut)
-          writer.start()
-
-          // get the next record batch
-          val nextBatch = batchIter.next()
-
-          while (nextBatch.hasNext) {
-            numRecords += 1
-            arrowWriter.write(nextBatch.next())
-          }
-
-          // set the write record count
-          arrowWriter.finish()
-          // write out the record batch to the underlying out
-          writer.writeBatch()
-
-          // get the wrote ByteArray and save to Ray ObjectStore
-          val byteArray = byteOut.toByteArray
-          results += writeToRay(byteArray, numRecords, queue, ownerName)
-          // end writes footer to the output stream and doesn't clean any resources.
-          // It could throw exception if the output stream is closed, so it should be
-          // in the try block.
-          writer.end()
-        }
-        arrowWriter.reset()
-        byteOut.close()
-      } {
-        // If we close root and allocator in TaskCompletionListener, there could be a race
-        // condition where the writer thread keeps writing to the VectorSchemaRoot while
-        // it's being closed by the TaskCompletion listener.
-        // Closing root and allocator here is cleaner because root and allocator is owned
-        // by the writer thread and is only visible to the writer thread.
-        //
-        // If the writer thread is interrupted by TaskCompletionListener, it should either
-        // (1) in the try block, in which case it will get an InterruptedException when
-        // performing io, and goes into the finally block or (2) in the finally block,
-        // in which case it will ignore the interruption and close the resources.
-
-        root.close()
-        allocator.close()
-      }
-
-      results.toIterator
-    }.collect()
-    objectIds.toSeq.asJava
-  }
-
   /**
    * For test.
    */
@@ -201,6 +73,15 @@ object ObjectStoreWriter {
     }
   }
 
+  private def parseMemoryBytes(value: String): Double = {
+    if (value == null || value.isEmpty) {
+      0.0
+    } else {
+      // Spark parser supports both plain numbers (bytes) and strings like "100M", "2g".
+      JavaUtils.byteStringAsBytes(value).toDouble
+    }
+  }
+
   def getAddress(): Array[Byte] = {
     if (address == null) {
       val objectRef = Ray.put(1)
@@ -218,6 +99,7 @@ object ObjectStoreWriter {
     SparkShimLoader.getSparkShims.toArrowSchema(df.schema, timeZoneId)
   }
 
+  @deprecated
   def fromSparkRDD(df: DataFrame, storageLevel: StorageLevel): Array[Array[Byte]] = {
     if (!Ray.isInitialized) {
       throw new RayDPException(
@@ -267,6 +149,67 @@ object ObjectStoreWriter {
     results
   }
 
+  /**
+   * Prepare a Spark ArrowBatch RDD for recoverable conversion and return metadata needed by
+   * Python to build reconstructable Ray Dataset blocks via Ray tasks.
+   *
+   * This method:
+   * - persists and materializes the ArrowBatch RDD in Spark (so partitions can be re-fetched)
+   * - computes per-partition executor locations (Spark executor IDs)
+   *
+   * It does NOT push any data to Ray.
+   */
+  def prepareRecoverableRDD(
+      df: DataFrame,
+      storageLevel: StorageLevel): RecoverableRDDInfo = {
+    if (!Ray.isInitialized) {
+      throw new RayDPException(
+        "Not yet connected to Ray! Please set fault_tolerant_mode=True when starting RayDP.")
+    }
+
+    val rdd = df.toArrowBatchRdd
+    rdd.persist(storageLevel)
+    rdd.count()
+
+    var executorIds = df.sqlContext.sparkContext.getExecutorIds.toArray
+    val numExecutors = executorIds.length
+    val appMasterHandle = Ray.getActor(RayAppMaster.ACTOR_NAME)
+                             .get.asInstanceOf[ActorHandle[RayAppMaster]]
+    val restartedExecutors = RayAppMasterUtils.getRestartedExecutors(appMasterHandle)
+    if (!restartedExecutors.isEmpty) {
+      for (i <- 0 until numExecutors) {
+        if (restartedExecutors.containsKey(executorIds(i))) {
+          val oldId = restartedExecutors.get(executorIds(i))
+          executorIds(i) = oldId
+        }
+      }
+    }
+
+    val schemaJson = ObjectStoreWriter.toArrowSchema(df).toJson
+    val numPartitions = rdd.getNumPartitions
+
+    val handles = executorIds.map { id =>
+      Ray.getActor("raydp-executor-" + id)
+         .get
+         .asInstanceOf[ActorHandle[RayDPExecutor]]
+    }
+    val locations = RayExecutorUtils.getBlockLocations(handles(0), rdd.id, numPartitions)
+
+    RecoverableRDDInfo(rdd.id, numPartitions, schemaJson, driverAgentUrl, locations)
+  }
+
+}
+
+case class RecoverableRDDInfo(
+    rddId: Int,
+    numPartitions: Int,
+    schemaJson: String,
+    driverAgentUrl: String,
+    locations: Array[String])
+
+object RecoverableRDDInfo {
+  // Empty constructor for reflection / Java interop (some tools expect it).
+  def empty: RecoverableRDDInfo = RecoverableRDDInfo(0, 0, "", "", Array.empty[String])
 }
 
 object ObjectRefHolder {