add test

pang-wu · pang-wu · commit c71ec91c0fba · 2026-02-01T16:29:02.000-08:00
diff --git a/core/raydp-main/src/main/scala/org/apache/spark/executor/RayDPExecutor.scala b/core/raydp-main/src/main/scala/org/apache/spark/executor/RayDPExecutor.scala
@@ -383,7 +383,7 @@ class RayDPExecutor(
                     s"Forwarding fetch to executor $ownerSparkExecutorId " +
                     s"(ray actor id $ownerRayExecutorId).")
                 val otherHandle =
-                  Ray.getActor("raydp-executor-" + ownerRayExecutorId).get
+                  Ray.getActor("raydp-executor-" + ownerRayExecutorId).get()
                     .asInstanceOf[ActorHandle[RayDPExecutor]]
                 // One-hop forward only: call no-forward variant on the target executor and
                 // return the Arrow IPC bytes directly.
diff --git a/python/raydp/tests/test_recoverable_forwarding.py b/python/raydp/tests/test_recoverable_forwarding.py
@@ -15,64 +15,175 @@
 # limitations under the License.
 #
 
-import platform
-
+import os
 import pytest
+import pyarrow as pa
 from pyspark.storagelevel import StorageLevel
 import ray
+from ray.cluster_utils import Cluster
+from ray.data import from_arrow_refs
 import ray.util.client as ray_client
+import raydp
+
+try:
+    # Ray cross-language calls require enabling load_code_from_local.
+    # This is an internal Ray API; keep it isolated and optional.
+    from ray._private.worker import global_worker as _ray_global_worker  # type: ignore
+except Exception:  # pragma: no cover
+    _ray_global_worker = None
+
+@ray.remote(max_retries=-1)
+def _fetch_arrow_table_from_executor(
+    executor_actor_name: str,
+    rdd_id: int,
+    partition_id: int,
+    schema_json: str,
+    driver_agent_url: str,
+) -> pa.Table:
+    """Fetch Arrow table bytes from a JVM executor actor and decode to `pyarrow.Table`.
+
+    This is a test-local version of RayDP's recoverable fetch task. Keeping it in this test
+    avoids Ray remote function registration issues when driver/workers import different `raydp`
+    versions.
+    """
+    if _ray_global_worker is not None:
+        _ray_global_worker.set_load_code_from_local(True)
 
-from raydp.spark import dataset as spark_dataset
-
+    executor_actor = ray.get_actor(executor_actor_name)
+    ipc_bytes = ray.get(
+        executor_actor.getRDDPartition.remote(
+            rdd_id, partition_id, schema_json, driver_agent_url
+        )
+    )
+    reader = pa.ipc.open_stream(pa.BufferReader(ipc_bytes))
+    table = reader.read_all()
+    # Match RayDP behavior: strip schema metadata for stability.
+    table = table.replace_schema_metadata()
+    return table
 
-if platform.system() == "Darwin":
-    # Spark-on-Ray recoverable path is unstable on macOS and can crash the raylet.
-    pytest.skip("Skip recoverable forwarding test on macOS", allow_module_level=True)
 
 
-@pytest.mark.parametrize("spark_on_ray_2_executors", ["local"], indirect=True)
-def test_recoverable_forwarding_via_fetch_task(spark_on_ray_2_executors):
+def test_recoverable_forwarding_via_fetch_task(jdk17_extra_spark_configs):
     """Verify JVM-side forwarding in recoverable Spark->Ray conversion.
 
-    We deliberately trigger the recoverable fetch task to contact an executor actor that is not
-    the current owner of the cached Spark block for the chosen partition. The request should still
-    succeed because the executor refreshes the block owner and forwards the fetch one hop.
+    This test intentionally calls the recoverable fetch task on the *wrong* Spark executor actor.
+    It should still succeed because `RayDPExecutor.getRDDPartition` refreshes the block owner and
+    forwards the fetch one hop.
     """
     if ray_client.ray.is_connected():
         pytest.skip("Skip forwarding test in Ray client mode")
 
-    spark = spark_on_ray_2_executors
-
-    # Create enough partitions so that at least two different executors own cached blocks.
-    df = spark.range(0, 10000, numPartitions=8)
-
-    sc = spark.sparkContext
-    storage_level = sc._getJavaStorageLevel(StorageLevel.MEMORY_AND_DISK)
-    object_store_writer = sc._jvm.org.apache.spark.sql.raydp.ObjectStoreWriter
-
-    info = object_store_writer.prepareRecoverableRDD(df._jdf, storage_level)
-    rdd_id = info.rddId()
-    schema_json = info.schemaJson()
-    driver_agent_url = info.driverAgentUrl()
-    locations = list(info.locations())
-
-    assert locations
-    unique_execs = sorted(set(locations))
-    assert len(unique_execs) >= 2, f"Need >=2 executors, got {unique_execs}"
-
-    # Pick a partition and intentionally target the *wrong* executor actor.
-    partition_id = 0
-    owner_executor_id = locations[partition_id]
-    wrong_executor_id = next(e for e in unique_execs if e != owner_executor_id)
-
-    # Ensure Ray cross-language calls are enabled for the worker side.
-    spark_dataset._enable_load_code_from_local()
-
-    wrong_executor_actor_name = f"raydp-executor-{wrong_executor_id}"
-    table = ray.get(
-        spark_dataset._fetch_arrow_table_from_executor.remote(
-            wrong_executor_actor_name, rdd_id, partition_id, schema_json, driver_agent_url
-        )
+    stop_after = os.environ.get("RAYDP_TRACE_STOP_AFTER", "").strip().lower()
+    fetch_mode = os.environ.get("RAYDP_FETCH_MODE", "task").strip().lower()
+    cluster = Cluster(
+        initialize_head=True,
+        head_node_args={
+            "num_cpus": 2,
+            "resources": {"master": 10},
+            "include_dashboard": True,
+            "dashboard_port": 0,
+        },
     )
-    assert table.num_rows > 0
+    cluster.add_node(num_cpus=4, resources={"spark_executor": 10})
+    
+    def phase(name: str) -> None:
+        # Prints are the most reliable breadcrumb if the raylet crashes.
+        print(f"\n=== PHASE: {name} ===", flush=True)
+
+    def should_stop(name: str) -> bool:
+        return bool(stop_after) and stop_after == name.lower()
+
+    spark = None
+    try:
+        # Single-node Ray is sufficient to reproduce / bisect the crash.
+        phase("ray.init")
+        ray.shutdown()
+        ray.init(address=cluster.address, include_dashboard=False)
+        if should_stop("ray.init"):
+            return
+
+        phase("raydp.init_spark")
+        node_ip = ray.util.get_node_ip_address()
+        spark = raydp.init_spark(
+            app_name="test_recoverable_forwarding_via_fetch_task",
+            num_executors=2,
+            executor_cores=1,
+            executor_memory="500M",
+            configs={
+                "spark.driver.host": node_ip,
+                "spark.driver.bindAddress": node_ip,
+                **jdk17_extra_spark_configs,
+            },
+        )
+        if should_stop("raydp.init_spark"):
+            return
+
+        phase("spark.range.count")
+        df = spark.range(0, 10000, numPartitions=8)
+        _ = df.count()
+        if should_stop("spark.range.count"):
+            return
+
+        phase("prepareRecoverableRDD")
+        sc = spark.sparkContext
+        storage_level = sc._getJavaStorageLevel(StorageLevel.MEMORY_AND_DISK)
+        object_store_writer = sc._jvm.org.apache.spark.sql.raydp.ObjectStoreWriter
+        info = object_store_writer.prepareRecoverableRDD(df._jdf, storage_level)
+        rdd_id = info.rddId()
+        schema_json = info.schemaJson()
+        driver_agent_url = info.driverAgentUrl()
+        locations = list(info.locations())
+        if should_stop("preparerecoverablerdd"):
+            return
+
+        assert locations
+        unique_execs = sorted(set(locations))
+        assert len(unique_execs) >= 2, f"Need >=2 executors, got {unique_execs}"
+
+        partition_id = 0
+        owner_executor_id = locations[partition_id]
+        wrong_executor_id = next(e for e in unique_execs if e != owner_executor_id)
+        wrong_executor_actor_name = f"raydp-executor-{wrong_executor_id}"
+
+        phase("fetch_wrong_executor")
+
+        phase("get_wrong_executor_actor")
+        wrong_executor_actor = ray.get_actor(wrong_executor_actor_name)
+        if should_stop("get_wrong_executor_actor"):
+            return
+
+        phase("call_fetch_task")
+        if fetch_mode == "driver":
+            phase("driver_call_java_actor")
+            if _ray_global_worker is not None:
+                _ray_global_worker.set_load_code_from_local(True)
+            ipc_bytes = ray.get(
+                wrong_executor_actor.getRDDPartition.remote(
+                    rdd_id, partition_id, schema_json, driver_agent_url
+                )
+            )
+            reader = pa.ipc.open_stream(pa.BufferReader(ipc_bytes))
+            table = reader.read_all()
+            table = table.replace_schema_metadata()
+        else:
+            phase("task_call_java_actor")
+            refs: list[ray.ObjectRef] = []
+            refs.append(
+                _fetch_arrow_table_from_executor.remote(
+                    wrong_executor_actor_name,
+                    rdd_id,
+                    partition_id,
+                    schema_json,
+                    driver_agent_url,
+                )
+            )
+            table = from_arrow_refs(refs)
+        assert table.count() > 0
+    finally:
+        phase("teardown")
+        
+        spark.stop()
+        raydp.stop_spark()
+        ray.shutdown()
+        cluster.shutdown()