Allow assign memory to RayDPSparkMaster actor (#448)

pang-wu · pang-wu · commit 0c9e5409f09a · 2025-12-09T18:34:01.000-08:00
* only set spark master memory instead

* add test

* revert changes not needed

* update ghaction

* change parent project
diff --git a/.github/workflows/pypi.yml b/.github/workflows/pypi.yml
@@ -29,7 +29,7 @@ permissions:  # added using https://github.com/step-security/secure-repo
 jobs:
   build-and-publish:
     # do not run in forks
-    if: ${{ github.repository_owner == 'oap-project' }}
+    if: ${{ github.repository_owner == 'ray-project' }}
     name: build wheel and upload
     runs-on: ubuntu-latest
     steps:
diff --git a/.github/workflows/ray_nightly_test.yml b/.github/workflows/ray_nightly_test.yml
@@ -84,10 +84,10 @@ jobs:
           fi
           case $PYTHON_VERSION in
             3.9)
-              pip install "ray[train] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl"
+              pip install "ray[train,default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp39-cp39-manylinux2014_x86_64.whl"
               ;;
             3.10.14)
-              pip install "ray[train] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl"
+              pip install "ray[train,default] @ https://s3-us-west-2.amazonaws.com/ray-wheels/latest/ray-3.0.0.dev0-cp310-cp310-manylinux2014_x86_64.whl"
               ;;
           esac
           pip install pyarrow tqdm pytest tensorflow==2.13.1 tabulate grpcio-tools wget
diff --git a/.github/workflows/raydp.yml b/.github/workflows/raydp.yml
@@ -83,7 +83,7 @@ jobs:
           else
             pip install torch
           fi
-          pip install pyarrow "ray[train]==${{ matrix.ray-version }}" tqdm pytest tensorflow==2.13.1 tabulate grpcio-tools wget
+          pip install pyarrow "ray[train,default]==${{ matrix.ray-version }}" tqdm pytest tensorflow==2.13.1 tabulate grpcio-tools wget
           pip install "xgboost_ray[default]<=0.1.13"
           pip install "xgboost<=2.0.3"
           pip install torchmetrics
diff --git a/.github/workflows/raydp_nightly.yml b/.github/workflows/raydp_nightly.yml
@@ -29,7 +29,7 @@ permissions:  # added using https://github.com/step-security/secure-repo
 jobs:
   build-and-publish:
     # do not run in forks
-    if: ${{ github.repository_owner == 'oap-project' }}
+    if: ${{ github.repository_owner == 'ray-project' }}
     name: build wheel and upload
     runs-on: ubuntu-latest
     steps:
diff --git a/README.md b/README.md
@@ -91,8 +91,8 @@ Spark features such as dynamic resource allocation, spark-submit script, etc are
 ## Spark + AI Pipeline on Ray
 
 RayDP provides APIs for converting a Spark DataFrame to a Ray Dataset which can be consumed by XGBoost, Ray Train, Horovod on Ray, etc. RayDP also provides high level scikit-learn style Estimator APIs for distributed training with PyTorch or Tensorflow. To get started with end-to-end Spark + AI pipeline, the easiest way is to run the following tutorials on Google Collab. More examples are also available in the `examples` folder.
-* [Spark + Ray Train Tutorial on Google Collab](https://colab.research.google.com/github/oap-project/raydp/blob/master/tutorials/raytrain_example.ipynb)
-* [Spark + TorchEstimator Tutorial on Google Collab](https://colab.research.google.com/github/oap-project/raydp/blob/master/tutorials/pytorch_example.ipynb)
+* [Spark + Ray Train Tutorial on Google Collab](https://colab.research.google.com/github/ray-project/raydp/blob/master/tutorials/raytrain_example.ipynb)
+* [Spark + TorchEstimator Tutorial on Google Collab](https://colab.research.google.com/github/ray-project/raydp/blob/master/tutorials/pytorch_example.ipynb)
 
 
 ***Spark DataFrame & Ray Dataset conversion***
diff --git a/core/pom.xml b/core/pom.xml
@@ -10,7 +10,7 @@
   <packaging>pom</packaging>
 
   <name>RayDP Parent Pom</name>
-  <url>https://github.com/oap-project/raydp.git</url>
+  <url>https://github.com/ray-project/raydp.git</url>
 
   <properties>
     <spark.version>3.3.3</spark.version>
diff --git a/python/raydp/spark/ray_cluster.py b/python/raydp/spark/ray_cluster.py
@@ -61,8 +61,15 @@ def _set_up_master(self, resources: Dict[str, float], kwargs: Dict[Any, Any]):
             if "CPU" in resources:
                 num_cpu = resources["CPU"]
                 resources.pop("CPU", None)
+
+            memory = None
+            if "memory" in resources:
+                memory = resources["memory"]
+                resources.pop("memory", None)
+
             self._spark_master_handle = RayDPSparkMaster.options(name=spark_master_name,
                                                                  num_cpus=num_cpu,
+                                                                 memory=memory,
                                                                  resources=resources) \
                                                         .remote(self._configs)
         else:
diff --git a/python/raydp/tests/test_spark_master_memory.py b/python/raydp/tests/test_spark_master_memory.py
@@ -0,0 +1,65 @@
+import sys
+import pytest
+import ray
+import raydp
+from ray.cluster_utils import Cluster
+from ray.util.state import list_actors
+
+
+def test_spark_master_memory_custom(jdk17_extra_spark_configs):
+    cluster = Cluster(
+        initialize_head=True,
+        head_node_args={
+            "num_cpus": 2,
+            "resources": {"master": 10},
+            "include_dashboard": True,
+            "dashboard_port": 8270,
+        },
+    )
+    ray.init(address=cluster.address, 
+             dashboard_port=cluster.head_node.dashboard_grpc_port,
+             include_dashboard=True)
+
+    custom_memory = 100 * 1024 * 1024  # 100MB in bytes
+    configs = jdk17_extra_spark_configs.copy()
+    # Config under test: set Spark Master actor memory via RayDP config
+    configs["spark.ray.raydp_spark_master.actor.resource.memory"] = str(custom_memory)
+    # Also require the master custom resource so the actor is scheduled on the head
+    configs["spark.ray.raydp_spark_master.actor.resource.master"] = "1"
+
+    app_name = "test_spark_master_memory_custom"
+
+    spark = raydp.init_spark(
+        app_name=app_name,
+        num_executors=1,
+        executor_cores=1,
+        executor_memory="500M",
+        configs=configs,
+    )
+
+    # Trigger the Spark master / RayDPSparkMaster startup
+    spark.createDataFrame([(1, 2)], ["a", "b"]).count()
+
+    # RayDPSparkMaster name is app_name + RAYDP_SPARK_MASTER_SUFFIX
+    master_actor_name = f"{app_name}_SPARK_MASTER"
+    
+    actor = ray.get_actor(master_actor_name)
+    assert actor is not None
+
+    # Query Ray state for this actor
+    actor_state = list_actors(filters=[("actor_id", "=", actor._actor_id.hex())], detail=True)[0]
+    resources = actor_state.required_resources
+    
+    assert resources["memory"] == custom_memory
+    assert resources["master"] == 1
+
+    spark.stop()
+    raydp.stop_spark()
+    ray.shutdown()
+    cluster.shutdown()
+
+
+if __name__ == "__main__":
+    sys.exit(pytest.main(["-v", __file__]))
+
+
diff --git a/python/setup.py b/python/setup.py
@@ -108,7 +108,7 @@ def run(self):
         author="RayDP Developers",
         author_email="raydp-dev@googlegroups.com",
         license="Apache 2.0",
-        url="https://github.com/oap-project/raydp",
+        url="https://github.com/ray-project/raydp",
         keywords="raydp spark ray distributed data-processing",
         description="RayDP: Distributed Data Processing on Ray",
         long_description=io.open(
diff --git a/tutorials/pytorch_example.ipynb b/tutorials/pytorch_example.ipynb
diff --git a/tutorials/raytrain_example.ipynb b/tutorials/raytrain_example.ipynb