ray-project
diff --git a/‎.github/workflows/test.yaml‎
Lines changed: 9 additions & 4 deletions b/‎.github/workflows/test.yaml‎
Lines changed: 9 additions & 4 deletions
diff --git a/‎examples/simple_modin.py‎
Lines changed: 96 additions & 0 deletions b/‎examples/simple_modin.py‎
Lines changed: 96 additions & 0 deletions
diff --git a/‎requirements-test.txt‎
Lines changed: 1 addition & 1 deletion b/‎requirements-test.txt‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎xgboost_ray/data_sources/__init__.py‎
Lines changed: 5 additions & 2 deletions b/‎xgboost_ray/data_sources/__init__.py‎
Lines changed: 5 additions & 2 deletions
diff --git a/‎xgboost_ray/data_sources/data_source.py‎
Lines changed: 27 additions & 3 deletions b/‎xgboost_ray/data_sources/data_source.py‎
Lines changed: 27 additions & 3 deletions
@@ -30,7 +30,7 @@ jobs:
 
   test_linux_ray_master:
     runs-on: ubuntu-latest
-    timeout-minutes: 12
+    timeout-minutes: 14
     strategy:
       matrix:
         python-version: [3.6.9, 3.7, 3.8]
@@ -64,6 +64,7 @@ jobs:
         pushd xgboost_ray/tests
         python -m pytest -vv -s --log-cli-level=DEBUG --durations=0 -x test_colocation.py
         python -m pytest -v --durations=0 -x test_matrix.py
+        python -m pytest -v --durations=0 -x test_data_source.py
         python -m pytest -v --durations=0 -x test_xgboost_api.py
         python -m pytest -v --durations=0 -x test_fault_tolerance.py
         python -m pytest -v --durations=0 -x test_end_to_end.py
@@ -74,13 +75,14 @@ jobs:
         ray stop || true
         echo "running simple.py" && python simple.py --smoke-test
         echo "running simple_predict.py" && python simple_predict.py
+        echo "running simple_modin.py" && python simple_modin.py --smoke-test
         echo "running simple_tune.py" && python simple_tune.py --smoke-test
         echo "running train_on_test_data.py" && python train_on_test_data.py --smoke-test
       # for f in *.py; do echo "running $f" && python "$f" || exit 1 ; done
 
   test_linux_ray_release:
     runs-on: ubuntu-latest
-    timeout-minutes: 12
+    timeout-minutes: 14
     strategy:
       matrix:
         python-version: [3.6.9, 3.7, 3.8]
@@ -95,7 +97,7 @@ jobs:
         python -m pip install --upgrade pip
         python -m pip install codecov
         python -m pip install -U ray
-        if [ -f requirements-test.txt ]; then python -m pip install -r requirements-test.txt; fi
+        if [ -f requirements-test.txt ]; then python -m pip install  -r requirements-test.txt; fi
     - name: Install package
       run: |
         python -m pip install -e .
@@ -107,6 +109,7 @@ jobs:
         pushd xgboost_ray/tests
         python -m pytest -vv -s --log-cli-level=DEBUG --durations=0 -x test_colocation.py
         python -m pytest -v --durations=0 -x test_matrix.py
+        python -m pytest -v --durations=0 -x test_data_source.py
         python -m pytest -v --durations=0 -x test_xgboost_api.py
         python -m pytest -v --durations=0 -x test_fault_tolerance.py
         python -m pytest -v --durations=0 -x test_end_to_end.py
@@ -117,6 +120,7 @@ jobs:
         ray stop || true
         echo "running simple.py" && python simple.py --smoke-test
         echo "running simple_predict.py" && python simple_predict.py
+        echo "running simple_modin.py" && python simple_modin.py --smoke-test
         echo "running simple_tune.py" && python simple_tune.py --smoke-test
         echo "running train_on_test_data.py" && python train_on_test_data.py --smoke-test
       # for f in *.py; do echo "running $f" && python "$f" || exit 1 ; done
@@ -125,7 +129,7 @@ jobs:
     # Test compatibility when some optional libraries are missing
     # Test runs on latest ray release
     runs-on: ubuntu-latest
-    timeout-minutes: 12
+    timeout-minutes: 14
     strategy:
       matrix:
         python-version: [3.6.9, 3.7, 3.8]
@@ -157,6 +161,7 @@ jobs:
         pushd xgboost_ray/tests
         python -m pytest -vv -s --log-cli-level=DEBUG --durations=0 -x test_colocation.py
         python -m pytest -v --durations=0 -x test_matrix.py
+        python -m pytest -v --durations=0 -x test_data_source.py
         python -m pytest -v --durations=0 -x test_xgboost_api.py
         python -m pytest -v --durations=0 -x test_fault_tolerance.py
         python -m pytest -v --durations=0 -x test_end_to_end.py
 
@@ -0,0 +1,96 @@
+import argparse
+
+import numpy as np
+import pandas as pd
+
+import ray
+
+from xgboost_ray import RayDMatrix, train, RayParams
+from xgboost_ray.data_sources.modin import MODIN_INSTALLED
+
+
+def main(cpus_per_actor, num_actors):
+    if not MODIN_INSTALLED:
+        print(f"Modin is not installed or installed in a version that is not "
+              f"compatible with xgboost_ray (< 0.9.0).")
+        return
+
+    # Import modin after initializing Ray
+    from modin.distributed.dataframe.pandas import from_partitions
+
+    # Generate dataset
+    x = np.repeat(range(8), 16).reshape((32, 4))
+    # Even numbers --> 0, odd numbers --> 1
+    y = np.tile(np.repeat(range(2), 4), 4)
+
+    # Flip some bits to reduce max accuracy
+    bits_to_flip = np.random.choice(32, size=6, replace=False)
+    y[bits_to_flip] = 1 - y[bits_to_flip]
+
+    data = pd.DataFrame(x)
+    data["label"] = y
+
+    # Split into 4 partitions
+    partitions = [ray.put(part) for part in np.split(data, 4)]
+
+    # Create modin df here
+    modin_df = from_partitions(partitions, axis=0)
+
+    train_set = RayDMatrix(modin_df, "label")
+
+    evals_result = {}
+    # Set XGBoost config.
+    xgboost_params = {
+        "tree_method": "approx",
+        "objective": "binary:logistic",
+        "eval_metric": ["logloss", "error"],
+    }
+
+    # Train the classifier
+    bst = train(
+        params=xgboost_params,
+        dtrain=train_set,
+        evals=[(train_set, "train")],
+        evals_result=evals_result,
+        ray_params=RayParams(
+            max_actor_restarts=0,
+            gpus_per_actor=0,
+            cpus_per_actor=cpus_per_actor,
+            num_actors=num_actors),
+        verbose_eval=False,
+        num_boost_round=10)
+
+    model_path = "modin.xgb"
+    bst.save_model(model_path)
+    print("Final training error: {:.4f}".format(
+        evals_result["train"]["error"][-1]))
+
+
+if __name__ == "__main__":
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        "--address",
+        required=False,
+        type=str,
+        help="the address to use for Ray")
+    parser.add_argument(
+        "--cpus-per-actor",
+        type=int,
+        default=1,
+        help="Sets number of CPUs per xgboost training worker.")
+    parser.add_argument(
+        "--num-actors",
+        type=int,
+        default=4,
+        help="Sets number of xgboost workers to use.")
+    parser.add_argument(
+        "--smoke-test", action="store_true", default=False, help="gpu")
+
+    args, _ = parser.parse_known_args()
+
+    if args.smoke_test:
+        ray.init(num_cpus=args.num_actors + 1)
+    else:
+        ray.init(address=args.address)
+
+    main(args.cpus_per_actor, args.num_actors)
@@ -7,4 +7,4 @@ pytest
 pyarrow
 ray[tune]
 scikit-learn
-modin[ray]
+modin
@@ -6,10 +6,13 @@
 from xgboost_ray.data_sources.petastorm import Petastorm
 from xgboost_ray.data_sources.csv import CSV
 from xgboost_ray.data_sources.parquet import Parquet
+from xgboost_ray.data_sources.object_store import ObjectStore
 
-data_sources = [Numpy, Pandas, Modin, MLDataset, Petastorm, CSV, Parquet]
+data_sources = [
+    Numpy, Pandas, Modin, MLDataset, Petastorm, CSV, Parquet, ObjectStore
+]
 
 __all__ = [
     "DataSource", "RayFileType", "Numpy", "Pandas", "Modin", "MLDataset",
-    "Petastorm", "CSV", "Parquet"
+    "Petastorm", "CSV", "Parquet", "ObjectStore"
 ]
@@ -1,8 +1,11 @@
+from typing import Any, Optional, Sequence, Tuple, Dict
+
 from enum import Enum
-from typing import Any, Optional, Sequence, Tuple
 
 import pandas as pd
 
+from ray.actor import ActorHandle
+
 
 class RayFileType(Enum):
     """Enum for different file types (used for overrides)."""
@@ -63,7 +66,7 @@ def get_filetype(data: Any) -> Optional[RayFileType]:
     @staticmethod
     def load_data(data: Any,
                   ignore: Optional[Sequence[str]] = None,
-                  indices: Optional[Sequence[int]] = None,
+                  indices: Optional[Sequence[Any]] = None,
                   **kwargs) -> pd.DataFrame:
         """
         Load data into a pandas dataframe.
@@ -73,7 +76,7 @@ def load_data(data: Any,
         Args:
             data (Any): Input data
             ignore (Optional[Sequence[str]]): Column names to ignore
-            indices (Optional[Sequence[int]]): Indices to select. What an
+            indices (Optional[Sequence[Any]]): Indices to select. What an
                 index indicates depends on the data source.
 
         Returns:
@@ -109,3 +112,24 @@ def get_column(cls, data: pd.DataFrame,
     def get_n(data: Any):
         """Get length of data source partitions for sharding."""
         return len(list(data))
+
+    @staticmethod
+    def get_actor_shards(
+            data: Any,
+            actors: Sequence[ActorHandle]) -> \
+            Tuple[Any, Optional[Dict[int, Any]]]:
+        """Get a dict mapping actor ranks to shards.
+
+        Args:
+            data (Any): Data to shard.
+
+        Returns:
+            Returns a tuple of which the first element indicates the new
+                data object that will overwrite the existing data object
+                in the RayDMatrix (e.g. when the object is not serializable).
+                The second element is a dict mapping actor ranks to shards.
+                These objects are usually passed to the ``load_data()`` method
+                for distributed loading, so that method needs to be able to
+                deal with the respective data.
+        """
+        return data, None