Repartition Ray dataset if number of shards is too small (#283)

krfricke · web-flow · commit 8668a775b01f · 2023-06-24T09:18:07.000+01:00
Currently we throw an error when the number of partitions in a data source is too small for the number of workers. However, in the case of Ray datasets, we can actually repartition the dataset ourselves. This will also ensure our quickstart examples, such as in https://docs.ray.io/en/latest/train/train.html#quick-start-to-distributed-training-with-ray-train will work out of the box.
diff --git a/xgboost_ray/data_sources/data_source.py b/xgboost_ray/data_sources/data_source.py
@@ -34,6 +34,7 @@ class methods are called directly.
 
     supports_central_loading = True
     supports_distributed_loading = False
+    needs_partitions = True
 
     @staticmethod
     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
diff --git a/xgboost_ray/data_sources/ray_dataset.py b/xgboost_ray/data_sources/ray_dataset.py
@@ -34,6 +34,7 @@ class RayDataset(DataSource):
 
     supports_central_loading = True
     supports_distributed_loading = True
+    needs_partitions = False
 
     @staticmethod
     def is_data_type(data: Any, filetype: Optional[RayFileType] = None) -> bool:
@@ -102,7 +103,7 @@ def get_actor_shards(
         }
 
     @staticmethod
-    def get_n(data: Any):
+    def get_n(data: "ray.data.dataset.Dataset"):
         """
         Return number of distributed blocks.
         """
diff --git a/xgboost_ray/matrix.py b/xgboost_ray/matrix.py
@@ -430,7 +430,7 @@ def load_data(
         data_source = self.get_data_source()
 
         max_num_shards = self._cached_n or data_source.get_n(self.data)
-        if num_actors > max_num_shards:
+        if num_actors > max_num_shards and data_source.needs_partitions:
             raise RuntimeError(
                 f"Trying to shard data for {num_actors} actors, but the "
                 f"maximum number of shards (i.e. the number of data rows) "
@@ -565,7 +565,7 @@ def assert_enough_shards_for_actors(self, num_actors: int):
             return
 
         max_num_shards = self._cached_n or data_source.get_n(self.data)
-        if num_actors > max_num_shards:
+        if num_actors > max_num_shards and data_source.needs_partitions:
             raise RuntimeError(
                 f"Trying to shard data for {num_actors} actors, but the "
                 f"maximum number of shards is {max_num_shards}. If you "