Fix BATCH sharding (#123)

Yard1 · web-flow · commit e68ff632b52f · 2021-06-19T12:21:32.000+01:00
diff --git a/xgboost_ray/data_sources/petastorm.py b/xgboost_ray/data_sources/petastorm.py
@@ -65,7 +65,10 @@ def load_data(data: Union[str, Sequence[str]],
                   **kwargs) -> pd.DataFrame:
         _assert_petastorm_installed()
         with petastorm.make_batch_reader(data) as reader:
-            shards = [pd.DataFrame(batch._asdict()) for batch in reader]
+            shards = [
+                pd.DataFrame(batch._asdict()) for i, batch in enumerate(reader)
+                if not indices or i in indices
+            ]
 
         local_df = pd.concat(shards, copy=False)
 
diff --git a/xgboost_ray/matrix.py b/xgboost_ray/matrix.py
@@ -885,8 +885,9 @@ def _get_sharding_indices(sharding: RayShardingMode, rank: int,
                           num_actors: int, n: int):
     """Return indices that belong to worker with rank `rank`"""
     if sharding == RayShardingMode.BATCH:
-        start_index = int(math.floor(rank / num_actors) * n)
-        end_index = int(math.floor(rank + 1 / num_actors) * n)
+        start_index = int(rank * math.ceil(n / num_actors))
+        end_index = int((rank + 1) * math.ceil(n / num_actors))
+        end_index = min(end_index, n)
         indices = list(range(start_index, end_index))
     elif sharding == RayShardingMode.INTERLEAVED:
         indices = list(range(rank, n, num_actors))
@@ -913,7 +914,7 @@ def combine_data(sharding: RayShardingMode, data: Iterable) -> np.ndarray:
     if data[0].ndim == 1:
         # most common case
         if sharding == RayShardingMode.BATCH:
-            res = np.ravel(data)
+            res = np.concatenate(data)
         elif sharding == RayShardingMode.INTERLEAVED:
             # Sometimes the lengths are off by 1 for uneven divisions
             min_len = min(len(d) for d in data)
diff --git a/xgboost_ray/tests/test_end_to_end.py b/xgboost_ray/tests/test_end_to_end.py
@@ -9,7 +9,7 @@
 import ray
 from ray.exceptions import RayActorError, RayTaskError
 
-from xgboost_ray import RayParams, train, RayDMatrix, predict
+from xgboost_ray import RayParams, train, RayDMatrix, predict, RayShardingMode
 from xgboost_ray.main import RayXGBoostTrainingError
 from xgboost_ray.callback import DistributedCallback
 from xgboost_ray.tests.utils import get_num_trees
@@ -134,20 +134,58 @@ def testHalfTraining(self):
         pred_test = bst.predict(test_X)
         self.assertSequenceEqual(test_y_second, list(pred_test))
 
-    def testJointTraining(self):
+    def _testJointTraining(self,
+                           sharding=RayShardingMode.INTERLEAVED,
+                           softprob=False):
         """Train with Ray. The data will be split, but the trees
         should be combined together and find the true model."""
-        ray.init(num_cpus=2, num_gpus=0)
+        params = self.params.copy()
+        if softprob:
+            params["objective"] = "multi:softprob"
 
         bst = train(
-            self.params,
-            RayDMatrix(self.x, self.y),
+            params,
+            RayDMatrix(self.x, self.y, sharding=sharding),
             ray_params=RayParams(num_actors=2))
 
         x_mat = xgb.DMatrix(self.x)
         pred_y = bst.predict(x_mat)
+        if softprob:
+            pred_y = np.argmax(pred_y, axis=1)
+        pred_y = pred_y.astype(int)
         self.assertSequenceEqual(list(self.y), list(pred_y))
 
+        x_mat = RayDMatrix(self.x, sharding=sharding)
+        pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2))
+        if softprob:
+            pred_y = np.argmax(pred_y, axis=1)
+        pred_y = pred_y.astype(int)
+        self.assertSequenceEqual(list(self.y), list(pred_y))
+
+        # try on an odd number of rows
+        bst = train(
+            params,
+            RayDMatrix(self.x[:-1], self.y[:-1], sharding=sharding),
+            ray_params=RayParams(num_actors=2))
+
+        x_mat = RayDMatrix(self.x[:-1], sharding=sharding)
+        pred_y = predict(bst, x_mat, ray_params=RayParams(num_actors=2))
+        if softprob:
+            pred_y = np.argmax(pred_y, axis=1)
+        pred_y = pred_y.astype(int)
+        self.assertSequenceEqual(list(self.y[:-1]), list(pred_y))
+
+    def testJointTrainingInterleaved(self):
+        ray.init(num_cpus=2, num_gpus=0)
+        self._testJointTraining(sharding=RayShardingMode.INTERLEAVED)
+        self._testJointTraining(
+            sharding=RayShardingMode.INTERLEAVED, softprob=True)
+
+    def testJointTrainingBatch(self):
+        ray.init(num_cpus=2, num_gpus=0)
+        self._testJointTraining(sharding=RayShardingMode.BATCH)
+        self._testJointTraining(sharding=RayShardingMode.BATCH, softprob=True)
+
     def testTrainPredict(self,
                          init=True,
                          remote=None,