|
9 | 9 | from ray import ObjectRef |
10 | 10 |
|
11 | 11 | from xgboost_ray.data_sources import Modin, Dask, Partitioned |
12 | | -from xgboost_ray.data_sources.ray_dataset import RAY_DATASET_AVAILABLE, \ |
13 | | - RayDataset |
14 | 12 | from xgboost_ray.main import _RemoteRayXGBoostActor |
15 | 13 |
|
16 | 14 | from xgboost_ray.data_sources.modin import MODIN_INSTALLED |
@@ -436,150 +434,9 @@ def create_remote_df(arr): |
436 | 434 | f"partition {i} is not partition with ID {part_id}.") |
437 | 435 |
|
438 | 436 |
|
439 | | -@unittest.skipIf( |
440 | | - not RAY_DATASET_AVAILABLE, |
441 | | - reason="Ray datasets are not available in this version of Ray") |
442 | | -class RayDatasetSourceTest(_DistributedDataSourceTest, unittest.TestCase): |
443 | | - def _testAssignPartitions(self, part_nodes, actor_nodes, |
444 | | - expected_actor_parts): |
445 | | - partitions = [ |
446 | | - ray.put( |
447 | | - pd.DataFrame(p, columns=[str(x) for x in range(p.shape[1])])) |
448 | | - for p in np.array_split(self.x, len(part_nodes)) |
449 | | - ] |
450 | | - |
451 | | - # Dict from partition (obj ref) to node host |
452 | | - part_to_node = dict(zip(partitions, [f"node{n}" for n in part_nodes])) |
453 | | - |
454 | | - actors_to_node = dict(enumerate(f"node{n}" for n in actor_nodes)) |
455 | | - |
456 | | - actor_to_parts = self._getActorToParts(actors_to_node, partitions, |
457 | | - part_to_node, part_nodes) |
458 | | - |
459 | | - print(expected_actor_parts) |
460 | | - print(actor_to_parts) |
461 | | - |
462 | | - for actor_rank, part_ids in expected_actor_parts.items(): |
463 | | - for i, part_id in enumerate(part_ids): |
464 | | - self.assertEqual( |
465 | | - actor_to_parts[actor_rank][i], |
466 | | - partitions[part_id], |
467 | | - msg=f"Assignment failed: Actor rank {actor_rank}, " |
468 | | - f"partition {i} is not partition with ID {part_id}.") |
469 | | - |
470 | | - def _getActorToParts(self, actors_to_node, partitions, part_to_node, |
471 | | - part_nodes): |
472 | | - def get_object_locations(data, *args, **kwargs): |
473 | | - return { |
474 | | - partitions[i]: { |
475 | | - "node_ids": [part_nodes[i]], |
476 | | - "object_size": 1 |
477 | | - } |
478 | | - for i in range(len(data)) |
479 | | - } |
480 | | - |
481 | | - def node_map(): |
482 | | - return [{ |
483 | | - "NodeID": n, |
484 | | - "NodeManagerAddress": f"node{n}" |
485 | | - } for n in range(4)] |
486 | | - |
487 | | - def actor_ranks(actors): |
488 | | - return actors_to_node |
489 | | - |
490 | | - with patch( |
491 | | - "ray.experimental.get_object_locations" |
492 | | - ) as mock_locations, patch("ray.nodes") as mock_nodes, patch( |
493 | | - "xgboost_ray.data_sources.ray_dataset.get_actor_rank_ips" |
494 | | - ) as mock_ranks: |
495 | | - mock_locations.side_effect = get_object_locations |
496 | | - mock_nodes.side_effect = node_map |
497 | | - mock_ranks.side_effect = actor_ranks |
498 | | - |
499 | | - if hasattr(ray.data, "from_pandas_refs"): |
500 | | - data = ray.data.from_pandas_refs(list(part_to_node.keys())) |
501 | | - else: |
502 | | - # Legacy API |
503 | | - data = ray.data.from_pandas(list(part_to_node.keys())) |
504 | | - |
505 | | - _, actor_to_parts = RayDataset.get_actor_shards( |
506 | | - data=data, actors=[]) |
507 | | - |
508 | | - return actor_to_parts |
509 | | - |
510 | | - def _testDataSourceAssignment(self, part_nodes, actor_nodes, |
511 | | - expected_actor_parts): |
512 | | - node_ips = [ |
513 | | - node["NodeManagerAddress"] for node in ray.nodes() if node["Alive"] |
514 | | - ] |
515 | | - if len(node_ips) < max(max(actor_nodes), max(part_nodes)) + 1: |
516 | | - print("Not running on cluster, skipping rest of this test.") |
517 | | - return |
518 | | - |
519 | | - actor_node_ips = [node_ips[nid] for nid in actor_nodes] |
520 | | - part_node_ips = [node_ips[nid] for nid in part_nodes] |
521 | | - |
522 | | - # Initialize data frames on remote nodes |
523 | | - # This way we can control which partition is on which node |
524 | | - @ray.remote(num_cpus=0.1) |
525 | | - def create_remote_df(arr): |
526 | | - return ray.put(pd.DataFrame(arr)) |
527 | | - |
528 | | - partitions = np.array_split(self.x, len(part_nodes)) |
529 | | - node_dfs: List[ObjectRef] = ray.get([ |
530 | | - create_remote_df.options(resources={ |
531 | | - f"node:{pip}": 0.1 |
532 | | - }).remote(partitions[pid]) for pid, pip in enumerate(part_node_ips) |
533 | | - ]) |
534 | | - |
535 | | - # Create Ray dataset from distributed partitions |
536 | | - if hasattr(ray.data, "from_pandas_refs"): |
537 | | - ray_ds = ray.data.from_pandas_refs(node_dfs) |
538 | | - df_objs = ray_ds.to_pandas_refs() |
539 | | - else: |
540 | | - # Legacy API |
541 | | - ray_ds = ray.data.from_pandas(node_dfs) |
542 | | - df_objs = ray_ds.to_pandas() |
543 | | - |
544 | | - ray.wait(df_objs) |
545 | | - locations = ray.experimental.get_object_locations(df_objs) |
546 | | - |
547 | | - try: |
548 | | - self.assertSequenceEqual( |
549 | | - [df[0][0] for df in partitions], |
550 | | - [df[0][0] for df in ray.get(list(df_objs))], |
551 | | - msg="Ray datasets mixed up the partition order") |
552 | | - |
553 | | - self.assertSequenceEqual( |
554 | | - part_node_ips, |
555 | | - locations, |
556 | | - msg="Ray datasets moved partitions to different IPs") |
557 | | - except AssertionError as exc: |
558 | | - print(f"Ray dataset part of the test failed: {exc}") |
559 | | - print("This is a stochastic test failure. Ignoring the rest " |
560 | | - "of this test.") |
561 | | - return |
562 | | - |
563 | | - # Create ray actors |
564 | | - actors = [ |
565 | | - _RemoteRayXGBoostActor.options(resources={ |
566 | | - f"node:{nip}": 0.1 |
567 | | - }).remote(rank=rank, num_actors=len(actor_nodes)) |
568 | | - for rank, nip in enumerate(actor_node_ips) |
569 | | - ] |
570 | | - |
571 | | - # Calculate shards |
572 | | - _, actor_to_parts = RayDataset.get_actor_shards(ray_ds, actors) |
573 | | - |
574 | | - for actor_rank, part_ids in expected_actor_parts.items(): |
575 | | - for i, part_id in enumerate(part_ids): |
576 | | - assigned_df = ray.get(actor_to_parts[actor_rank][i]) |
577 | | - part_df = pd.DataFrame(partitions[part_id]) |
578 | | - |
579 | | - self.assertTrue( |
580 | | - assigned_df.equals(part_df), |
581 | | - msg=f"Assignment failed: Actor rank {actor_rank}, " |
582 | | - f"partition {i} is not partition with ID {part_id}.") |
| 437 | +# Ray Datasets data source is not tested, as we do not make use of xgboost-ray |
| 438 | +# partition-to-actor assign logic. Furthermore, xgboost-ray with Ray Datasets |
| 439 | +# is tested in ray-project/ray. |
583 | 440 |
|
584 | 441 |
|
585 | 442 | class PartitionedSourceTest(_DistributedDataSourceTest, unittest.TestCase): |
|
0 commit comments