Skip to content
This repository was archived by the owner on Jan 12, 2026. It is now read-only.

Commit 8d07e8d

Browse files
authored
Remove support for deprecated ray.util.data.MLDataset (#218)
Ray MLDataset has been deprecated. We should no longer maintain support for it in newer xgboost-ray versions.
1 parent 929b3e3 commit 8d07e8d

File tree

5 files changed

+8
-141
lines changed

5 files changed

+8
-141
lines changed

xgboost_ray/data_sources/__init__.py

Lines changed: 3 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
from xgboost_ray.data_sources.pandas import Pandas
44
from xgboost_ray.data_sources.modin import Modin
55
from xgboost_ray.data_sources.dask import Dask
6-
from xgboost_ray.data_sources.ml_dataset import MLDataset
76
from xgboost_ray.data_sources.petastorm import Petastorm
87
from xgboost_ray.data_sources.csv import CSV
98
from xgboost_ray.data_sources.parquet import Parquet
@@ -12,12 +11,11 @@
1211
from xgboost_ray.data_sources.partitioned import Partitioned
1312

1413
data_sources = [
15-
Numpy, Pandas, Partitioned, Modin, Dask, MLDataset, Petastorm, CSV,
16-
Parquet, ObjectStore, RayDataset
14+
Numpy, Pandas, Partitioned, Modin, Dask, Petastorm, CSV, Parquet,
15+
ObjectStore, RayDataset
1716
]
1817

1918
__all__ = [
2019
"DataSource", "RayFileType", "Numpy", "Pandas", "Modin", "Dask",
21-
"MLDataset", "Petastorm", "CSV", "Parquet", "ObjectStore", "RayDataset",
22-
"Partitioned"
20+
"Petastorm", "CSV", "Parquet", "ObjectStore", "RayDataset", "Partitioned"
2321
]

xgboost_ray/data_sources/data_source.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -24,7 +24,7 @@ class DataSource:
2424
"""Abstract class for data sources.
2525
2626
xgboost_ray supports reading from various sources, such as files
27-
(e.g. CSV, Parquet) or distributed datasets (Ray MLDataset, Modin).
27+
(e.g. CSV, Parquet) or distributed datasets (Modin).
2828
2929
This abstract class defines an interface to read from these sources.
3030
New data sources can be added by implementing this interface.

xgboost_ray/data_sources/ml_dataset.py

Lines changed: 0 additions & 83 deletions
This file was deleted.

xgboost_ray/matrix.py

Lines changed: 4 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -20,14 +20,8 @@
2020
from ray import logger
2121
from ray.util.annotations import PublicAPI, DeveloperAPI
2222

23-
from xgboost_ray.util import Unavailable
2423
from xgboost_ray.data_sources import DataSource, data_sources, RayFileType
2524

26-
try:
27-
from ray.util.data import MLDataset
28-
except ImportError:
29-
MLDataset = Unavailable
30-
3125
try:
3226
from ray.data.dataset import Dataset as RayDataset
3327
except (ImportError, ModuleNotFoundError):
@@ -46,7 +40,7 @@ class RayDataset:
4640
if TYPE_CHECKING:
4741
from xgboost_ray.xgb import xgboost as xgb
4842

49-
Data = Union[str, List[str], np.ndarray, pd.DataFrame, pd.Series, MLDataset]
43+
Data = Union[str, List[str], np.ndarray, pd.DataFrame, pd.Series]
5044

5145

5246
def concat_dataframes(dfs: List[Optional[pd.DataFrame]]):
@@ -404,7 +398,7 @@ def get_data_source(self) -> Type[DataSource]:
404398

405399
# Todo (krfricke): It would be good to have a more general way to
406400
# check for compatibility here. Combine with test below?
407-
if not (isinstance(self.data, (Iterable, MLDataset, RayDataset))
401+
if not (isinstance(self.data, (Iterable, RayDataset))
408402
or hasattr(self.data, "__partitioned__")) or invalid_data:
409403
raise ValueError(
410404
f"Distributed data loading only works with already "
@@ -444,7 +438,7 @@ def get_data_source(self) -> Type[DataSource]:
444438
f"with FileType: {self.filetype} for a distributed dataset."
445439
"\nFIX THIS by passing a supported data type. Supported "
446440
"data types for distributed datasets are a list of "
447-
"CSV or Parquet sources as well as Ray MLDatasets. If using "
441+
"CSV or Parquet sources. If using "
448442
"Modin, Dask, or Petastorm, make sure the library is "
449443
"installed.")
450444

@@ -586,7 +580,7 @@ class RayDMatrix:
586580
587581
Args:
588582
data: Data object. Can be a pandas dataframe, pandas series,
589-
numpy array, Ray MLDataset, modin dataframe, string pointing to
583+
numpy array, modin dataframe, string pointing to
590584
a csv or parquet file, or list of strings pointing to csv or
591585
parquet files.
592586
label: Optional label object. Can be a pandas series, numpy array,
@@ -874,13 +868,10 @@ def get_data(
874868

875869
def _can_load_distributed(source: Data) -> bool:
876870
"""Returns True if it might be possible to use distributed data loading"""
877-
from xgboost_ray.data_sources.ml_dataset import MLDataset
878871
from xgboost_ray.data_sources.modin import Modin
879872

880873
if isinstance(source, (int, float, bool)):
881874
return False
882-
elif MLDataset.is_data_type(source):
883-
return True
884875
elif Modin.is_data_type(source):
885876
return True
886877
elif isinstance(source, str):
@@ -902,12 +893,9 @@ def _can_load_distributed(source: Data) -> bool:
902893

903894
def _detect_distributed(source: Data) -> bool:
904895
"""Returns True if we should try to use distributed data loading"""
905-
from xgboost_ray.data_sources.ml_dataset import MLDataset
906896
from xgboost_ray.data_sources.modin import Modin
907897
if not _can_load_distributed(source):
908898
return False
909-
if MLDataset.is_data_type(source):
910-
return True
911899
if Modin.is_data_type(source):
912900
return True
913901
if isinstance(source, Iterable) and not isinstance(source, str) and \

xgboost_ray/tests/test_matrix.py

Lines changed: 0 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -289,32 +289,6 @@ def testFromMultiParquetString(self):
289289
self._testMatrixCreation(
290290
[data_file_1, data_file_2], "label", distributed=True)
291291

292-
def testFromMLDataset(self):
293-
try:
294-
from ray.util import data as ml_data
295-
except ImportError:
296-
self.skipTest("MLDataset not available in current Ray version.")
297-
return
298-
299-
with tempfile.TemporaryDirectory() as dir:
300-
data_file_1 = os.path.join(dir, "data_1.parquet")
301-
data_file_2 = os.path.join(dir, "data_2.parquet")
302-
303-
data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
304-
data_df["label"] = pd.Series(self.y)
305-
306-
df_1 = data_df[0:len(data_df) // 2]
307-
df_2 = data_df[len(data_df) // 2:]
308-
309-
df_1.to_parquet(data_file_1)
310-
df_2.to_parquet(data_file_2)
311-
312-
dataset = ml_data.read_parquet(
313-
[data_file_1, data_file_2], num_shards=2)
314-
315-
self._testMatrixCreation(dataset, "label", distributed=False)
316-
self._testMatrixCreation(dataset, "label", distributed=True)
317-
318292
def testDetectDistributed(self):
319293
with tempfile.TemporaryDirectory() as dir:
320294
parquet_file = os.path.join(dir, "file.parquet")
@@ -339,16 +313,6 @@ def testDetectDistributed(self):
339313
mat = RayDMatrix([csv_file] * 3, lazy=True)
340314
self.assertTrue(mat.distributed)
341315

342-
try:
343-
from ray.util import data as ml_data
344-
mat = RayDMatrix(
345-
ml_data.read_parquet(parquet_file, num_shards=1),
346-
lazy=True)
347-
self.assertTrue(mat.distributed)
348-
except ImportError:
349-
print("MLDataset not available in current Ray version. "
350-
"Skipping part of test.")
351-
352316
def testTooManyActorsDistributed(self):
353317
"""Test error when too many actors are passed"""
354318
with self.assertRaises(RuntimeError):

0 commit comments

Comments
 (0)