2020from ray import logger
2121from ray .util .annotations import PublicAPI , DeveloperAPI
2222
23- from xgboost_ray .util import Unavailable
2423from xgboost_ray .data_sources import DataSource , data_sources , RayFileType
2524
26- try :
27- from ray .util .data import MLDataset
28- except ImportError :
29- MLDataset = Unavailable
30-
3125try :
3226 from ray .data .dataset import Dataset as RayDataset
3327except (ImportError , ModuleNotFoundError ):
@@ -46,7 +40,7 @@ class RayDataset:
4640if TYPE_CHECKING :
4741 from xgboost_ray .xgb import xgboost as xgb
4842
49- Data = Union [str , List [str ], np .ndarray , pd .DataFrame , pd .Series , MLDataset ]
43+ Data = Union [str , List [str ], np .ndarray , pd .DataFrame , pd .Series ]
5044
5145
5246def concat_dataframes (dfs : List [Optional [pd .DataFrame ]]):
@@ -404,7 +398,7 @@ def get_data_source(self) -> Type[DataSource]:
404398
405399 # Todo (krfricke): It would be good to have a more general way to
406400 # check for compatibility here. Combine with test below?
407- if not (isinstance (self .data , (Iterable , MLDataset , RayDataset ))
401+ if not (isinstance (self .data , (Iterable , RayDataset ))
408402 or hasattr (self .data , "__partitioned__" )) or invalid_data :
409403 raise ValueError (
410404 f"Distributed data loading only works with already "
@@ -444,7 +438,7 @@ def get_data_source(self) -> Type[DataSource]:
444438 f"with FileType: { self .filetype } for a distributed dataset."
445439 "\n FIX THIS by passing a supported data type. Supported "
446440 "data types for distributed datasets are a list of "
447- "CSV or Parquet sources as well as Ray MLDatasets . If using "
441+ "CSV or Parquet sources. If using "
448442 "Modin, Dask, or Petastorm, make sure the library is "
449443 "installed." )
450444
@@ -586,7 +580,7 @@ class RayDMatrix:
586580
587581 Args:
588582 data: Data object. Can be a pandas dataframe, pandas series,
589- numpy array, Ray MLDataset, modin dataframe, string pointing to
583+ numpy array, modin dataframe, string pointing to
590584 a csv or parquet file, or list of strings pointing to csv or
591585 parquet files.
592586 label: Optional label object. Can be a pandas series, numpy array,
@@ -874,13 +868,10 @@ def get_data(
874868
875869def _can_load_distributed (source : Data ) -> bool :
876870 """Returns True if it might be possible to use distributed data loading"""
877- from xgboost_ray .data_sources .ml_dataset import MLDataset
878871 from xgboost_ray .data_sources .modin import Modin
879872
880873 if isinstance (source , (int , float , bool )):
881874 return False
882- elif MLDataset .is_data_type (source ):
883- return True
884875 elif Modin .is_data_type (source ):
885876 return True
886877 elif isinstance (source , str ):
@@ -902,12 +893,9 @@ def _can_load_distributed(source: Data) -> bool:
902893
903894def _detect_distributed (source : Data ) -> bool :
904895 """Returns True if we should try to use distributed data loading"""
905- from xgboost_ray .data_sources .ml_dataset import MLDataset
906896 from xgboost_ray .data_sources .modin import Modin
907897 if not _can_load_distributed (source ):
908898 return False
909- if MLDataset .is_data_type (source ):
910- return True
911899 if Modin .is_data_type (source ):
912900 return True
913901 if isinstance (source , Iterable ) and not isinstance (source , str ) and \
0 commit comments