Skip to content
This repository was archived by the owner on Jan 12, 2026. It is now read-only.

Commit ac30013

Browse files
atomicYard1
andauthored
Add sort dataframe logic on qid (#239)
* add sort dataframe logic on qid for centralized * move sorting logic to _split_dataframe * Update xgboost_ray/matrix.py Co-authored-by: Antoni Baum <antoni.baum@protonmail.com> Signed-off-by: atomic <atomic@users.noreply.github.com> * refactor sorting logic to method * logic for more cases of qid type and add integration test - add logic to include more case of qid data type (array, dataframe) - add 2 integration tests to cover behavior for sorting qid * raise exception for the case when qid DataFrame is using unexpected shape * fix lint * more lint fix * add unittest skip for xgboost 0.9 * Apply suggestions from code review Signed-off-by: Antoni Baum <antoni.baum@protonmail.com> * Fix Signed-off-by: Antoni Baum <antoni.baum@protonmail.com> * Test tweak Signed-off-by: Antoni Baum <antoni.baum@protonmail.com> Signed-off-by: atomic <atomic@users.noreply.github.com> Signed-off-by: Antoni Baum <antoni.baum@protonmail.com> Co-authored-by: Antoni Baum <antoni.baum@protonmail.com>
1 parent 536b702 commit ac30013

File tree

2 files changed

+84
-1
lines changed

2 files changed

+84
-1
lines changed

xgboost_ray/matrix.py

Lines changed: 31 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -48,6 +48,31 @@ def concat_dataframes(dfs: List[Optional[pd.DataFrame]]):
4848
return pd.concat(filtered, ignore_index=True, copy=False)
4949

5050

51+
def ensure_sorted_by_qid(df: pd.DataFrame, qid: Data
52+
) -> Tuple[Union[np.array, str], pd.DataFrame]:
53+
_qid: pd.Series = None
54+
if isinstance(qid, str):
55+
_qid = df[qid]
56+
elif isinstance(qid, np.ndarray):
57+
_qid = pd.Series(qid)
58+
elif isinstance(qid, pd.DataFrame):
59+
if len(df.shape) != 2 and df.shape[1] != 1:
60+
raise ValueError(f"qid argument of type pd.DataFrame is expected"
61+
"to contains only 1 column of data "
62+
f"but the qid passed in is of shape {df.shape}.")
63+
_qid = qid.iloc[:, 0]
64+
elif isinstance(qid, pd.Series):
65+
_qid = qid
66+
if _qid.is_monotonic:
67+
return _qid, df
68+
else:
69+
if isinstance(qid, str):
70+
return qid, df.sort_values([qid])
71+
else: # case when qid is not part of df
72+
return _qid.sort_values(), \
73+
df.set_index(_qid).sort_index().reset_index(drop=True)
74+
75+
5176
@PublicAPI(stability="beta")
5277
class RayShardingMode(Enum):
5378
"""Enum for different modes of sharding the data.
@@ -227,6 +252,12 @@ def _split_dataframe(
227252
`label_upper_bound`
228253
229254
"""
255+
# sort dataframe by qid if exists (required by DMatrix)
256+
if self.qid is not None:
257+
_qid, local_data = ensure_sorted_by_qid(local_data, self.qid)
258+
if not isinstance(self.qid, str):
259+
self.qid = _qid
260+
230261
exclude_cols: Set[str] = set() # Exclude these columns from `x`
231262

232263
label, exclude = data_source.get_column(local_data, self.label)

xgboost_ray/tests/test_matrix.py

Lines changed: 53 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,8 @@
1+
import inspect
12
import os
23
import tempfile
34
import unittest
5+
import xgboost as xgb
46

57
import numpy as np
68
import pandas as pd
@@ -27,7 +29,7 @@ def setUp(self):
2729

2830
@classmethod
2931
def setUpClass(cls):
30-
ray.init(num_cpus=1, local_mode=True)
32+
ray.init(local_mode=True)
3133

3234
@classmethod
3335
def tearDownClass(cls):
@@ -356,6 +358,56 @@ def testLegacyParams(self):
356358
label_lower_bound=label_lower_bound,
357359
label_upper_bound=label_upper_bound)
358360

361+
@unittest.skipIf("qid" not in inspect.signature(xgb.DMatrix).parameters,
362+
f"not supported in xgb version {xgb.__version__}")
363+
def testQidSortedBehaviorXGBoost(self):
364+
"""Test that data with unsorted qid is sorted in RayDMatrix"""
365+
in_x = self.x
366+
in_y = self.y
367+
unsorted_qid = np.array([1, 2] * 16)
368+
369+
from xgboost import DMatrix
370+
with self.assertRaises(ValueError):
371+
DMatrix(**{"data": in_x, "label": in_y, "qid": unsorted_qid})
372+
DMatrix(**{
373+
"data": in_x,
374+
"label": in_y,
375+
"qid": np.sort(unsorted_qid)
376+
}) # no exception
377+
# test RayDMatrix handles sorting automatically
378+
mat = RayDMatrix(in_x, in_y, qid=unsorted_qid)
379+
params = mat.get_data(rank=0, num_actors=1)
380+
DMatrix(**params)
381+
382+
@unittest.skipIf("qid" not in inspect.signature(xgb.DMatrix).parameters,
383+
f"not supported in xgb version {xgb.__version__}")
384+
def testQidSortedParquet(self):
385+
from xgboost import DMatrix
386+
with tempfile.TemporaryDirectory() as dir:
387+
parquet_file1 = os.path.join(dir, "file1.parquet")
388+
parquet_file2 = os.path.join(dir, "file2.parquet")
389+
390+
unsorted_qid1 = np.array([2, 4] * 16)
391+
unsorted_qid2 = np.array([1, 3] * 16)
392+
393+
# parquet 1
394+
data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
395+
data_df["label"] = pd.Series(self.y)
396+
data_df["group"] = pd.Series(unsorted_qid1)
397+
data_df.to_parquet(parquet_file1)
398+
# parquet 2
399+
data_df = pd.DataFrame(self.x, columns=["a", "b", "c", "d"])
400+
data_df["label"] = pd.Series(self.y)
401+
data_df["group"] = pd.Series(unsorted_qid2)
402+
data_df.to_parquet(parquet_file2)
403+
mat = RayDMatrix(
404+
[parquet_file1, parquet_file2],
405+
columns=["a", "b", "c", "d", "label", "group"],
406+
label="label",
407+
qid="group")
408+
params = mat.get_data(rank=0, num_actors=1)
409+
DMatrix(**params)
410+
359411

360412
if __name__ == "__main__":
361413
import pytest

0 commit comments

Comments
 (0)