Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
20 commits
Select commit Hold shift + click to select a range
3f692b5
Refactor sketch ReduceV merge path to explicit output reduction
RAMitchell Mar 4, 2026
1c776e6
Refine ReduceV sketch payload API and summary views
RAMitchell Mar 4, 2026
67986d5
Simplify ReduceV sketch merge to prune in-place
RAMitchell Mar 4, 2026
34b0fe2
Encapsulate WQSummary storage and avoid direct payload data writes
RAMitchell Mar 4, 2026
7baa1db
refactor: remove retained cuts metadata from allreduce summaries
RAMitchell Mar 5, 2026
ea6df23
refactor quantile payload and fix ReduceV root to rank 0
RAMitchell Mar 5, 2026
5f98530
refactor: use payload-based categorical allreduce
RAMitchell Mar 5, 2026
278a2aa
refactor(quantile): split numeric and categorical allreduce paths
RAMitchell Mar 5, 2026
c79a14d
test: drop distributed quantile benchmark tests
RAMitchell Mar 5, 2026
c102f78
fix: address CI failures in quantile reduce path
RAMitchell Mar 5, 2026
a9c8317
fix: address PR review feedback for ReduceV path
RAMitchell Mar 6, 2026
211dc46
refactor: rename variable-length allreduce primitive
RAMitchell Mar 8, 2026
f1ad572
Refactor HistogramCuts construction
RAMitchell Mar 9, 2026
be7e392
Remove stored HistogramCuts minimums
RAMitchell Mar 9, 2026
d3e563d
Merge branch 'master' of github.com:dmlc/xgboost into quantile-refactor
RAMitchell Mar 9, 2026
434c043
Fix i386 split evaluator build
RAMitchell Mar 9, 2026
2cdcd99
Fix quantile cut compatibility regressions
RAMitchell Mar 9, 2026
87e0355
Handle inf values in serialization tests
RAMitchell Mar 9, 2026
b3a02c5
Remove HistogramCuts cache format header
RAMitchell Mar 11, 2026
cd80557
Merge remote-tracking branch 'upstream/master' into quantile-refactor
RAMitchell Mar 11, 2026
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
49 changes: 16 additions & 33 deletions plugin/sycl/data/gradient_index.h
Original file line number Diff line number Diff line change
Expand Up @@ -5,12 +5,11 @@
#ifndef PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_
#define PLUGIN_SYCL_DATA_GRADIENT_INDEX_H_

#include <sycl/sycl.hpp>
#include <vector>

#include "../data.h"
#include "../../src/common/hist_util.h"

#include <sycl/sycl.hpp>
#include "../data.h"

namespace xgboost {
namespace sycl {
Expand All @@ -22,52 +21,40 @@ using BinTypeSize = ::xgboost::common::BinTypeSize;
* \brief Index data and offsets stored in USM buffers to provide access from device kernels
*/
struct Index {
Index() {
SetBinTypeSize(binTypeSize_);
}
Index() { SetBinTypeSize(binTypeSize_); }
Index(const Index& i) = delete;
Index& operator=(Index i) = delete;
Index(Index&& i) = delete;
Index& operator=(Index&& i) = delete;
void SetBinTypeSize(BinTypeSize binTypeSize) {
binTypeSize_ = binTypeSize;
CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize ||
CHECK(binTypeSize == BinTypeSize::kUint8BinsTypeSize ||
binTypeSize == BinTypeSize::kUint16BinsTypeSize ||
binTypeSize == BinTypeSize::kUint32BinsTypeSize);
}
BinTypeSize GetBinTypeSize() const {
return binTypeSize_;
}
BinTypeSize GetBinTypeSize() const { return binTypeSize_; }

template<typename T>
template <typename T>
T* data() {
return reinterpret_cast<T*>(data_.Data());
}

template<typename T>
template <typename T>
const T* data() const {
return reinterpret_cast<const T*>(data_.DataConst());
}

size_t Size() const {
return data_.Size() / (binTypeSize_);
}
size_t Size() const { return data_.Size() / (binTypeSize_); }

void Resize(::sycl::queue* qu, const size_t nBytesData) {
data_.Resize(qu, nBytesData);
}
void Resize(::sycl::queue* qu, const size_t nBytesData) { data_.Resize(qu, nBytesData); }

uint8_t* begin() const {
return data_.Begin();
}
uint8_t* begin() const { return data_.Begin(); }

uint8_t* end() const {
return data_.End();
}
uint8_t* end() const { return data_.End(); }

private:
USMVector<uint8_t, MemoryType::on_device> data_;
BinTypeSize binTypeSize_ {BinTypeSize::kUint8BinsTypeSize};
BinTypeSize binTypeSize_{BinTypeSize::kUint8BinsTypeSize};
};

/*!
Expand All @@ -84,20 +71,18 @@ struct GHistIndexMatrix {

USMVector<uint8_t, MemoryType::on_device> sort_buff;
/*! \brief The corresponding cuts */
xgboost::common::HistogramCuts cut;
xgboost::common::HistogramCuts cut{0};
size_t max_num_bins;
size_t min_num_bins;
size_t nbins;
size_t nfeatures;
size_t row_stride;

// Create a global histogram matrix based on a given DMatrix device wrapper
void Init(::sycl::queue* qu, Context const * ctx,
DMatrix *dmat, int max_num_bins);
void Init(::sycl::queue* qu, Context const* ctx, DMatrix* dmat, int max_num_bins);

template <typename BinIdxType, bool isDense>
void SetIndexData(::sycl::queue* qu, Context const * ctx, BinIdxType* index_data,
DMatrix *dmat);
void SetIndexData(::sycl::queue* qu, Context const* ctx, BinIdxType* index_data, DMatrix* dmat);

void ResizeIndex(::sycl::queue* qu, size_t n_index);

Expand All @@ -111,9 +96,7 @@ struct GHistIndexMatrix {
}
}
}
inline bool IsDense() const {
return isDense_;
}
inline bool IsDense() const { return isDense_; }

private:
bool isDense_;
Expand Down
36 changes: 10 additions & 26 deletions src/c_api/c_api.cc
Original file line number Diff line number Diff line change
Expand Up @@ -970,41 +970,25 @@ void GetCutImpl(Context const *ctx, std::shared_ptr<DMatrix> p_m,
auto &data = *p_data;
for (auto const &page : p_m->GetBatches<Page>(ctx, {})) {
auto const &cut = page.Cuts();

auto const &ptrs = cut.Ptrs();
auto const &vals = cut.Values();
auto ft = p_m->Info().feature_types.ConstHostSpan();

indptr.resize(ptrs.size());
data.clear();

auto const &vals = cut.Values();
auto const &mins = cut.MinValues();
for (bst_feature_t fidx = 0; fidx < p_m->Info().num_col_; ++fidx) {
indptr[fidx] = data.size();

bst_feature_t n_features = p_m->Info().num_col_;
auto ft = p_m->Info().feature_types.ConstHostSpan();
std::size_t n_categories = std::count_if(ft.cbegin(), ft.cend(),
[](auto t) { return t == FeatureType::kCategorical; });
data.resize(vals.size() + n_features - n_categories); // |vals| + |mins|
std::size_t i{0}, n_numeric{0};
for (bst_feature_t fidx = 0; fidx < n_features; ++fidx) {
CHECK_LT(i, data.size());
bool is_numeric = !common::IsCat(ft, fidx);
if (is_numeric) {
data[i] = mins[fidx];
i++;
if (!common::IsCat(ft, fidx)) {
data.push_back(common::HistogramCuts::NumericBinLowerBound(ptrs, vals, fidx, ptrs[fidx]));
}

auto beg = ptrs[fidx];
auto end = ptrs[fidx + 1];
CHECK_LE(end, data.size());
std::copy(vals.cbegin() + beg, vals.cbegin() + end, data.begin() + i);
i += (end - beg);
// shift by min values.
indptr[fidx] = ptrs[fidx] + n_numeric;
if (is_numeric) {
n_numeric++;
}
data.insert(data.end(), vals.cbegin() + beg, vals.cbegin() + end);
}
CHECK_EQ(n_numeric, n_features - n_categories);

indptr.back() = data.size();
CHECK_EQ(indptr.back(), vals.size() + mins.size() - n_categories);
break;
}
}
Expand Down
88 changes: 35 additions & 53 deletions src/common/hist_util.cc
Original file line number Diff line number Diff line change
Expand Up @@ -17,43 +17,40 @@
#include "xgboost/data.h" // for SparsePage, SortedCSCPage

#if defined(XGBOOST_MM_PREFETCH_PRESENT)
#include <xmmintrin.h>
#define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast<const char*>(addr), _MM_HINT_T0)
#include <xmmintrin.h>
#define PREFETCH_READ_T0(addr) _mm_prefetch(reinterpret_cast<const char *>(addr), _MM_HINT_T0)
#elif defined(XGBOOST_BUILTIN_PREFETCH_PRESENT)
#define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast<const char*>(addr), 0, 3)
#define PREFETCH_READ_T0(addr) __builtin_prefetch(reinterpret_cast<const char *>(addr), 0, 3)
#else // no SW pre-fetching available; PREFETCH_READ_T0 is no-op
#define PREFETCH_READ_T0(addr) do {} while (0)
#define PREFETCH_READ_T0(addr) \
do { \
} while (0)
#endif // defined(XGBOOST_MM_PREFETCH_PRESENT)

namespace xgboost::common {
HistogramCuts::HistogramCuts() {
cut_ptrs_.HostVector().emplace_back(0);
}
HistogramCuts::HistogramCuts(bst_feature_t n_features)
: cut_ptrs_(static_cast<std::size_t>(n_features) + 1, 0) {}

void HistogramCuts::Save(common::AlignedFileWriteStream *fo) const {
auto const &ptrs = this->Ptrs();
CHECK_LE(Span{ptrs}.size_bytes(), WriteVec(fo, ptrs));
auto const &vals = this->Values();
CHECK_LE(Span{vals}.size_bytes(), WriteVec(fo, vals));
auto const &mins = this->MinValues();
CHECK_LE(Span{mins}.size_bytes(), WriteVec(fo, mins));
CHECK_GE(fo->Write(has_categorical_), sizeof(has_categorical_));
CHECK_GE(fo->Write(max_cat_), sizeof(max_cat_));
}

[[nodiscard]] HistogramCuts *HistogramCuts::Load(common::AlignedResourceReadStream *fi) {
auto p_cuts = new HistogramCuts;
auto p_cuts = new HistogramCuts{0};
CHECK(ReadVec(fi, &p_cuts->cut_ptrs_.HostVector()));
CHECK(ReadVec(fi, &p_cuts->cut_values_.HostVector()));
CHECK(ReadVec(fi, &p_cuts->min_vals_.HostVector()));
CHECK(fi->Read(&p_cuts->has_categorical_));
CHECK(fi->Read(&p_cuts->max_cat_));
return p_cuts;
}

HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins, bool use_sorted,
Span<float const> hessian) {
HistogramCuts out;
auto const &info = m->Info();
auto n_threads = ctx->Threads();
std::vector<bst_idx_t> reduced(info.num_col_, 0);
Expand All @@ -73,20 +70,15 @@ HistogramCuts SketchOnDMatrix(Context const *ctx, DMatrix *m, bst_bin_t max_bins
for (auto const &page : m->GetBatches<SparsePage>()) {
container.PushRowPage(page, info, hessian);
}
container.MakeCuts(ctx, m->Info(), &out);
return container.MakeCuts(ctx, m->Info());
} else {
SortedSketchContainer container{ctx,
max_bins,
m->Info().feature_types.ConstHostSpan(),
reduced,
SortedSketchContainer container{ctx, max_bins, m->Info().feature_types.ConstHostSpan(), reduced,
HostSketchContainer::UseGroup(info)};
for (auto const &page : m->GetBatches<SortedCSCPage>(ctx)) {
container.PushColPage(page, info, hessian);
}
container.MakeCuts(ctx, m->Info(), &out);
return container.MakeCuts(ctx, m->Info());
}

return out;
}

/*!
Expand Down Expand Up @@ -118,9 +110,9 @@ void CopyHist(GHistRow dst, const GHistRow src, size_t begin, size_t end) {
*/
void SubtractionHist(GHistRow dst, const GHistRow src1, const GHistRow src2, size_t begin,
size_t end) {
double* pdst = reinterpret_cast<double*>(dst.data());
const double* psrc1 = reinterpret_cast<const double*>(src1.data());
const double* psrc2 = reinterpret_cast<const double*>(src2.data());
double *pdst = reinterpret_cast<double *>(dst.data());
const double *psrc1 = reinterpret_cast<const double *>(src1.data());
const double *psrc2 = reinterpret_cast<const double *>(src2.data());

for (size_t i = 2 * begin; i < 2 * end; ++i) {
pdst[i] = psrc1[i] - psrc2[i];
Expand All @@ -134,13 +126,10 @@ struct Prefetch {

private:
static constexpr size_t kNoPrefetchSize =
kPrefetchOffset + kCacheLineSize /
sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type);
kPrefetchOffset + kCacheLineSize / sizeof(decltype(GHistIndexMatrix::row_ptr)::value_type);

public:
static size_t NoPrefetchSize(size_t rows) {
return std::min(rows, kNoPrefetchSize);
}
static size_t NoPrefetchSize(size_t rows) { return std::min(rows, kNoPrefetchSize); }

template <typename T>
static constexpr size_t GetPrefetchStep() {
Expand All @@ -156,9 +145,7 @@ struct RuntimeFlags {
const BinTypeSize bin_type_size;
};

template <bool _any_missing,
bool _first_page = false,
bool _read_by_column = false,
template <bool _any_missing, bool _first_page = false, bool _read_by_column = false,
typename BinIdxTypeName = uint8_t>
class GHistBuildingManager {
public:
Expand Down Expand Up @@ -192,7 +179,7 @@ class GHistBuildingManager {
* and forward the call there.
*/
template <typename Fn>
static void DispatchAndExecute(const RuntimeFlags& flags, Fn&& fn) {
static void DispatchAndExecute(const RuntimeFlags &flags, Fn &&fn) {
if (flags.first_page != kFirstPage) {
SetFirstPage<true>::Type::DispatchAndExecute(flags, std::forward<Fn>(fn));
} else if (flags.read_by_column != kReadByColumn) {
Expand Down Expand Up @@ -247,22 +234,19 @@ void RowsWiseBuildHistKernel(Span<GradientPair const> gpair, Span<bst_idx_t cons
// to work with gradient pairs as a singe row FP array

for (std::size_t i = 0; i < size; ++i) {
const size_t icol_start =
kAnyMissing ? get_row_ptr(rid[i]) : get_rid(rid[i]) * n_features;
const size_t icol_end =
kAnyMissing ? get_row_ptr(rid[i] + 1) : icol_start + n_features;
const size_t icol_start = kAnyMissing ? get_row_ptr(rid[i]) : get_rid(rid[i]) * n_features;
const size_t icol_end = kAnyMissing ? get_row_ptr(rid[i] + 1) : icol_start + n_features;

const size_t row_size = icol_end - icol_start;
const size_t idx_gh = two * rid[i];

if (do_prefetch) {
const size_t icol_start_prefetch =
kAnyMissing
? get_row_ptr(rid[i + Prefetch::kPrefetchOffset])
: get_rid(rid[i + Prefetch::kPrefetchOffset]) * n_features;
const size_t icol_end_prefetch =
kAnyMissing ? get_row_ptr(rid[i + Prefetch::kPrefetchOffset] + 1)
: icol_start_prefetch + n_features;
kAnyMissing ? get_row_ptr(rid[i + Prefetch::kPrefetchOffset])
: get_rid(rid[i + Prefetch::kPrefetchOffset]) * n_features;
const size_t icol_end_prefetch = kAnyMissing
? get_row_ptr(rid[i + Prefetch::kPrefetchOffset] + 1)
: icol_start_prefetch + n_features;

PREFETCH_READ_T0(p_gpair + two * rid[i + Prefetch::kPrefetchOffset]);
for (size_t j = icol_start_prefetch; j < icol_end_prefetch;
Expand Down Expand Up @@ -301,7 +285,9 @@ void ColsWiseBuildHistKernel(Span<GradientPair const> gpair, Span<bst_idx_t cons
auto get_row_ptr = [&](bst_idx_t ridx) {
return kFirstPage ? row_ptr[ridx] : row_ptr[ridx - base_rowid];
};
auto get_rid = [&](bst_idx_t ridx) { return kFirstPage ? ridx : (ridx - base_rowid); };
auto get_rid = [&](bst_idx_t ridx) {
return kFirstPage ? ridx : (ridx - base_rowid);
};

const size_t n_features = gmat.cut.Ptrs().size() - 1;
const size_t n_columns = n_features;
Expand All @@ -314,10 +300,8 @@ void ColsWiseBuildHistKernel(Span<GradientPair const> gpair, Span<bst_idx_t cons
const uint32_t offset = kAnyMissing ? 0 : offsets[cid];
for (size_t i = 0; i < size; ++i) {
const size_t row_id = rid[i];
const size_t icol_start =
kAnyMissing ? get_row_ptr(row_id) : get_rid(row_id) * n_features;
const size_t icol_end =
kAnyMissing ? get_row_ptr(rid[i] + 1) : icol_start + n_features;
const size_t icol_start = kAnyMissing ? get_row_ptr(row_id) : get_rid(row_id) * n_features;
const size_t icol_end = kAnyMissing ? get_row_ptr(rid[i] + 1) : icol_start + n_features;

if (cid < icol_end - icol_start) {
const BinIdxType *gr_index_local = gradient_index + icol_start;
Expand All @@ -327,7 +311,7 @@ void ColsWiseBuildHistKernel(Span<GradientPair const> gpair, Span<bst_idx_t cons
const size_t idx_gh = two * row_id;
// The trick with pgh_t buffer helps the compiler to generate faster binary.
const float pgh_t[] = {pgh[idx_gh], pgh[idx_gh + 1]};
*(hist_local) += pgh_t[0];
*(hist_local) += pgh_t[0];
*(hist_local + 1) += pgh_t[1];
}
}
Expand Down Expand Up @@ -369,7 +353,7 @@ void BuildHistDispatch(Span<GradientPair const> gpair, Span<bst_idx_t const> row
template <bool any_missing>
void BuildHist(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
const GHistIndexMatrix &gmat, GHistRow hist, bool read_by_column) {
bool first_page = gmat.base_rowid == 0;;
bool first_page = gmat.base_rowid == 0;
auto bin_type_size = gmat.index.GetBinTypeSize();

GHistBuildingManager<any_missing>::DispatchAndExecute(
Expand All @@ -380,10 +364,8 @@ void BuildHist(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices
}

template void BuildHist<true>(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
const GHistIndexMatrix &gmat, GHistRow hist,
bool read_by_column);
const GHistIndexMatrix &gmat, GHistRow hist, bool read_by_column);

template void BuildHist<false>(Span<GradientPair const> gpair, Span<bst_idx_t const> row_indices,
const GHistIndexMatrix &gmat, GHistRow hist,
bool read_by_column);
const GHistIndexMatrix &gmat, GHistRow hist, bool read_by_column);
} // namespace xgboost::common
Loading
Loading