@@ -61,10 +61,12 @@ struct DeviceAUCCache {
61
61
neg_pos.resize (sorted_idx.size ());
62
62
if (is_multi) {
63
63
predts_t .resize (sorted_idx.size ());
64
- reducer.reset (new dh::AllReducer);
65
- reducer->Init (rabit::GetRank ());
66
64
}
67
65
}
66
+ if (is_multi && !reducer) {
67
+ reducer.reset (new dh::AllReducer);
68
+ reducer->Init (device);
69
+ }
68
70
}
69
71
};
70
72
@@ -197,12 +199,48 @@ XGBOOST_DEVICE size_t LastOf(size_t group, common::Span<Idx> indptr) {
197
199
return indptr[group + 1 ] - 1 ;
198
200
}
199
201
202
+
203
+ float ScaleClasses (common::Span<float > results, common::Span<float > local_area,
204
+ common::Span<float > fp, common::Span<float > tp,
205
+ common::Span<float > auc, std::shared_ptr<DeviceAUCCache> cache,
206
+ size_t n_classes) {
207
+ dh::XGBDeviceAllocator<char > alloc;
208
+ if (rabit::IsDistributed ()) {
209
+ CHECK_EQ (dh::CudaGetPointerDevice (results.data ()), dh::CurrentDevice ());
210
+ cache->reducer ->AllReduceSum (results.data (), results.data (), results.size ());
211
+ }
212
+ auto reduce_in = dh::MakeTransformIterator<thrust::pair<float , float >>(
213
+ thrust::make_counting_iterator (0 ), [=] __device__ (size_t i) {
214
+ if (local_area[i] > 0 ) {
215
+ return thrust::make_pair (auc[i] / local_area[i] * tp[i], tp[i]);
216
+ }
217
+ return thrust::make_pair (std::numeric_limits<float >::quiet_NaN (), 0 .0f );
218
+ });
219
+
220
+ float tp_sum;
221
+ float auc_sum;
222
+ thrust::tie (auc_sum, tp_sum) = thrust::reduce (
223
+ thrust::cuda::par (alloc), reduce_in, reduce_in + n_classes,
224
+ thrust::make_pair (0 .0f , 0 .0f ),
225
+ [=] __device__ (auto const &l, auto const &r) {
226
+ return thrust::make_pair (l.first + r.first , l.second + r.second );
227
+ });
228
+ if (tp_sum != 0 && !std::isnan (auc_sum)) {
229
+ auc_sum /= tp_sum;
230
+ } else {
231
+ return std::numeric_limits<float >::quiet_NaN ();
232
+ }
233
+ return auc_sum;
234
+ }
235
+
200
236
/* *
201
237
* MultiClass implementation is similar to binary classification, except we need to split
202
238
* up each class in all kernels.
203
239
*/
204
240
float GPUMultiClassAUCOVR (common::Span<float const > predts, MetaInfo const &info,
205
- int32_t device, std::shared_ptr<DeviceAUCCache>* p_cache) {
241
+ int32_t device, std::shared_ptr<DeviceAUCCache>* p_cache,
242
+ size_t n_classes) {
243
+ dh::safe_cuda (cudaSetDevice (device));
206
244
auto & cache = *p_cache;
207
245
if (!cache) {
208
246
cache.reset (new DeviceAUCCache);
@@ -213,8 +251,19 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
213
251
auto weights = info.weights_ .ConstDeviceSpan ();
214
252
215
253
size_t n_samples = labels.size ();
216
- size_t n_classes = predts.size () / labels.size ();
217
- CHECK_NE (n_classes, 0 );
254
+
255
+ if (n_samples == 0 ) {
256
+ dh::TemporaryArray<float > resutls (n_classes * 4 , 0 .0f );
257
+ auto d_results = dh::ToSpan (resutls);
258
+ dh::LaunchN (device, n_classes * 4 , [=]__device__ (size_t i) {
259
+ d_results[i] = 0 .0f ;
260
+ });
261
+ auto local_area = d_results.subspan (0 , n_classes);
262
+ auto fp = d_results.subspan (n_classes, n_classes);
263
+ auto tp = d_results.subspan (2 * n_classes, n_classes);
264
+ auto auc = d_results.subspan (3 * n_classes, n_classes);
265
+ return ScaleClasses (d_results, local_area, fp, tp, auc, cache, n_classes);
266
+ }
218
267
219
268
/* *
220
269
* Create sorted index for each class
@@ -377,32 +426,7 @@ float GPUMultiClassAUCOVR(common::Span<float const> predts, MetaInfo const &info
377
426
tp[c] = last.second ;
378
427
local_area[c] = last.first * last.second ;
379
428
});
380
- if (rabit::IsDistributed ()) {
381
- cache->reducer ->AllReduceSum (resutls.data ().get (), resutls.data ().get (),
382
- resutls.size ());
383
- }
384
- auto reduce_in = dh::MakeTransformIterator<thrust::pair<float , float >>(
385
- thrust::make_counting_iterator (0 ), [=] __device__ (size_t i) {
386
- if (local_area[i] > 0 ) {
387
- return thrust::make_pair (auc[i] / local_area[i] * tp[i], tp[i]);
388
- }
389
- return thrust::make_pair (std::numeric_limits<float >::quiet_NaN (), 0 .0f );
390
- });
391
-
392
- float tp_sum;
393
- float auc_sum;
394
- thrust::tie (auc_sum, tp_sum) = thrust::reduce (
395
- thrust::cuda::par (alloc), reduce_in, reduce_in + n_classes,
396
- thrust::make_pair (0 .0f , 0 .0f ),
397
- [=] __device__ (auto const &l, auto const &r) {
398
- return thrust::make_pair (l.first + r.first , l.second + r.second );
399
- });
400
- if (tp_sum != 0 && !std::isnan (auc_sum)) {
401
- auc_sum /= tp_sum;
402
- } else {
403
- return std::numeric_limits<float >::quiet_NaN ();
404
- }
405
- return auc_sum;
429
+ return ScaleClasses (d_results, local_area, fp, tp, auc, cache, n_classes);
406
430
}
407
431
408
432
namespace {
0 commit comments