19
19
#ifdef __CUDACC__
20
20
#include " HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
21
21
#endif
22
+ #include " HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
23
+
22
24
23
25
#ifdef __CUDACC__
24
26
namespace cudautils {
@@ -38,36 +40,51 @@ namespace cudautils {
38
40
39
41
template <typename Histo, typename T>
40
42
__global__
41
- void fillFromVector (Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
42
- uint32_t * __restrict__ ws ) {
43
+ void fillFromVector (Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
43
44
auto i = blockIdx.x * blockDim.x + threadIdx.x ;
44
45
if (i >= offsets[nh]) return ;
45
46
auto off = cuda_std::upper_bound (offsets, offsets + nh + 1 , i);
46
47
assert ((*off) > 0 );
47
48
int32_t ih = off - offsets - 1 ;
48
49
assert (ih >= 0 );
49
50
assert (ih < nh);
50
- (*h).fill (v[i], i, ws, ih);
51
+ (*h).fill (v[i], i, ih);
52
+ }
53
+
54
+ template <typename Histo>
55
+ void launchZero (Histo * __restrict__ h, cudaStream_t stream) {
56
+ uint32_t * off = (uint32_t *)( (char *)(h) +offsetof (Histo,off));
57
+ cudaMemsetAsync (off,0 , 4 *Histo::totbins (),stream);
58
+ }
59
+
60
+ template <typename Histo>
61
+ void launchFinalize (Histo * __restrict__ h, uint8_t * __restrict__ ws, cudaStream_t stream) {
62
+ uint32_t * off = (uint32_t *)( (char *)(h) +offsetof (Histo,off));
63
+ size_t wss = Histo::wsSize ();
64
+ CubDebugExit (cub::DeviceScan::InclusiveSum (ws, wss, off, off, Histo::totbins (), stream));
51
65
}
52
66
53
67
54
68
template <typename Histo, typename T>
55
- void fillManyFromVector (Histo * __restrict__ h, typename Histo::Counter * __restrict__ ws,
69
+ void fillManyFromVector (Histo * __restrict__ h, uint8_t * __restrict__ ws,
56
70
uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize,
57
71
int nthreads, cudaStream_t stream) {
58
- uint32_t * off = (uint32_t *)( (char *)(h) +offsetof (Histo,off));
59
- cudaMemsetAsync (off,0 , 4 *Histo::totbins (),stream);
72
+ launchZero (h,stream);
60
73
auto nblocks = (totSize + nthreads - 1 ) / nthreads;
61
74
countFromVector<<<nblocks, nthreads, 0 , stream>>>(h, nh, v, offsets);
62
75
cudaCheck (cudaGetLastError ());
63
- size_t wss = Histo::totbins ();
64
- CubDebugExit (cub::DeviceScan::InclusiveSum (ws, wss, off, off, Histo::totbins (), stream));
65
- cudaMemsetAsync (ws,0 , 4 *Histo::totbins (),stream);
66
- fillFromVector<<<nblocks, nthreads, 0 , stream>>>(h, nh, v, offsets,ws);
76
+ launchFinalize (h,ws,stream);
77
+ fillFromVector<<<nblocks, nthreads, 0 , stream>>>(h, nh, v, offsets);
67
78
cudaCheck (cudaGetLastError ());
68
79
}
69
80
70
81
82
+ template <typename Assoc>
83
+ __global__
84
+ void finalizeBulk (AtomicPairCounter const * apc, Assoc * __restrict__ assoc) {
85
+ assoc->bulkFinalizeFill (*apc);
86
+ }
87
+
71
88
} // namespace cudautils
72
89
#endif
73
90
@@ -149,8 +166,8 @@ class HistoContainer {
149
166
uint32_t * v =nullptr ;
150
167
void * d_temp_storage = nullptr ;
151
168
size_t temp_storage_bytes = 0 ;
152
- cub::DeviceScan::InclusiveSum (d_temp_storage, temp_storage_bytes, v, v, totbins ()- 1 );
153
- return std::max ( temp_storage_bytes, size_t ( totbins ())) ;
169
+ cub::DeviceScan::InclusiveSum (d_temp_storage, temp_storage_bytes, v, v, totbins ());
170
+ return temp_storage_bytes;
154
171
}
155
172
#endif
156
173
@@ -176,22 +193,79 @@ class HistoContainer {
176
193
#endif
177
194
}
178
195
196
+ static __host__ __device__
197
+ __forceinline__
198
+ uint32_t atomicDecrement (Counter & x) {
199
+ #ifdef __CUDA_ARCH__
200
+ return atomicSub (&x, 1 );
201
+ #else
202
+ return x--;
203
+ #endif
204
+ }
205
+
206
+ __host__ __device__
207
+ __forceinline__
208
+ void countDirect (T b) {
209
+ assert (b<nbins ());
210
+ atomicIncrement (off[b]);
211
+ }
212
+
213
+ __host__ __device__
214
+ __forceinline__
215
+ void fillDirect (T b, index_type j) {
216
+ assert (b<nbins ());
217
+ auto w = atomicDecrement (off[b]);
218
+ assert (w>0 );
219
+ bins[w-1 ] = j;
220
+ }
221
+
222
+
223
+ #ifdef __CUDACC__
224
+ __device__
225
+ __forceinline__
226
+ uint32_t bulkFill (AtomicPairCounter & apc, index_type const * v, uint32_t n) {
227
+ auto c = apc.add (n);
228
+ off[c.m ] = c.n ;
229
+ for (int j=0 ; j<n; ++j) bins[c.n +j]=v[j];
230
+ return c.m ;
231
+ }
232
+
233
+ __device__
234
+ __forceinline__
235
+ void bulkFinalize (AtomicPairCounter const & apc) {
236
+ off[apc.get ().m ]=apc.get ().n ;
237
+ }
238
+
239
+ __device__
240
+ __forceinline__
241
+ void bulkFinalizeFill (AtomicPairCounter const & apc) {
242
+ auto m = apc.get ().m ;
243
+ auto n = apc.get ().n ;
244
+ auto i = m + blockIdx.x * blockDim.x + threadIdx.x ;
245
+ if (i>=totbins ()) return ;
246
+ off[i]=n;
247
+ }
248
+
249
+
250
+ #endif
251
+
252
+
179
253
__host__ __device__
180
254
__forceinline__
181
255
void count (T t) {
182
256
uint32_t b = bin (t);
183
257
assert (b<nbins ());
184
- atomicIncrement (off[b+ 1 ]);
258
+ atomicIncrement (off[b]);
185
259
}
186
260
187
261
__host__ __device__
188
262
__forceinline__
189
- void fill (T t, index_type j, Counter * ws ) {
263
+ void fill (T t, index_type j) {
190
264
uint32_t b = bin (t);
191
265
assert (b<nbins ());
192
- auto w = atomicIncrement (ws [b]);
193
- assert (w < size (b) );
194
- bins[off[b] + w ] = j;
266
+ auto w = atomicDecrement (off [b]);
267
+ assert (w> 0 );
268
+ bins[w- 1 ] = j;
195
269
}
196
270
197
271
@@ -202,31 +276,35 @@ class HistoContainer {
202
276
assert (b<nbins ());
203
277
b+=histOff (nh);
204
278
assert (b<totbins ());
205
- atomicIncrement (off[b+ 1 ]);
279
+ atomicIncrement (off[b]);
206
280
}
207
281
208
282
__host__ __device__
209
283
__forceinline__
210
- void fill (T t, index_type j, Counter * ws, uint32_t nh) {
284
+ void fill (T t, index_type j, uint32_t nh) {
211
285
uint32_t b = bin (t);
212
286
assert (b<nbins ());
213
287
b+=histOff (nh);
214
288
assert (b<totbins ());
215
- auto w = atomicIncrement (ws [b]);
216
- assert (w < size (b) );
217
- bins[off[b] + w ] = j;
289
+ auto w = atomicDecrement (off [b]);
290
+ assert (w> 0 );
291
+ bins[w- 1 ] = j;
218
292
}
219
293
220
294
#ifdef __CUDACC__
221
295
__device__
222
296
__forceinline__
223
297
void finalize (Counter * ws) {
224
- blockPrefixScan (off+1 ,totbins ()-1 ,ws);
298
+ assert (off[totbins ()-1 ]==0 );
299
+ blockPrefixScan (off,totbins (),ws);
300
+ assert (off[totbins ()-1 ]==off[totbins ()-2 ]);
225
301
}
226
302
__host__
227
303
#endif
228
304
void finalize () {
229
- for (uint32_t i=2 ; i<totbins (); ++i) off[i]+=off[i-1 ];
305
+ assert (off[totbins ()-1 ]==0 );
306
+ for (uint32_t i=1 ; i<totbins (); ++i) off[i]+=off[i-1 ];
307
+ assert (off[totbins ()-1 ]==off[totbins ()-2 ]);
230
308
}
231
309
232
310
constexpr auto size () const { return uint32_t (off[totbins ()-1 ]);}
@@ -245,4 +323,13 @@ class HistoContainer {
245
323
index_type bins[capacity()];
246
324
};
247
325
326
+
327
+
328
+ template <
329
+ typename I, // type stored in the container (usually an index in a vector of the input values)
330
+ uint32_t MAXONES, // max number of "ones"
331
+ uint32_t MAXMANYS // max number of "manys"
332
+ >
333
+ using OneToManyAssoc = HistoContainer<uint32_t , MAXONES, MAXMANYS, sizeof (uint32_t ) * 8 , I, 1 >;
334
+
248
335
#endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h
0 commit comments