-
Notifications
You must be signed in to change notification settings - Fork 4.4k
Use cooperative groups to populate Associations (Histograms) in Pixel Patatrack #35713
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: master
Are you sure you want to change the base?
Changes from all commits
650c971
e654481
0614694
df68916
f71031a
ac394e7
a3ab3ff
8a5d69b
6974029
0c1e5f4
151aea7
e9a9bda
b8e2760
446d652
5f6f596
71631b2
cf5b8ba
da13a56
fc4faa7
ac764da
6c38fdf
699d28d
72f6df7
6637f96
e9328e3
0160c6d
8f387b0
078f314
90dec57
2e58827
c09b4de
9df0add
418d0c4
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -0,0 +1,171 @@ | ||
#ifndef HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h | ||
#define HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h | ||
|
||
#include "HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h" | ||
|
||
#ifdef __CUDACC__ | ||
#include "HeterogeneousCore/CUDAUtilities/interface/device_unique_ptr.h" | ||
#include "HeterogeneousCore/CUDAUtilities/interface/maxCoopBlocks.h" | ||
#endif | ||
|
||
namespace cms { | ||
namespace cuda { | ||
|
||
template <template <CountOrFill> typename Func, typename Histo, typename... Args> | ||
__global__ void kernel_populate(typename Histo::View view, typename Histo::View::Counter *ws, Args... args) { | ||
namespace cg = cooperative_groups; | ||
auto grid = cg::this_grid(); | ||
auto histo = static_cast<Histo *>(view.assoc); | ||
zeroAndInitCoop(view); | ||
grid.sync(); | ||
Func<CountOrFill::count>::countOrFill(histo, std::forward<Args>(args)...); | ||
grid.sync(); | ||
finalizeCoop(view, ws); | ||
grid.sync(); | ||
Func<CountOrFill::fill>::countOrFill(histo, std::forward<Args>(args)...); | ||
} | ||
|
||
template <typename Histo, typename T, CountOrFill cof> | ||
__device__ __inline__ void countOrFillFromVector(Histo *__restrict__ h, | ||
uint32_t nh, | ||
T const *__restrict__ v, | ||
uint32_t const *__restrict__ offsets) { | ||
int first = blockDim.x * blockIdx.x + threadIdx.x; | ||
for (int i = first, nt = offsets[nh]; i < nt; i += gridDim.x * blockDim.x) { | ||
auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i); | ||
assert((*off) > 0); | ||
int32_t ih = off - offsets - 1; | ||
assert(ih >= 0); | ||
assert(ih < int(nh)); | ||
if constexpr (CountOrFill::count == cof) | ||
(*h).count(v[i], ih); | ||
else | ||
(*h).fill(v[i], i, ih); | ||
} | ||
} | ||
|
||
template <typename Histo, typename T, CountOrFill cof> | ||
__global__ void countOrFillFromVectorKernel(Histo *__restrict__ h, | ||
uint32_t nh, | ||
T const *__restrict__ v, | ||
uint32_t const *__restrict__ offsets) { | ||
countOrFillFromVector<Histo, T, cof>(h, nh, v, offsets); | ||
} | ||
|
||
template <typename Histo, typename T> | ||
inline __attribute__((always_inline)) void fillManyFromVector(Histo *__restrict__ h, | ||
uint32_t nh, | ||
T const *__restrict__ v, | ||
uint32_t const *__restrict__ offsets, | ||
int32_t totSize, | ||
int nthreads, | ||
typename Histo::index_type *mem, | ||
cudaStream_t stream | ||
#ifndef __CUDACC__ | ||
= cudaStreamDefault | ||
#endif | ||
) { | ||
typename Histo::View view = {h, nullptr, mem, -1, totSize}; | ||
launchZero(view, stream); | ||
#ifdef __CUDACC__ | ||
auto nblocks = (totSize + nthreads - 1) / nthreads; | ||
assert(nblocks > 0); | ||
countOrFillFromVectorKernel<Histo, T, CountOrFill::count><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets); | ||
cudaCheck(cudaGetLastError()); | ||
launchFinalize(view, stream); | ||
countOrFillFromVectorKernel<Histo, T, CountOrFill::fill><<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets); | ||
cudaCheck(cudaGetLastError()); | ||
#else | ||
countOrFillFromVectorKernel<Histo, T, CountOrFill::count>(h, nh, v, offsets); | ||
h->finalize(); | ||
countOrFillFromVectorKernel<Histo, T, CountOrFill::fill>(h, nh, v, offsets); | ||
#endif | ||
} | ||
|
||
#ifdef __CUDACC__ | ||
template <typename Histo, typename T> | ||
__global__ void fillManyFromVectorCoopKernel(typename Histo::View view, | ||
uint32_t nh, | ||
T const *__restrict__ v, | ||
uint32_t const *__restrict__ offsets, | ||
int32_t totSize, | ||
typename Histo::View::Counter *ws) { | ||
namespace cg = cooperative_groups; | ||
auto grid = cg::this_grid(); | ||
auto h = static_cast<Histo *>(view.assoc); | ||
zeroAndInitCoop(view); | ||
grid.sync(); | ||
countOrFillFromVector<Histo, T, CountOrFill::count>(h, nh, v, offsets); | ||
grid.sync(); | ||
finalizeCoop(view, ws); | ||
grid.sync(); | ||
countOrFillFromVector<Histo, T, CountOrFill::fill>(h, nh, v, offsets); | ||
} | ||
#endif | ||
|
||
template <typename Histo, typename T> | ||
inline __attribute__((always_inline)) void fillManyFromVectorCoop(Histo *h, | ||
uint32_t nh, | ||
T const *v, | ||
uint32_t const *offsets, | ||
int32_t totSize, | ||
int nthreads, | ||
typename Histo::index_type *mem, | ||
cudaStream_t stream | ||
#ifndef __CUDACC__ | ||
= cudaStreamDefault | ||
#endif | ||
) { | ||
using View = typename Histo::View; | ||
View view = {h, nullptr, mem, -1, totSize}; | ||
#ifdef __CUDACC__ | ||
auto kernel = fillManyFromVectorCoopKernel<Histo, T>; | ||
auto nblocks = (totSize + nthreads - 1) / nthreads; | ||
assert(nblocks > 0); | ||
auto nOnes = view.size(); | ||
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. ok, a huge stack of boiler plate. could be partially encapsulated in a "launch" interface as in launch.h. There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. If you want to give it a try, there is |
||
auto nchunks = nOnes / nthreads + 1; | ||
auto ws = cms::cuda::make_device_unique<typename View::Counter[]>(nchunks, stream); | ||
auto wsp = ws.get(); | ||
// FIXME: discuss with FW team: cuda calls are expensive and not needed for each event | ||
static int maxBlocks = maxCoopBlocks(kernel, nthreads, 0, 0); | ||
auto ncoopblocks = std::min(nblocks, maxBlocks); | ||
assert(ncoopblocks > 0); | ||
void *kernelArgs[] = {&view, &nh, &v, &offsets, &totSize, &wsp}; | ||
dim3 dimBlock(nthreads, 1, 1); | ||
dim3 dimGrid(ncoopblocks, 1, 1); | ||
// launch | ||
cudaCheck(cudaLaunchCooperativeKernel((void *)kernel, dimGrid, dimBlock, kernelArgs, 0, stream)); | ||
#else | ||
launchZero(view, stream); | ||
countFromVector(h, nh, v, offsets); | ||
h->finalize(); | ||
fillFromVector(h, nh, v, offsets); | ||
#endif | ||
} | ||
|
||
// iteratate over N bins left and right of the one containing "v" | ||
template <typename Hist, typename V, typename Func> | ||
__host__ __device__ __forceinline__ void forEachInBins(Hist const &hist, V value, int n, Func func) { | ||
int bs = Hist::bin(value); | ||
int be = std::min(int(Hist::nbins() - 1), bs + n); | ||
bs = std::max(0, bs - n); | ||
assert(be >= bs); | ||
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { | ||
func(*pj); | ||
} | ||
} | ||
|
||
// iteratate over bins containing all values in window wmin, wmax | ||
template <typename Hist, typename V, typename Func> | ||
__host__ __device__ __forceinline__ void forEachInWindow(Hist const &hist, V wmin, V wmax, Func &&func) { | ||
auto bs = Hist::bin(wmin); | ||
auto be = Hist::bin(wmax); | ||
assert(be >= bs); | ||
for (auto pj = hist.begin(bs); pj < hist.end(be); ++pj) { | ||
func(*pj); | ||
} | ||
} | ||
} // namespace cuda | ||
} // namespace cms | ||
|
||
#endif // HeterogeneousCore_CUDAUtilities_interface_HistoContainerAlgo_h |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
This is not used (yet?) It may make the syntax more complex, not simpler