Skip to content

Commit 7067416

Browse files
VinInnfwyzard
authored andcommitted
Full workflow from raw data to pixel tracks and vertices on GPUs (#216)
Port and optimise the full workflow from pixel raw data to pixel tracks and vertices to GPUs. Clean the pixel n-tuplets with the "fishbone" algorithm (only on GPUs). Other changes: - recover the Riemann fit updates lost during the merge with CMSSW 10.4.x; - speed up clustering and track fitting; - minor bug fix to avoid trivial regression with the optimized fit.
1 parent 68f320f commit 7067416

File tree

75 files changed

+4805
-2504
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

75 files changed

+4805
-2504
lines changed

Geometry/TrackerGeometryBuilder/interface/phase1PixelTopology.h

+61
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@
22
#define Geometry_TrackerGeometryBuilder_phase1PixelTopology_h
33

44
#include <cstdint>
5+
#include <array>
56

67
namespace phase1PixelTopology {
78

@@ -29,6 +30,66 @@ namespace phase1PixelTopology {
2930
};
3031

3132

33+
template<class Function, std::size_t... Indices>
34+
constexpr auto map_to_array_helper(Function f, std::index_sequence<Indices...>)
35+
-> std::array<typename std::result_of<Function(std::size_t)>::type, sizeof...(Indices)>
36+
{
37+
return {{ f(Indices)... }};
38+
}
39+
40+
template<int N, class Function>
41+
constexpr auto map_to_array(Function f)
42+
-> std::array<typename std::result_of<Function(std::size_t)>::type, N>
43+
{
44+
return map_to_array_helper(f, std::make_index_sequence<N>{});
45+
}
46+
47+
48+
constexpr uint32_t findMaxModuleStride() {
49+
bool go = true;
50+
int n=2;
51+
while (go) {
52+
for (uint8_t i=1; i<11; ++i) {
53+
if (layerStart[i]%n !=0) {go=false; break;}
54+
}
55+
if(!go) break;
56+
n*=2;
57+
}
58+
return n/2;
59+
}
60+
61+
constexpr uint32_t maxModuleStride = findMaxModuleStride();
62+
63+
64+
constexpr uint8_t findLayer(uint32_t detId) {
65+
for (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
66+
return 11;
67+
}
68+
69+
constexpr uint8_t findLayerFromCompact(uint32_t detId) {
70+
detId*=maxModuleStride;
71+
for (uint8_t i=0; i<11; ++i) if (detId<layerStart[i+1]) return i;
72+
return 11;
73+
}
74+
75+
76+
constexpr uint32_t layerIndexSize = numberOfModules/maxModuleStride;
77+
constexpr std::array<uint8_t,layerIndexSize> layer = map_to_array<layerIndexSize>(findLayerFromCompact);
78+
79+
constexpr bool validateLayerIndex() {
80+
bool res=true;
81+
for (auto i=0U; i<numberOfModules; ++i) {
82+
auto j = i/maxModuleStride;
83+
res &=(layer[j]<10);
84+
res &=(i>=layerStart[layer[j]]);
85+
res &=(i<layerStart[layer[j]+1]);
86+
}
87+
return res;
88+
}
89+
90+
static_assert(validateLayerIndex(),"layer from detIndex algo is buggy");
91+
92+
3293
// this is for the ROC n<512 (upgrade 1024)
3394
constexpr inline
3495
uint16_t divu52(uint16_t n) {

Geometry/TrackerGeometryBuilder/test/phase1PixelTopology_t.cpp

+8
Original file line numberDiff line numberDiff line change
@@ -141,5 +141,13 @@ int main() {
141141
assert(std::get<1>(ori)==bp);
142142
}
143143

144+
using namespace phase1PixelTopology;
145+
for (auto i=0U; i<numberOfModules; ++i) {
146+
assert(layer[i]<10);
147+
assert(i>=layerStart[layer[i]]);
148+
assert(i<layerStart[layer[i]+1]);
149+
}
150+
151+
144152
return 0;
145153
}
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,54 @@
1+
#ifndef HeterogeneousCoreCUDAUtilitiesAtomicPairCounter_H
2+
#define HeterogeneousCoreCUDAUtilitiesAtomicPairCounter_H
3+
4+
#include <cuda_runtime.h>
5+
#include <cstdint>
6+
7+
class AtomicPairCounter {
8+
public:
9+
10+
using c_type = unsigned long long int;
11+
12+
AtomicPairCounter(){}
13+
AtomicPairCounter(c_type i) { counter.ac=i;}
14+
15+
__device__ __host__
16+
AtomicPairCounter & operator=(c_type i) { counter.ac=i; return *this;}
17+
18+
struct Counters {
19+
uint32_t n; // in a "One to Many" association is the number of "One"
20+
uint32_t m; // in a "One to Many" association is the total number of associations
21+
};
22+
23+
union Atomic2 {
24+
Counters counters;
25+
c_type ac;
26+
};
27+
28+
#ifdef __CUDACC__
29+
30+
static constexpr c_type incr = 1UL<<32;
31+
32+
__device__ __host__
33+
Counters get() const { return counter.counters;}
34+
35+
// increment n by 1 and m by i. return previous value
36+
__device__
37+
Counters add(uint32_t i) {
38+
c_type c = i;
39+
c+=incr;
40+
Atomic2 ret;
41+
ret.ac = atomicAdd(&counter.ac,c);
42+
return ret.counters;
43+
}
44+
45+
#endif
46+
47+
private:
48+
49+
Atomic2 counter;
50+
51+
};
52+
53+
54+
#endif

HeterogeneousCore/CUDAUtilities/interface/GPUSimpleVector.h

+2-1
Original file line numberDiff line numberDiff line change
@@ -80,7 +80,8 @@ template <class T> struct SimpleVector {
8080
}
8181

8282
#endif // __CUDACC__
83-
83+
inline constexpr bool empty() const { return m_size==0;}
84+
inline constexpr bool full() const { return m_size==m_capacity;}
8485
inline constexpr T& operator[](int i) { return m_data[i]; }
8586
inline constexpr const T& operator[](int i) const { return m_data[i]; }
8687
inline constexpr void reset() { m_size = 0; }

HeterogeneousCore/CUDAUtilities/interface/GPUVecArray.h

+4
Original file line numberDiff line numberDiff line change
@@ -80,6 +80,10 @@ template <class T, int maxSize> struct VecArray {
8080
return T();
8181
}
8282

83+
inline constexpr T const * begin() const { return m_data;}
84+
inline constexpr T const * end() const { return m_data+m_size;}
85+
inline constexpr T * begin() { return m_data;}
86+
inline constexpr T * end() { return m_data+m_size;}
8387
inline constexpr int size() const { return m_size; }
8488
inline constexpr T& operator[](int i) { return m_data[i]; }
8589
inline constexpr const T& operator[](int i) const { return m_data[i]; }

HeterogeneousCore/CUDAUtilities/interface/HistoContainer.h

+111-24
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@
1919
#ifdef __CUDACC__
2020
#include "HeterogeneousCore/CUDAUtilities/interface/prefixScan.h"
2121
#endif
22+
#include "HeterogeneousCore/CUDAUtilities/interface/AtomicPairCounter.h"
23+
2224

2325
#ifdef __CUDACC__
2426
namespace cudautils {
@@ -38,36 +40,51 @@ namespace cudautils {
3840

3941
template<typename Histo, typename T>
4042
__global__
41-
void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets,
42-
uint32_t * __restrict__ ws ) {
43+
void fillFromVector(Histo * __restrict__ h, uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets) {
4344
auto i = blockIdx.x * blockDim.x + threadIdx.x;
4445
if(i >= offsets[nh]) return;
4546
auto off = cuda_std::upper_bound(offsets, offsets + nh + 1, i);
4647
assert((*off) > 0);
4748
int32_t ih = off - offsets - 1;
4849
assert(ih >= 0);
4950
assert(ih < nh);
50-
(*h).fill(v[i], i, ws, ih);
51+
(*h).fill(v[i], i, ih);
52+
}
53+
54+
template<typename Histo>
55+
void launchZero(Histo * __restrict__ h, cudaStream_t stream) {
56+
uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
57+
cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
58+
}
59+
60+
template<typename Histo>
61+
void launchFinalize(Histo * __restrict__ h, uint8_t * __restrict__ ws, cudaStream_t stream) {
62+
uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
63+
size_t wss = Histo::wsSize();
64+
CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
5165
}
5266

5367

5468
template<typename Histo, typename T>
55-
void fillManyFromVector(Histo * __restrict__ h, typename Histo::Counter * __restrict__ ws,
69+
void fillManyFromVector(Histo * __restrict__ h, uint8_t * __restrict__ ws,
5670
uint32_t nh, T const * __restrict__ v, uint32_t const * __restrict__ offsets, uint32_t totSize,
5771
int nthreads, cudaStream_t stream) {
58-
uint32_t * off = (uint32_t *)( (char*)(h) +offsetof(Histo,off));
59-
cudaMemsetAsync(off,0, 4*Histo::totbins(),stream);
72+
launchZero(h,stream);
6073
auto nblocks = (totSize + nthreads - 1) / nthreads;
6174
countFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
6275
cudaCheck(cudaGetLastError());
63-
size_t wss = Histo::totbins();
64-
CubDebugExit(cub::DeviceScan::InclusiveSum(ws, wss, off, off, Histo::totbins(), stream));
65-
cudaMemsetAsync(ws,0, 4*Histo::totbins(),stream);
66-
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets,ws);
76+
launchFinalize(h,ws,stream);
77+
fillFromVector<<<nblocks, nthreads, 0, stream>>>(h, nh, v, offsets);
6778
cudaCheck(cudaGetLastError());
6879
}
6980

7081

82+
template<typename Assoc>
83+
__global__
84+
void finalizeBulk(AtomicPairCounter const * apc, Assoc * __restrict__ assoc) {
85+
assoc->bulkFinalizeFill(*apc);
86+
}
87+
7188
} // namespace cudautils
7289
#endif
7390

@@ -149,8 +166,8 @@ class HistoContainer {
149166
uint32_t * v =nullptr;
150167
void * d_temp_storage = nullptr;
151168
size_t temp_storage_bytes = 0;
152-
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins()-1);
153-
return std::max(temp_storage_bytes,size_t(totbins()));
169+
cub::DeviceScan::InclusiveSum(d_temp_storage, temp_storage_bytes, v, v, totbins());
170+
return temp_storage_bytes;
154171
}
155172
#endif
156173

@@ -176,22 +193,79 @@ class HistoContainer {
176193
#endif
177194
}
178195

196+
static __host__ __device__
197+
__forceinline__
198+
uint32_t atomicDecrement(Counter & x) {
199+
#ifdef __CUDA_ARCH__
200+
return atomicSub(&x, 1);
201+
#else
202+
return x--;
203+
#endif
204+
}
205+
206+
__host__ __device__
207+
__forceinline__
208+
void countDirect(T b) {
209+
assert(b<nbins());
210+
atomicIncrement(off[b]);
211+
}
212+
213+
__host__ __device__
214+
__forceinline__
215+
void fillDirect(T b, index_type j) {
216+
assert(b<nbins());
217+
auto w = atomicDecrement(off[b]);
218+
assert(w>0);
219+
bins[w-1] = j;
220+
}
221+
222+
223+
#ifdef __CUDACC__
224+
__device__
225+
__forceinline__
226+
uint32_t bulkFill(AtomicPairCounter & apc, index_type const * v, uint32_t n) {
227+
auto c = apc.add(n);
228+
off[c.m] = c.n;
229+
for(int j=0; j<n; ++j) bins[c.n+j]=v[j];
230+
return c.m;
231+
}
232+
233+
__device__
234+
__forceinline__
235+
void bulkFinalize(AtomicPairCounter const & apc) {
236+
off[apc.get().m]=apc.get().n;
237+
}
238+
239+
__device__
240+
__forceinline__
241+
void bulkFinalizeFill(AtomicPairCounter const & apc) {
242+
auto m = apc.get().m;
243+
auto n = apc.get().n;
244+
auto i = m + blockIdx.x * blockDim.x + threadIdx.x;
245+
if (i>=totbins()) return;
246+
off[i]=n;
247+
}
248+
249+
250+
#endif
251+
252+
179253
__host__ __device__
180254
__forceinline__
181255
void count(T t) {
182256
uint32_t b = bin(t);
183257
assert(b<nbins());
184-
atomicIncrement(off[b+1]);
258+
atomicIncrement(off[b]);
185259
}
186260

187261
__host__ __device__
188262
__forceinline__
189-
void fill(T t, index_type j, Counter * ws) {
263+
void fill(T t, index_type j) {
190264
uint32_t b = bin(t);
191265
assert(b<nbins());
192-
auto w = atomicIncrement(ws[b]);
193-
assert(w < size(b));
194-
bins[off[b] + w] = j;
266+
auto w = atomicDecrement(off[b]);
267+
assert(w>0);
268+
bins[w-1] = j;
195269
}
196270

197271

@@ -202,31 +276,35 @@ class HistoContainer {
202276
assert(b<nbins());
203277
b+=histOff(nh);
204278
assert(b<totbins());
205-
atomicIncrement(off[b+1]);
279+
atomicIncrement(off[b]);
206280
}
207281

208282
__host__ __device__
209283
__forceinline__
210-
void fill(T t, index_type j, Counter * ws, uint32_t nh) {
284+
void fill(T t, index_type j, uint32_t nh) {
211285
uint32_t b = bin(t);
212286
assert(b<nbins());
213287
b+=histOff(nh);
214288
assert(b<totbins());
215-
auto w = atomicIncrement(ws[b]);
216-
assert(w < size(b));
217-
bins[off[b] + w] = j;
289+
auto w = atomicDecrement(off[b]);
290+
assert(w>0);
291+
bins[w-1] = j;
218292
}
219293

220294
#ifdef __CUDACC__
221295
__device__
222296
__forceinline__
223297
void finalize(Counter * ws) {
224-
blockPrefixScan(off+1,totbins()-1,ws);
298+
assert(off[totbins()-1]==0);
299+
blockPrefixScan(off,totbins(),ws);
300+
assert(off[totbins()-1]==off[totbins()-2]);
225301
}
226302
__host__
227303
#endif
228304
void finalize() {
229-
for(uint32_t i=2; i<totbins(); ++i) off[i]+=off[i-1];
305+
assert(off[totbins()-1]==0);
306+
for(uint32_t i=1; i<totbins(); ++i) off[i]+=off[i-1];
307+
assert(off[totbins()-1]==off[totbins()-2]);
230308
}
231309

232310
constexpr auto size() const { return uint32_t(off[totbins()-1]);}
@@ -245,4 +323,13 @@ class HistoContainer {
245323
index_type bins[capacity()];
246324
};
247325

326+
327+
328+
template<
329+
typename I, // type stored in the container (usually an index in a vector of the input values)
330+
uint32_t MAXONES, // max number of "ones"
331+
uint32_t MAXMANYS // max number of "manys"
332+
>
333+
using OneToManyAssoc = HistoContainer<uint32_t, MAXONES, MAXMANYS, sizeof(uint32_t) * 8, I, 1>;
334+
248335
#endif // HeterogeneousCore_CUDAUtilities_HistoContainer_h

0 commit comments

Comments
 (0)