13
13
#include " traccc/clusterization/device/aggregate_cluster.hpp"
14
14
#include " traccc/clusterization/device/ccl_kernel_definitions.hpp"
15
15
#include " traccc/clusterization/device/reduce_problem_cell.hpp"
16
+ #include " traccc/device/concepts/barrier.hpp"
17
+ #include " traccc/device/concepts/thread_id.hpp"
16
18
#include " traccc/device/mutex.hpp"
17
19
#include " traccc/device/unique_lock.hpp"
18
20
#include " traccc/edm/cell.hpp"
@@ -40,13 +42,13 @@ namespace traccc::device {
40
42
// / iteration.
41
43
// / @param[in] barrier A generic object for block-wide synchronisation
42
44
// /
43
- template <device::concepts::barrier barrier_t >
44
- TRACCC_DEVICE void fast_sv_1 (vecmem::device_vector<details::index_t >& f,
45
+ template <device::concepts::barrier barrier_t ,
46
+ device::concepts::thread_id1 thread_id_t >
47
+ TRACCC_DEVICE void fast_sv_1 (const thread_id_t & thread_id,
48
+ vecmem::device_vector<details::index_t >& f,
45
49
vecmem::device_vector<details::index_t >& gf,
46
50
unsigned char * adjc, details::index_t * adjv,
47
51
details::index_t thread_cell_count,
48
- const details::index_t tid,
49
- const details::index_t blckDim,
50
52
barrier_t & barrier) {
51
53
/*
52
54
* The algorithm finishes if an iteration leaves the arrays unchanged.
@@ -70,7 +72,8 @@ TRACCC_DEVICE void fast_sv_1(vecmem::device_vector<details::index_t>& f,
70
72
* together.
71
73
*/
72
74
for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
73
- const details::index_t cid = tst * blckDim + tid;
75
+ const details::index_t cid =
76
+ tst * thread_id.getBlockDimX () + thread_id.getLocalThreadIdX ();
74
77
75
78
TRACCC_ASSUME (adjc[tst] <= 8 );
76
79
for (unsigned char k = 0 ; k < adjc[tst]; ++k) {
@@ -90,7 +93,8 @@ TRACCC_DEVICE void fast_sv_1(vecmem::device_vector<details::index_t>& f,
90
93
barrier.blockBarrier ();
91
94
92
95
for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
93
- const details::index_t cid = tst * blckDim + tid;
96
+ const details::index_t cid =
97
+ tst * thread_id.getBlockDimX () + thread_id.getLocalThreadIdX ();
94
98
/*
95
99
* The second stage is shortcutting, which is an optimisation that
96
100
* allows us to look at any shortcuts in the cluster IDs that we
@@ -107,7 +111,8 @@ TRACCC_DEVICE void fast_sv_1(vecmem::device_vector<details::index_t>& f,
107
111
barrier.blockBarrier ();
108
112
109
113
for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
110
- const details::index_t cid = tst * blckDim + tid;
114
+ const details::index_t cid =
115
+ tst * thread_id.getBlockDimX () + thread_id.getLocalThreadIdX ();
111
116
/*
112
117
* Update the array for the next generation, keeping track of any
113
118
* changes we make.
@@ -128,11 +133,11 @@ TRACCC_DEVICE void fast_sv_1(vecmem::device_vector<details::index_t>& f,
128
133
} while (barrier.blockOr (gf_changed));
129
134
}
130
135
131
- template <device::concepts::barrier barrier_t >
136
+ template <device::concepts::barrier barrier_t ,
137
+ device::concepts::thread_id1 thread_id_t >
132
138
TRACCC_DEVICE inline void ccl_core (
133
- const details::index_t threadId, const details::index_t blckDim,
134
- std::size_t & partition_start, std::size_t & partition_end,
135
- vecmem::device_vector<details::index_t > f,
139
+ const thread_id_t & thread_id, std::size_t & partition_start,
140
+ std::size_t & partition_end, vecmem::device_vector<details::index_t > f,
136
141
vecmem::device_vector<details::index_t > gf,
137
142
vecmem::data::vector_view<unsigned int > cell_links, details::index_t * adjv,
138
143
unsigned char * adjc, const cell_collection_types::const_device cells_device,
@@ -145,20 +150,23 @@ TRACCC_DEVICE inline void ccl_core(
145
150
assert (size <= gf.size ());
146
151
147
152
details::index_t thread_cell_count =
148
- (size - threadId + blckDim - 1 ) / blckDim;
153
+ (size - thread_id.getLocalThreadIdX () + thread_id.getBlockDimX () - 1 ) /
154
+ thread_id.getBlockDimX ();
149
155
150
156
for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
151
157
/*
152
158
* Look for adjacent cells to the current one.
153
159
*/
154
- const details::index_t cid = tst * blckDim + threadId;
160
+ const details::index_t cid =
161
+ tst * thread_id.getBlockDimX () + thread_id.getLocalThreadIdX ();
155
162
adjc[tst] = 0 ;
156
163
reduce_problem_cell (cells_device, cid, partition_start, partition_end,
157
164
adjc[tst], &adjv[8 * tst]);
158
165
}
159
166
160
167
for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
161
- const details::index_t cid = tst * blckDim + threadId;
168
+ const details::index_t cid =
169
+ tst * thread_id.getBlockDimX () + thread_id.getLocalThreadIdX ();
162
170
/*
163
171
* At the start, the values of f and gf should be equal to the
164
172
* ID of the cell.
@@ -177,12 +185,13 @@ TRACCC_DEVICE inline void ccl_core(
177
185
* Run FastSV algorithm, which will update the father index to that of
178
186
* the cell belonging to the same cluster with the lowest index.
179
187
*/
180
- fast_sv_1 (f, gf, adjc, adjv, thread_cell_count, threadId, blckDim , barrier);
188
+ fast_sv_1 (thread_id, f, gf, adjc, adjv, thread_cell_count, barrier);
181
189
182
190
barrier.blockBarrier ();
183
191
184
192
for (details::index_t tst = 0 ; tst < thread_cell_count; ++tst) {
185
- const details::index_t cid = tst * blckDim + threadId;
193
+ const details::index_t cid =
194
+ tst * thread_id.getBlockDimX () + thread_id.getLocalThreadIdX ();
186
195
if (f.at (cid) == cid) {
187
196
// Add a new measurement to the output buffer. Remembering its
188
197
// position inside of the container.
@@ -196,10 +205,10 @@ TRACCC_DEVICE inline void ccl_core(
196
205
}
197
206
}
198
207
199
- template <device::concepts::barrier barrier_t >
208
+ template <device::concepts::barrier barrier_t ,
209
+ device::concepts::thread_id1 thread_id_t >
200
210
TRACCC_DEVICE inline void ccl_kernel (
201
- const clustering_config cfg, const details::index_t threadId,
202
- const details::index_t blckDim, const unsigned int blockId,
211
+ const clustering_config cfg, const thread_id_t & thread_id,
203
212
const cell_collection_types::const_view cells_view,
204
213
const cell_module_collection_types::const_view modules_view,
205
214
std::size_t & partition_start, std::size_t & partition_end, std::size_t & outi,
@@ -237,8 +246,9 @@ TRACCC_DEVICE inline void ccl_kernel(
237
246
* (to a later point in the array); start and end may be moved different
238
247
* amounts.
239
248
*/
240
- if (threadId == 0 ) {
241
- std::size_t start = blockId * cfg.target_partition_size ();
249
+ if (thread_id.getLocalThreadIdX () == 0 ) {
250
+ std::size_t start =
251
+ thread_id.getBlockIdX () * cfg.target_partition_size ();
242
252
assert (start < num_cells);
243
253
std::size_t end =
244
254
std::min (num_cells, start + cfg.target_partition_size ());
@@ -313,24 +323,26 @@ TRACCC_DEVICE inline void ccl_kernel(
313
323
* rare edge case.
314
324
*/
315
325
if (size > cfg.max_partition_size ()) {
316
- if (threadId == 0 ) {
326
+ if (thread_id. getLocalThreadIdX () == 0 ) {
317
327
lock.lock ();
318
328
}
319
329
320
330
barrier.blockBarrier ();
321
331
322
- adjc = adjc_backup.data () + (threadId * cfg.max_cells_per_thread *
323
- cfg.backup_size_multiplier );
324
- adjv = adjv_backup.data () + (threadId * 8 * cfg.max_cells_per_thread *
325
- cfg.backup_size_multiplier );
332
+ adjc = adjc_backup.data () +
333
+ (thread_id.getLocalThreadIdX () * cfg.max_cells_per_thread *
334
+ cfg.backup_size_multiplier );
335
+ adjv = adjv_backup.data () +
336
+ (thread_id.getLocalThreadIdX () * 8 * cfg.max_cells_per_thread *
337
+ cfg.backup_size_multiplier );
326
338
use_scratch = true ;
327
339
} else {
328
340
adjc = _adjc;
329
341
adjv = _adjv;
330
342
use_scratch = false ;
331
343
}
332
344
333
- ccl_core (threadId, blckDim , partition_start, partition_end,
345
+ ccl_core (thread_id , partition_start, partition_end,
334
346
use_scratch ? f_backup : f_primary,
335
347
use_scratch ? gf_backup : gf_primary, cell_links, adjv, adjc,
336
348
cells_device, modules_device, measurements_device, barrier);
0 commit comments