6
6
*/
7
7
8
8
// CUDA Library include(s).
9
+ #include < cuda_runtime_api.h>
10
+ #include < driver_types.h>
11
+
9
12
#include " ../sanity/contiguous_on.cuh"
10
13
#include " ../sanity/ordered_on.cuh"
11
14
#include " ../utils/barrier.hpp"
12
15
#include " ../utils/cuda_error_handling.hpp"
13
16
#include " ../utils/utils.hpp"
14
17
#include " traccc/clusterization/clustering_config.hpp"
18
+ #include " traccc/clusterization/device/ccl_debug_output.hpp"
15
19
#include " traccc/clusterization/device/ccl_kernel_definitions.hpp"
16
20
#include " traccc/cuda/clusterization/clusterization_algorithm.hpp"
17
21
#include " traccc/cuda/utils/thread_id.hpp"
21
25
// Project include(s)
22
26
#include " traccc/clusterization/device/ccl_kernel.hpp"
23
27
28
+ // System include
29
+ #include < iostream>
30
+
24
31
// Vecmem include(s).
25
32
#include < cstring>
26
33
#include < vecmem/utils/copy.hpp>
@@ -40,7 +47,8 @@ __global__ void ccl_kernel(
40
47
vecmem::data::vector_view<device::details::index_t > gf_backup_view,
41
48
vecmem::data::vector_view<unsigned char > adjc_backup_view,
42
49
vecmem::data::vector_view<device::details::index_t > adjv_backup_view,
43
- unsigned int * backup_mutex_ptr) {
50
+ unsigned int * backup_mutex_ptr,
51
+ device::details::ccl_debug_output* debug_output) {
44
52
45
53
__shared__ std::size_t partition_start, partition_end;
46
54
__shared__ std::size_t outi;
@@ -62,7 +70,7 @@ __global__ void ccl_kernel(
62
70
partition_start, partition_end, outi, f_view, gf_view,
63
71
f_backup_view, gf_backup_view, adjc_backup_view,
64
72
adjv_backup_view, backup_mutex, barry_r,
65
- measurements_view, cell_links);
73
+ measurements_view, cell_links, debug_output );
66
74
}
67
75
68
76
} // namespace kernels
@@ -132,14 +140,52 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
132
140
assert (m_config.max_cells_per_thread <=
133
141
device::details::CELLS_PER_THREAD_STACK_LIMIT);
134
142
143
+ // If necessary, allocate an object for storing the debug information
144
+ vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;
145
+
146
+ if (m_config.enable_debug_output ) {
147
+ debug_output =
148
+ vecmem::make_unique_alloc<device::details::ccl_debug_output>(
149
+ m_mr.main );
150
+
151
+ device::details::ccl_debug_output empty_output =
152
+ device::details::ccl_debug_output::init ();
153
+
154
+ TRACCC_CUDA_ERROR_CHECK (
155
+ cudaMemcpyAsync (debug_output.get (), &empty_output,
156
+ sizeof (device::details::ccl_debug_output),
157
+ cudaMemcpyHostToDevice, stream));
158
+ }
159
+
135
160
kernels::ccl_kernel<<<num_blocks, m_config.threads_per_partition,
136
161
2 * m_config.max_partition_size() *
137
162
sizeof (device::details::index_t ),
138
- stream>>> (
139
- m_config, cells, modules, measurements, cell_links, m_f_backup,
140
- m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get ());
163
+ stream>>> (m_config, cells, modules, measurements,
164
+ cell_links, m_f_backup, m_gf_backup,
165
+ m_adjc_backup, m_adjv_backup,
166
+ m_backup_mutex.get (), debug_output.get ());
141
167
TRACCC_CUDA_ERROR_CHECK (cudaGetLastError ());
142
168
169
+ if (debug_output) {
170
+ device::details::ccl_debug_output host_output;
171
+
172
+ TRACCC_CUDA_ERROR_CHECK (
173
+ cudaMemcpyAsync (&host_output, debug_output.get (),
174
+ sizeof (device::details::ccl_debug_output),
175
+ cudaMemcpyDeviceToHost, stream));
176
+
177
+ TRACCC_CUDA_ERROR_CHECK (cudaStreamSynchronize (stream));
178
+
179
+ if (host_output.num_oversized_partitions > 0 ) {
180
+ std::cout << " WARNING: @clusterization_algorithm: "
181
+ << " Clustering encountered "
182
+ << host_output.num_oversized_partitions
183
+ << " oversized partitions; if this number is too large, "
184
+ " it may cause performance problems."
185
+ << std::endl;
186
+ }
187
+ }
188
+
143
189
// Return the reconstructed measurements.
144
190
return measurements;
145
191
}
0 commit comments