6
6
*/
7
7
8
8
// CUDA Library include(s).
9
+ #include < cuda_runtime_api.h>
10
+ #include < driver_types.h>
11
+
9
12
#include " ../utils/barrier.hpp"
10
13
#include " ../utils/cuda_error_handling.hpp"
11
14
#include " ../utils/utils.hpp"
12
15
#include " traccc/clusterization/clustering_config.hpp"
16
+ #include " traccc/clusterization/device/ccl_debug_output.hpp"
13
17
#include " traccc/clusterization/device/ccl_kernel_definitions.hpp"
14
18
#include " traccc/cuda/clusterization/clusterization_algorithm.hpp"
15
19
16
20
// Project include(s)
17
21
#include " traccc/clusterization/device/ccl_kernel.hpp"
18
22
23
+ // System include
24
+ #include < iostream>
25
+
19
26
// Vecmem include(s).
20
27
#include < cstring>
21
28
#include < vecmem/utils/copy.hpp>
@@ -35,7 +42,8 @@ __global__ void ccl_kernel(
35
42
vecmem::data::vector_view<device::details::index_t > gf_backup_view,
36
43
vecmem::data::vector_view<unsigned char > adjc_backup_view,
37
44
vecmem::data::vector_view<device::details::index_t > adjv_backup_view,
38
- unsigned int * backup_mutex_ptr) {
45
+ unsigned int * backup_mutex_ptr,
46
+ device::details::ccl_debug_output* debug_output) {
39
47
40
48
__shared__ std::size_t partition_start, partition_end;
41
49
__shared__ std::size_t outi;
@@ -56,7 +64,7 @@ __global__ void ccl_kernel(
56
64
modules_view, partition_start, partition_end, outi,
57
65
f_view, gf_view, f_backup_view, gf_backup_view,
58
66
adjc_backup_view, adjv_backup_view, backup_mutex,
59
- barry_r, measurements_view, cell_links);
67
+ barry_r, measurements_view, cell_links, debug_output );
60
68
}
61
69
62
70
} // namespace kernels
@@ -121,14 +129,52 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
121
129
assert (m_config.max_cells_per_thread <=
122
130
device::details::CELLS_PER_THREAD_STACK_LIMIT);
123
131
132
+ // If necessary, allocate an object for storing the debug information
133
+ vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;
134
+
135
+ if (m_config.enable_debug_output ) {
136
+ debug_output =
137
+ vecmem::make_unique_alloc<device::details::ccl_debug_output>(
138
+ m_mr.main );
139
+
140
+ device::details::ccl_debug_output empty_output =
141
+ device::details::ccl_debug_output::init ();
142
+
143
+ TRACCC_CUDA_ERROR_CHECK (
144
+ cudaMemcpyAsync (debug_output.get (), &empty_output,
145
+ sizeof (device::details::ccl_debug_output),
146
+ cudaMemcpyHostToDevice, stream));
147
+ }
148
+
124
149
kernels::ccl_kernel<<<num_blocks, m_config.threads_per_partition,
125
150
2 * m_config.max_partition_size() *
126
151
sizeof (device::details::index_t ),
127
- stream>>> (
128
- m_config, cells, modules, measurements, cell_links, m_f_backup,
129
- m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get ());
152
+ stream>>> (m_config, cells, modules, measurements,
153
+ cell_links, m_f_backup, m_gf_backup,
154
+ m_adjc_backup, m_adjv_backup,
155
+ m_backup_mutex.get (), debug_output.get ());
130
156
TRACCC_CUDA_ERROR_CHECK (cudaGetLastError ());
131
157
158
+ if (debug_output) {
159
+ device::details::ccl_debug_output host_output;
160
+
161
+ TRACCC_CUDA_ERROR_CHECK (
162
+ cudaMemcpyAsync (&host_output, debug_output.get (),
163
+ sizeof (device::details::ccl_debug_output),
164
+ cudaMemcpyDeviceToHost, stream));
165
+
166
+ TRACCC_CUDA_ERROR_CHECK (cudaStreamSynchronize (stream));
167
+
168
+ if (host_output.num_oversized_partitions > 0 ) {
169
+ std::cout << " WARNING: @clusterization_algorithm: "
170
+ << " Clustering encountered "
171
+ << host_output.num_oversized_partitions
172
+ << " oversized partitions; if this number is too large, "
173
+ " it may cause performance problems."
174
+ << std::endl;
175
+ }
176
+ }
177
+
132
178
// Return the reconstructed measurements.
133
179
return measurements;
134
180
}
0 commit comments