Skip to content

Commit 9b6ca4a

Browse files
committed
Add debug output to clustering algorithms
In #595, I equipped the CCA code with some edge case handling which allows it to handle oversized partitions. Although this makes sure the algorithm works, it also risks to slow down execution. In order to better understand how much performance we might be losing, this commit adds the ability for the SYCL and CUDA algorithms to print some warnings if they ever encounter this edge case.
1 parent 693fd52 commit 9b6ca4a

File tree

10 files changed

+166
-16
lines changed

10 files changed

+166
-16
lines changed

core/include/traccc/clusterization/clustering_config.hpp

+7
Original file line numberDiff line numberDiff line change
@@ -54,6 +54,13 @@ struct clustering_config {
5454
*/
5555
unsigned int backup_size_multiplier;
5656

57+
/**
58+
* @brief Flag to enforce debug output.
59+
*
60+
* @warning This will slown down the clustering algorithm.
61+
*/
62+
bool enable_debug_output;
63+
5764
/**
5865
* @brief The maximum number of cells per partition.
5966
*/

device/alpaka/src/clusterization/clusterization_algorithm.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -55,7 +55,7 @@ struct CCLKernel {
5555
cells_view, modules_view, max_cells_per_partition,
5656
target_cells_per_partition, partition_start,
5757
partition_end, outi, f_view, gf_view, barry_r,
58-
measurements_view, cell_links);
58+
measurements_view, cell_links, nullptr);
5959
}
6060
};
6161

Original file line numberDiff line numberDiff line change
@@ -0,0 +1,25 @@
1+
/**
2+
* traccc library, part of the ACTS project (R&D line)
3+
*
4+
* (c) 2024 CERN for the benefit of the ACTS project
5+
*
6+
* Mozilla Public License Version 2.0
7+
*/
8+
9+
#pragma once
10+
11+
#include <cstdint>
12+
13+
namespace traccc::device::details {
14+
struct ccl_debug_output {
15+
uint32_t num_oversized_partitions;
16+
17+
static ccl_debug_output init() {
18+
ccl_debug_output rv;
19+
20+
rv.num_oversized_partitions = 0;
21+
22+
return rv;
23+
}
24+
};
25+
} // namespace traccc::device::details

device/common/include/traccc/clusterization/device/ccl_kernel.hpp

+4-1
Original file line numberDiff line numberDiff line change
@@ -9,6 +9,7 @@
99

1010
// Project include(s).
1111
#include "traccc/clusterization/clustering_config.hpp"
12+
#include "traccc/clusterization/device/ccl_debug_output.hpp"
1213
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
1314
#include "traccc/definitions/hints.hpp"
1415
#include "traccc/definitions/qualifiers.hpp"
@@ -54,6 +55,7 @@ namespace traccc::device {
5455
/// @param[out] measurements_view collection of measurements
5556
/// @param[out] cell_links collection of links to measurements each cell is
5657
/// put into
58+
/// @param[out] debug_output debug output location
5759
template <TRACCC_CONSTRAINT(device::concepts::barrier) barrier_t>
5860
TRACCC_DEVICE inline void ccl_kernel(
5961
const clustering_config cfg, details::index_t threadId,
@@ -69,7 +71,8 @@ TRACCC_DEVICE inline void ccl_kernel(
6971
vecmem::data::vector_view<details::index_t> adjv_backup_view,
7072
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
7173
measurement_collection_types::view measurements_view,
72-
vecmem::data::vector_view<unsigned int> cell_links);
74+
vecmem::data::vector_view<unsigned int> cell_links,
75+
details::ccl_debug_output* debug_output);
7376

7477
} // namespace traccc::device
7578

device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp

+10-1
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,7 @@
1111

1212
#include "traccc/clusterization/clustering_config.hpp"
1313
#include "traccc/clusterization/device/aggregate_cluster.hpp"
14+
#include "traccc/clusterization/device/ccl_debug_output.hpp"
1415
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
1516
#include "traccc/clusterization/device/reduce_problem_cell.hpp"
1617
#include "traccc/device/mutex.hpp"
@@ -211,7 +212,8 @@ TRACCC_DEVICE inline void ccl_kernel(
211212
vecmem::data::vector_view<details::index_t> adjv_backup_view,
212213
vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
213214
measurement_collection_types::view measurements_view,
214-
vecmem::data::vector_view<unsigned int> cell_links) {
215+
vecmem::data::vector_view<unsigned int> cell_links,
216+
details::ccl_debug_output* debug_output) {
215217
// Construct device containers around the views.
216218
const cell_collection_types::const_device cells_device(cells_view);
217219
const cell_module_collection_types::const_device modules_device(
@@ -315,6 +317,13 @@ TRACCC_DEVICE inline void ccl_kernel(
315317
if (size > cfg.max_partition_size()) {
316318
if (threadId == 0) {
317319
lock.lock();
320+
321+
if (debug_output) {
322+
vecmem::device_atomic_ref<uint32_t>
323+
num_oversized_partitions_atm(
324+
debug_output->num_oversized_partitions);
325+
num_oversized_partitions_atm.fetch_add(1);
326+
}
318327
}
319328

320329
barrier.blockBarrier();

device/cuda/src/clusterization/clusterization_algorithm.cu

+51-5
Original file line numberDiff line numberDiff line change
@@ -6,16 +6,23 @@
66
*/
77

88
// CUDA Library include(s).
9+
#include <cuda_runtime_api.h>
10+
#include <driver_types.h>
11+
912
#include "../utils/barrier.hpp"
1013
#include "../utils/cuda_error_handling.hpp"
1114
#include "../utils/utils.hpp"
1215
#include "traccc/clusterization/clustering_config.hpp"
16+
#include "traccc/clusterization/device/ccl_debug_output.hpp"
1317
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
1418
#include "traccc/cuda/clusterization/clusterization_algorithm.hpp"
1519

1620
// Project include(s)
1721
#include "traccc/clusterization/device/ccl_kernel.hpp"
1822

23+
// System include
24+
#include <iostream>
25+
1926
// Vecmem include(s).
2027
#include <cstring>
2128
#include <vecmem/utils/copy.hpp>
@@ -35,7 +42,8 @@ __global__ void ccl_kernel(
3542
vecmem::data::vector_view<device::details::index_t> gf_backup_view,
3643
vecmem::data::vector_view<unsigned char> adjc_backup_view,
3744
vecmem::data::vector_view<device::details::index_t> adjv_backup_view,
38-
unsigned int* backup_mutex_ptr) {
45+
unsigned int* backup_mutex_ptr,
46+
device::details::ccl_debug_output* debug_output) {
3947

4048
__shared__ std::size_t partition_start, partition_end;
4149
__shared__ std::size_t outi;
@@ -56,7 +64,7 @@ __global__ void ccl_kernel(
5664
modules_view, partition_start, partition_end, outi,
5765
f_view, gf_view, f_backup_view, gf_backup_view,
5866
adjc_backup_view, adjv_backup_view, backup_mutex,
59-
barry_r, measurements_view, cell_links);
67+
barry_r, measurements_view, cell_links, debug_output);
6068
}
6169

6270
} // namespace kernels
@@ -121,14 +129,52 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
121129
assert(m_config.max_cells_per_thread <=
122130
device::details::CELLS_PER_THREAD_STACK_LIMIT);
123131

132+
// If necessary, allocate an object for storing the debug information
133+
vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;
134+
135+
if (m_config.enable_debug_output) {
136+
debug_output =
137+
vecmem::make_unique_alloc<device::details::ccl_debug_output>(
138+
m_mr.main);
139+
140+
device::details::ccl_debug_output empty_output =
141+
device::details::ccl_debug_output::init();
142+
143+
TRACCC_CUDA_ERROR_CHECK(
144+
cudaMemcpyAsync(debug_output.get(), &empty_output,
145+
sizeof(device::details::ccl_debug_output),
146+
cudaMemcpyHostToDevice, stream));
147+
}
148+
124149
kernels::ccl_kernel<<<num_blocks, m_config.threads_per_partition,
125150
2 * m_config.max_partition_size() *
126151
sizeof(device::details::index_t),
127-
stream>>>(
128-
m_config, cells, modules, measurements, cell_links, m_f_backup,
129-
m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get());
152+
stream>>>(m_config, cells, modules, measurements,
153+
cell_links, m_f_backup, m_gf_backup,
154+
m_adjc_backup, m_adjv_backup,
155+
m_backup_mutex.get(), debug_output.get());
130156
TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
131157

158+
if (debug_output) {
159+
device::details::ccl_debug_output host_output;
160+
161+
TRACCC_CUDA_ERROR_CHECK(
162+
cudaMemcpyAsync(&host_output, debug_output.get(),
163+
sizeof(device::details::ccl_debug_output),
164+
cudaMemcpyDeviceToHost, stream));
165+
166+
TRACCC_CUDA_ERROR_CHECK(cudaStreamSynchronize(stream));
167+
168+
if (host_output.num_oversized_partitions > 0) {
169+
std::cout << "WARNING: @clusterization_algorithm: "
170+
<< "Clustering encountered "
171+
<< host_output.num_oversized_partitions
172+
<< " oversized partitions; if this number is too large, "
173+
"it may cause performance problems."
174+
<< std::endl;
175+
}
176+
}
177+
132178
// Return the reconstructed measurements.
133179
return measurements;
134180
}

device/sycl/src/clusterization/clusterization_algorithm.sycl

+50-7
Original file line numberDiff line numberDiff line change
@@ -8,6 +8,7 @@
88
// Local include(s).
99
#include "../utils/barrier.hpp"
1010
#include "../utils/get_queue.hpp"
11+
#include "traccc/clusterization/device/ccl_debug_output.hpp"
1112
#include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
1213
#include "traccc/sycl/clusterization/clusterization_algorithm.hpp"
1314

@@ -103,9 +104,29 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
103104
assert(m_config.max_cells_per_thread <=
104105
device::details::CELLS_PER_THREAD_STACK_LIMIT);
105106

107+
// If necessary, allocate an object for storing the debug information
108+
vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;
109+
cl::sycl::event e100;
110+
111+
if (true // TODO: OR DEBUG OR CONFIG
112+
) {
113+
debug_output =
114+
vecmem::make_unique_alloc<device::details::ccl_debug_output>(
115+
m_mr.main);
116+
117+
device::details::ccl_debug_output empty_output =
118+
device::details::ccl_debug_output::init();
119+
120+
e100 = details::get_queue(m_queue).memcpy(
121+
debug_output.get(), &empty_output,
122+
sizeof(device::details::ccl_debug_output));
123+
}
124+
106125
// Run ccl kernel
107-
details::get_queue(m_queue)
108-
.submit([&](::sycl::handler& h) {
126+
cl::sycl::event e200 =
127+
details::get_queue(m_queue).submit([&](::sycl::handler& h) {
128+
h.depends_on(e100);
129+
109130
// Allocate shared memory for the kernel.
110131
vecmem::sycl::local_accessor<std::size_t> shared_uint(3, h);
111132
vecmem::sycl::local_accessor<device::details::index_t> shared_idx(
@@ -120,8 +141,8 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
120141
gf_backup_view = vecmem::get_data(m_gf_backup),
121142
adjc_backup_view = vecmem::get_data(m_adjc_backup),
122143
adjv_backup_view = vecmem::get_data(m_adjv_backup),
123-
mutex_ptr = m_backup_mutex.get(),
124-
cfg = m_config](::sycl::nd_item<1> item) {
144+
mutex_ptr = m_backup_mutex.get(), cfg = m_config,
145+
debug_output = debug_output.get()](::sycl::nd_item<1> item) {
125146
// Construct more readable variable names.
126147
vecmem::data::vector_view<device::details::index_t> f_view{
127148
static_cast<vector_size_t>(cfg.max_partition_size()),
@@ -148,10 +169,32 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
148169
partition_end, outi, f_view, gf_view, f_backup_view,
149170
gf_backup_view, adjc_backup_view, adjv_backup_view,
150171
backup_mutex, barry_r, measurements_view,
151-
cell_links_view);
172+
cell_links_view, debug_output);
152173
});
153-
})
154-
.wait_and_throw();
174+
});
175+
176+
cl::sycl::event e300;
177+
178+
if (debug_output) {
179+
device::details::ccl_debug_output host_output;
180+
181+
e300 = details::get_queue(m_queue).memcpy(
182+
&host_output, debug_output.get(),
183+
sizeof(device::details::ccl_debug_output), {e200});
184+
185+
e300.wait_and_throw();
186+
187+
if (host_output.num_oversized_partitions > 0) {
188+
std::cout << "WARNING: @clusterization_algorithm: "
189+
<< "Clustering encountered "
190+
<< host_output.num_oversized_partitions
191+
<< " oversized partitions; if this number is too large, "
192+
"it may cause performance problems."
193+
<< std::endl;
194+
}
195+
}
196+
197+
cl::sycl::event::wait_and_throw({e200, e300});
155198

156199
// Return the reconstructed measurements.
157200
return measurements;

examples/options/include/traccc/options/clusterization.hpp

+1
Original file line numberDiff line numberDiff line change
@@ -37,6 +37,7 @@ class clusterization
3737
unsigned int max_cells_per_thread;
3838
unsigned int target_cells_per_thread;
3939
unsigned int backup_size_multiplier;
40+
bool enable_debug_output;
4041
/// @}
4142

4243
/// Print the specific options of this class

examples/options/src/clusterization.cpp

+15-1
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,14 @@
1313
// System include(s).
1414
#include <iostream>
1515

16+
namespace {
17+
#ifndef NDEBUG
18+
constexpr bool enable_cca_debug_default = true;
19+
#else
20+
constexpr bool enable_cca_debug_default = false;
21+
#endif
22+
} // namespace
23+
1624
namespace traccc::opts {
1725

1826
clusterization::clusterization() : interface("Clusterization Options") {
@@ -33,6 +41,10 @@ clusterization::clusterization() : interface("Clusterization Options") {
3341
boost::program_options::value(&backup_size_multiplier)
3442
->default_value(256),
3543
"The size multiplier of the backup scratch space");
44+
m_desc.add_options()("cca-debug",
45+
boost::program_options::value(&enable_debug_output)
46+
->default_value(enable_cca_debug_default),
47+
"The size multiplier of the backup scratch space");
3648
}
3749

3850
clusterization::operator clustering_config() const {
@@ -42,6 +54,7 @@ clusterization::operator clustering_config() const {
4254
rv.max_cells_per_thread = max_cells_per_thread;
4355
rv.target_cells_per_thread = target_cells_per_thread;
4456
rv.backup_size_multiplier = backup_size_multiplier;
57+
rv.enable_debug_output = enable_debug_output;
4558

4659
return rv;
4760
}
@@ -54,7 +67,8 @@ std::ostream& clusterization::print_impl(std::ostream& out) const {
5467
out << " Threads per partition: " << threads_per_partition << "\n";
5568
out << " Target cells per thread: " << target_cells_per_thread << "\n";
5669
out << " Max cells per thread: " << max_cells_per_thread << "\n";
57-
out << " Scratch space size mult.: " << backup_size_multiplier;
70+
out << " Scratch space size mult.: " << backup_size_multiplier << "\n";
71+
out << " Debug output printing: " << enable_debug_output << "\n";
5872
return out;
5973
}
6074

tests/common/tests/cca_test.hpp

+2
Original file line numberDiff line numberDiff line change
@@ -43,6 +43,7 @@ inline traccc::clustering_config default_ccl_test_config() {
4343
rv.max_cells_per_thread = 16;
4444
rv.target_cells_per_thread = 8;
4545
rv.backup_size_multiplier = 256;
46+
rv.enable_debug_output = false;
4647

4748
return rv;
4849
}
@@ -54,6 +55,7 @@ inline traccc::clustering_config tiny_ccl_test_config() {
5455
rv.max_cells_per_thread = 1;
5556
rv.target_cells_per_thread = 1;
5657
rv.backup_size_multiplier = 16384;
58+
rv.enable_debug_output = false;
5759

5860
return rv;
5961
}

0 commit comments

Comments
 (0)