Add debug output to clustering algorithms

stephenswat · stephenswat · commit f24331cf26c8 · 2024-08-01T14:39:09.000+02:00
In #595, I equipped the CCA code with some edge case handling which allows it to handle oversized partitions. Although this makes sure the algorithm works, it also risks to slow down execution. In order to better understand how much performance we might be losing, this commit adds the ability for the SYCL and CUDA algorithms to print some warnings if they ever encounter this edge case.
diff --git a/core/include/traccc/clusterization/clustering_config.hpp b/core/include/traccc/clusterization/clustering_config.hpp
@@ -54,6 +54,13 @@ struct clustering_config {
      */
     unsigned int backup_size_multiplier;
 
+    /**
+     * @brief Flag to enforce debug output.
+     *
+     * @warning This will slown down the clustering algorithm.
+     */
+    bool enable_debug_output;
+
     /**
      * @brief The maximum number of cells per partition.
      */
diff --git a/device/alpaka/src/clusterization/clusterization_algorithm.cpp b/device/alpaka/src/clusterization/clusterization_algorithm.cpp
@@ -60,7 +60,7 @@ struct CCLKernel {
                            partition_start, partition_end, outi, f_view,
                            gf_view, f_backup_view, gf_backup_view,
                            adjc_backup_view, adjv_backup_view, backup_mutex,
-                           barry_r, measurements_view, cell_links);
+                           barry_r, measurements_view, cell_links, nullptr);
     }
 };
 
diff --git a/device/common/include/traccc/clusterization/device/ccl_debug_output.hpp b/device/common/include/traccc/clusterization/device/ccl_debug_output.hpp
@@ -0,0 +1,25 @@
+/**
+ * traccc library, part of the ACTS project (R&D line)
+ *
+ * (c) 2024 CERN for the benefit of the ACTS project
+ *
+ * Mozilla Public License Version 2.0
+ */
+
+#pragma once
+
+#include <cstdint>
+
+namespace traccc::device::details {
+struct ccl_debug_output {
+    uint32_t num_oversized_partitions;
+
+    static ccl_debug_output init() {
+        ccl_debug_output rv;
+
+        rv.num_oversized_partitions = 0;
+
+        return rv;
+    }
+};
+}  // namespace traccc::device::details
diff --git a/device/common/include/traccc/clusterization/device/ccl_kernel.hpp b/device/common/include/traccc/clusterization/device/ccl_kernel.hpp
@@ -9,6 +9,7 @@
 
 // Project include(s).
 #include "traccc/clusterization/clustering_config.hpp"
+#include "traccc/clusterization/device/ccl_debug_output.hpp"
 #include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
 #include "traccc/definitions/hints.hpp"
 #include "traccc/definitions/qualifiers.hpp"
@@ -53,6 +54,7 @@ namespace traccc::device {
 /// @param[out] measurements_view collection of measurements
 /// @param[out] cell_links    collection of links to measurements each cell is
 /// put into
+/// @param[out] debug_output debug output location
 template <device::concepts::barrier barrier_t,
           device::concepts::thread_id1 thread_id_t>
 TRACCC_DEVICE inline void ccl_kernel(
@@ -68,7 +70,8 @@ TRACCC_DEVICE inline void ccl_kernel(
     vecmem::data::vector_view<details::index_t> adjv_backup_view,
     vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
     measurement_collection_types::view measurements_view,
-    vecmem::data::vector_view<unsigned int> cell_links);
+    vecmem::data::vector_view<unsigned int> cell_links,
+    details::ccl_debug_output* debug_output);
 
 }  // namespace traccc::device
 
diff --git a/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp b/device/common/include/traccc/clusterization/device/impl/ccl_kernel.ipp
@@ -11,6 +11,7 @@
 
 #include "traccc/clusterization/clustering_config.hpp"
 #include "traccc/clusterization/device/aggregate_cluster.hpp"
+#include "traccc/clusterization/device/ccl_debug_output.hpp"
 #include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
 #include "traccc/clusterization/device/reduce_problem_cell.hpp"
 #include "traccc/device/concepts/barrier.hpp"
@@ -220,7 +221,8 @@ TRACCC_DEVICE inline void ccl_kernel(
     vecmem::data::vector_view<details::index_t> adjv_backup_view,
     vecmem::device_atomic_ref<uint32_t> backup_mutex, barrier_t& barrier,
     measurement_collection_types::view measurements_view,
-    vecmem::data::vector_view<unsigned int> cell_links) {
+    vecmem::data::vector_view<unsigned int> cell_links,
+    details::ccl_debug_output* debug_output) {
     // Construct device containers around the views.
     const cell_collection_types::const_device cells_device(cells_view);
     const cell_module_collection_types::const_device modules_device(
@@ -325,6 +327,13 @@ TRACCC_DEVICE inline void ccl_kernel(
     if (size > cfg.max_partition_size()) {
         if (thread_id.getLocalThreadIdX() == 0) {
             lock.lock();
+
+            if (debug_output) {
+                vecmem::device_atomic_ref<uint32_t>
+                    num_oversized_partitions_atm(
+                        debug_output->num_oversized_partitions);
+                num_oversized_partitions_atm.fetch_add(1);
+            }
         }
 
         barrier.blockBarrier();
diff --git a/device/cuda/src/clusterization/clusterization_algorithm.cu b/device/cuda/src/clusterization/clusterization_algorithm.cu
@@ -6,12 +6,16 @@
  */
 
 // CUDA Library include(s).
+#include <cuda_runtime_api.h>
+#include <driver_types.h>
+
 #include "../sanity/contiguous_on.cuh"
 #include "../sanity/ordered_on.cuh"
 #include "../utils/barrier.hpp"
 #include "../utils/cuda_error_handling.hpp"
 #include "../utils/utils.hpp"
 #include "traccc/clusterization/clustering_config.hpp"
+#include "traccc/clusterization/device/ccl_debug_output.hpp"
 #include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
 #include "traccc/cuda/clusterization/clusterization_algorithm.hpp"
 #include "traccc/cuda/utils/thread_id.hpp"
@@ -21,6 +25,9 @@
 // Project include(s)
 #include "traccc/clusterization/device/ccl_kernel.hpp"
 
+// System include
+#include <iostream>
+
 // Vecmem include(s).
 #include <cstring>
 #include <vecmem/utils/copy.hpp>
@@ -40,7 +47,8 @@ __global__ void ccl_kernel(
     vecmem::data::vector_view<device::details::index_t> gf_backup_view,
     vecmem::data::vector_view<unsigned char> adjc_backup_view,
     vecmem::data::vector_view<device::details::index_t> adjv_backup_view,
-    unsigned int* backup_mutex_ptr) {
+    unsigned int* backup_mutex_ptr,
+    device::details::ccl_debug_output* debug_output) {
 
     __shared__ std::size_t partition_start, partition_end;
     __shared__ std::size_t outi;
@@ -62,7 +70,7 @@ __global__ void ccl_kernel(
                        partition_start, partition_end, outi, f_view, gf_view,
                        f_backup_view, gf_backup_view, adjc_backup_view,
                        adjv_backup_view, backup_mutex, barry_r,
-                       measurements_view, cell_links);
+                       measurements_view, cell_links, debug_output);
 }
 
 }  // namespace kernels
@@ -132,14 +140,52 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
     assert(m_config.max_cells_per_thread <=
            device::details::CELLS_PER_THREAD_STACK_LIMIT);
 
+    // If necessary, allocate an object for storing the debug information
+    vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;
+
+    if (m_config.enable_debug_output) {
+        debug_output =
+            vecmem::make_unique_alloc<device::details::ccl_debug_output>(
+                m_mr.main);
+
+        device::details::ccl_debug_output empty_output =
+            device::details::ccl_debug_output::init();
+
+        TRACCC_CUDA_ERROR_CHECK(
+            cudaMemcpyAsync(debug_output.get(), &empty_output,
+                            sizeof(device::details::ccl_debug_output),
+                            cudaMemcpyHostToDevice, stream));
+    }
+
     kernels::ccl_kernel<<<num_blocks, m_config.threads_per_partition,
                           2 * m_config.max_partition_size() *
                               sizeof(device::details::index_t),
-                          stream>>>(
-        m_config, cells, modules, measurements, cell_links, m_f_backup,
-        m_gf_backup, m_adjc_backup, m_adjv_backup, m_backup_mutex.get());
+                          stream>>>(m_config, cells, modules, measurements,
+                                    cell_links, m_f_backup, m_gf_backup,
+                                    m_adjc_backup, m_adjv_backup,
+                                    m_backup_mutex.get(), debug_output.get());
     TRACCC_CUDA_ERROR_CHECK(cudaGetLastError());
 
+    if (debug_output) {
+        device::details::ccl_debug_output host_output;
+
+        TRACCC_CUDA_ERROR_CHECK(
+            cudaMemcpyAsync(&host_output, debug_output.get(),
+                            sizeof(device::details::ccl_debug_output),
+                            cudaMemcpyDeviceToHost, stream));
+
+        TRACCC_CUDA_ERROR_CHECK(cudaStreamSynchronize(stream));
+
+        if (host_output.num_oversized_partitions > 0) {
+            std::cout << "WARNING: @clusterization_algorithm: "
+                      << "Clustering encountered "
+                      << host_output.num_oversized_partitions
+                      << " oversized partitions; if this number is too large, "
+                         "it may cause performance problems."
+                      << std::endl;
+        }
+    }
+
     // Return the reconstructed measurements.
     return measurements;
 }
diff --git a/device/sycl/src/clusterization/clusterization_algorithm.sycl b/device/sycl/src/clusterization/clusterization_algorithm.sycl
@@ -10,6 +10,7 @@
 #include "../sanity/ordered_on.hpp"
 #include "../utils/barrier.hpp"
 #include "../utils/get_queue.hpp"
+#include "traccc/clusterization/device/ccl_debug_output.hpp"
 #include "traccc/clusterization/device/ccl_kernel_definitions.hpp"
 #include "traccc/sycl/clusterization/clusterization_algorithm.hpp"
 #include "traccc/sycl/utils/thread_id.hpp"
@@ -113,9 +114,29 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
     assert(m_config.max_cells_per_thread <=
            device::details::CELLS_PER_THREAD_STACK_LIMIT);
 
+    // If necessary, allocate an object for storing the debug information
+    vecmem::unique_alloc_ptr<device::details::ccl_debug_output> debug_output;
+    cl::sycl::event e100;
+
+    if (true  // TODO: OR DEBUG OR CONFIG
+    ) {
+        debug_output =
+            vecmem::make_unique_alloc<device::details::ccl_debug_output>(
+                m_mr.main);
+
+        device::details::ccl_debug_output empty_output =
+            device::details::ccl_debug_output::init();
+
+        e100 = details::get_queue(m_queue).memcpy(
+            debug_output.get(), &empty_output,
+            sizeof(device::details::ccl_debug_output));
+    }
+
     // Run ccl kernel
-    details::get_queue(m_queue)
-        .submit([&](::sycl::handler& h) {
+    cl::sycl::event e200 =
+        details::get_queue(m_queue).submit([&](::sycl::handler& h) {
+            h.depends_on(e100);
+
             // Allocate shared memory for the kernel.
             vecmem::sycl::local_accessor<std::size_t> shared_uint(3, h);
             vecmem::sycl::local_accessor<device::details::index_t> shared_idx(
@@ -130,8 +151,8 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
                  gf_backup_view = vecmem::get_data(m_gf_backup),
                  adjc_backup_view = vecmem::get_data(m_adjc_backup),
                  adjv_backup_view = vecmem::get_data(m_adjv_backup),
-                 mutex_ptr = m_backup_mutex.get(),
-                 cfg = m_config](::sycl::nd_item<1> item) {
+                 mutex_ptr = m_backup_mutex.get(), cfg = m_config,
+                 debug_output = debug_output.get()](::sycl::nd_item<1> item) {
                     // Construct more readable variable names.
                     vecmem::data::vector_view<device::details::index_t> f_view{
                         static_cast<vector_size_t>(cfg.max_partition_size()),
@@ -152,15 +173,37 @@ clusterization_algorithm::output_type clusterization_algorithm::operator()(
                     const sycl::thread_id1 thread_id(item);
 
                     // Run the algorithm for this thread.
-                    device::ccl_kernel(cfg, thread_id, cells_view, modules_view,
-                                       partition_start, partition_end, outi,
-                                       f_view, gf_view, f_backup_view,
-                                       gf_backup_view, adjc_backup_view,
-                                       adjv_backup_view, backup_mutex, barry_r,
-                                       measurements_view, cell_links_view);
+                    device::ccl_kernel(
+                        cfg, thread_id, cells_view, modules_view,
+                        partition_start, partition_end, outi, f_view, gf_view,
+                        f_backup_view, gf_backup_view, adjc_backup_view,
+                        adjv_backup_view, backup_mutex, barry_r,
+                        measurements_view, cell_links_view, debug_output);
                 });
-        })
-        .wait_and_throw();
+        });
+
+    cl::sycl::event e300;
+
+    if (debug_output) {
+        device::details::ccl_debug_output host_output;
+
+        e300 = details::get_queue(m_queue).memcpy(
+            &host_output, debug_output.get(),
+            sizeof(device::details::ccl_debug_output), {e200});
+
+        e300.wait_and_throw();
+
+        if (host_output.num_oversized_partitions > 0) {
+            std::cout << "WARNING: @clusterization_algorithm: "
+                      << "Clustering encountered "
+                      << host_output.num_oversized_partitions
+                      << " oversized partitions; if this number is too large, "
+                         "it may cause performance problems."
+                      << std::endl;
+        }
+    }
+
+    cl::sycl::event::wait_and_throw({e200, e300});
 
     // Return the reconstructed measurements.
     return measurements;
diff --git a/examples/options/include/traccc/options/clusterization.hpp b/examples/options/include/traccc/options/clusterization.hpp
@@ -37,6 +37,7 @@ class clusterization
     unsigned int max_cells_per_thread;
     unsigned int target_cells_per_thread;
     unsigned int backup_size_multiplier;
+    bool enable_debug_output;
     /// @}
 
     /// Print the specific options of this class
diff --git a/examples/options/src/clusterization.cpp b/examples/options/src/clusterization.cpp
@@ -13,6 +13,14 @@
 // System include(s).
 #include <iostream>
 
+namespace {
+#ifndef NDEBUG
+constexpr bool enable_cca_debug_default = true;
+#else
+constexpr bool enable_cca_debug_default = false;
+#endif
+}  // namespace
+
 namespace traccc::opts {
 
 clusterization::clusterization() : interface("Clusterization Options") {
@@ -33,6 +41,10 @@ clusterization::clusterization() : interface("Clusterization Options") {
                          boost::program_options::value(&backup_size_multiplier)
                              ->default_value(256),
                          "The size multiplier of the backup scratch space");
+    m_desc.add_options()("cca-debug",
+                         boost::program_options::value(&enable_debug_output)
+                             ->default_value(enable_cca_debug_default),
+                         "The size multiplier of the backup scratch space");
 }
 
 clusterization::operator clustering_config() const {
@@ -42,6 +54,7 @@ clusterization::operator clustering_config() const {
     rv.max_cells_per_thread = max_cells_per_thread;
     rv.target_cells_per_thread = target_cells_per_thread;
     rv.backup_size_multiplier = backup_size_multiplier;
+    rv.enable_debug_output = enable_debug_output;
 
     return rv;
 }
@@ -54,7 +67,8 @@ std::ostream& clusterization::print_impl(std::ostream& out) const {
     out << "  Threads per partition:      " << threads_per_partition << "\n";
     out << "  Target cells per thread:    " << target_cells_per_thread << "\n";
     out << "  Max cells per thread:       " << max_cells_per_thread << "\n";
-    out << "  Scratch space size mult.:   " << backup_size_multiplier;
+    out << "  Scratch space size mult.:   " << backup_size_multiplier << "\n";
+    out << "  Debug output printing:      " << enable_debug_output << "\n";
     return out;
 }
 
diff --git a/tests/common/tests/cca_test.hpp b/tests/common/tests/cca_test.hpp
@@ -43,6 +43,7 @@ inline traccc::clustering_config default_ccl_test_config() {
     rv.max_cells_per_thread = 16;
     rv.target_cells_per_thread = 8;
     rv.backup_size_multiplier = 256;
+    rv.enable_debug_output = false;
 
     return rv;
 }
@@ -54,6 +55,7 @@ inline traccc::clustering_config tiny_ccl_test_config() {
     rv.max_cells_per_thread = 1;
     rv.target_cells_per_thread = 1;
     rv.backup_size_multiplier = 16384;
+    rv.enable_debug_output = false;
 
     return rv;
 }

Original file line number	Diff line number	Diff line change
`@@ -60,7 +60,7 @@ struct CCLKernel {`
`60`	`60`	`partition_start, partition_end, outi, f_view,`
`61`	`61`	`gf_view, f_backup_view, gf_backup_view,`
`62`	`62`	`adjc_backup_view, adjv_backup_view, backup_mutex,`
`63`		`- barry_r, measurements_view, cell_links);`
	`63`	`+ barry_r, measurements_view, cell_links, nullptr);`
`64`	`64`	`}`
`65`	`65`	`};`
`66`	`66`
Original file line number	Diff line number	Diff line change
`@@ -43,6 +43,7 @@ inline traccc::clustering_config default_ccl_test_config() {`
`43`	`43`	`rv.max_cells_per_thread = 16;`
`44`	`44`	`rv.target_cells_per_thread = 8;`
`45`	`45`	`rv.backup_size_multiplier = 256;`
	`46`	`+ rv.enable_debug_output = false;`
`46`	`47`
`47`	`48`	`return rv;`
`48`	`49`	`}`
`@@ -54,6 +55,7 @@ inline traccc::clustering_config tiny_ccl_test_config() {`
`54`	`55`	`rv.max_cells_per_thread = 1;`
`55`	`56`	`rv.target_cells_per_thread = 1;`
`56`	`57`	`rv.backup_size_multiplier = 16384;`
	`58`	`+ rv.enable_debug_output = false;`
`57`	`59`
`58`	`60`	`return rv;`
`59`	`61`	`}`