kokkos · vbrunini · Apr 28, 2025 · Apr 29, 2025 · Apr 29, 2025 · May 1, 2025
diff --git a/profiling/space-time-stack/kp_space_time_stack.cpp b/profiling/space-time-stack/kp_space_time_stack.cpp
@@ -336,9 +336,11 @@ struct StackNode {
   void print_recursive(std::ostream& os, std::string my_indent,
                        std::string const& child_indent,
                        double tree_time) const {
-    auto percent = (total_runtime / tree_time) * 100.0;
+    const double comm_size = total_runtime / avg_runtime;
+    auto threshold_percent = ((max_runtime * comm_size) / tree_time) * 100.0;
+    auto percent           = (total_runtime / tree_time) * 100.0;
 
-    if (percent < output_threshold) return;
+    if (threshold_percent < output_threshold) return;
     if (!name.empty()) {
       os << my_indent;
       auto imbalance = (max_runtime / avg_runtime - 1.0) * 100.0;

diff --git a/tests/CMakeLists.txt b/tests/CMakeLists.txt
@@ -5,6 +5,7 @@
 # Arguments:
 #   TARGET_NAME       : name of the test (required)
 #   SOURCE_FILE       : source file, defaults to <TARGET_NAME>.cpp (optional)
+#   RANKS             : number of MPI ranks to use for the test, defaults to 1 (optional)
 #   KOKKOS_TOOLS_LIBS : the test environment will receive the variable 'KOKKOS_TOOLS_LIBS' that is set as the path
 #                       to the target file of this argument (optional)
 #   KOKKOS_TOOLS_SAMPLER_VERBOSE : the test environment will receive the variable 'KOKKOS_TOOLS_SAMPLER_VERBOSE' that is set as the value of 1 for printing the sample has been taken 
@@ -15,7 +16,7 @@
 
 
 function(kp_add_executable_and_test)
-	cmake_parse_arguments(kaeat_args "" "TARGET_NAME;SOURCE_FILE;KOKKOS_TOOLS_SAMPLER_VERBOSE;KOKKOS_TOOLS_GLOBALFENCES;KOKKOS_TOOLS_SAMPLER_SKIP;KOKKOS_TOOLS_SAMPLER_PROB;KOKKOS_TOOLS_RANDOM_SEED" "KOKKOS_TOOLS_LIBS" ${ARGN})
+	cmake_parse_arguments(kaeat_args "" "TARGET_NAME;SOURCE_FILE;RANKS;KOKKOS_TOOLS_SAMPLER_VERBOSE;KOKKOS_TOOLS_GLOBALFENCES;KOKKOS_TOOLS_SAMPLER_SKIP;KOKKOS_TOOLS_SAMPLER_PROB;KOKKOS_TOOLS_RANDOM_SEED" "KOKKOS_TOOLS_LIBS" ${ARGN})
     if(NOT DEFINED kaeat_args_TARGET_NAME)
         message(FATAL_ERROR "'TARGET_NAME' is a required argument.")
     endif()
@@ -24,6 +25,15 @@ function(kp_add_executable_and_test)
         set(kaeat_args_SOURCE_FILE "${kaeat_args_TARGET_NAME}.cpp")
     endif()
 
+    if(DEFINED kaeat_args_RANKS AND NOT KokkosTools_ENABLE_MPI)
+        message(WARNING "Skipping ${kaeat_args_TARGET_NAME} test because MPI is not enabled.")
+        return()
+    endif()
+
+    if(DEFINED kaeat_args_RANKS)
+        message(WARNING "Test ${kaeat_args_TARGET_NAME} will run on ${kaeat_args_RANKS} MPI ranks.")
+    endif()
+
     add_executable(${kaeat_args_TARGET_NAME})
 
     target_sources(
@@ -37,10 +47,18 @@ function(kp_add_executable_and_test)
             test_common
     )
 
-    add_test(
-        NAME ${kaeat_args_TARGET_NAME}
-        COMMAND $<TARGET_FILE:${kaeat_args_TARGET_NAME}>
-    )
+    if(DEFINED kaeat_args_RANKS)
+        add_test(
+            NAME ${kaeat_args_TARGET_NAME}
+            COMMAND mpirun -n ${kaeat_args_RANKS} $<TARGET_FILE:${kaeat_args_TARGET_NAME}>
+        )
+        set_property(TEST ${kaeat_args_TARGET_NAME} PROPERTY PROCESSORS ${kaeat_args_RANKS})
+    else()
+        add_test(
+            NAME ${kaeat_args_TARGET_NAME}
+            COMMAND $<TARGET_FILE:${kaeat_args_TARGET_NAME}>
+        )
+    endif()
 
     if(DEFINED kaeat_args_KOKKOS_TOOLS_LIBS)
         set(TOOL_LIBS_FILES)

diff --git a/tests/space-time-stack/CMakeLists.txt b/tests/space-time-stack/CMakeLists.txt
@@ -3,3 +3,13 @@ kp_add_executable_and_test(
     SOURCE_FILE       test_demangling.cpp
     KOKKOS_TOOLS_LIBS kp_space_time_stack
 )
+
+if(KokkosTools_ENABLE_MPI)
+  kp_add_executable_and_test(
+      TARGET_NAME       test_threshold_with_imbalance
+      SOURCE_FILE       test_threshold_with_imbalance.cpp
+      KOKKOS_TOOLS_LIBS kp_space_time_stack
+      RANKS             2
+  )
+  target_link_libraries(test_threshold_with_imbalance PRIVATE MPI::MPI_CXX)
+endif()
diff --git a/tests/space-time-stack/test_deep_copy.cpp.orig b/tests/space-time-stack/test_deep_copy.cpp.orig
@@ -0,0 +1,60 @@
+#include <iostream>
+#include <sstream>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include "Kokkos_Core.hpp"
+
+struct Tester {
+  struct TagNamed {};
+  struct TagUnnamed {};
+
+  template <typename execution_space>
+  explicit Tester(const execution_space& space) {
+    Kokkos::View<double *, execution_space> a("view_a", 10);
+    Kokkos::View<double *, execution_space> b("view_b", 10);
+
+    Kokkos::deep_copy(a, 1.5);
+    Kokkos::deep_copy(space, b, 2.0);
+    Kokkos::deep_copy(space, b, a);
+  }
+};
+
+static const std::vector<std::string> matchers{
+    // copy of scalar into view_a
+    "[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.0% ------ 1 \"view_a\"=\"Scalar\" \\([A-Z]+->[A-Z]+\\) \\[copy\\]",
+    // copy of scalar into view_b, execution space overload which apparently reports (none) for the source instead of Scalar
+    "[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.0% ------ 1 \"view_b\"=\".*\" \\([A-Z]+->[A-Z]+\\) \\[copy\\]",
+    // copy of view_a into view_b
+    "[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.0% ------ 1 \"view_b\"=\"view_a\" \\([A-Z]+->[A-Z]+\\) \\[copy\\]",
+};
+
+/**
+ * @test This test checks that the tool outputs deep_copy statistics.
+ */
+TEST(SpaceTimeStackTest, deep_copy) {
+  //! Initialize @c Kokkos.
+  Kokkos::initialize();
+
+  //! Redirect output for later analysis.
+  std::cout.flush();
+  std::ostringstream output;
+  std::streambuf* coutbuf = std::cout.rdbuf(output.rdbuf());
+
+  //! Run tests. @todo Replace this with Google Test.
+  Tester tester(Kokkos::DefaultExecutionSpace{});
+
+  //! Finalize @c Kokkos.
+  Kokkos::finalize();
+
+  //! Restore output buffer.
+  std::cout.flush();
+  std::cout.rdbuf(coutbuf);
+  std::cout << output.str() << std::endl;
+
+  //! Analyze test output.
+  for (const auto& matcher : matchers) {
+    EXPECT_THAT(output.str(), ::testing::ContainsRegex(matcher));
+  }
+}
diff --git a/tests/space-time-stack/test_threshold_with_imbalance.cpp b/tests/space-time-stack/test_threshold_with_imbalance.cpp
@@ -0,0 +1,77 @@
+#include <chrono>
+#include <iostream>
+#include <sstream>
+
+#include "gmock/gmock.h"
+#include "gtest/gtest.h"
+
+#include "Kokkos_Core.hpp"
+#include "Kokkos_Profiling_ScopedRegion.hpp"
+#include "mpi.h"
+
+struct Tester {
+  explicit Tester() {
+    int rank;
+    MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+    // The threshold for filtering out lines from the stack is 0.1% of the total runtime.
+    // Do a 1s sleep on all ranks, then a 1.5 ms sleep on only rank 1. If the threshold
+    // is applied based on the average time across ranks the second region will be filtered out,
+    // but if it is based on the max time then it will be included (which is the desired behavior
+    // so that we see things which are slow only on a subset of ranks in cases like a coarse solve
+    // in a multigrid method that only runs on a subset of the ranks).
+    {
+      Kokkos::Profiling::ScopedRegion all_ranks_region("all_ranks_region");
+      std::this_thread::sleep_for(std::chrono::seconds(1));
+    }
+    {
+      Kokkos::Profiling::ScopedRegion rank_1_region("rank_1_region");
+      if(rank == 1) {
+        std::this_thread::sleep_for(std::chrono::microseconds(1500));
+      }
+    }
+  }
+};
+
+static const std::vector<std::string> matchers{
+    "[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.[0-9]+% 100.0% 0.00e\\+00 1 all_ranks_region \\[region\\]",
+    "[0-9.e]+ sec [0-9.]+% [0-9.]+% [1-9][0-9.]+% 100.0% 0.00e\\+00 1 rank_1_region \\[region\\]",
+};
+
+/**
+ * @test This test checks that the tool outputs deep_copy statistics.
+ */
+TEST(SpaceTimeStackTest, threshold_with_imbalance) {
+  MPI_Init(nullptr, nullptr);
+  int comm_size;
+  MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
+  ASSERT_EQ(2, comm_size) << " test requires exactly 2 MPI ranks.";
+
+  //! Initialize @c Kokkos.
+  Kokkos::initialize();
+
+  //! Redirect output for later analysis.
+  std::cout.flush();
+  std::ostringstream output;
+  std::streambuf* coutbuf = std::cout.rdbuf(output.rdbuf());
+
+  //! Run tests. @todo Replace this with Google Test.
+  Tester tester;
+
+  //! Finalize @c Kokkos.
+  Kokkos::finalize();
+
+  //! Restore output buffer.
+  std::cout.flush();
+  std::cout.rdbuf(coutbuf);
+  std::cout << output.str() << std::endl;
+
+  int rank;
+  MPI_Comm_rank(MPI_COMM_WORLD, &rank);
+  //! Analyze test output.
+  if(rank == 0) {
+    for (const auto& matcher : matchers) {
+      EXPECT_THAT(output.str(), ::testing::ContainsRegex(matcher));
+    }
+  }
+  MPI_Finalize();
+}