Skip to content
Open
6 changes: 4 additions & 2 deletions profiling/space-time-stack/kp_space_time_stack.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -336,9 +336,11 @@ struct StackNode {
void print_recursive(std::ostream& os, std::string my_indent,
std::string const& child_indent,
double tree_time) const {
auto percent = (total_runtime / tree_time) * 100.0;
const double comm_size = total_runtime / avg_runtime;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

You might as well compute this properly once and store it in a member variable.

auto threshold_percent = ((max_runtime * comm_size) / tree_time) * 100.0;
auto percent = (total_runtime / tree_time) * 100.0;
Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Can you elaborate on why this is sensible? I understand the previous code but now we essentially do

((max_runtime / avg_runtime) * percent  < outout_threshold

In other words, why does multiplying by max_runtime / avg_runtime do what you want? Wouldn't you want to do something like

if (percent > first_threshold || (high_imbalance && percent > second_threshold))
  report

? It's not quite clear to me that this second threshold should be the same as the orignal one divided by the imbalance. Maybe it's sensible to report all kernels that have an imbalance greater than a certain factor?

Copy link
Copy Markdown
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

The thinking is to identify any kernels/regions that would be above the 0.1% threshold if we only looked at the runtimes on that rank. Assuming that total_runtime is evenly distributed across ranks (which I think is basically always true), then rank_wall_time = tree_time / comm_size and therefore max_runtime / rank_wall_time == (max_runtime * comm_size) / tree_time.

Copy link
Copy Markdown
Contributor

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

OK. You basically want to use max_runtime instead of avg_runtime for comparing with the threshold value. I think that's sensible. Not sure if we need a configuration option.
Note that there also is print_json_recursive that needs consistent logic.


if (percent < output_threshold) return;
if (threshold_percent < output_threshold) return;
if (!name.empty()) {
os << my_indent;
auto imbalance = (max_runtime / avg_runtime - 1.0) * 100.0;
Expand Down
28 changes: 23 additions & 5 deletions tests/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -5,6 +5,7 @@
# Arguments:
# TARGET_NAME : name of the test (required)
# SOURCE_FILE : source file, defaults to <TARGET_NAME>.cpp (optional)
# RANKS : number of MPI ranks to use for the test, defaults to 1 (optional)
# KOKKOS_TOOLS_LIBS : the test environment will receive the variable 'KOKKOS_TOOLS_LIBS' that is set as the path
# to the target file of this argument (optional)
# KOKKOS_TOOLS_SAMPLER_VERBOSE : the test environment will receive the variable 'KOKKOS_TOOLS_SAMPLER_VERBOSE' that is set as the value of 1 for printing the sample has been taken
Expand All @@ -15,7 +16,7 @@


function(kp_add_executable_and_test)
cmake_parse_arguments(kaeat_args "" "TARGET_NAME;SOURCE_FILE;KOKKOS_TOOLS_SAMPLER_VERBOSE;KOKKOS_TOOLS_GLOBALFENCES;KOKKOS_TOOLS_SAMPLER_SKIP;KOKKOS_TOOLS_SAMPLER_PROB;KOKKOS_TOOLS_RANDOM_SEED" "KOKKOS_TOOLS_LIBS" ${ARGN})
cmake_parse_arguments(kaeat_args "" "TARGET_NAME;SOURCE_FILE;RANKS;KOKKOS_TOOLS_SAMPLER_VERBOSE;KOKKOS_TOOLS_GLOBALFENCES;KOKKOS_TOOLS_SAMPLER_SKIP;KOKKOS_TOOLS_SAMPLER_PROB;KOKKOS_TOOLS_RANDOM_SEED" "KOKKOS_TOOLS_LIBS" ${ARGN})
if(NOT DEFINED kaeat_args_TARGET_NAME)
message(FATAL_ERROR "'TARGET_NAME' is a required argument.")
endif()
Expand All @@ -24,6 +25,15 @@ function(kp_add_executable_and_test)
set(kaeat_args_SOURCE_FILE "${kaeat_args_TARGET_NAME}.cpp")
endif()

if(DEFINED kaeat_args_RANKS AND NOT KokkosTools_ENABLE_MPI)
message(WARNING "Skipping ${kaeat_args_TARGET_NAME} test because MPI is not enabled.")
return()
endif()

if(DEFINED kaeat_args_RANKS)
message(WARNING "Test ${kaeat_args_TARGET_NAME} will run on ${kaeat_args_RANKS} MPI ranks.")
endif()

Comment thread
vbrunini marked this conversation as resolved.
Outdated
add_executable(${kaeat_args_TARGET_NAME})

target_sources(
Expand All @@ -37,10 +47,18 @@ function(kp_add_executable_and_test)
test_common
)

add_test(
NAME ${kaeat_args_TARGET_NAME}
COMMAND $<TARGET_FILE:${kaeat_args_TARGET_NAME}>
)
if(DEFINED kaeat_args_RANKS)
add_test(
NAME ${kaeat_args_TARGET_NAME}
COMMAND mpirun -n ${kaeat_args_RANKS} $<TARGET_FILE:${kaeat_args_TARGET_NAME}>
)
set_property(TEST ${kaeat_args_TARGET_NAME} PROPERTY PROCESSORS ${kaeat_args_RANKS})
else()
add_test(
NAME ${kaeat_args_TARGET_NAME}
COMMAND $<TARGET_FILE:${kaeat_args_TARGET_NAME}>
)
endif()

if(DEFINED kaeat_args_KOKKOS_TOOLS_LIBS)
set(TOOL_LIBS_FILES)
Expand Down
10 changes: 10 additions & 0 deletions tests/space-time-stack/CMakeLists.txt
Original file line number Diff line number Diff line change
Expand Up @@ -3,3 +3,13 @@ kp_add_executable_and_test(
SOURCE_FILE test_demangling.cpp
KOKKOS_TOOLS_LIBS kp_space_time_stack
)

if(KokkosTools_ENABLE_MPI)
kp_add_executable_and_test(
TARGET_NAME test_threshold_with_imbalance
SOURCE_FILE test_threshold_with_imbalance.cpp
KOKKOS_TOOLS_LIBS kp_space_time_stack
RANKS 2
)
target_link_libraries(test_threshold_with_imbalance PRIVATE MPI::MPI_CXX)
endif()
60 changes: 60 additions & 0 deletions tests/space-time-stack/test_deep_copy.cpp.orig
Comment thread
vbrunini marked this conversation as resolved.
Outdated
Original file line number Diff line number Diff line change
@@ -0,0 +1,60 @@
#include <iostream>
#include <sstream>

#include "gmock/gmock.h"
#include "gtest/gtest.h"

#include "Kokkos_Core.hpp"

struct Tester {
struct TagNamed {};
struct TagUnnamed {};

template <typename execution_space>
explicit Tester(const execution_space& space) {
Kokkos::View<double *, execution_space> a("view_a", 10);
Kokkos::View<double *, execution_space> b("view_b", 10);

Kokkos::deep_copy(a, 1.5);
Kokkos::deep_copy(space, b, 2.0);
Kokkos::deep_copy(space, b, a);
}
};

static const std::vector<std::string> matchers{
// copy of scalar into view_a
"[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.0% ------ 1 \"view_a\"=\"Scalar\" \\([A-Z]+->[A-Z]+\\) \\[copy\\]",
// copy of scalar into view_b, execution space overload which apparently reports (none) for the source instead of Scalar
"[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.0% ------ 1 \"view_b\"=\".*\" \\([A-Z]+->[A-Z]+\\) \\[copy\\]",
// copy of view_a into view_b
"[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.0% ------ 1 \"view_b\"=\"view_a\" \\([A-Z]+->[A-Z]+\\) \\[copy\\]",
};

/**
* @test This test checks that the tool outputs deep_copy statistics.
*/
TEST(SpaceTimeStackTest, deep_copy) {
//! Initialize @c Kokkos.
Kokkos::initialize();

//! Redirect output for later analysis.
std::cout.flush();
std::ostringstream output;
std::streambuf* coutbuf = std::cout.rdbuf(output.rdbuf());

//! Run tests. @todo Replace this with Google Test.
Tester tester(Kokkos::DefaultExecutionSpace{});

//! Finalize @c Kokkos.
Kokkos::finalize();

//! Restore output buffer.
std::cout.flush();
std::cout.rdbuf(coutbuf);
std::cout << output.str() << std::endl;

//! Analyze test output.
for (const auto& matcher : matchers) {
EXPECT_THAT(output.str(), ::testing::ContainsRegex(matcher));
}
}
77 changes: 77 additions & 0 deletions tests/space-time-stack/test_threshold_with_imbalance.cpp
Original file line number Diff line number Diff line change
@@ -0,0 +1,77 @@
#include <chrono>
#include <iostream>
#include <sstream>

#include "gmock/gmock.h"
#include "gtest/gtest.h"

#include "Kokkos_Core.hpp"
#include "Kokkos_Profiling_ScopedRegion.hpp"
#include "mpi.h"

struct Tester {
explicit Tester() {
int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
// The threshold for filtering out lines from the stack is 0.1% of the total runtime.
// Do a 1s sleep on all ranks, then a 1.5 ms sleep on only rank 1. If the threshold
// is applied based on the average time across ranks the second region will be filtered out,
// but if it is based on the max time then it will be included (which is the desired behavior
// so that we see things which are slow only on a subset of ranks in cases like a coarse solve
// in a multigrid method that only runs on a subset of the ranks).
{
Kokkos::Profiling::ScopedRegion all_ranks_region("all_ranks_region");
std::this_thread::sleep_for(std::chrono::seconds(1));
}
{
Kokkos::Profiling::ScopedRegion rank_1_region("rank_1_region");
if(rank == 1) {
std::this_thread::sleep_for(std::chrono::microseconds(1500));
}
}
}
};

static const std::vector<std::string> matchers{
"[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.[0-9]+% 100.0% 0.00e\\+00 1 all_ranks_region \\[region\\]",
"[0-9.e]+ sec [0-9.]+% [0-9.]+% [1-9][0-9.]+% 100.0% 0.00e\\+00 1 rank_1_region \\[region\\]",
};

/**
* @test This test checks that the tool outputs deep_copy statistics.
*/
TEST(SpaceTimeStackTest, threshold_with_imbalance) {
MPI_Init(nullptr, nullptr);
int comm_size;
MPI_Comm_size(MPI_COMM_WORLD, &comm_size);
ASSERT_EQ(2, comm_size) << " test requires exactly 2 MPI ranks.";

//! Initialize @c Kokkos.
Kokkos::initialize();

//! Redirect output for later analysis.
std::cout.flush();
std::ostringstream output;
std::streambuf* coutbuf = std::cout.rdbuf(output.rdbuf());

//! Run tests. @todo Replace this with Google Test.
Tester tester;

//! Finalize @c Kokkos.
Kokkos::finalize();

//! Restore output buffer.
std::cout.flush();
std::cout.rdbuf(coutbuf);
std::cout << output.str() << std::endl;

int rank;
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
//! Analyze test output.
if(rank == 0) {
for (const auto& matcher : matchers) {
EXPECT_THAT(output.str(), ::testing::ContainsRegex(matcher));
}
}
MPI_Finalize();
}
Loading