-
Notifications
You must be signed in to change notification settings - Fork 75
SpaceTimeStack: Account for MPI imbalance when applying printing threshold #284
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: develop
Are you sure you want to change the base?
Changes from 4 commits
3f05b4c
1e16db8
3988d6e
7c17a08
5cef33b
986f6ed
8442565
dcb58cd
2425310
f3350c9
04b560f
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -336,9 +336,11 @@ struct StackNode { | |
| void print_recursive(std::ostream& os, std::string my_indent, | ||
| std::string const& child_indent, | ||
| double tree_time) const { | ||
| auto percent = (total_runtime / tree_time) * 100.0; | ||
| const double comm_size = total_runtime / avg_runtime; | ||
| auto threshold_percent = ((max_runtime * comm_size) / tree_time) * 100.0; | ||
| auto percent = (total_runtime / tree_time) * 100.0; | ||
|
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Can you elaborate on why this is sensible? I understand the previous code but now we essentially do In other words, why does multiplying by ? It's not quite clear to me that this second threshold should be the same as the orignal one divided by the imbalance. Maybe it's sensible to report all kernels that have an imbalance greater than a certain factor?
Contributor
Author
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. The thinking is to identify any kernels/regions that would be above the 0.1% threshold if we only looked at the runtimes on that rank. Assuming that total_runtime is evenly distributed across ranks (which I think is basically always true), then
Contributor
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. OK. You basically want to use |
||
|
|
||
| if (percent < output_threshold) return; | ||
| if (threshold_percent < output_threshold) return; | ||
| if (!name.empty()) { | ||
| os << my_indent; | ||
| auto imbalance = (max_runtime / avg_runtime - 1.0) * 100.0; | ||
|
|
||
|
vbrunini marked this conversation as resolved.
Outdated
|
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,60 @@ | ||
| #include <iostream> | ||
| #include <sstream> | ||
|
|
||
| #include "gmock/gmock.h" | ||
| #include "gtest/gtest.h" | ||
|
|
||
| #include "Kokkos_Core.hpp" | ||
|
|
||
| struct Tester { | ||
| struct TagNamed {}; | ||
| struct TagUnnamed {}; | ||
|
|
||
| template <typename execution_space> | ||
| explicit Tester(const execution_space& space) { | ||
| Kokkos::View<double *, execution_space> a("view_a", 10); | ||
| Kokkos::View<double *, execution_space> b("view_b", 10); | ||
|
|
||
| Kokkos::deep_copy(a, 1.5); | ||
| Kokkos::deep_copy(space, b, 2.0); | ||
| Kokkos::deep_copy(space, b, a); | ||
| } | ||
| }; | ||
|
|
||
| static const std::vector<std::string> matchers{ | ||
| // copy of scalar into view_a | ||
| "[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.0% ------ 1 \"view_a\"=\"Scalar\" \\([A-Z]+->[A-Z]+\\) \\[copy\\]", | ||
| // copy of scalar into view_b, execution space overload which apparently reports (none) for the source instead of Scalar | ||
| "[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.0% ------ 1 \"view_b\"=\".*\" \\([A-Z]+->[A-Z]+\\) \\[copy\\]", | ||
| // copy of view_a into view_b | ||
| "[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.0% ------ 1 \"view_b\"=\"view_a\" \\([A-Z]+->[A-Z]+\\) \\[copy\\]", | ||
| }; | ||
|
|
||
| /** | ||
| * @test This test checks that the tool outputs deep_copy statistics. | ||
| */ | ||
| TEST(SpaceTimeStackTest, deep_copy) { | ||
| //! Initialize @c Kokkos. | ||
| Kokkos::initialize(); | ||
|
|
||
| //! Redirect output for later analysis. | ||
| std::cout.flush(); | ||
| std::ostringstream output; | ||
| std::streambuf* coutbuf = std::cout.rdbuf(output.rdbuf()); | ||
|
|
||
| //! Run tests. @todo Replace this with Google Test. | ||
| Tester tester(Kokkos::DefaultExecutionSpace{}); | ||
|
|
||
| //! Finalize @c Kokkos. | ||
| Kokkos::finalize(); | ||
|
|
||
| //! Restore output buffer. | ||
| std::cout.flush(); | ||
| std::cout.rdbuf(coutbuf); | ||
| std::cout << output.str() << std::endl; | ||
|
|
||
| //! Analyze test output. | ||
| for (const auto& matcher : matchers) { | ||
| EXPECT_THAT(output.str(), ::testing::ContainsRegex(matcher)); | ||
| } | ||
| } |
| Original file line number | Diff line number | Diff line change |
|---|---|---|
| @@ -0,0 +1,77 @@ | ||
| #include <chrono> | ||
| #include <iostream> | ||
| #include <sstream> | ||
|
|
||
| #include "gmock/gmock.h" | ||
| #include "gtest/gtest.h" | ||
|
|
||
| #include "Kokkos_Core.hpp" | ||
| #include "Kokkos_Profiling_ScopedRegion.hpp" | ||
| #include "mpi.h" | ||
|
|
||
| struct Tester { | ||
| explicit Tester() { | ||
| int rank; | ||
| MPI_Comm_rank(MPI_COMM_WORLD, &rank); | ||
| // The threshold for filtering out lines from the stack is 0.1% of the total runtime. | ||
| // Do a 1s sleep on all ranks, then a 1.5 ms sleep on only rank 1. If the threshold | ||
| // is applied based on the average time across ranks the second region will be filtered out, | ||
| // but if it is based on the max time then it will be included (which is the desired behavior | ||
| // so that we see things which are slow only on a subset of ranks in cases like a coarse solve | ||
| // in a multigrid method that only runs on a subset of the ranks). | ||
| { | ||
| Kokkos::Profiling::ScopedRegion all_ranks_region("all_ranks_region"); | ||
| std::this_thread::sleep_for(std::chrono::seconds(1)); | ||
| } | ||
| { | ||
| Kokkos::Profiling::ScopedRegion rank_1_region("rank_1_region"); | ||
| if(rank == 1) { | ||
| std::this_thread::sleep_for(std::chrono::microseconds(1500)); | ||
| } | ||
| } | ||
| } | ||
| }; | ||
|
|
||
| static const std::vector<std::string> matchers{ | ||
| "[0-9.e]+ sec [0-9.]+% [0-9.]+% 0.[0-9]+% 100.0% 0.00e\\+00 1 all_ranks_region \\[region\\]", | ||
| "[0-9.e]+ sec [0-9.]+% [0-9.]+% [1-9][0-9.]+% 100.0% 0.00e\\+00 1 rank_1_region \\[region\\]", | ||
| }; | ||
|
|
||
| /** | ||
| * @test This test checks that the tool outputs deep_copy statistics. | ||
| */ | ||
| TEST(SpaceTimeStackTest, threshold_with_imbalance) { | ||
| MPI_Init(nullptr, nullptr); | ||
| int comm_size; | ||
| MPI_Comm_size(MPI_COMM_WORLD, &comm_size); | ||
| ASSERT_EQ(2, comm_size) << " test requires exactly 2 MPI ranks."; | ||
|
|
||
| //! Initialize @c Kokkos. | ||
| Kokkos::initialize(); | ||
|
|
||
| //! Redirect output for later analysis. | ||
| std::cout.flush(); | ||
| std::ostringstream output; | ||
| std::streambuf* coutbuf = std::cout.rdbuf(output.rdbuf()); | ||
|
|
||
| //! Run tests. @todo Replace this with Google Test. | ||
| Tester tester; | ||
|
|
||
| //! Finalize @c Kokkos. | ||
| Kokkos::finalize(); | ||
|
|
||
| //! Restore output buffer. | ||
| std::cout.flush(); | ||
| std::cout.rdbuf(coutbuf); | ||
| std::cout << output.str() << std::endl; | ||
|
|
||
| int rank; | ||
| MPI_Comm_rank(MPI_COMM_WORLD, &rank); | ||
| //! Analyze test output. | ||
| if(rank == 0) { | ||
| for (const auto& matcher : matchers) { | ||
| EXPECT_THAT(output.str(), ::testing::ContainsRegex(matcher)); | ||
| } | ||
| } | ||
| MPI_Finalize(); | ||
| } |
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
You might as well compute this properly once and store it in a member variable.