Skip to content

Commit ff1fc3d

Browse files
committed
#10: wip: call sensors before/after all benchmarks
1 parent 58d263c commit ff1fc3d

File tree

2 files changed

+71
-65
lines changed

2 files changed

+71
-65
lines changed

src/benchmarks.cc

Lines changed: 67 additions & 63 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,10 @@
22
#include <mpi.h>
33
#include <Kokkos_Random.hpp>
44

5+
#include <unordered_map>
6+
7+
using benchmark_result_t = std::tuple<std::vector<double>, double>;
8+
using all_results_t = std::unordered_map<std::string, benchmark_result_t>
59

610
template <>
711
std::string typeToString<double>() {
@@ -24,7 +28,7 @@ std::string benchmarkToString(const benchmarks& b) {
2428
}
2529

2630
template <typename T>
27-
std::tuple<std::vector<double>, double> runBenchmarkLevel1(int N, int iters) {
31+
benchmark_results_t runBenchmarkLevel1(int N, int iters) {
2832
std::cout << "-- level 1 benchmark [" << typeToString<T>() << "] -- " << std::endl;
2933

3034
Kokkos::View<T*> x("x", N);
@@ -63,7 +67,7 @@ std::tuple<std::vector<double>, double> runBenchmarkLevel1(int N, int iters) {
6367
}
6468

6569
template <typename T>
66-
std::tuple<std::vector<double>, double> runBenchmarkLevel2(int M, int N, int iters) {
70+
benchmark_results_t runBenchmarkLevel2(int M, int N, int iters) {
6771
std::cout << "-- level 2 benchmark [" << typeToString<T>() << "] -- " << std::endl;
6872

6973
Kokkos::View<T*> x("x", M);
@@ -112,7 +116,7 @@ std::tuple<std::vector<double>, double> runBenchmarkLevel2(int M, int N, int ite
112116
}
113117

114118
template <typename T>
115-
std::tuple<std::vector<double>, double> runBenchmarkLevel3() {
119+
benchmark_results_t runBenchmarkLevel3() {
116120
std::cout << "-- level 3 benchmark [" << typeToString<T>() << "] -- " << std::endl;
117121
Kokkos::View<T**> A("A", M, N);
118122
Kokkos::View<T**> B("B", N, K);
@@ -165,7 +169,7 @@ std::tuple<std::vector<double>, double> runBenchmarkLevel3() {
165169
}
166170

167171
template <typename T>
168-
std::tuple<std::vector<double>, double> runBenchmarkLevel3() {
172+
benchmark_results_t runBenchmarkLevel3() {
169173
std::cout << "-- level 3 benchmark [" << typeToString<T>() << "] -- " << std::endl;
170174
Kokkos::View<T**> A("A", M, N);
171175
Kokkos::View<T**> B("B", N, K);
@@ -216,19 +220,9 @@ std::tuple<std::vector<double>, double> runBenchmarkLevel3() {
216220

217221
return std::make_tuple(iter_timings, total_time);
218222
}
219-
Assistant
220-
221-
Certainly! Below is an implementation of a benchmark function that performs the MKL DPOTRF operation, which is used for Cholesky factorization of a symmetric positive definite matrix. This benchmark follows the same style as your provided runBenchmarkLevel3 function.
222-
223-
#include <iostream>
224-
#include <vector>
225-
#include <tuple>
226-
#include <mkl.h>
227-
#include <Kokkos_Core.hpp>
228-
#include <mpi.h>
229223

230224
template <typename T>
231-
std::tuple<std::vector<double>, double> runBenchmarkDPOTRF(int N, int iters) {
225+
benchmark_results_t runBenchmarkDPOTRF(int N, int iters) {
232226
std::cout << "-- dpotrf benchmark [" << typeToString<T>() << "] -- " << std::endl;
233227

234228
// Define matrix size
@@ -280,7 +274,7 @@ std::tuple<std::vector<double>, double> runBenchmarkDPOTRF(int N, int iters) {
280274
}
281275

282276
template <typename T>
283-
std::tuple<std::vector<double>, double> runBenchmark(benchmarks type, int M, int N, int K, int iters) {
277+
benchmark_results_t runBenchmark(benchmarks type, int M, int N, int K, int iters) {
284278
switch (type) {
285279
case level1:
286280
return runBenchmarkLevel1<T>(N, iters);
@@ -295,65 +289,75 @@ std::tuple<std::vector<double>, double> runBenchmark(benchmarks type, int M, int
295289
}
296290
}
297291

298-
void reduceAndPrintBenchmarkOutput(
299-
std::vector<double> iter_timings,
300-
double total_time,
301-
std::string benchmark)
292+
all_results_t runAllBenchmarks(int M, int N, int K, int iters) {
293+
all_results_t all_results;
294+
for (int i=0; i < benchmarks::num_benchmarks; i++) {
295+
auto b = static_cast<benchmarks>(i);
296+
std::string benchmark_str = benchmarkToString(b) + "_double";
297+
all_results[benchmark_str] = runBenchmark<T>(b, M, N, K, iters);
298+
}
299+
return all_results;
300+
}
301+
302+
void reduceAndPrintBenchmarkOutput(all_results_t benchmark_results)
302303
{
303304
int rank = -1;
304305
int num_ranks = 0;
305306
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
306307
MPI_Comm_size(MPI_COMM_WORLD, &num_ranks);
307308

308-
char processor_name[MPI_MAX_PROCESSOR_NAME];
309-
int name_len;
310-
MPI_Get_processor_name(processor_name, &name_len);
309+
for (const auto& [benchmark_str, benchmark_results]: benchmark_results) {
310+
auto const& [iter_timings, total_time] = benchmark_results;
311+
char processor_name[MPI_MAX_PROCESSOR_NAME];
312+
int name_len;
313+
MPI_Get_processor_name(processor_name, &name_len);
311314

312-
std::vector<double> all_times;
313-
all_times.resize(num_ranks);
315+
std::vector<double> all_times;
316+
all_times.resize(num_ranks);
314317

315-
std::vector<double> all_iter_times;
316-
all_iter_times.resize(num_ranks * iters);
318+
std::vector<double> all_iter_times;
319+
all_iter_times.resize(num_ranks * iters);
317320

318-
std::vector<char> all_processor_names;
319-
all_processor_names.resize(num_ranks * MPI_MAX_PROCESSOR_NAME);
321+
std::vector<char> all_processor_names;
322+
all_processor_names.resize(num_ranks * MPI_MAX_PROCESSOR_NAME);
320323

321-
if (rank == 0) {
322-
std::cout << "num_ranks: " << num_ranks << std::endl;
323-
}
324+
if (rank == 0) {
325+
std::cout << "num_ranks: " << num_ranks << std::endl;
326+
}
324327

325-
MPI_Gather(
326-
&total_time, 1, MPI_DOUBLE,
327-
&all_times[0], 1, MPI_DOUBLE, 0,
328-
MPI_COMM_WORLD
329-
);
330-
331-
MPI_Gather(
332-
&iter_timings[0], iters, MPI_DOUBLE,
333-
&all_iter_times[0], iters, MPI_DOUBLE, 0,
334-
MPI_COMM_WORLD
335-
);
336-
337-
MPI_Gather(
338-
&processor_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR,
339-
&all_processor_names[0], MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0,
340-
MPI_COMM_WORLD
341-
);
342-
343-
if (rank == 0) {
344-
int cur_rank = 0;
345-
int cur = 0;
346-
std::cout << "=== " << benchmark << " ===" << std::endl;
347-
for (auto&& time : all_times) {
348-
std::cout << "gather: " << cur_rank << " ("
349-
<< std::string(&all_processor_names[cur_rank * MPI_MAX_PROCESSOR_NAME])
350-
<< "): " << time << ": breakdown: ";
351-
for (int i = cur; i < iters + cur; i++) {
352-
std::cout << all_iter_times[i] << " ";
328+
MPI_Gather(
329+
&total_time, 1, MPI_DOUBLE,
330+
&all_times[0], 1, MPI_DOUBLE, 0,
331+
MPI_COMM_WORLD
332+
);
333+
334+
MPI_Gather(
335+
&iter_timings[0], iters, MPI_DOUBLE,
336+
&all_iter_times[0], iters, MPI_DOUBLE, 0,
337+
MPI_COMM_WORLD
338+
);
339+
340+
MPI_Gather(
341+
&processor_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR,
342+
&all_processor_names[0], MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0,
343+
MPI_COMM_WORLD
344+
);
345+
346+
if (rank == 0) {
347+
int cur_rank = 0;
348+
int cur = 0;
349+
std::cout << "=== " << benchmark << " ===" << std::endl;
350+
for (auto&& time : all_times) {
351+
std::cout << "gather: " << cur_rank << " ("
352+
<< std::string(&all_processor_names[cur_rank * MPI_MAX_PROCESSOR_NAME])
353+
<< "): " << time << ": breakdown: ";
354+
for (int i = cur; i < iters + cur; i++) {
355+
std::cout << all_iter_times[i] << " ";
356+
}
357+
std::cout << std::endl;
358+
cur += iters;
359+
cur_rank++;
353360
}
354-
std::cout << std::endl;
355-
cur += iters;
356-
cur_rank++;
357361
}
358362
}
359363
}

src/slow_node.cc

Lines changed: 4 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -22,12 +22,14 @@ int main(int argc, char** argv) {
2222
Kokkos::initialize(argc, argv);
2323

2424
// Loop through all available benchmarks
25+
sensors::runSensorsAndReduceOutput(processor_name, "pre");
26+
auto all_benchmark_output = runAllBenchmarks(M, N, K, iters);
27+
sensors::runSensorsAndReduceOutput(processor_name, "post");
28+
printBenchmarkOutput(all_benchmark_output);
2529
for (int i=0; i < benchmarks::num_benchmarks; i++) {
2630
auto benchmark_type = static_cast<benchmarks>(i);
2731
std::string benchmark_str = benchmarkToString(benchmark_type) + "_double";
28-
sensors::runSensorsAndReduceOutput(processor_name, "pre_" + benchmark_str);
2932
auto const& [iter_timings, total_time] results = runBenchmark<double>(benchmark_type, M, N, K, iters);
30-
sensors::runSensorsAndReduceOutput(processor_name, "post" + benchmark_str);
3133
reduceAndPrintBenchmarkOutput(iter_timings, total_time, benchmark_str);
3234
}
3335

0 commit comments

Comments
 (0)