22#include < mpi.h>
33#include < Kokkos_Random.hpp>
44
5+ #include < unordered_map>
6+
7+ using benchmark_result_t = std::tuple<std::vector<double >, double >;
8+ using all_results_t = std::unordered_map<std::string, benchmark_result_t >
59
610template <>
711std::string typeToString<double >() {
@@ -24,7 +28,7 @@ std::string benchmarkToString(const benchmarks& b) {
2428}
2529
2630template <typename T>
27- std::tuple<std::vector< double >, double > runBenchmarkLevel1 (int N, int iters) {
31+ benchmark_results_t runBenchmarkLevel1 (int N, int iters) {
2832 std::cout << " -- level 1 benchmark [" << typeToString<T>() << " ] -- " << std::endl;
2933
3034 Kokkos::View<T*> x (" x" , N);
@@ -63,7 +67,7 @@ std::tuple<std::vector<double>, double> runBenchmarkLevel1(int N, int iters) {
6367}
6468
6569template <typename T>
66- std::tuple<std::vector< double >, double > runBenchmarkLevel2 (int M, int N, int iters) {
70+ benchmark_results_t runBenchmarkLevel2 (int M, int N, int iters) {
6771 std::cout << " -- level 2 benchmark [" << typeToString<T>() << " ] -- " << std::endl;
6872
6973 Kokkos::View<T*> x (" x" , M);
@@ -112,7 +116,7 @@ std::tuple<std::vector<double>, double> runBenchmarkLevel2(int M, int N, int ite
112116}
113117
114118template <typename T>
115- std::tuple<std::vector< double >, double > runBenchmarkLevel3 () {
119+ benchmark_results_t runBenchmarkLevel3 () {
116120 std::cout << " -- level 3 benchmark [" << typeToString<T>() << " ] -- " << std::endl;
117121 Kokkos::View<T**> A (" A" , M, N);
118122 Kokkos::View<T**> B (" B" , N, K);
@@ -165,7 +169,7 @@ std::tuple<std::vector<double>, double> runBenchmarkLevel3() {
165169}
166170
167171template <typename T>
168- std::tuple<std::vector< double >, double > runBenchmarkLevel3 () {
172+ benchmark_results_t runBenchmarkLevel3 () {
169173 std::cout << " -- level 3 benchmark [" << typeToString<T>() << " ] -- " << std::endl;
170174 Kokkos::View<T**> A (" A" , M, N);
171175 Kokkos::View<T**> B (" B" , N, K);
@@ -216,19 +220,9 @@ std::tuple<std::vector<double>, double> runBenchmarkLevel3() {
216220
217221 return std::make_tuple (iter_timings, total_time);
218222}
219- Assistant
220-
221- Certainly! Below is an implementation of a benchmark function that performs the MKL DPOTRF operation, which is used for Cholesky factorization of a symmetric positive definite matrix. This benchmark follows the same style as your provided runBenchmarkLevel3 function.
222-
223- #include < iostream>
224- #include < vector>
225- #include < tuple>
226- #include < mkl.h>
227- #include < Kokkos_Core.hpp>
228- #include < mpi.h>
229223
230224template <typename T>
231- std::tuple<std::vector< double >, double > runBenchmarkDPOTRF (int N, int iters) {
225+ benchmark_results_t runBenchmarkDPOTRF (int N, int iters) {
232226 std::cout << " -- dpotrf benchmark [" << typeToString<T>() << " ] -- " << std::endl;
233227
234228 // Define matrix size
@@ -280,7 +274,7 @@ std::tuple<std::vector<double>, double> runBenchmarkDPOTRF(int N, int iters) {
280274}
281275
282276template <typename T>
283- std::tuple<std::vector< double >, double > runBenchmark (benchmarks type, int M, int N, int K, int iters) {
277+ benchmark_results_t runBenchmark (benchmarks type, int M, int N, int K, int iters) {
284278 switch (type) {
285279 case level1:
286280 return runBenchmarkLevel1<T>(N, iters);
@@ -295,65 +289,75 @@ std::tuple<std::vector<double>, double> runBenchmark(benchmarks type, int M, int
295289 }
296290}
297291
298- void reduceAndPrintBenchmarkOutput (
299- std::vector<double > iter_timings,
300- double total_time,
301- std::string benchmark)
292+ all_results_t runAllBenchmarks (int M, int N, int K, int iters) {
293+ all_results_t all_results;
294+ for (int i=0 ; i < benchmarks::num_benchmarks; i++) {
295+ auto b = static_cast <benchmarks>(i);
296+ std::string benchmark_str = benchmarkToString (b) + " _double" ;
297+ all_results[benchmark_str] = runBenchmark<T>(b, M, N, K, iters);
298+ }
299+ return all_results;
300+ }
301+
302+ void reduceAndPrintBenchmarkOutput (all_results_t benchmark_results)
302303{
303304 int rank = -1 ;
304305 int num_ranks = 0 ;
305306 MPI_Comm_rank (MPI_COMM_WORLD, &rank);
306307 MPI_Comm_size (MPI_COMM_WORLD, &num_ranks);
307308
308- char processor_name[MPI_MAX_PROCESSOR_NAME];
309- int name_len;
310- MPI_Get_processor_name (processor_name, &name_len);
309+ for (const auto & [benchmark_str, benchmark_results]: benchmark_results) {
310+ auto const & [iter_timings, total_time] = benchmark_results;
311+ char processor_name[MPI_MAX_PROCESSOR_NAME];
312+ int name_len;
313+ MPI_Get_processor_name (processor_name, &name_len);
311314
312- std::vector<double > all_times;
313- all_times.resize (num_ranks);
315+ std::vector<double > all_times;
316+ all_times.resize (num_ranks);
314317
315- std::vector<double > all_iter_times;
316- all_iter_times.resize (num_ranks * iters);
318+ std::vector<double > all_iter_times;
319+ all_iter_times.resize (num_ranks * iters);
317320
318- std::vector<char > all_processor_names;
319- all_processor_names.resize (num_ranks * MPI_MAX_PROCESSOR_NAME);
321+ std::vector<char > all_processor_names;
322+ all_processor_names.resize (num_ranks * MPI_MAX_PROCESSOR_NAME);
320323
321- if (rank == 0 ) {
322- std::cout << " num_ranks: " << num_ranks << std::endl;
323- }
324+ if (rank == 0 ) {
325+ std::cout << " num_ranks: " << num_ranks << std::endl;
326+ }
324327
325- MPI_Gather (
326- &total_time, 1 , MPI_DOUBLE,
327- &all_times[0 ], 1 , MPI_DOUBLE, 0 ,
328- MPI_COMM_WORLD
329- );
330-
331- MPI_Gather (
332- &iter_timings[0 ], iters, MPI_DOUBLE,
333- &all_iter_times[0 ], iters, MPI_DOUBLE, 0 ,
334- MPI_COMM_WORLD
335- );
336-
337- MPI_Gather (
338- &processor_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR,
339- &all_processor_names[0 ], MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0 ,
340- MPI_COMM_WORLD
341- );
342-
343- if (rank == 0 ) {
344- int cur_rank = 0 ;
345- int cur = 0 ;
346- std::cout << " === " << benchmark << " ===" << std::endl;
347- for (auto && time : all_times) {
348- std::cout << " gather: " << cur_rank << " ("
349- << std::string (&all_processor_names[cur_rank * MPI_MAX_PROCESSOR_NAME])
350- << " ): " << time << " : breakdown: " ;
351- for (int i = cur; i < iters + cur; i++) {
352- std::cout << all_iter_times[i] << " " ;
328+ MPI_Gather (
329+ &total_time, 1 , MPI_DOUBLE,
330+ &all_times[0 ], 1 , MPI_DOUBLE, 0 ,
331+ MPI_COMM_WORLD
332+ );
333+
334+ MPI_Gather (
335+ &iter_timings[0 ], iters, MPI_DOUBLE,
336+ &all_iter_times[0 ], iters, MPI_DOUBLE, 0 ,
337+ MPI_COMM_WORLD
338+ );
339+
340+ MPI_Gather (
341+ &processor_name, MPI_MAX_PROCESSOR_NAME, MPI_CHAR,
342+ &all_processor_names[0 ], MPI_MAX_PROCESSOR_NAME, MPI_CHAR, 0 ,
343+ MPI_COMM_WORLD
344+ );
345+
346+ if (rank == 0 ) {
347+ int cur_rank = 0 ;
348+ int cur = 0 ;
349+ std::cout << " === " << benchmark << " ===" << std::endl;
350+ for (auto && time : all_times) {
351+ std::cout << " gather: " << cur_rank << " ("
352+ << std::string (&all_processor_names[cur_rank * MPI_MAX_PROCESSOR_NAME])
353+ << " ): " << time << " : breakdown: " ;
354+ for (int i = cur; i < iters + cur; i++) {
355+ std::cout << all_iter_times[i] << " " ;
356+ }
357+ std::cout << std::endl;
358+ cur += iters;
359+ cur_rank++;
353360 }
354- std::cout << std::endl;
355- cur += iters;
356- cur_rank++;
357361 }
358362 }
359363}
0 commit comments