ginkgo-project · MarcelKoch · Aug 18, 2025 · Aug 19, 2025 · Aug 19, 2025 · Aug 19, 2025
diff --git a/benchmark/blas/blas.cpp b/benchmark/blas/blas.cpp
@@ -105,13 +105,30 @@ Parameters for a benchmark case are:
     stride_A: stride for A matrix in gemm (optional, default k)
     stride_B: stride for B matrix in gemm (optional, default m)
     stride_C: stride for C matrix in gemm (optional, default m)
+The supported operations are defined as:
+BLAS algorithms:
+   copy (y = x),
+   axpy (y = y + a * x),
+   sub_scaled (y = y - a * x),
+   multiaxpy (like axpy, but a has one entry per column),
+   scal (y = a * y),
+   multiscal (like scal, but a has one entry per column),
+   dot (a = x' * y),"
+   norm (a = sqrt(x' * x)),
+   mm (C = A * B),
+   gemm (C = a * A * B + b * C)
+Non-numerical algorithms:
+   prefix_sum32 (x_i <- sum_{j=0}^{i-1} x_i, 32 bit indices)
+   prefix_sum64 (                            64 bit indices)
+where A has dimensions n x k, B has dimensions k x m,
+C has dimensions n x m and x and y have dimensions n x r
 )";
     auto schema =
         json::parse(std::ifstream(GKO_ROOT "/benchmark/schema/blas.json"));
 
     initialize_argument_parsing(&argc, &argv, header, schema["examples"]);
 
-    std::string extra_information = "The operations are " + FLAGS_operations;
+    std::string extra_information;
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
     print_general_information(extra_information, exec);
 

diff --git a/benchmark/blas/blas_common.hpp b/benchmark/blas/blas_common.hpp
@@ -21,28 +21,6 @@
 #include "core/components/prefix_sum_kernels.hpp"
 
 
-// Command-line arguments
-DEFINE_string(
-    operations, "copy,axpy,scal",
-    "A comma-separated list of operations to benchmark.\nCandidates are\n"
-    "BLAS algorithms:\n"
-    "   copy (y = x),\n"
-    "   axpy (y = y + a * x),\n"
-    "   sub_scaled (y = y - a * x),\n"
-    "   multiaxpy (like axpy, but a has one entry per column),\n"
-    "   scal (y = a * y),\n"
-    "   multiscal (like scal, but a has one entry per column),\n"
-    "   dot (a = x' * y),"
-    "   norm (a = sqrt(x' * x)),\n"
-    "   mm (C = A * B),\n"
-    "   gemm (C = a * A * B + b * C)\n"
-    "Non-numerical algorithms:\n"
-    "   prefix_sum32 (x_i <- sum_{j=0}^{i-1} x_i, 32 bit indices)\n"
-    "   prefix_sum64 (                            64 bit indices)\n"
-    "where A has dimensions n x k, B has dimensions k x m,\n"
-    "C has dimensions n x m and x and y have dimensions n x r");
-
-
 class BenchmarkOperation {
 public:
     virtual ~BenchmarkOperation() = default;
@@ -424,14 +402,12 @@ struct BlasBenchmark : Benchmark<dimensions> {
                  std::function<std::unique_ptr<BenchmarkOperation>(
                      std::shared_ptr<const gko::Executor>, dimensions)>>;
     map_type operation_map;
-    std::vector<std::string> operations;
     std::string name;
     bool do_print;
 
     BlasBenchmark(map_type operation_map, bool do_print = true)
         : operation_map{std::move(operation_map)},
           name{"blas"},
-          operations{split(FLAGS_operations)},
           do_print{do_print}
     {}
 
@@ -473,39 +449,54 @@ struct BlasBenchmark : Benchmark<dimensions> {
              annotate_functor annotate, dimensions& dims,
              const json& operation_case, json& result_case) const override
     {
-        for (auto& operation_name : operations) {
-            result_case[operation_name] = json::object();
-            auto& op_result_case = result_case[operation_name];
-
-            auto op = operation_map.at(operation_name)(exec, dims);
-
-            IterationControl ic(timer);
-
-            // warm run
-            {
-                auto range = annotate("warmup", FLAGS_warmup > 0);
-                for (auto _ : ic.warmup_run()) {
-                    op->prepare();
-                    exec->synchronize();
-                    op->run();
-                    exec->synchronize();
-                }
-            }
+        auto op = operation_map.at(
+            operation_case["operation"].get<std::string>())(exec, dims);
 
-            // timed run
-            op->prepare();
-            for (auto _ : ic.run()) {
-                auto range = annotate("repetition");
+        IterationControl ic(timer);
+
+        // warm run
+        {
+            auto range = annotate("warmup", FLAGS_warmup > 0);
+            for (auto _ : ic.warmup_run()) {
+                op->prepare();
+                exec->synchronize();
                 op->run();
+                exec->synchronize();
             }
-            const auto runtime = ic.compute_time(FLAGS_timer_method);
-            const auto flops = static_cast<double>(op->get_flops());
-            const auto mem = static_cast<double>(op->get_memory());
-            const auto repetitions = ic.get_num_repetitions();
-            op_result_case["time"] = runtime;
-            op_result_case["flops"] = flops / runtime;
-            op_result_case["bandwidth"] = mem / runtime;
-            op_result_case["repetitions"] = repetitions;
         }
+
+        // timed run
+        op->prepare();
+        for (auto _ : ic.run()) {
+            auto range = annotate("repetition");
+            op->run();
+        }
+        const auto runtime = ic.compute_time(FLAGS_timer_method);
+        const auto flops = static_cast<double>(op->get_flops());
+        const auto mem = static_cast<double>(op->get_memory());
+        const auto repetitions = ic.get_num_repetitions();
+        result_case["time"] = runtime;
+        result_case["flops"] = flops / runtime;
+        result_case["bandwidth"] = mem / runtime;
+        result_case["repetitions"] = repetitions;
+    }
+
+    void postprocess(json& test_cases) const override
+    {
+        std::map<json, json> same_operators;
+        for (const auto& test_case : test_cases) {
+            auto case_operator = test_case;
+            case_operator.erase("operation");
+            case_operator.erase(name);
+            same_operators.try_emplace(case_operator, json::object());
+            same_operators[case_operator][test_case["operation"]] =
+                test_case[name];
+        }
+        auto merged_cases = json::array();
+        for (const auto& [test_case, results] : same_operators) {
+            merged_cases.push_back(test_case);
+            merged_cases.back()[name] = results;
+        }
+        test_cases = std::move(merged_cases);
     }
 };
diff --git a/benchmark/blas/distributed/multi_vector.cpp b/benchmark/blas/distributed/multi_vector.cpp
@@ -37,6 +37,16 @@ Parameters for a benchmark case are:
     stride: storage stride for both vectors (optional, default r)
     stride_x: stride for input vector x (optional, default r)
     stride_y: stride for in/out vector y (optional, default r)
+The supported operations are defined as:
+BLAS algorithms:
+   copy (y = x),
+   axpy (y = y + a * x),
+   sub_scaled (y = y - a * x),
+   multiaxpy (like axpy, but a has one entry per column),
+   scal (y = a * y),
+   multiscal (like scal, but a has one entry per column),
+   dot (a = x' * y),"
+   norm (a = sqrt(x' * x))
 )";
     auto schema = json::parse(
         std::ifstream(GKO_ROOT "/benchmark/schema/blas-distributed.json"));
@@ -47,8 +57,7 @@ Parameters for a benchmark case are:
     auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());
 
     if (do_print) {
-        std::string extra_information =
-            "The operations are " + FLAGS_operations;
+        std::string extra_information;
         print_general_information(extra_information, exec);
     }
 

diff --git a/benchmark/conversion/conversion.cpp b/benchmark/conversion/conversion.cpp
@@ -33,31 +33,8 @@ using Generator = DefaultSystemGenerator<>;
 
 struct ConversionBenchmark : Benchmark<gko::device_matrix_data<etype, itype>> {
     std::string name;
-    std::vector<std::string> operations;
 
-    ConversionBenchmark() : name{"conversion"}
-    {
-        auto ref_exec = gko::ReferenceExecutor::create();
-        auto formats = split(FLAGS_formats);
-        for (const auto& from_format : formats) {
-            operations.push_back(from_format + "-read");
-            auto from_mtx =
-                formats::matrix_type_factory.at(from_format)(ref_exec);
-            // all pairs of conversions that are supported by Ginkgo
-            for (const auto& to_format : formats) {
-                if (from_format == to_format) {
-                    continue;
-                }
-                auto to_mtx =
-                    formats::matrix_type_factory.at(to_format)(ref_exec);
-                try {
-                    to_mtx->copy_from(from_mtx);
-                    operations.push_back(from_format + "-" + to_format);
-                } catch (const std::exception& e) {
-                }
-            }
-        }
-    }
+    ConversionBenchmark() : name{"conversion"} {}
 
     const std::string& get_name() const override { return name; }
 
@@ -83,55 +60,80 @@ struct ConversionBenchmark : Benchmark<gko::device_matrix_data<etype, itype>> {
              gko::device_matrix_data<etype, itype>& data,
              const json& operation_case, json& result_case) const override
     {
-        for (const auto& operation_name : operations) {
-            result_case[operation_name] = json::object();
-            auto& op_result_case = result_case[operation_name];
-
-            auto split_it =
-                std::find(operation_name.begin(), operation_name.end(), '-');
-            std::string from_name{operation_name.begin(), split_it};
-            std::string to_name{split_it + 1, operation_name.end()};
-            auto mtx_from = formats::matrix_type_factory.at(from_name)(exec);
-            auto readable = gko::as<gko::ReadableFromMatrixData<etype, itype>>(
-                mtx_from.get());
-            IterationControl ic{timer};
-            if (to_name == "read") {
-                // warm run
-                {
-                    auto range = annotate("warmup", FLAGS_warmup > 0);
-                    for (auto _ : ic.warmup_run()) {
-                        exec->synchronize();
-                        readable->read(data);
-                        exec->synchronize();
-                    }
-                }
-                // timed run
-                for (auto _ : ic.run()) {
-                    auto range = annotate("repetition");
+        std::string from_name = operation_case["from"].get<std::string>();
+        std::string to_name = operation_case["to"].get<std::string>();
+        auto mtx_from = formats::matrix_type_factory.at(from_name)(exec);
+        auto readable =
+            gko::as<gko::ReadableFromMatrixData<etype, itype>>(mtx_from.get());
+
+        // check if conversion is supported on empty matrix first
+        if (from_name != to_name) {
+            auto to_mtx = formats::matrix_type_factory.at(to_name)(exec);
+            to_mtx->copy_from(mtx_from);
+        }
+
+        IterationControl ic{timer};
+        if (to_name == from_name) {
+            // warm run
+            {
+                auto range = annotate("warmup", FLAGS_warmup > 0);
+                for (auto _ : ic.warmup_run()) {
+                    exec->synchronize();
                     readable->read(data);
+                    exec->synchronize();
                 }
-            } else {
+            }
+            // timed run
+            for (auto _ : ic.run()) {
+                auto range = annotate("repetition");
                 readable->read(data);
-                auto mtx_to = formats::matrix_type_factory.at(to_name)(exec);
-
-                // warm run
-                {
-                    auto range = annotate("warmup", FLAGS_warmup > 0);
-                    for (auto _ : ic.warmup_run()) {
-                        exec->synchronize();
-                        mtx_to->copy_from(mtx_from);
-                        exec->synchronize();
-                    }
-                }
-                // timed run
-                for (auto _ : ic.run()) {
-                    auto range = annotate("repetition");
+            }
+        } else {
+            readable->read(data);
+            auto mtx_to = formats::matrix_type_factory.at(to_name)(exec);
+
+            // warm run
+            {
+                auto range = annotate("warmup", FLAGS_warmup > 0);
+                for (auto _ : ic.warmup_run()) {
+                    exec->synchronize();
                     mtx_to->copy_from(mtx_from);
+                    exec->synchronize();
                 }
             }
-            op_result_case["time"] = ic.compute_time(FLAGS_timer_method);
-            op_result_case["repetitions"] = ic.get_num_repetitions();
+            // timed run
+            for (auto _ : ic.run()) {
+                auto range = annotate("repetition");
+                mtx_to->copy_from(mtx_from);
+            }
+        }
+        result_case["time"] = ic.compute_time(FLAGS_timer_method);
+        result_case["repetitions"] = ic.get_num_repetitions();
+    }
+
+    void postprocess(json& test_cases) const override
+    {
+        std::map<json, json> same_operators;
+        for (const auto& test_case : test_cases) {
+            if (test_case[name].contains("error_type") &&
+                test_case[name]["error_type"] == "gko::NotSupported") {
+                continue;
+            }
+            auto case_operator = test_case;
+            case_operator.erase("to");
+            case_operator.erase("from");
+            case_operator.erase(name);
+            same_operators.try_emplace(case_operator, json::array());
+            same_operators[case_operator].push_back(test_case[name]);
+            same_operators[case_operator].back()["to"] = test_case["to"];
+            same_operators[case_operator].back()["from"] = test_case["from"];
         }
+        auto merged_cases = json::array();
+        for (auto& [case_operator, results] : same_operators) {
+            merged_cases.push_back(case_operator);
+            merged_cases.back()[name] = results;
+        }
+        test_cases = std::move(merged_cases);
     }
 };
 
@@ -146,12 +148,8 @@ int main(int argc, char* argv[])
 
     initialize_argument_parsing(&argc, &argv, header, schema["examples"]);
 
-    std::string extra_information =
-        std::string() + "The formats are " + FLAGS_formats;
-
     auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
-    print_general_information(extra_information, exec);
-    auto formats = split(FLAGS_formats, ',');
+    print_general_information("", exec);
 
     auto test_cases = json::parse(get_input_stream());
 

diff --git a/benchmark/schema/blas-distributed.json b/benchmark/schema/blas-distributed.json
@@ -51,20 +51,28 @@
   ],
   "examples": [
     {
-      "n": 1000
+      "n": 1000,
+      "operation": "axpy"
+    },
+    {
+      "n": 1000,
+      "operation": "dot"
     },
     {
       "n": 1000,
-      "r": 1000
+      "r": 1000,
+      "operation": "copy"
     },
     {
       "n": 1000,
-      "stride": 1024
+      "stride": 1024,
+      "operation": "scal"
     },
     {
       "n": 1000,
       "stride_x": 1024,
-      "stride_y": 2048
+      "stride_y": 2048,
+      "operation": "multiscal"
     }
   ]
 }