Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 18 additions & 1 deletion benchmark/blas/blas.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -105,13 +105,30 @@ Parameters for a benchmark case are:
stride_A: stride for A matrix in gemm (optional, default k)
stride_B: stride for B matrix in gemm (optional, default m)
stride_C: stride for C matrix in gemm (optional, default m)
The supported operations are defined as:
BLAS algorithms:
copy (y = x),
axpy (y = y + a * x),
sub_scaled (y = y - a * x),
multiaxpy (like axpy, but a has one entry per column),
scal (y = a * y),
multiscal (like scal, but a has one entry per column),
dot (a = x' * y),"
norm (a = sqrt(x' * x)),
mm (C = A * B),
gemm (C = a * A * B + b * C)
Non-numerical algorithms:
prefix_sum32 (x_i <- sum_{j=0}^{i-1} x_i, 32 bit indices)
prefix_sum64 ( 64 bit indices)
where A has dimensions n x k, B has dimensions k x m,
C has dimensions n x m and x and y have dimensions n x r
)";
auto schema =
json::parse(std::ifstream(GKO_ROOT "/benchmark/schema/blas.json"));

initialize_argument_parsing(&argc, &argv, header, schema["examples"]);

std::string extra_information = "The operations are " + FLAGS_operations;
std::string extra_information;
auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
print_general_information(extra_information, exec);

Expand Down
99 changes: 45 additions & 54 deletions benchmark/blas/blas_common.hpp
Original file line number Diff line number Diff line change
Expand Up @@ -21,28 +21,6 @@
#include "core/components/prefix_sum_kernels.hpp"


// Command-line arguments
DEFINE_string(
operations, "copy,axpy,scal",
"A comma-separated list of operations to benchmark.\nCandidates are\n"
"BLAS algorithms:\n"
" copy (y = x),\n"
" axpy (y = y + a * x),\n"
" sub_scaled (y = y - a * x),\n"
" multiaxpy (like axpy, but a has one entry per column),\n"
" scal (y = a * y),\n"
" multiscal (like scal, but a has one entry per column),\n"
" dot (a = x' * y),"
" norm (a = sqrt(x' * x)),\n"
" mm (C = A * B),\n"
" gemm (C = a * A * B + b * C)\n"
"Non-numerical algorithms:\n"
" prefix_sum32 (x_i <- sum_{j=0}^{i-1} x_i, 32 bit indices)\n"
" prefix_sum64 ( 64 bit indices)\n"
"where A has dimensions n x k, B has dimensions k x m,\n"
"C has dimensions n x m and x and y have dimensions n x r");


class BenchmarkOperation {
public:
virtual ~BenchmarkOperation() = default;
Expand Down Expand Up @@ -424,14 +402,12 @@ struct BlasBenchmark : Benchmark<dimensions> {
std::function<std::unique_ptr<BenchmarkOperation>(
std::shared_ptr<const gko::Executor>, dimensions)>>;
map_type operation_map;
std::vector<std::string> operations;
std::string name;
bool do_print;

BlasBenchmark(map_type operation_map, bool do_print = true)
: operation_map{std::move(operation_map)},
name{"blas"},
operations{split(FLAGS_operations)},
do_print{do_print}
{}

Expand Down Expand Up @@ -473,39 +449,54 @@ struct BlasBenchmark : Benchmark<dimensions> {
annotate_functor annotate, dimensions& dims,
const json& operation_case, json& result_case) const override
{
for (auto& operation_name : operations) {
result_case[operation_name] = json::object();
auto& op_result_case = result_case[operation_name];

auto op = operation_map.at(operation_name)(exec, dims);

IterationControl ic(timer);

// warm run
{
auto range = annotate("warmup", FLAGS_warmup > 0);
for (auto _ : ic.warmup_run()) {
op->prepare();
exec->synchronize();
op->run();
exec->synchronize();
}
}
auto op = operation_map.at(
operation_case["operation"].get<std::string>())(exec, dims);

// timed run
op->prepare();
for (auto _ : ic.run()) {
auto range = annotate("repetition");
IterationControl ic(timer);

// warm run
{
auto range = annotate("warmup", FLAGS_warmup > 0);
for (auto _ : ic.warmup_run()) {
op->prepare();
exec->synchronize();
op->run();
exec->synchronize();
}
const auto runtime = ic.compute_time(FLAGS_timer_method);
const auto flops = static_cast<double>(op->get_flops());
const auto mem = static_cast<double>(op->get_memory());
const auto repetitions = ic.get_num_repetitions();
op_result_case["time"] = runtime;
op_result_case["flops"] = flops / runtime;
op_result_case["bandwidth"] = mem / runtime;
op_result_case["repetitions"] = repetitions;
}

// timed run
op->prepare();
for (auto _ : ic.run()) {
auto range = annotate("repetition");
op->run();
}
const auto runtime = ic.compute_time(FLAGS_timer_method);
const auto flops = static_cast<double>(op->get_flops());
const auto mem = static_cast<double>(op->get_memory());
const auto repetitions = ic.get_num_repetitions();
result_case["time"] = runtime;
result_case["flops"] = flops / runtime;
result_case["bandwidth"] = mem / runtime;
result_case["repetitions"] = repetitions;
}

void postprocess(json& test_cases) const override
{
std::map<json, json> same_operators;
for (const auto& test_case : test_cases) {
auto case_operator = test_case;
case_operator.erase("operation");
case_operator.erase(name);
same_operators.try_emplace(case_operator, json::object());
same_operators[case_operator][test_case["operation"]] =
test_case[name];
}
auto merged_cases = json::array();
for (const auto& [test_case, results] : same_operators) {
merged_cases.push_back(test_case);
merged_cases.back()[name] = results;
}
test_cases = std::move(merged_cases);
}
};
13 changes: 11 additions & 2 deletions benchmark/blas/distributed/multi_vector.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,16 @@ Parameters for a benchmark case are:
stride: storage stride for both vectors (optional, default r)
stride_x: stride for input vector x (optional, default r)
stride_y: stride for in/out vector y (optional, default r)
The supported operations are defined as:
BLAS algorithms:
copy (y = x),
axpy (y = y + a * x),
sub_scaled (y = y - a * x),
multiaxpy (like axpy, but a has one entry per column),
scal (y = a * y),
multiscal (like scal, but a has one entry per column),
dot (a = x' * y),"
norm (a = sqrt(x' * x))
)";
auto schema = json::parse(
std::ifstream(GKO_ROOT "/benchmark/schema/blas-distributed.json"));
Expand All @@ -47,8 +57,7 @@ Parameters for a benchmark case are:
auto exec = executor_factory_mpi.at(FLAGS_executor)(comm.get());

if (do_print) {
std::string extra_information =
"The operations are " + FLAGS_operations;
std::string extra_information;
print_general_information(extra_information, exec);
}

Expand Down
140 changes: 69 additions & 71 deletions benchmark/conversion/conversion.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -33,31 +33,8 @@ using Generator = DefaultSystemGenerator<>;

struct ConversionBenchmark : Benchmark<gko::device_matrix_data<etype, itype>> {
std::string name;
std::vector<std::string> operations;

ConversionBenchmark() : name{"conversion"}
{
auto ref_exec = gko::ReferenceExecutor::create();
auto formats = split(FLAGS_formats);
for (const auto& from_format : formats) {
operations.push_back(from_format + "-read");
auto from_mtx =
formats::matrix_type_factory.at(from_format)(ref_exec);
// all pairs of conversions that are supported by Ginkgo
for (const auto& to_format : formats) {
if (from_format == to_format) {
continue;
}
auto to_mtx =
formats::matrix_type_factory.at(to_format)(ref_exec);
try {
to_mtx->copy_from(from_mtx);
operations.push_back(from_format + "-" + to_format);
} catch (const std::exception& e) {
}
}
}
}
ConversionBenchmark() : name{"conversion"} {}

const std::string& get_name() const override { return name; }

Expand All @@ -83,55 +60,80 @@ struct ConversionBenchmark : Benchmark<gko::device_matrix_data<etype, itype>> {
gko::device_matrix_data<etype, itype>& data,
const json& operation_case, json& result_case) const override
{
for (const auto& operation_name : operations) {
result_case[operation_name] = json::object();
auto& op_result_case = result_case[operation_name];

auto split_it =
std::find(operation_name.begin(), operation_name.end(), '-');
std::string from_name{operation_name.begin(), split_it};
std::string to_name{split_it + 1, operation_name.end()};
auto mtx_from = formats::matrix_type_factory.at(from_name)(exec);
auto readable = gko::as<gko::ReadableFromMatrixData<etype, itype>>(
mtx_from.get());
IterationControl ic{timer};
if (to_name == "read") {
// warm run
{
auto range = annotate("warmup", FLAGS_warmup > 0);
for (auto _ : ic.warmup_run()) {
exec->synchronize();
readable->read(data);
exec->synchronize();
}
}
// timed run
for (auto _ : ic.run()) {
auto range = annotate("repetition");
std::string from_name = operation_case["from"].get<std::string>();
std::string to_name = operation_case["to"].get<std::string>();
auto mtx_from = formats::matrix_type_factory.at(from_name)(exec);
auto readable =
gko::as<gko::ReadableFromMatrixData<etype, itype>>(mtx_from.get());

// check if conversion is supported on empty matrix first
if (from_name != to_name) {
auto to_mtx = formats::matrix_type_factory.at(to_name)(exec);
to_mtx->copy_from(mtx_from);
}

IterationControl ic{timer};
if (to_name == from_name) {
// warm run
{
auto range = annotate("warmup", FLAGS_warmup > 0);
for (auto _ : ic.warmup_run()) {
exec->synchronize();
readable->read(data);
exec->synchronize();
}
} else {
}
// timed run
for (auto _ : ic.run()) {
auto range = annotate("repetition");
readable->read(data);
auto mtx_to = formats::matrix_type_factory.at(to_name)(exec);

// warm run
{
auto range = annotate("warmup", FLAGS_warmup > 0);
for (auto _ : ic.warmup_run()) {
exec->synchronize();
mtx_to->copy_from(mtx_from);
exec->synchronize();
}
}
// timed run
for (auto _ : ic.run()) {
auto range = annotate("repetition");
}
} else {
readable->read(data);
auto mtx_to = formats::matrix_type_factory.at(to_name)(exec);

// warm run
{
auto range = annotate("warmup", FLAGS_warmup > 0);
for (auto _ : ic.warmup_run()) {
exec->synchronize();
mtx_to->copy_from(mtx_from);
exec->synchronize();
}
}
op_result_case["time"] = ic.compute_time(FLAGS_timer_method);
op_result_case["repetitions"] = ic.get_num_repetitions();
// timed run
for (auto _ : ic.run()) {
auto range = annotate("repetition");
mtx_to->copy_from(mtx_from);
}
}
result_case["time"] = ic.compute_time(FLAGS_timer_method);
result_case["repetitions"] = ic.get_num_repetitions();
}

void postprocess(json& test_cases) const override
{
std::map<json, json> same_operators;
for (const auto& test_case : test_cases) {
if (test_case[name].contains("error_type") &&
test_case[name]["error_type"] == "gko::NotSupported") {
continue;
}
auto case_operator = test_case;
case_operator.erase("to");
case_operator.erase("from");
case_operator.erase(name);
same_operators.try_emplace(case_operator, json::array());
same_operators[case_operator].push_back(test_case[name]);
same_operators[case_operator].back()["to"] = test_case["to"];
same_operators[case_operator].back()["from"] = test_case["from"];
}
auto merged_cases = json::array();
for (auto& [case_operator, results] : same_operators) {
merged_cases.push_back(case_operator);
merged_cases.back()[name] = results;
}
test_cases = std::move(merged_cases);
}
};

Expand All @@ -146,12 +148,8 @@ int main(int argc, char* argv[])

initialize_argument_parsing(&argc, &argv, header, schema["examples"]);

std::string extra_information =
std::string() + "The formats are " + FLAGS_formats;

auto exec = executor_factory.at(FLAGS_executor)(FLAGS_gpu_timer);
print_general_information(extra_information, exec);
auto formats = split(FLAGS_formats, ',');
print_general_information("", exec);

auto test_cases = json::parse(get_input_stream());

Expand Down
16 changes: 12 additions & 4 deletions benchmark/schema/blas-distributed.json
Original file line number Diff line number Diff line change
Expand Up @@ -51,20 +51,28 @@
],
"examples": [
{
"n": 1000
"n": 1000,
"operation": "axpy"
},
{
"n": 1000,
"operation": "dot"
},
{
"n": 1000,
"r": 1000
"r": 1000,
"operation": "copy"
},
{
"n": 1000,
"stride": 1024
"stride": 1024,
"operation": "scal"
},
{
"n": 1000,
"stride_x": 1024,
"stride_y": 2048
"stride_y": 2048,
"operation": "multiscal"
}
]
}
Loading
Loading