Skip to content

Commit d386357

Browse files
committed
Catch2 Benchmarking
1 parent 2fc0f04 commit d386357

File tree

8 files changed

+265
-7
lines changed

8 files changed

+265
-7
lines changed

CMakeLists.txt

+4-1
Original file line numberDiff line numberDiff line change
@@ -41,6 +41,8 @@ set_property(GLOBAL PROPERTY USE_FOLDERS ON)
4141

4242
option(alpaka_BUILD_EXAMPLES "Build the examples" OFF)
4343

44+
option(alpaka_BUILD_BENCHMARK "Build the benchmarks." OFF)
45+
4446
option(BUILD_TESTING "Build the testing tree." OFF)
4547

4648
option(alpaka_INSTALL_TEST_HEADER "Install headers of the namespace alpaka::test. Attention, headers are not designed for production code, see documentation." OFF)
@@ -138,7 +140,8 @@ endif()
138140
if(alpaka_BUILD_EXAMPLES)
139141
add_subdirectory("example/")
140142
endif()
141-
if(BUILD_TESTING)
143+
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
144+
enable_testing()
142145
add_subdirectory("test/")
143146
endif()
144147

cmake/alpakaCommon.cmake

+2-2
Original file line numberDiff line numberDiff line change
@@ -617,7 +617,7 @@ if(alpaka_ACC_SYCL_ENABLE)
617617

618618
# Enable device-side printing to stdout
619619
cmake_dependent_option(alpaka_SYCL_ENABLE_IOSTREAM "Enable device-side printing to stdout" OFF "alpaka_ACC_SYCL_ENABLE" OFF)
620-
if(BUILD_TESTING)
620+
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
621621
set(alpaka_SYCL_ENABLE_IOSTREAM ON CACHE BOOL "Enable device-side printing to stdout" FORCE)
622622
endif()
623623

@@ -851,7 +851,7 @@ if(TARGET alpaka)
851851

852852
# the alpaka library itself
853853
# SYSTEM voids showing warnings produced by alpaka when used in user applications.
854-
if(BUILD_TESTING)
854+
if(BUILD_TESTING OR alpaka_BUILD_BENCHMARK)
855855
target_include_directories(alpaka INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
856856
else()
857857
target_include_directories(alpaka SYSTEM INTERFACE ${_alpaka_INCLUDE_DIRECTORY})
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,120 @@
1+
/* Copyright 2022 Benjamin Worpitz, Andrea Bocci, Bernhard Manfred Gruber
2+
*
3+
* This file is part of alpaka.
4+
*
5+
* This Source Code Form is subject to the terms of the Mozilla Public
6+
* License, v. 2.0. If a copy of the MPL was not distributed with this
7+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
8+
*/
9+
10+
#pragma once
11+
12+
#include <alpaka/alpaka.hpp>
13+
14+
#if defined(ALPAKA_ACC_GPU_CUDA_ENABLED) && !BOOST_LANG_CUDA
15+
# error If ALPAKA_ACC_GPU_CUDA_ENABLED is set, the compiler has to support CUDA!
16+
#endif
17+
18+
#if defined(ALPAKA_ACC_GPU_HIP_ENABLED) && !BOOST_LANG_HIP
19+
# error If ALPAKA_ACC_GPU_HIP_ENABLED is set, the compiler has to support HIP!
20+
#endif
21+
22+
#include <alpaka/test/Check.hpp>
23+
#include <alpaka/test/queue/Queue.hpp>
24+
25+
#include <catch2/catch.hpp>
26+
27+
#include <string>
28+
#include <utility>
29+
30+
namespace alpaka::test
31+
{
32+
//! The fixture for executing a kernel on a given accelerator.
33+
template<typename TAcc>
34+
class KernelExecutionBenchmarkFixture
35+
{
36+
public:
37+
using Acc = TAcc;
38+
using Dim = alpaka::Dim<Acc>;
39+
using Idx = alpaka::Idx<Acc>;
40+
using DevAcc = Dev<Acc>;
41+
using PltfAcc = Pltf<DevAcc>;
42+
using QueueAcc = test::DefaultQueue<DevAcc>;
43+
using WorkDiv = WorkDivMembers<Dim, Idx>;
44+
45+
KernelExecutionBenchmarkFixture(WorkDiv workDiv)
46+
: m_devHost(getDevByIdx<PltfCpu>(0u))
47+
, m_devAcc(getDevByIdx<PltfAcc>(0u))
48+
, m_queue(m_devAcc)
49+
, m_workDiv(std::move(workDiv))
50+
{
51+
}
52+
53+
template<typename TExtent>
54+
KernelExecutionBenchmarkFixture(TExtent const& extent)
55+
: KernelExecutionBenchmarkFixture(getValidWorkDiv<Acc>(
56+
getDevByIdx<PltfAcc>(0u),
57+
extent,
58+
Vec<Dim, Idx>::ones(),
59+
false,
60+
GridBlockExtentSubDivRestrictions::Unrestricted))
61+
{
62+
}
63+
64+
template<typename TKernelFnObj, typename... TArgs>
65+
auto operator()(
66+
TKernelFnObj const& kernelFnObj,
67+
std::string const& benchmarkName,
68+
float& result,
69+
TArgs&&... args) -> bool
70+
{
71+
// Allocate result buffers
72+
auto bufAccResult = allocBuf<float, Idx>(m_devAcc, static_cast<Idx>(1u));
73+
auto bufHostResult = allocBuf<float, Idx>(m_devHost, static_cast<Idx>(1u));
74+
75+
int numRuns = 0;
76+
result = 0.0f;
77+
78+
// The following block is executed unknown times during estimation phase, then once per benchmark sample
79+
BENCHMARK_ADVANCED(std::string(benchmarkName))(Catch::Benchmark::Chronometer meter)
80+
{
81+
numRuns++;
82+
memset(m_queue, bufAccResult, 0.0f);
83+
wait(m_queue);
84+
85+
// Only the following part is measured as the benchmark part
86+
meter.measure(
87+
[&]
88+
{
89+
exec<Acc>(
90+
m_queue,
91+
m_workDiv,
92+
kernelFnObj,
93+
getPtrNative(bufAccResult),
94+
std::forward<TArgs>(args)...); // run the measured kernel
95+
wait(m_queue); // wait for the kernel to actually run
96+
});
97+
98+
// Copy the result value to the host
99+
memcpy(m_queue, bufHostResult, bufAccResult);
100+
wait(m_queue);
101+
102+
auto const resultLocal = *getPtrNative(bufHostResult);
103+
result += resultLocal;
104+
return resultLocal; // make sure the benchmark call is not optimized away
105+
};
106+
result /= static_cast<float>(numRuns);
107+
108+
return true;
109+
// TODO: Can we return the result here and read it from Catch2's REQUIRE or something similar? Or are the
110+
// returns limited to bools?
111+
// return result;
112+
}
113+
114+
protected:
115+
DevCpu m_devHost;
116+
DevAcc m_devAcc;
117+
QueueAcc m_queue;
118+
WorkDiv m_workDiv;
119+
};
120+
} // namespace alpaka::test

test/CMakeLists.txt

+9-4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,5 @@
11
#
2-
# Copyright 2015-2020 Benjamin Worpitz, Axel Huebl, Jan Stephan
2+
# Copyright 2015-2022 Benjamin Worpitz, Axel Huebl, Jan Stephan, Jiri Vyskocil
33
#
44
# This file is part of alpaka.
55
#
@@ -21,6 +21,11 @@ add_subdirectory(common)
2121

2222
list(APPEND _alpaka_TEST_OPTIONS --use-colour yes)
2323

24-
add_subdirectory(analysis)
25-
add_subdirectory(integ)
26-
add_subdirectory(unit)
24+
if(BUILD_TESTING)
25+
add_subdirectory(analysis)
26+
add_subdirectory(integ)
27+
add_subdirectory(unit)
28+
endif()
29+
if(alpaka_BUILD_BENCHMARK)
30+
add_subdirectory(benchmark)
31+
endif()

test/benchmark/CMakeLists.txt

+13
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,13 @@
1+
#
2+
# Copyright 2022 Jiri Vyskocil
3+
#
4+
# This file is part of alpaka.
5+
#
6+
# This Source Code Form is subject to the terms of the Mozilla Public
7+
# License, v. 2.0. If a copy of the MPL was not distributed with this
8+
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
9+
#
10+
11+
cmake_minimum_required(VERSION 3.18)
12+
13+
add_subdirectory("rand/")

test/benchmark/rand/CMakeLists.txt

+29
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,29 @@
1+
#
2+
# Copyright 2022 Jiri Vyskocil
3+
#
4+
# This file is part of alpaka.
5+
#
6+
# This Source Code Form is subject to the terms of the Mozilla Public
7+
# License, v. 2.0. If a copy of the MPL was not distributed with this
8+
# file, You can obtain one at http://mozilla.org/MPL/2.0/.
9+
#
10+
11+
set(_TARGET_NAME "randBenchmark")
12+
13+
append_recursive_files_add_to_src_group("src/" "src/" "cpp" _FILES_SOURCE)
14+
15+
alpaka_add_executable(
16+
${_TARGET_NAME}
17+
${_FILES_SOURCE})
18+
target_link_libraries(
19+
${_TARGET_NAME}
20+
PRIVATE common)
21+
22+
set_target_properties(${_TARGET_NAME} PROPERTIES FOLDER "test/benchmark")
23+
target_compile_definitions(${_TARGET_NAME} PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)
24+
25+
if(alpaka_CI)
26+
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS} --benchmark-samples 1)
27+
else()
28+
add_test(NAME ${_TARGET_NAME} COMMAND ${_TARGET_NAME} ${_alpaka_TEST_OPTIONS})
29+
endif()
+84
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
/* Copyright 2022 Jiri Vyskocil
2+
*
3+
* This file is part of alpaka.
4+
*
5+
* This Source Code Form is subject to the terms of the Mozilla Public
6+
* License, v. 2.0. If a copy of the MPL was not distributed with this
7+
* file, You can obtain one at http://mozilla.org/MPL/2.0/.
8+
*/
9+
10+
#include <alpaka/example/ExampleDefaultAcc.hpp>
11+
#include <alpaka/rand/Traits.hpp>
12+
#include <alpaka/test/KernelExecutionBenchmarkFixture.hpp>
13+
#include <alpaka/test/acc/TestAccs.hpp>
14+
15+
#include <catch2/catch.hpp>
16+
17+
class RandBenchmarkKernel
18+
{
19+
public:
20+
ALPAKA_NO_HOST_ACC_WARNING
21+
template<typename TAcc, typename TIdx>
22+
ALPAKA_FN_ACC void operator()(TAcc const& acc, float* result, TIdx numPoints) const
23+
{
24+
// Get the global linearized thread idx.
25+
auto const globalThreadIdx = alpaka::getIdx<alpaka::Grid, alpaka::Threads>(acc);
26+
auto const globalThreadExtent = alpaka::getWorkDiv<alpaka::Grid, alpaka::Threads>(acc);
27+
28+
auto const linearizedGlobalThreadIdx
29+
= static_cast<TIdx>(alpaka::mapIdx<1u>(globalThreadIdx, globalThreadExtent)[0]);
30+
31+
// Setup generator engine and distribution.
32+
auto engine = alpaka::rand::engine::createDefault(acc, 42, linearizedGlobalThreadIdx);
33+
auto dist(alpaka::rand::distribution::createUniformReal<float>(acc));
34+
35+
float number = 0;
36+
for(TIdx i = linearizedGlobalThreadIdx; i < numPoints; i += static_cast<TIdx>(globalThreadExtent.prod()))
37+
{
38+
number += dist(engine);
39+
}
40+
41+
alpaka::atomicAdd(
42+
acc,
43+
result,
44+
number); // TODO: we're measuring the atomicAdd time too, this is not what we want
45+
}
46+
};
47+
48+
// TODO: This takes an enormous time to finish and is probably useless anyway:
49+
// TEMPLATE_LIST_TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]", alpaka::test::TestAccs)
50+
// Running the benchmark on a single default accelerator instead
51+
TEST_CASE("defaultRandomGeneratorBenchmark", "[randBenchmark]")
52+
{
53+
// using Acc = TestType;
54+
using Acc = alpaka::ExampleDefaultAcc<alpaka::DimInt<1>, std::size_t>;
55+
using Dim = alpaka::Dim<Acc>;
56+
using Idx = alpaka::Idx<Acc>;
57+
using Vec = alpaka::Vec<Dim, Idx>;
58+
using WorkDiv = alpaka::WorkDivMembers<Dim, Idx>;
59+
60+
auto const devAcc = alpaka::getDevByIdx<Acc>(0u);
61+
62+
const Idx numThreads = std::thread::hardware_concurrency(); // TODO: GPU?
63+
std::cout << "Hardware threads: " << numThreads << std::endl;
64+
65+
const unsigned numPoints = GENERATE(100'000u, 1'000'000u, 10'000'000u, 100'000'000u, 1'000'000'000u);
66+
67+
WorkDiv workdiv{alpaka::getValidWorkDiv<Acc>(
68+
devAcc,
69+
Vec::all(numThreads * numThreads),
70+
Vec::all(numThreads),
71+
false,
72+
alpaka::GridBlockExtentSubDivRestrictions::Unrestricted)};
73+
74+
alpaka::test::KernelExecutionBenchmarkFixture<Acc> fixture(workdiv);
75+
76+
RandBenchmarkKernel kernel;
77+
78+
float result = 0.0f;
79+
80+
REQUIRE(fixture(kernel, "Random sequence N=" + std::to_string(numPoints), result, numPoints));
81+
// TODO: Actually check the result
82+
std::cout << "\ntemp debug normalized result = " << result / static_cast<float>(numPoints)
83+
<< " should probably converge to 0.5." << std::flush;
84+
}

test/catch_main/CMakeLists.txt

+4
Original file line numberDiff line numberDiff line change
@@ -27,6 +27,10 @@ set_target_properties(CatchMain PROPERTIES
2727
WINDOWS_EXPORT_ALL_SYMBOLS ON
2828
)
2929

30+
if(alpaka_BUILD_BENCHMARK)
31+
target_compile_definitions(CatchMain PUBLIC CATCH_CONFIG_ENABLE_BENCHMARKING)
32+
endif()
33+
3034
target_compile_definitions(CatchMain PUBLIC "CATCH_CONFIG_FAST_COMPILE")
3135
if (CMAKE_CXX_COMPILER_ID STREQUAL "PGI")
3236
# Workaround for STL atomic issue: https://forums.developer.nvidia.com/t/support-for-atomic-in-libstdc-missing/135403/2

0 commit comments

Comments
 (0)