Skip to content

Commit ca0e795

Browse files
authored
Merge pull request #113 from elstehle/fix/per-device-stream
Fixes cudaErrorInvalidValue when running on nvbench-created cuda stream
1 parent 4879607 commit ca0e795

File tree

9 files changed

+130
-10
lines changed

9 files changed

+130
-10
lines changed

nvbench/cuda_stream.cuh

Lines changed: 26 additions & 5 deletions
Original file line numberDiff line numberDiff line change
@@ -19,10 +19,13 @@
1919
#pragma once
2020

2121
#include <nvbench/cuda_call.cuh>
22+
#include <nvbench/detail/device_scope.cuh>
23+
#include <nvbench/device_info.cuh>
2224

2325
#include <cuda_runtime_api.h>
2426

2527
#include <memory>
28+
#include <optional>
2629

2730
namespace nvbench
2831
{
@@ -39,18 +42,36 @@ namespace nvbench
3942
struct cuda_stream
4043
{
4144
/**
42-
* Constructs a cuda_stream that owns a new stream, created with
43-
* `cudaStreamCreate`.
45+
* Constructs a cuda_stream that owns a new stream, created with `cudaStreamCreate`.
46+
*
47+
* @param device The device that this stream should be associated with. If no device is provided,
48+
* the stream will be associated with the device that is active at the call time.
4449
*/
45-
cuda_stream()
46-
: m_stream{[]() {
50+
explicit cuda_stream(std::optional<nvbench::device_info> device)
51+
: m_stream{[device]() {
4752
cudaStream_t s;
48-
NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
53+
if (device.has_value())
54+
{
55+
nvbench::detail::device_scope scope_guard{device.value().get_id()};
56+
NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
57+
}
58+
else
59+
{
60+
NVBENCH_CUDA_CALL(cudaStreamCreate(&s));
61+
}
4962
return s;
5063
}(),
5164
stream_deleter{true}}
5265
{}
5366

67+
/**
68+
* @brief Constructs a new cuda_stream tha is associated with the device that is active at the
69+
* call time.
70+
*/
71+
cuda_stream()
72+
: cuda_stream(std::nullopt)
73+
{}
74+
5475
/**
5576
* Constructs a `cuda_stream` from an explicit cudaStream_t.
5677
*

nvbench/detail/measure_cold.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -37,7 +37,7 @@ namespace nvbench::detail
3737

3838
measure_cold_base::measure_cold_base(state &exec_state)
3939
: m_state{exec_state}
40-
, m_launch{m_state.get_cuda_stream()}
40+
, m_launch{exec_state.get_cuda_stream()}
4141
, m_criterion_params{exec_state.get_criterion_params()}
4242
, m_stopping_criterion{nvbench::criterion_manager::get().get_criterion(
4343
exec_state.get_stopping_criterion())}

nvbench/detail/measure_cupti.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -165,7 +165,7 @@ measure_cupti_base::measure_cupti_base(state &exec_state)
165165
// (formatter doesn't handle `try :` very well...)
166166
try
167167
: m_state{exec_state}
168-
, m_launch{m_state.get_cuda_stream()}
168+
, m_launch{exec_state.get_cuda_stream()}
169169
, m_cupti{*m_state.get_device(), add_metrics(m_state)}
170170
{}
171171
// clang-format on

nvbench/detail/measure_hot.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -36,7 +36,7 @@ namespace nvbench::detail
3636

3737
measure_hot_base::measure_hot_base(state &exec_state)
3838
: m_state{exec_state}
39-
, m_launch{m_state.get_cuda_stream()}
39+
, m_launch{exec_state.get_cuda_stream()}
4040
, m_min_samples{exec_state.get_min_samples()}
4141
, m_min_time{exec_state.get_min_time()}
4242
, m_skip_time{exec_state.get_skip_time()}

nvbench/state.cuh

Lines changed: 17 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -64,7 +64,21 @@ struct state
6464
state &operator=(const state &) = delete;
6565
state &operator=(state &&) = default;
6666

67-
[[nodiscard]] const nvbench::cuda_stream &get_cuda_stream() const { return m_cuda_stream; }
67+
/// If a stream exists, return that. Otherwise, create a new stream using the current
68+
/// device (or the current device if none is set), save it, and return it.
69+
/// @sa get_cuda_stream_optional
70+
[[nodiscard]] nvbench::cuda_stream &get_cuda_stream()
71+
{
72+
if (!m_cuda_stream.has_value())
73+
{
74+
m_cuda_stream = nvbench::cuda_stream{m_device};
75+
}
76+
return m_cuda_stream.value();
77+
}
78+
[[nodiscard]] const std::optional<nvbench::cuda_stream> &get_cuda_stream_optional() const
79+
{
80+
return m_cuda_stream;
81+
}
6882
void set_cuda_stream(nvbench::cuda_stream &&stream) { m_cuda_stream = std::move(stream); }
6983

7084
/// The CUDA device associated with with this benchmark state. May be
@@ -313,7 +327,6 @@ private:
313327
std::optional<nvbench::device_info> device,
314328
std::size_t type_config_index);
315329

316-
nvbench::cuda_stream m_cuda_stream;
317330
std::reference_wrapper<const nvbench::benchmark_base> m_benchmark;
318331
nvbench::named_values m_axis_values;
319332
std::optional<nvbench::device_info> m_device;
@@ -334,6 +347,8 @@ private:
334347
nvbench::float32_t m_throttle_threshold; // [% of default SM clock rate]
335348
nvbench::float32_t m_throttle_recovery_delay; // [seconds]
336349

350+
std::optional<nvbench::cuda_stream> m_cuda_stream;
351+
337352
// Deadlock protection. See blocking_kernel's class doc for details.
338353
nvbench::float64_t m_blocking_kernel_timeout{30.0};
339354

nvbench/state.cxx

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ state::state(const benchmark_base &bench)
4242
, m_timeout{bench.get_timeout()}
4343
, m_throttle_threshold{bench.get_throttle_threshold()}
4444
, m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
45+
, m_cuda_stream{std::nullopt}
4546
{}
4647

4748
state::state(const benchmark_base &bench,
@@ -62,6 +63,7 @@ state::state(const benchmark_base &bench,
6263
, m_timeout{bench.get_timeout()}
6364
, m_throttle_threshold{bench.get_throttle_threshold()}
6465
, m_throttle_recovery_delay{bench.get_throttle_recovery_delay()}
66+
, m_cuda_stream{std::nullopt}
6567
{}
6668

6769
nvbench::int64_t state::get_int64(const std::string &axis_name) const

testing/CMakeLists.txt

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -3,6 +3,7 @@ set(test_srcs
33
benchmark.cu
44
create.cu
55
cuda_timer.cu
6+
cuda_stream.cu
67
cpu_timer.cu
78
criterion_manager.cu
89
criterion_params.cu

testing/cuda_stream.cu

Lines changed: 77 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,77 @@
1+
/*
2+
* Copyright 2023 NVIDIA Corporation
3+
*
4+
* Licensed under the Apache License, Version 2.0 with the LLVM exception
5+
* (the "License"); you may not use this file except in compliance with
6+
* the License.
7+
*
8+
* You may obtain a copy of the License at
9+
*
10+
* http://llvm.org/foundation/relicensing/LICENSE.txt
11+
*
12+
* Unless required by applicable law or agreed to in writing, software
13+
* distributed under the License is distributed on an "AS IS" BASIS,
14+
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
15+
* See the License for the specific language governing permissions and
16+
* limitations under the License.
17+
*/
18+
19+
#include <nvbench/config.cuh>
20+
#include <nvbench/cuda_stream.cuh>
21+
#include <nvbench/device_manager.cuh>
22+
#include <nvbench/types.cuh>
23+
24+
#include <fmt/format.h>
25+
26+
#include "test_asserts.cuh"
27+
28+
namespace
29+
{
30+
#ifdef NVBENCH_HAS_CUPTI
31+
/**
32+
* @brief Queries and returns the device id that the given \p cuda_stream is associated with
33+
*
34+
* @param cuda_stream The stream to get the device id for
35+
* @return The device id that \p cuda_stream is associated with
36+
*/
37+
int get_device_of_stream(cudaStream_t cuda_stream)
38+
{
39+
CUcontext ctx;
40+
NVBENCH_DRIVER_API_CALL(cuStreamGetCtx(CUstream{cuda_stream}, &ctx));
41+
NVBENCH_DRIVER_API_CALL(cuCtxPushCurrent(ctx));
42+
CUdevice device_id{};
43+
NVBENCH_DRIVER_API_CALL(cuCtxGetDevice(&device_id));
44+
NVBENCH_DRIVER_API_CALL(cuCtxPopCurrent(&ctx));
45+
return static_cast<int>(device_id);
46+
}
47+
#endif
48+
} // namespace
49+
50+
void test_basic()
51+
{
52+
#ifdef NVBENCH_HAS_CUPTI
53+
// Get devices
54+
auto devices = nvbench::device_manager::get().get_devices();
55+
56+
// Iterate over devices
57+
for (auto const &device_info : devices)
58+
{
59+
// Create stream on the device before it becomes the active device
60+
nvbench::cuda_stream device_stream(device_info);
61+
62+
// Verify cuda stream is associated with the correct cuda device
63+
ASSERT(get_device_of_stream(device_stream.get_stream()) == device_info.get_id());
64+
65+
// Set the device as active device
66+
device_info.set_active();
67+
68+
// Create the stream (implicitly) on the device that is currently active
69+
nvbench::cuda_stream current_device_stream{};
70+
71+
// Verify the cuda stream was in fact associated with the currently active device
72+
ASSERT(get_device_of_stream(current_device_stream.get_stream()) == device_info.get_id());
73+
}
74+
#endif
75+
}
76+
77+
int main() { test_basic(); }

testing/state.cu

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -55,9 +55,13 @@ void test_streams()
5555

5656
state_tester state{bench};
5757

58+
// Confirm that the stream hasn't been initialized yet
59+
ASSERT(!state.get_cuda_stream_optional().has_value());
60+
5861
// Test non-owning stream
5962
cudaStream_t default_stream = 0;
6063
state.set_cuda_stream(nvbench::cuda_stream{default_stream, false});
64+
ASSERT(state.get_cuda_stream_optional() == default_stream);
6165
ASSERT(state.get_cuda_stream() == default_stream);
6266

6367
// Test owning stream

0 commit comments

Comments
 (0)