Skip to content

Commit eb300e6

Browse files
authored
Enhance executor interfaces (#222)
* Support to explicitly set executor's stream * Support data copy between ARK tensors and external CUDA arrays * Support non-loop execution mode * Minor fixes & interface updates
1 parent ee0895e commit eb300e6

31 files changed

+697
-325
lines changed

ark/api/executor.cpp

+268-145
Large diffs are not rendered by default.

ark/api/executor_test.cpp

+192
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,192 @@
1+
// Copyright (c) Microsoft Corporation.
2+
// Licensed under the MIT license.
3+
4+
#include "ark/executor.hpp"
5+
6+
#include "gpu/gpu.hpp"
7+
#include "model/model_json.hpp"
8+
#include "unittest/unittest_utils.h"
9+
10+
template <bool LoopMode>
11+
ark::unittest::State test_executor() {
12+
ark::gpuStream stream;
13+
UNITTEST_EQ(
14+
ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
15+
ark::gpuSuccess);
16+
17+
ark::Model empty;
18+
{
19+
ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
20+
UNITTEST_EQ(executor.device_id(), 0);
21+
UNITTEST_EQ(executor.stream(), stream);
22+
23+
executor.compile();
24+
executor.launch();
25+
executor.run(1);
26+
executor.wait();
27+
executor.stop();
28+
executor.destroy();
29+
30+
UNITTEST_TRUE(executor.destroyed());
31+
}
32+
{
33+
ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
34+
executor.compile();
35+
executor.launch();
36+
executor.run(1);
37+
executor.wait();
38+
executor.stop();
39+
40+
executor.launch();
41+
executor.run(1);
42+
executor.wait();
43+
executor.stop();
44+
45+
executor.destroy();
46+
}
47+
{
48+
ark::DefaultExecutor executor(empty, 0, stream, {}, "test", LoopMode);
49+
UNITTEST_THROW(executor.launch(), ark::InvalidUsageError);
50+
51+
executor.compile();
52+
executor.launch();
53+
executor.launch(); // Will be ignored with a warning.
54+
executor.run(1);
55+
executor.wait();
56+
executor.wait(); // nothing to do
57+
58+
// Stop & destroy automatically.
59+
}
60+
61+
UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess);
62+
return ark::unittest::SUCCESS;
63+
}
64+
65+
ark::unittest::State test_executor_loop() { return test_executor<true>(); }
66+
67+
ark::unittest::State test_executor_no_loop() { return test_executor<false>(); }
68+
69+
ark::unittest::State test_executor_tensor_read_write(ark::Dims shape,
70+
ark::Dims stride,
71+
ark::Dims offset) {
72+
// Alloc CPU array
73+
std::vector<float> host_data(shape.nelems());
74+
for (size_t i = 0; i < host_data.size(); ++i) {
75+
host_data[i] = static_cast<float>(i);
76+
}
77+
78+
// Alloc GPU array
79+
void *dev_ptr;
80+
UNITTEST_EQ(ark::gpuMalloc(&dev_ptr, shape.nelems() * sizeof(float)),
81+
ark::gpuSuccess);
82+
83+
// Create an ARK tensor
84+
ark::Model m;
85+
auto tensor = m.tensor(shape, ark::FP32, stride, offset);
86+
m.noop(tensor);
87+
88+
ark::DefaultExecutor executor(m, 0);
89+
executor.compile();
90+
executor.launch();
91+
UNITTEST_GT(executor.tensor_address(tensor), 0);
92+
93+
// Copy data from CPU array to ARK tensor
94+
executor.tensor_write(tensor, host_data.data(),
95+
shape.nelems() * sizeof(float));
96+
97+
// Copy data from ARK tensor to GPU array
98+
executor.tensor_read(tensor, dev_ptr, shape.nelems() * sizeof(float),
99+
nullptr, true);
100+
101+
// Check the data
102+
std::vector<float> dev_data(shape.nelems());
103+
executor.tensor_read(tensor, dev_data.data(),
104+
shape.nelems() * sizeof(float));
105+
for (size_t i = 0; i < dev_data.size(); ++i) {
106+
UNITTEST_EQ(dev_data[i], static_cast<float>(i));
107+
dev_data[i] = -1;
108+
}
109+
110+
UNITTEST_EQ(
111+
ark::gpuMemcpy(dev_data.data(), dev_ptr, shape.nelems() * sizeof(float),
112+
ark::gpuMemcpyDeviceToHost),
113+
ark::gpuSuccess);
114+
for (size_t i = 0; i < dev_data.size(); ++i) {
115+
UNITTEST_EQ(dev_data[i], static_cast<float>(i));
116+
dev_data[i] = -1;
117+
}
118+
119+
// Copy -1s back to GPU array
120+
UNITTEST_EQ(
121+
ark::gpuMemcpy(dev_ptr, dev_data.data(), shape.nelems() * sizeof(float),
122+
ark::gpuMemcpyHostToDevice),
123+
ark::gpuSuccess);
124+
125+
// Copy data from GPU array to ARK tensor
126+
executor.tensor_write(tensor, dev_ptr, shape.nelems() * sizeof(float),
127+
nullptr, true);
128+
129+
// Copy data from ARK tensor to CPU array
130+
executor.tensor_read(tensor, host_data.data(),
131+
shape.nelems() * sizeof(float));
132+
133+
// Check the data
134+
for (size_t i = 0; i < host_data.size(); ++i) {
135+
UNITTEST_EQ(host_data[i], -1);
136+
}
137+
138+
// Provide a stream
139+
ark::gpuStream stream;
140+
UNITTEST_EQ(
141+
ark::gpuStreamCreateWithFlags(&stream, ark::gpuStreamNonBlocking),
142+
ark::gpuSuccess);
143+
executor.tensor_read(tensor, host_data.data(),
144+
shape.nelems() * sizeof(float), stream);
145+
executor.tensor_write(tensor, host_data.data(),
146+
shape.nelems() * sizeof(float), stream);
147+
UNITTEST_EQ(ark::gpuStreamDestroy(stream), ark::gpuSuccess);
148+
149+
// Invalid copy size
150+
UNITTEST_THROW(executor.tensor_read(tensor, host_data.data(),
151+
shape.nelems() * sizeof(float) + 1),
152+
ark::InvalidUsageError);
153+
UNITTEST_THROW(executor.tensor_write(tensor, host_data.data(),
154+
shape.nelems() * sizeof(float) + 1),
155+
ark::InvalidUsageError);
156+
157+
executor.stop();
158+
159+
UNITTEST_EQ(ark::gpuFree(dev_ptr), ark::gpuSuccess);
160+
return ark::unittest::SUCCESS;
161+
}
162+
163+
ark::unittest::State test_executor_tensor_read_write_no_stride() {
164+
return test_executor_tensor_read_write({1024}, {}, {});
165+
}
166+
167+
ark::unittest::State test_executor_tensor_read_write_stride_offset() {
168+
return test_executor_tensor_read_write({4, 512}, {4, 1024}, {0, 512});
169+
}
170+
171+
ark::unittest::State test_executor_invalid() {
172+
// Invalid device ID.
173+
UNITTEST_THROW(ark::Executor(-1, nullptr, "test", ""),
174+
ark::InvalidUsageError);
175+
176+
// Invalid rank.
177+
ark::PlanJson plan;
178+
plan["Rank"] = 1;
179+
UNITTEST_THROW(ark::Executor(0, nullptr, "test", plan.dump(), true),
180+
ark::InvalidUsageError);
181+
182+
return ark::unittest::SUCCESS;
183+
}
184+
185+
int main() {
186+
UNITTEST(test_executor_loop);
187+
UNITTEST(test_executor_no_loop);
188+
UNITTEST(test_executor_tensor_read_write_no_stride);
189+
UNITTEST(test_executor_tensor_read_write_stride_offset);
190+
UNITTEST(test_executor_invalid);
191+
return 0;
192+
}

ark/api/planner.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@
77
#include "context_impl.hpp"
88
#include "env.h"
99
#include "file_io.h"
10-
#include "gpu/gpu_manager.h"
10+
#include "gpu/gpu_manager.hpp"
1111
#include "model/model_json.hpp"
1212
#include "model/model_node.hpp"
1313
#include "model/model_op.hpp"

ark/codegen.cpp

+1-1
Original file line numberDiff line numberDiff line change
@@ -174,7 +174,7 @@ CodeGenerator::Impl::Impl(const PlanJson &plan,
174174
{"@NUM_WARPS_PER_BLOCK@", std::to_string(num_warps_per_proc_)},
175175
{"@DEFINITIONS@", definitions_ss.str()},
176176
{"@BODY@", body_ss.str()},
177-
{"@NAME@", name_},
177+
{"@NAME@", (name_.empty() ? "" : "_" + name_)},
178178
};
179179
code_ = replace(template_code, replacements);
180180
}

ark/gpu/gpu.h ark/gpu/gpu.hpp

+4-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
// Copyright (c) Microsoft Corporation.
22
// Licensed under the MIT license.
33

4-
#ifndef ARK_GPU_H_
5-
#define ARK_GPU_H_
4+
#ifndef ARK_GPU_HPP_
5+
#define ARK_GPU_HPP_
66

77
#include <functional>
88

@@ -125,6 +125,7 @@ ARK_GPU_DEFINE_CONSTANT_ALIAS(gpuPointerAttributeSyncMemops,
125125
// runtime API
126126
ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetErrorString, cudaGetErrorString,
127127
hipGetErrorString);
128+
ARK_GPU_DEFINE_FUNC_ALIAS(gpuGetLastError, cudaGetLastError, hipGetLastError);
128129
ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceGetAttribute, cudaDeviceGetAttribute,
129130
hipDeviceGetAttribute);
130131
ARK_GPU_DEFINE_FUNC_ALIAS(gpuDeviceSynchronize, cudaDeviceSynchronize,
@@ -183,4 +184,4 @@ ARK_GPU_DEFINE_FUNC_ALIAS(gpuPointerSetAttribute, cuPointerSetAttribute,
183184

184185
} // namespace ark
185186

186-
#endif // ARK_GPU_H_
187+
#endif // ARK_GPU_HPP_

ark/gpu/gpu_compile.cpp

+2-2
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
// Copyright (c) Microsoft Corporation.
22
// Licensed under the MIT license.
33

4-
#include "gpu/gpu_compile.h"
4+
#include "gpu/gpu_compile.hpp"
55

66
#include <sys/types.h>
77
#include <sys/wait.h>
@@ -22,7 +22,7 @@
2222
#include "cpu_timer.h"
2323
#include "env.h"
2424
#include "file_io.h"
25-
#include "gpu/gpu_logging.h"
25+
#include "gpu/gpu_logging.hpp"
2626
#include "utils/utils_string.hpp"
2727

2828
#define ARK_DEBUG_KERNEL 0

ark/gpu/gpu_compile.h ark/gpu/gpu_compile.hpp

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,8 +1,8 @@
11
// Copyright (c) Microsoft Corporation.
22
// Licensed under the MIT license.
33

4-
#ifndef ARK_GPU_COMPILE_H_
5-
#define ARK_GPU_COMPILE_H_
4+
#ifndef ARK_GPU_COMPILE_HPP_
5+
#define ARK_GPU_COMPILE_HPP_
66

77
#include <string>
88
#include <vector>
@@ -16,4 +16,4 @@ const std::string gpu_compile(const std::vector<std::string> &codes,
1616

1717
} // namespace ark
1818

19-
#endif // ARK_GPU_COMPILE_H_
19+
#endif // ARK_GPU_COMPILE_HPP_

ark/gpu/gpu_event.cpp

+7-10
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,10 @@
11
// Copyright (c) Microsoft Corporation.
22
// Licensed under the MIT license.
33

4-
#include "gpu/gpu_event.h"
4+
#include "gpu/gpu_event.hpp"
55

6-
#include "gpu/gpu.h"
7-
#include "gpu/gpu_logging.h"
8-
#include "gpu/gpu_manager.h"
6+
#include "gpu/gpu_logging.hpp"
7+
#include "gpu/gpu_manager.hpp"
98

109
namespace ark {
1110
class GpuEvent::Impl {
@@ -15,7 +14,7 @@ class GpuEvent::Impl {
1514
Impl(const Impl&) = delete;
1615
Impl& operator=(const Impl&) = delete;
1716

18-
void record(std::shared_ptr<GpuStream> stream);
17+
void record(gpuStream stream);
1918
float elapsed_msec(const GpuEvent& other) const;
2019

2120
private:
@@ -32,8 +31,8 @@ GpuEvent::Impl::Impl(bool disable_timing) {
3231

3332
GpuEvent::Impl::~Impl() { GLOG(gpuEventDestroy(event_)); }
3433

35-
void GpuEvent::Impl::record(std::shared_ptr<GpuStream> stream) {
36-
GLOG(gpuEventRecord(event_, stream->get()));
34+
void GpuEvent::Impl::record(gpuStream stream) {
35+
GLOG(gpuEventRecord(event_, stream));
3736
}
3837

3938
float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
@@ -45,9 +44,7 @@ float GpuEvent::Impl::elapsed_msec(const GpuEvent& other) const {
4544
GpuEvent::GpuEvent(bool disable_timing)
4645
: pimpl_(std::make_shared<Impl>(disable_timing)) {}
4746

48-
void GpuEvent::record(std::shared_ptr<GpuStream> stream) {
49-
pimpl_->record(stream);
50-
}
47+
void GpuEvent::record(gpuStream stream) { pimpl_->record(stream); }
5148

5249
float GpuEvent::elapsed_msec(const GpuEvent& other) const {
5350
return pimpl_->elapsed_msec(other);

ark/gpu/gpu_event.h ark/gpu/gpu_event.hpp

+6-4
Original file line numberDiff line numberDiff line change
@@ -1,11 +1,13 @@
11
// Copyright (c) Microsoft Corporation.
22
// Licensed under the MIT license.
33

4-
#ifndef ARK_GPU_EVENT_H_
5-
#define ARK_GPU_EVENT_H_
4+
#ifndef ARK_GPU_EVENT_HPP_
5+
#define ARK_GPU_EVENT_HPP_
66

77
#include <memory>
88

9+
#include "gpu/gpu.hpp"
10+
911
namespace ark {
1012

1113
class GpuStream;
@@ -17,7 +19,7 @@ class GpuEvent {
1719
GpuEvent(const GpuEvent &) = delete;
1820
GpuEvent &operator=(const GpuEvent &) = delete;
1921

20-
void record(std::shared_ptr<GpuStream> stream);
22+
void record(gpuStream stream);
2123
float elapsed_msec(const GpuEvent &other) const;
2224

2325
protected:
@@ -31,4 +33,4 @@ class GpuEvent {
3133
};
3234
} // namespace ark
3335

34-
#endif // ARK_GPU_EVENT_H_
36+
#endif // ARK_GPU_EVENT_HPP_

0 commit comments

Comments
 (0)