Skip to content

Commit 79be470

Browse files
valentinandreifacebook-github-bot
authored andcommitted
Modify CUDA test to attempt overlapping D2H transfers
Summary: Distributed checkpointing for GenAI requires very expensive memory downloads from the GPU which can block the trainer thread if it happens that it issues a new D2H transfer. For example, we want that model parameters and optimizer state downloads to overlap with compute. However if for some reason the forward pass thread or the backward pass issue a D2H transfer, it will have to wait until the checkpoint download was completed. This code is a test program for Kineto that issues CUDA kernels, memory copies and UVM accesses in a configurable way. This change enables us to issue multiple GPU D2H downloads to host memory using multiple streams on multiple threads. Previously the D2H downloads were very short because we downloaded a single output value of 4 bytes. With the change we download an entire buffer. Reviewed By: xerothermic Differential Revision: D62601073 fbshipit-source-id: ed192723403787f37d45bf63d39e1a768df4a1d3
1 parent ca1eedb commit 79be470

File tree

4 files changed

+44
-47
lines changed

4 files changed

+44
-47
lines changed

libkineto/stress_test/kineto_stress_test.cpp

Lines changed: 8 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -182,7 +182,14 @@ void create_cuda_streams(stress_test_args& test_args) {
182182
if (test_args.use_memcpy_stream) {
183183
test_args.memcpy_streams = (cudaStream_t*)malloc(test_args.num_workers * sizeof(cudaStream_t));
184184
for (uint32_t i = 0; i < test_args.num_workers; ++i) {
185-
checkCudaStatus(cudaStreamCreateWithFlags(test_args.memcpy_streams + i, cudaStreamNonBlocking), __LINE__);
185+
if (i % 2 != 0) {
186+
checkCudaStatus(cudaStreamCreateWithFlags(test_args.memcpy_streams + i, cudaStreamNonBlocking), __LINE__);
187+
} else {
188+
int leastPriority = 0;
189+
int greatestPriority = 0;
190+
checkCudaStatus(cudaDeviceGetStreamPriorityRange(&leastPriority, &greatestPriority), __LINE__);
191+
checkCudaStatus(cudaStreamCreateWithPriority(test_args.memcpy_streams + i, cudaStreamNonBlocking, leastPriority), __LINE__);
192+
}
186193
}
187194
}
188195

libkineto/stress_test/random_ops_stress_test.cu

Lines changed: 21 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -14,6 +14,7 @@
1414

1515
namespace kineto_stress_test {
1616

17+
#define CUDA_API_PER_THREAD_DEFAULT_STREAM
1718
#define RNG_SEED 2049
1819

1920
// NCCL variables buffers
@@ -123,15 +124,15 @@ void run_stress_test(
123124
} else {
124125
v_streams = (cudaStream_t*)malloc(test_args.num_cuda_streams * sizeof(cudaStream_t));
125126
for (uint32_t i = 0; i < test_args.num_cuda_streams; ++i) {
126-
checkCudaStatus(cudaStreamCreate(v_streams + i), __LINE__);
127+
checkCudaStatus(cudaStreamCreateWithFlags(v_streams + i, cudaStreamNonBlocking), __LINE__);
127128
}
128129

129130
if (test_args.use_memcpy_stream) {
130-
checkCudaStatus(cudaStreamCreate(&memcpy_stream), __LINE__);
131+
checkCudaStatus(cudaStreamCreateWithFlags(&memcpy_stream, cudaStreamNonBlocking), __LINE__);
131132
}
132133

133134
if (test_args.use_uvm_stream) {
134-
checkCudaStatus(cudaStreamCreate(&uvm_stream), __LINE__);
135+
checkCudaStatus(cudaStreamCreateWithFlags(&uvm_stream, cudaStreamNonBlocking), __LINE__);
135136
}
136137
}
137138

@@ -268,17 +269,29 @@ void run_stress_test(
268269
szTransfer, cudaMemcpyDeviceToDevice), __LINE__);
269270
}
270271

271-
// Simulate output download
272-
if (p_memory_pool[pair_idx].b_copy_d2h) {
272+
// Simulate checkpoint download. The odd workers will have higher stream priorities
273+
// but lower number of transactions
274+
bool enable_d2h_copy = p_memory_pool[pair_idx].b_copy_d2h;
275+
if (thread_id % 2 != 0) {
276+
if (rand_r(&rng_state) % 100 < 97) {
277+
enable_d2h_copy = false;
278+
}
279+
}
280+
281+
if (enable_d2h_copy) {
282+
// checkCudaStatus(cudaStreamSynchronize(current_stream), __LINE__);
273283
uint32_t rand_index = rand_r(&rng_state) % p_memory_pool[pair_idx].n_elements;
274284
checkCudaStatus(
275285
cudaMemcpyAsync(
276-
h_output + i,
277-
p_memory_pool[pair_idx].d_C + rand_index,
278-
sizeof(float),
286+
p_memory_pool[pair_idx].h_C,
287+
p_memory_pool[pair_idx].d_C,
288+
p_memory_pool[pair_idx].n_elements * sizeof(float),
279289
cudaMemcpyDeviceToHost,
280290
current_memcpy_stream),
281291
__LINE__);
292+
uint32_t rand_idx_out = rand_r(&rng_state) % test_args.num_operations;
293+
// checkCudaStatus(cudaStreamSynchronize(current_memcpy_stream), __LINE__);
294+
h_output[rand_idx_out] = p_memory_pool[pair_idx].h_C[rand_index];
282295
}
283296

284297
// Get memory during execution

libkineto/stress_test/tensor_cache.cu

Lines changed: 14 additions & 38 deletions
Original file line numberDiff line numberDiff line change
@@ -13,6 +13,7 @@
1313

1414
namespace kineto_stress_test {
1515

16+
#define CUDA_API_PER_THREAD_DEFAULT_STREAM
1617
#define RNG_SEED 1025
1718

1819
// A kernel that fills a device buffer with random values
@@ -92,8 +93,11 @@ void add_pairs_to_tensor_cache(tensor_cache_args cache_args, uint32_t
9293
// Simulate output download
9394
if (((float)(rand() % 32767) / 32767.0) < cache_args.prob_d2h) {
9495
p_memory_pool[i].b_copy_d2h = true;
96+
checkCudaStatus(cudaHostAlloc(&p_memory_pool[i].h_C, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
97+
simple_lcg_host(p_memory_pool[i].h_C, num_elements);
9598
} else {
9699
p_memory_pool[i].b_copy_d2h = false;
100+
p_memory_pool[i].h_C = NULL;
97101
}
98102

99103
// Now we have a new tensor pair
@@ -151,42 +155,6 @@ void re_initialize_buffer_values() {
151155
}
152156

153157
void free_and_realloc_tensor_pairs(tensor_pair *tensor_pair, cudaStream_t stream) {
154-
// Older CUDA versions don't know about async malloc and free
155-
#if defined(CUDA_VERSION) && CUDA_VERSION > 11000 && defined(ASYNC_MALLOC)
156-
157-
checkCudaStatus(
158-
cudaFreeAsync(tensor_pair->d_A, stream),
159-
__LINE__);
160-
checkCudaStatus(
161-
cudaFreeAsync(tensor_pair->d_B, stream),
162-
__LINE__);
163-
checkCudaStatus(
164-
cudaFreeAsync(tensor_pair->d_C, stream),
165-
__LINE__);
166-
167-
// Allocate device buffers
168-
uint32_t num_elements = tensor_pair->n_elements;
169-
checkCudaStatus(
170-
cudaMallocAsync(
171-
&tensor_pair->d_A,
172-
num_elements * sizeof(float),
173-
stream),
174-
__LINE__);
175-
checkCudaStatus(
176-
cudaMallocAsync(
177-
&tensor_pair->d_B,
178-
num_elements * sizeof(float),
179-
stream),
180-
__LINE__);
181-
checkCudaStatus(
182-
cudaMallocAsync(
183-
&tensor_pair->d_C,
184-
num_elements * sizeof(float),
185-
stream),
186-
__LINE__);
187-
188-
#else
189-
190158
checkCudaStatus(cudaFree(tensor_pair->d_A), __LINE__);
191159
checkCudaStatus(cudaFree(tensor_pair->d_B), __LINE__);
192160
checkCudaStatus(cudaFree(tensor_pair->d_C), __LINE__);
@@ -203,8 +171,6 @@ void free_and_realloc_tensor_pairs(tensor_pair *tensor_pair, cudaStream_t stream
203171
num_elements * sizeof(float)),
204172
__LINE__);
205173

206-
#endif // CUDA_VERSION >= 11000
207-
208174
if (tensor_pair->b_copy_h2d) {
209175
checkCudaStatus(cudaFreeHost(tensor_pair->h_A), __LINE__);
210176
checkCudaStatus(cudaFreeHost(tensor_pair->h_B), __LINE__);
@@ -215,6 +181,12 @@ void free_and_realloc_tensor_pairs(tensor_pair *tensor_pair, cudaStream_t stream
215181
simple_lcg_host(tensor_pair->h_A, num_elements);
216182
simple_lcg_host(tensor_pair->h_B, num_elements);
217183
}
184+
185+
if (tensor_pair->b_copy_d2h) {
186+
checkCudaStatus(cudaFreeHost(tensor_pair->h_C), __LINE__);
187+
checkCudaStatus(cudaHostAlloc(&tensor_pair->h_C, num_elements * sizeof(float), cudaHostAllocDefault), __LINE__);
188+
simple_lcg_host(tensor_pair->h_C, num_elements);
189+
}
218190
}
219191

220192
void free_tensor_cache() {
@@ -231,6 +203,10 @@ void free_tensor_cache() {
231203
if (p_memory_pool[i].h_B) {
232204
checkCudaStatus(cudaFreeHost(p_memory_pool[i].h_B), __LINE__);
233205
}
206+
207+
if (p_memory_pool[i].h_C) {
208+
checkCudaStatus(cudaFreeHost(p_memory_pool[i].h_C), __LINE__);
209+
}
234210
}
235211
}
236212

libkineto/stress_test/tensor_cache.cuh

Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -42,6 +42,7 @@ struct tensor_pair {
4242
// Host buffers
4343
float* h_A;
4444
float* h_B;
45+
float* h_C;
4546
};
4647

4748
// The memory pool object

0 commit comments

Comments
 (0)