Skip to content

Commit a3d2603

Browse files
author
Xi Luo
committed
Support more than one GPU per node
1 parent 03cadcc commit a3d2603

File tree

6 files changed

+133
-35
lines changed

6 files changed

+133
-35
lines changed

core/core.cc

Lines changed: 90 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -80,11 +80,23 @@ void Kernel::execute(long graph_index, long timestep, long point,
8080
assert(timestep >= 0 && point >= 0);
8181
execute_kernel_imbalance(*this, graph_index, timestep, point);
8282
break;
83+
default:
84+
assert(false && "unimplemented kernel type");
85+
};
86+
}
87+
88+
void Kernel::execute(long graph_index, long timestep, long point,
89+
char *scratch_ptr, size_t scratch_bytes, int gpu_id) const
90+
{
91+
switch(type) {
92+
case KernelType::EMPTY:
93+
execute_kernel_empty(*this);
94+
break;
8395
#ifdef ENABLE_CUDA
8496
case KernelType::CUDA_COMPUTE_BOUND:
8597
assert(scratch_ptr != NULL);
8698
assert(scratch_bytes > 0);
87-
execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes);
99+
execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes, gpu_id);
88100
break;
89101
#endif
90102
default:
@@ -633,6 +645,81 @@ void TaskGraph::execute_point(long timestep, long point,
633645
k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes);
634646
}
635647

648+
void TaskGraph::execute_point(long timestep, long point,
649+
char *output_ptr, size_t output_bytes,
650+
const char **input_ptr, const size_t *input_bytes,
651+
size_t n_inputs,
652+
char *scratch_ptr, size_t scratch_bytes,
653+
int gpu_id) const
654+
{
655+
#ifdef DEBUG_CORE
656+
// Validate graph_index
657+
assert(graph_index >= 0 && graph_index < sizeof(TaskGraphMask)*8);
658+
has_executed_graph |= 1 << graph_index;
659+
#endif
660+
661+
// Validate timestep and point
662+
assert(0 <= timestep && timestep < timesteps);
663+
664+
long offset = offset_at_timestep(timestep);
665+
long width = width_at_timestep(timestep);
666+
assert(offset <= point && point < offset+width);
667+
668+
long last_offset = offset_at_timestep(timestep-1);
669+
long last_width = width_at_timestep(timestep-1);
670+
671+
// Validate input
672+
{
673+
size_t idx = 0;
674+
long dset = dependence_set_at_timestep(timestep);
675+
size_t max_deps = num_dependencies(dset, point);
676+
std::pair<long, long> *deps = reinterpret_cast<std::pair<long, long> *>(alloca(sizeof(std::pair<long, long>) * max_deps));
677+
size_t num_deps = dependencies(dset, point, deps);
678+
for (size_t span = 0; span < num_deps; span++) {
679+
for (long dep = deps[span].first; dep <= deps[span].second; dep++) {
680+
if (last_offset <= dep && dep < last_offset + last_width) {
681+
assert(idx < n_inputs);
682+
683+
assert(input_bytes[idx] == output_bytes_per_task);
684+
assert(input_bytes[idx] >= sizeof(std::pair<long, long>));
685+
686+
const std::pair<long, long> *input = reinterpret_cast<const std::pair<long, long> *>(input_ptr[idx]);
687+
for (size_t i = 0; i < input_bytes[idx]/sizeof(std::pair<long, long>); ++i) {
688+
assert(input[i].first == timestep - 1);
689+
assert(input[i].second == dep);
690+
}
691+
idx++;
692+
}
693+
}
694+
}
695+
// FIXME (Elliott): Legion is currently passing in uninitialized
696+
// memory for dependencies outside of the last offset/width.
697+
// assert(idx == n_inputs);
698+
}
699+
700+
// Validate output
701+
assert(output_bytes == output_bytes_per_task);
702+
assert(output_bytes >= sizeof(std::pair<long, long>));
703+
704+
// Generate output
705+
std::pair<long, long> *output = reinterpret_cast<std::pair<long, long> *>(output_ptr);
706+
for (size_t i = 0; i < output_bytes/sizeof(std::pair<long, long>); ++i) {
707+
output[i].first = timestep;
708+
output[i].second = point;
709+
}
710+
711+
// Validate scratch
712+
assert(scratch_bytes == scratch_bytes_per_task);
713+
if (scratch_bytes > 0) {
714+
uint64_t *scratch = reinterpret_cast<uint64_t *>(scratch_ptr);
715+
assert(*scratch == MAGIC_VALUE);
716+
}
717+
718+
// Execute kernel
719+
Kernel k(kernel);
720+
k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes, gpu_id);
721+
}
722+
636723
void TaskGraph::prepare_scratch(char *scratch_ptr, size_t scratch_bytes)
637724
{
638725
assert(scratch_bytes % sizeof(uint64_t) == 0);
@@ -654,7 +741,7 @@ static TaskGraph default_graph(long graph_index)
654741
graph.period = -1;
655742
graph.fraction_connected = 0.25;
656743
#ifdef ENABLE_CUDA
657-
graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 0, 1};
744+
graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 1};
658745
#else
659746
graph.kernel = {KernelType::EMPTY, 0, 16, 0.0};
660747
#endif
@@ -962,9 +1049,6 @@ App::App(int argc, char **argv)
9621049

9631050
check();
9641051

965-
#ifdef ENABLE_CUDA
966-
init_cuda_support(graphs);
967-
#endif
9681052
}
9691053

9701054
void App::check() const
@@ -1253,8 +1337,5 @@ void App::report_timing(double elapsed_seconds) const
12531337
#ifdef DEBUG_CORE
12541338
printf("Task Graph Execution Mask %llx\n", has_executed_graph.load());
12551339
#endif
1256-
1257-
#ifdef ENABLE_CUDA
1258-
fini_cuda_support();
1259-
#endif
1340+
12601341
}

core/core.h

Lines changed: 8 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -34,6 +34,8 @@ struct Kernel : public kernel_t {
3434
private:
3535
void execute(long graph_index, long timestep, long point,
3636
char *scratch_ptr, size_t scratch_bytes) const;
37+
void execute(long graph_index, long timestep, long point,
38+
char *scratch_ptr, size_t scratch_bytes, int gpu_id) const;
3739
friend struct TaskGraph;
3840
};
3941

@@ -69,6 +71,12 @@ struct TaskGraph : public task_graph_t {
6971
const char **input_ptr, const size_t *input_bytes,
7072
size_t n_inputs,
7173
char *scratch_ptr, size_t scratch_bytes) const;
74+
void execute_point(long timestep, long point,
75+
char *output_ptr, size_t output_bytes,
76+
const char **input_ptr, const size_t *input_bytes,
77+
size_t n_inputs,
78+
char *scratch_ptr, size_t scratch_bytes,
79+
int gpu_id) const;
7280
static void prepare_scratch(char *scratch_ptr, size_t scratch_bytes);
7381
};
7482

core/core_c.h

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -63,7 +63,6 @@ typedef struct kernel_t {
6363
int nb_blocks;
6464
int threads_per_block;
6565
int memcpy_required;
66-
int gpu_id;
6766
int cuda_unroll;
6867
#endif
6968
} kernel_t;

core/cuda_kernel.cu

Lines changed: 19 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -3,7 +3,6 @@
33
#include "cuda_kernel.h"
44

55
std::vector<char*> local_buffer;
6-
int nb_local_buffer = 0;
76
size_t local_buffer_size;
87
cudaStream_t cuda_stream_array[8];
98

@@ -27,61 +26,60 @@ __global__ void execute_kernel_compute_cuda_kernel_unroll_4(long iter, double *A
2726
__global__ void execute_kernel_compute_cuda_kernel_unroll_8(long iter, double *A);
2827
__global__ void execute_kernel_compute_cuda_kernel_unroll_16(long iter, double *A);
2928

30-
void init_cuda_support(const std::vector<TaskGraph> &graphs)
29+
void init_cuda_support(const std::vector<TaskGraph> &graphs, const std::vector<int> &local_gpus)
3130
{
32-
int nb_gpus = 1;
31+
int nb_gpus = local_gpus.size();
3332

34-
nb_local_buffer = nb_gpus;
35-
local_buffer.reserve(nb_local_buffer);
33+
local_buffer.reserve(nb_gpus);
3634
int nb_blocks = graphs[0].kernel.nb_blocks;
3735
int threads_per_block = graphs[0].kernel.threads_per_block;
3836
int cuda_unroll = graphs[0].kernel.cuda_unroll;
3937
printf("init cuda support nb_blocks %d, threads_per_block %d, cuda_unroll %d\n", nb_blocks, threads_per_block, cuda_unroll);
4038
local_buffer_size = nb_blocks * threads_per_block * sizeof(double);
4139
for (int i = 0; i < nb_gpus; i++) {
42-
gpuErrchk( cudaSetDevice(0) );
40+
gpuErrchk( cudaSetDevice(local_gpus[i]) );
4341
gpuErrchk( cudaMalloc((void**)&(local_buffer[i]), sizeof(double) * nb_blocks * threads_per_block * cuda_unroll) );
4442
assert(local_buffer[i] != NULL);
4543
gpuErrchk( cudaStreamCreate(&(cuda_stream_array[i])) );
4644
}
4745
}
4846

49-
void fini_cuda_support()
47+
void fini_cuda_support(const std::vector<int> &local_gpus)
5048
{
51-
for (int i = 0; i < nb_local_buffer; i++) {
52-
gpuErrchk( cudaSetDevice(0) );
49+
for (int i = 0; i < local_buffer.size(); i++) {
50+
gpuErrchk( cudaSetDevice(local_gpus[i]) );
5351
gpuErrchk( cudaFree(local_buffer[i]) );
5452
local_buffer[i] = NULL;
5553
gpuErrchk( cudaStreamDestroy(cuda_stream_array[i]) );
5654
}
55+
local_buffer.clear();
5756
}
5857

59-
void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes)
58+
void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, int gpu_id)
6059
{
61-
// printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
60+
// printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
6261
assert(scratch_bytes <= local_buffer_size);
63-
assert(kernel.gpu_id == 0);
6462

6563
if (kernel.memcpy_required == 1) {
6664
// printf("enable memcpy in\n");
67-
gpuErrchk( cudaMemcpyAsync(local_buffer[kernel.gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[kernel.gpu_id]) );
68-
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
65+
gpuErrchk( cudaMemcpyAsync(local_buffer[gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[gpu_id]) );
66+
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
6967
}
7068
if (kernel.cuda_unroll == 4) {
71-
execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
69+
execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
7270
} else if (kernel.cuda_unroll == 8) {
73-
execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
71+
execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
7472
} else if (kernel.cuda_unroll == 16) {
75-
execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
73+
execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
7674
} else {
77-
execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
75+
execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
7876
}
7977
gpuErrchk( cudaPeekAtLastError() );
80-
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
78+
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
8179
if (kernel.memcpy_required == 1) {
8280
// printf("enable memcpy out\n");
83-
gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[kernel.gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[kernel.gpu_id]) );
84-
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
81+
gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[gpu_id]) );
82+
gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
8583
}
8684
}
8785

core/cuda_kernel.h

Lines changed: 3 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -11,10 +11,10 @@ extern std::vector<char*> local_buffer;
1111

1212
extern size_t local_buffer_size;
1313

14-
void init_cuda_support(const std::vector<TaskGraph> &graphs);
14+
void init_cuda_support(const std::vector<TaskGraph> &graphs, const std::vector<int> &local_gpus);
1515

16-
void fini_cuda_support();
16+
void fini_cuda_support(const std::vector<int> &local_gpus);
1717

18-
void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes);
18+
void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, int gpu_id);
1919

2020
#endif

mpi_gpu/nonblock_gpu.cc

Lines changed: 13 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -22,7 +22,9 @@
2222
#include "core.h"
2323

2424
#include "mpi.h"
25+
#include "cuda_kernel.h"
2526

27+
// Only support one gpu per MPI process
2628
int main(int argc, char *argv[])
2729
{
2830
MPI_Init(&argc, &argv);
@@ -31,6 +33,14 @@ int main(int argc, char *argv[])
3133
MPI_Comm_rank(MPI_COMM_WORLD, &rank);
3234

3335
App app(argc, argv);
36+
37+
int local_rank, local_size;
38+
MPI_Comm MPI_COMM_LOCAL;
39+
MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0 /* key */, MPI_INFO_NULL, &MPI_COMM_LOCAL);
40+
MPI_Comm_rank(MPI_COMM_LOCAL, &local_rank);
41+
std::vector<int> local_gpus(1, local_rank);
42+
init_cuda_support(app.graphs, local_gpus);
43+
3444
if (rank == 0) app.display();
3545

3646
double elapsed_time = 0.0;
@@ -177,7 +187,7 @@ int main(int argc, char *argv[])
177187
graph.execute_point(timestep, point,
178188
point_output.data(), point_output.size(),
179189
point_input_ptr.data(), point_input_bytes.data(), point_n_inputs,
180-
scratch_ptr, scratch_bytes);
190+
scratch_ptr, scratch_bytes, 0);
181191
}
182192
}
183193
}
@@ -195,5 +205,7 @@ int main(int argc, char *argv[])
195205
app.report_timing(elapsed_time);
196206
}
197207

208+
fini_cuda_support(local_gpus);
209+
198210
MPI_Finalize();
199211
}

0 commit comments

Comments
 (0)