Support more than one GPU per node

Xi Luo · Xi Luo · commit a3d2603717b9 · 2020-07-20T17:03:45.000-07:00
diff --git a/core/core.cc b/core/core.cc
@@ -80,11 +80,23 @@ void Kernel::execute(long graph_index, long timestep, long point,
     assert(timestep >= 0 && point >= 0);
     execute_kernel_imbalance(*this, graph_index, timestep, point);
     break;
+  default:
+    assert(false && "unimplemented kernel type");
+  };
+}
+
+void Kernel::execute(long graph_index, long timestep, long point,
+                     char *scratch_ptr, size_t scratch_bytes, int gpu_id) const
+{
+  switch(type) {
+  case KernelType::EMPTY:
+    execute_kernel_empty(*this);
+    break;
 #ifdef ENABLE_CUDA
   case KernelType::CUDA_COMPUTE_BOUND:
     assert(scratch_ptr != NULL);
     assert(scratch_bytes > 0);
-    execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes);
+    execute_kernel_compute_cuda(*this, scratch_ptr, scratch_bytes, gpu_id);
     break; 
 #endif
   default:
@@ -633,6 +645,81 @@ void TaskGraph::execute_point(long timestep, long point,
   k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes);
 }
 
+void TaskGraph::execute_point(long timestep, long point,
+                              char *output_ptr, size_t output_bytes,
+                              const char **input_ptr, const size_t *input_bytes,
+                              size_t n_inputs,
+                              char *scratch_ptr, size_t scratch_bytes,
+                              int gpu_id) const
+{
+#ifdef DEBUG_CORE
+  // Validate graph_index
+  assert(graph_index >= 0 && graph_index < sizeof(TaskGraphMask)*8);
+  has_executed_graph |= 1 << graph_index;
+#endif
+
+  // Validate timestep and point
+  assert(0 <= timestep && timestep < timesteps);
+
+  long offset = offset_at_timestep(timestep);
+  long width = width_at_timestep(timestep);
+  assert(offset <= point && point < offset+width);
+
+  long last_offset = offset_at_timestep(timestep-1);
+  long last_width = width_at_timestep(timestep-1);
+
+  // Validate input
+  {
+    size_t idx = 0;
+    long dset = dependence_set_at_timestep(timestep);
+    size_t max_deps = num_dependencies(dset, point);
+    std::pair<long, long> *deps = reinterpret_cast<std::pair<long, long> *>(alloca(sizeof(std::pair<long, long>) * max_deps));
+    size_t num_deps = dependencies(dset, point, deps);
+    for (size_t span = 0; span < num_deps; span++) {
+      for (long dep = deps[span].first; dep <= deps[span].second; dep++) {
+        if (last_offset <= dep && dep < last_offset + last_width) {
+          assert(idx < n_inputs);
+
+          assert(input_bytes[idx] == output_bytes_per_task);
+          assert(input_bytes[idx] >= sizeof(std::pair<long, long>));
+
+          const std::pair<long, long> *input = reinterpret_cast<const std::pair<long, long> *>(input_ptr[idx]);
+          for (size_t i = 0; i < input_bytes[idx]/sizeof(std::pair<long, long>); ++i) {
+            assert(input[i].first == timestep - 1);
+            assert(input[i].second == dep);
+          }
+          idx++;
+        }
+      }
+    }
+    // FIXME (Elliott): Legion is currently passing in uninitialized
+    // memory for dependencies outside of the last offset/width.
+    // assert(idx == n_inputs);
+  }
+
+  // Validate output
+  assert(output_bytes == output_bytes_per_task);
+  assert(output_bytes >= sizeof(std::pair<long, long>));
+
+  // Generate output
+  std::pair<long, long> *output = reinterpret_cast<std::pair<long, long> *>(output_ptr);
+  for (size_t i = 0; i < output_bytes/sizeof(std::pair<long, long>); ++i) {
+    output[i].first = timestep;
+    output[i].second = point;
+  }
+
+  // Validate scratch
+  assert(scratch_bytes == scratch_bytes_per_task);
+  if (scratch_bytes > 0) {
+    uint64_t *scratch = reinterpret_cast<uint64_t *>(scratch_ptr);
+    assert(*scratch == MAGIC_VALUE);
+  }
+
+  // Execute kernel
+  Kernel k(kernel);
+  k.execute(graph_index, timestep, point, scratch_ptr, scratch_bytes, gpu_id);
+}
+
 void TaskGraph::prepare_scratch(char *scratch_ptr, size_t scratch_bytes)
 {
   assert(scratch_bytes % sizeof(uint64_t) == 0);
@@ -654,7 +741,7 @@ static TaskGraph default_graph(long graph_index)
   graph.period = -1;
   graph.fraction_connected = 0.25;
 #ifdef ENABLE_CUDA
-  graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 0, 1};
+  graph.kernel = {KernelType::EMPTY, 0, 16, 0.0, 1, 32, 0, 1};
 #else
   graph.kernel = {KernelType::EMPTY, 0, 16, 0.0};
 #endif
@@ -962,9 +1049,6 @@ App::App(int argc, char **argv)
   
   check();
   
-#ifdef ENABLE_CUDA
-  init_cuda_support(graphs);
-#endif
 }
 
 void App::check() const
@@ -1253,8 +1337,5 @@ void App::report_timing(double elapsed_seconds) const
 #ifdef DEBUG_CORE
   printf("Task Graph Execution Mask %llx\n", has_executed_graph.load());
 #endif
-  
-#ifdef ENABLE_CUDA
-  fini_cuda_support();
-#endif
+
 }
diff --git a/core/core.h b/core/core.h
@@ -34,6 +34,8 @@ struct Kernel : public kernel_t {
 private:
   void execute(long graph_index, long timestep, long point,
                char *scratch_ptr, size_t scratch_bytes) const;
+  void execute(long graph_index, long timestep, long point,
+               char *scratch_ptr, size_t scratch_bytes, int gpu_id) const;
   friend struct TaskGraph;
 };
 
@@ -69,6 +71,12 @@ struct TaskGraph : public task_graph_t {
                      const char **input_ptr, const size_t *input_bytes,
                      size_t n_inputs,
                      char *scratch_ptr, size_t scratch_bytes) const;
+  void execute_point(long timestep, long point,
+                     char *output_ptr, size_t output_bytes,
+                     const char **input_ptr, const size_t *input_bytes,
+                     size_t n_inputs,
+                     char *scratch_ptr, size_t scratch_bytes,
+                     int gpu_id) const;
   static void prepare_scratch(char *scratch_ptr, size_t scratch_bytes);
 };
 
diff --git a/core/core_c.h b/core/core_c.h
@@ -63,7 +63,6 @@ typedef struct kernel_t {
   int nb_blocks;
   int threads_per_block;
   int memcpy_required;
-  int gpu_id;
   int cuda_unroll;
 #endif
 } kernel_t;
diff --git a/core/cuda_kernel.cu b/core/cuda_kernel.cu
@@ -3,7 +3,6 @@
 #include "cuda_kernel.h"
 
 std::vector<char*> local_buffer;
-int nb_local_buffer = 0;
 size_t local_buffer_size;
 cudaStream_t cuda_stream_array[8];
 
@@ -27,61 +26,60 @@ __global__ void execute_kernel_compute_cuda_kernel_unroll_4(long iter, double *A
 __global__ void execute_kernel_compute_cuda_kernel_unroll_8(long iter, double *A);
 __global__ void execute_kernel_compute_cuda_kernel_unroll_16(long iter, double *A);
 
-void init_cuda_support(const std::vector<TaskGraph> &graphs)
+void init_cuda_support(const std::vector<TaskGraph> &graphs, const std::vector<int> &local_gpus)
 {
-  int nb_gpus = 1;
+  int nb_gpus = local_gpus.size();
   
-  nb_local_buffer = nb_gpus;
-  local_buffer.reserve(nb_local_buffer);
+  local_buffer.reserve(nb_gpus);
   int nb_blocks = graphs[0].kernel.nb_blocks;
   int threads_per_block = graphs[0].kernel.threads_per_block;
   int cuda_unroll = graphs[0].kernel.cuda_unroll;
   printf("init cuda support nb_blocks %d, threads_per_block %d, cuda_unroll %d\n", nb_blocks, threads_per_block, cuda_unroll);
   local_buffer_size = nb_blocks * threads_per_block * sizeof(double);
   for (int i = 0; i < nb_gpus; i++) {
-    gpuErrchk( cudaSetDevice(0) );
+    gpuErrchk( cudaSetDevice(local_gpus[i]) );
     gpuErrchk( cudaMalloc((void**)&(local_buffer[i]), sizeof(double) * nb_blocks * threads_per_block * cuda_unroll) );
     assert(local_buffer[i] != NULL);
     gpuErrchk( cudaStreamCreate(&(cuda_stream_array[i])) );
   }
 }
 
-void fini_cuda_support()
+void fini_cuda_support(const std::vector<int> &local_gpus)
 {
-  for (int i = 0; i < nb_local_buffer; i++) {
-    gpuErrchk( cudaSetDevice(0) );
+  for (int i = 0; i < local_buffer.size(); i++) {
+    gpuErrchk( cudaSetDevice(local_gpus[i]) );
     gpuErrchk( cudaFree(local_buffer[i]) );
     local_buffer[i] = NULL;
     gpuErrchk( cudaStreamDestroy(cuda_stream_array[i]) );
   }
+  local_buffer.clear();
 }
 
-void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes)
+void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, int gpu_id)
 {
-//  printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
+  // printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
   assert(scratch_bytes <= local_buffer_size);
-  assert(kernel.gpu_id == 0);
     
   if (kernel.memcpy_required == 1) {
  //   printf("enable memcpy in\n");
-    gpuErrchk( cudaMemcpyAsync(local_buffer[kernel.gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[kernel.gpu_id]) ); 
-    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
+    gpuErrchk( cudaMemcpyAsync(local_buffer[gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[gpu_id]) ); 
+    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
   }
   if (kernel.cuda_unroll == 4) {
-    execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
   } else if (kernel.cuda_unroll == 8) {
-    execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
   } else if (kernel.cuda_unroll == 16) {
-    execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
   } else {
-    execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[kernel.gpu_id]>>>(kernel.iterations, (double *)local_buffer[kernel.gpu_id]);
+    execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0, cuda_stream_array[gpu_id]>>>(kernel.iterations, (double *)local_buffer[gpu_id]);
   }
   gpuErrchk( cudaPeekAtLastError() );
-  gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
+  gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
   if (kernel.memcpy_required == 1) {
  //   printf("enable memcpy out\n");
-    gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[kernel.gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[kernel.gpu_id]) );
-    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[kernel.gpu_id]) );
+    gpuErrchk( cudaMemcpyAsync(scratch_ptr, local_buffer[gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[gpu_id]) );
+    gpuErrchk( cudaStreamSynchronize(cuda_stream_array[gpu_id]) );
   }
 }
 
diff --git a/core/cuda_kernel.h b/core/cuda_kernel.h
@@ -11,10 +11,10 @@ extern std::vector<char*> local_buffer;
 
 extern size_t local_buffer_size;
 
-void init_cuda_support(const std::vector<TaskGraph> &graphs);
+void init_cuda_support(const std::vector<TaskGraph> &graphs, const std::vector<int> &local_gpus);
 
-void fini_cuda_support();
+void fini_cuda_support(const std::vector<int> &local_gpus);
 
-void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes);
+void execute_kernel_compute_cuda(const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, int gpu_id);
 
 #endif
diff --git a/mpi_gpu/nonblock_gpu.cc b/mpi_gpu/nonblock_gpu.cc
@@ -22,7 +22,9 @@
 #include "core.h"
 
 #include "mpi.h"
+#include "cuda_kernel.h"
 
+// Only support one gpu per MPI process
 int main(int argc, char *argv[])
 {
   MPI_Init(&argc, &argv);
@@ -31,6 +33,14 @@ int main(int argc, char *argv[])
   MPI_Comm_rank(MPI_COMM_WORLD, &rank);
 
   App app(argc, argv);
+
+  int local_rank, local_size;
+  MPI_Comm MPI_COMM_LOCAL;
+  MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0 /* key */, MPI_INFO_NULL, &MPI_COMM_LOCAL);
+  MPI_Comm_rank(MPI_COMM_LOCAL, &local_rank);
+  std::vector<int> local_gpus(1, local_rank);
+  init_cuda_support(app.graphs, local_gpus);
+
   if (rank == 0) app.display();
 
   double elapsed_time = 0.0;
@@ -177,7 +187,7 @@ int main(int argc, char *argv[])
             graph.execute_point(timestep, point,
                                 point_output.data(), point_output.size(),
                                 point_input_ptr.data(), point_input_bytes.data(), point_n_inputs,
-                                scratch_ptr, scratch_bytes);
+                                scratch_ptr, scratch_bytes, 0);
           }
         }
       }
@@ -195,5 +205,7 @@ int main(int argc, char *argv[])
     app.report_timing(elapsed_time);
   }
 
+  fini_cuda_support(local_gpus);
+
   MPI_Finalize();
 }

Original file line number	Diff line number	Diff line change
`@@ -22,7 +22,9 @@`
`22`	`22`	`#include "core.h"`
`23`	`23`
`24`	`24`	`#include "mpi.h"`
	`25`	`+#include "cuda_kernel.h"`
`25`	`26`
	`27`	`+// Only support one gpu per MPI process`
`26`	`28`	`int main(int argc, char *argv[])`
`27`	`29`	`{`
`28`	`30`	`MPI_Init(&argc, &argv);`
`@@ -31,6 +33,14 @@ int main(int argc, char *argv[])`
`31`	`33`	`MPI_Comm_rank(MPI_COMM_WORLD, &rank);`
`32`	`34`
`33`	`35`	`App app(argc, argv);`
	`36`	`+`
	`37`	`+ int local_rank, local_size;`
	`38`	`+ MPI_Comm MPI_COMM_LOCAL;`
	`39`	`+ MPI_Comm_split_type(MPI_COMM_WORLD, MPI_COMM_TYPE_SHARED, 0 /* key */, MPI_INFO_NULL, &MPI_COMM_LOCAL);`
	`40`	`+ MPI_Comm_rank(MPI_COMM_LOCAL, &local_rank);`
	`41`	`+ std::vector<int> local_gpus(1, local_rank);`
	`42`	`+ init_cuda_support(app.graphs, local_gpus);`
	`43`	`+`
`34`	`44`	`if (rank == 0) app.display();`
`35`	`45`
`36`	`46`	`double elapsed_time = 0.0;`
`@@ -177,7 +187,7 @@ int main(int argc, char *argv[])`
`177`	`187`	`graph.execute_point(timestep, point,`
`178`	`188`	`point_output.data(), point_output.size(),`
`179`	`189`	`point_input_ptr.data(), point_input_bytes.data(), point_n_inputs,`
`180`		`- scratch_ptr, scratch_bytes);`
	`190`	`+ scratch_ptr, scratch_bytes, 0);`
`181`	`191`	`}`
`182`	`192`	`}`
`183`	`193`	`}`
`@@ -195,5 +205,7 @@ int main(int argc, char *argv[])`
`195`	`205`	`app.report_timing(elapsed_time);`
`196`	`206`	`}`
`197`	`207`
	`208`	`+ fini_cuda_support(local_gpus);`
	`209`	`+`
`198`	`210`	`MPI_Finalize();`
`199`	`211`	`}`