33#include " cuda_kernel.h"
44
55std::vector<char *> local_buffer;
6- int nb_local_buffer = 0 ;
76size_t local_buffer_size;
87cudaStream_t cuda_stream_array[8 ];
98
@@ -27,61 +26,60 @@ __global__ void execute_kernel_compute_cuda_kernel_unroll_4(long iter, double *A
2726__global__ void execute_kernel_compute_cuda_kernel_unroll_8 (long iter, double *A);
2827__global__ void execute_kernel_compute_cuda_kernel_unroll_16 (long iter, double *A);
2928
30- void init_cuda_support (const std::vector<TaskGraph> &graphs)
29+ void init_cuda_support (const std::vector<TaskGraph> &graphs, const std::vector< int > &local_gpus )
3130{
32- int nb_gpus = 1 ;
31+ int nb_gpus = local_gpus. size () ;
3332
34- nb_local_buffer = nb_gpus;
35- local_buffer.reserve (nb_local_buffer);
33+ local_buffer.reserve (nb_gpus);
3634 int nb_blocks = graphs[0 ].kernel .nb_blocks ;
3735 int threads_per_block = graphs[0 ].kernel .threads_per_block ;
3836 int cuda_unroll = graphs[0 ].kernel .cuda_unroll ;
3937 printf (" init cuda support nb_blocks %d, threads_per_block %d, cuda_unroll %d\n " , nb_blocks, threads_per_block, cuda_unroll);
4038 local_buffer_size = nb_blocks * threads_per_block * sizeof (double );
4139 for (int i = 0 ; i < nb_gpus; i++) {
42- gpuErrchk ( cudaSetDevice (0 ) );
40+ gpuErrchk ( cudaSetDevice (local_gpus[i] ) );
4341 gpuErrchk ( cudaMalloc ((void **)&(local_buffer[i]), sizeof (double ) * nb_blocks * threads_per_block * cuda_unroll) );
4442 assert (local_buffer[i] != NULL );
4543 gpuErrchk ( cudaStreamCreate (&(cuda_stream_array[i])) );
4644 }
4745}
4846
49- void fini_cuda_support ()
47+ void fini_cuda_support (const std::vector< int > &local_gpus )
5048{
51- for (int i = 0 ; i < nb_local_buffer ; i++) {
52- gpuErrchk ( cudaSetDevice (0 ) );
49+ for (int i = 0 ; i < local_buffer. size () ; i++) {
50+ gpuErrchk ( cudaSetDevice (local_gpus[i] ) );
5351 gpuErrchk ( cudaFree (local_buffer[i]) );
5452 local_buffer[i] = NULL ;
5553 gpuErrchk ( cudaStreamDestroy (cuda_stream_array[i]) );
5654 }
55+ local_buffer.clear ();
5756}
5857
59- void execute_kernel_compute_cuda (const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes)
58+ void execute_kernel_compute_cuda (const Kernel &kernel, char *scratch_ptr, size_t scratch_bytes, int gpu_id )
6059{
61- // printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
60+ // printf("CUDA COMPUTE KERNEL buffer %p, size %lld, nb_blocks %d, threads_per_block %d\n", scratch_ptr, scratch_bytes, kernel.nb_blocks, kernel.threads_per_block);
6261 assert (scratch_bytes <= local_buffer_size);
63- assert (kernel.gpu_id == 0 );
6462
6563 if (kernel.memcpy_required == 1 ) {
6664 // printf("enable memcpy in\n");
67- gpuErrchk ( cudaMemcpyAsync (local_buffer[kernel. gpu_id ], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[kernel. gpu_id ]) );
68- gpuErrchk ( cudaStreamSynchronize (cuda_stream_array[kernel. gpu_id ]) );
65+ gpuErrchk ( cudaMemcpyAsync (local_buffer[gpu_id], scratch_ptr, scratch_bytes, cudaMemcpyHostToDevice, cuda_stream_array[gpu_id]) );
66+ gpuErrchk ( cudaStreamSynchronize (cuda_stream_array[gpu_id]) );
6967 }
7068 if (kernel.cuda_unroll == 4 ) {
71- execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0 , cuda_stream_array[kernel. gpu_id]>>> (kernel.iterations , (double *)local_buffer[kernel. gpu_id ]);
69+ execute_kernel_compute_cuda_kernel_unroll_4<<<kernel.nb_blocks, kernel.threads_per_block, 0 , cuda_stream_array[gpu_id]>>> (kernel.iterations , (double *)local_buffer[gpu_id]);
7270 } else if (kernel.cuda_unroll == 8 ) {
73- execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0 , cuda_stream_array[kernel. gpu_id]>>> (kernel.iterations , (double *)local_buffer[kernel. gpu_id ]);
71+ execute_kernel_compute_cuda_kernel_unroll_8<<<kernel.nb_blocks, kernel.threads_per_block, 0 , cuda_stream_array[gpu_id]>>> (kernel.iterations , (double *)local_buffer[gpu_id]);
7472 } else if (kernel.cuda_unroll == 16 ) {
75- execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0 , cuda_stream_array[kernel. gpu_id]>>> (kernel.iterations , (double *)local_buffer[kernel. gpu_id ]);
73+ execute_kernel_compute_cuda_kernel_unroll_16<<<kernel.nb_blocks, kernel.threads_per_block, 0 , cuda_stream_array[gpu_id]>>> (kernel.iterations , (double *)local_buffer[gpu_id]);
7674 } else {
77- execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0 , cuda_stream_array[kernel. gpu_id]>>> (kernel.iterations , (double *)local_buffer[kernel. gpu_id ]);
75+ execute_kernel_compute_cuda_kernel_unroll_1<<<kernel.nb_blocks, kernel.threads_per_block, 0 , cuda_stream_array[gpu_id]>>> (kernel.iterations , (double *)local_buffer[gpu_id]);
7876 }
7977 gpuErrchk ( cudaPeekAtLastError () );
80- gpuErrchk ( cudaStreamSynchronize (cuda_stream_array[kernel. gpu_id ]) );
78+ gpuErrchk ( cudaStreamSynchronize (cuda_stream_array[gpu_id]) );
8179 if (kernel.memcpy_required == 1 ) {
8280 // printf("enable memcpy out\n");
83- gpuErrchk ( cudaMemcpyAsync (scratch_ptr, local_buffer[kernel. gpu_id ], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[kernel. gpu_id ]) );
84- gpuErrchk ( cudaStreamSynchronize (cuda_stream_array[kernel. gpu_id ]) );
81+ gpuErrchk ( cudaMemcpyAsync (scratch_ptr, local_buffer[gpu_id], scratch_bytes, cudaMemcpyDeviceToHost, cuda_stream_array[gpu_id]) );
82+ gpuErrchk ( cudaStreamSynchronize (cuda_stream_array[gpu_id]) );
8583 }
8684}
8785
0 commit comments