Skip to content

Commit cf32477

Browse files
authored
Merge pull request #1803 from danielpeter/devel
adding GPU support for LTS
2 parents e0cfa3b + b197760 commit cf32477

File tree

49 files changed

+4404
-1545
lines changed

Some content is hidden

Large Commits have some content hidden by default. Use the searchbox below for content that may be hidden.

49 files changed

+4404
-1545
lines changed

src/gpu/assemble_MPI_vector_cuda.cu

Lines changed: 4 additions & 15 deletions
Original file line numberDiff line numberDiff line change
@@ -118,8 +118,7 @@ TRACE("transfer_boun_accel_from_device");
118118

119119
extern EXTERN_LANG
120120
void FC_FUNC_(transfer_boundary_from_device_a,
121-
TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer,
122-
const int* nspec_outer_elastic) {
121+
TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer) {
123122

124123
// asynchronous transfer from device to host
125124

@@ -173,9 +172,7 @@ void FC_FUNC_(transfer_boundary_from_device_a,
173172
extern EXTERN_LANG
174173
void FC_FUNC_(transfer_boundary_to_device_a,
175174
TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
176-
realw* buffer_recv_vector_ext_mesh,
177-
const int* num_interfaces_ext_mesh,
178-
const int* max_nibool_interfaces_ext_mesh) {
175+
realw* buffer_recv_vector_ext_mesh) {
179176

180177
// asynchronous transfer from host to device
181178

@@ -205,10 +202,6 @@ extern EXTERN_LANG
205202
void FC_FUNC_(transfer_asmbl_accel_to_device,
206203
TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
207204
realw* buffer_recv_vector_ext_mesh,
208-
const int* num_interfaces_ext_mesh,
209-
const int* max_nibool_interfaces_ext_mesh,
210-
const int* nibool_interfaces_ext_mesh,
211-
const int* ibool_interfaces_ext_mesh,
212205
const int* FORWARD_OR_ADJOINT) {
213206
TRACE("transfer_asmbl_accel_to_device");
214207

@@ -305,10 +298,6 @@ extern EXTERN_LANG
305298
void FC_FUNC_(transfer_sync_accel_to_device,
306299
TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
307300
realw* buffer_recv_vector_ext_mesh,
308-
const int* num_interfaces_ext_mesh,
309-
const int* max_nibool_interfaces_ext_mesh,
310-
const int* nibool_interfaces_ext_mesh,
311-
const int* ibool_interfaces_ext_mesh,
312301
const int* FORWARD_OR_ADJOINT) {
313302
TRACE("transfer_sync_accel_to_device");
314303

@@ -422,7 +411,7 @@ TRACE("transfer_sync_accel_to_device");
422411
//
423412
// // ***************************************************************************
424413
// // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
425-
// cudaStreamSynchronize(mp->copy_stream);
414+
// gpuStreamSynchronize(mp->copy_stream);
426415
//
427416
// // Assembling on the copy_stream breaks the solution and it "blows up"
428417
// if (*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
@@ -464,7 +453,7 @@ void FC_FUNC_(sync_copy_from_device,
464453

465454
// There have been problems using the pinned-memory with MPI, so
466455
// we copy the buffer into a non-pinned region.
467-
memcpy(send_buffer,mp->h_send_accel_buffer,mp->size_mpi_buffer*sizeof(float));
456+
memcpy(send_buffer,mp->h_send_accel_buffer,mp->size_mpi_buffer*sizeof(realw));
468457
}
469458
// memory copy is now finished, so non-blocking MPI send can proceed
470459
}

src/gpu/compute_add_sources_acoustic_cuda.cu

Lines changed: 4 additions & 7 deletions
Original file line numberDiff line numberDiff line change
@@ -199,7 +199,6 @@ extern EXTERN_LANG
199199
void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
200200
ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
201201
realw* h_source_adjoint,
202-
int* nrec,
203202
int* nadj_rec_local,
204203
int* NTSTEP_BETWEEN_READ_ADJSRC,
205204
int* it) {
@@ -285,32 +284,30 @@ void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
285284
#ifdef USE_CUDA
286285
if (run_cuda){
287286
add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
288-
*nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
287+
it_index,
289288
mp->d_source_adjoint,
290289
mp->d_hxir_adj,
291290
mp->d_hetar_adj,
292291
mp->d_hgammar_adj,
293292
mp->d_ibool,
294293
mp->d_ispec_is_acoustic,
295294
mp->d_ispec_selected_adjrec_loc,
296-
mp->nadj_rec_local,
297-
mp->d_kappastore);
295+
mp->nadj_rec_local);
298296
}
299297
#endif
300298
#ifdef USE_HIP
301299
if (run_hip){
302300
hipLaunchKernelGGL(add_sources_ac_SIM_TYPE_2_OR_3_kernel, dim3(grid), dim3(threads), 0, mp->compute_stream,
303301
mp->d_potential_dot_dot_acoustic,
304-
*nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
302+
it_index,
305303
mp->d_source_adjoint,
306304
mp->d_hxir_adj,
307305
mp->d_hetar_adj,
308306
mp->d_hgammar_adj,
309307
mp->d_ibool,
310308
mp->d_ispec_is_acoustic,
311309
mp->d_ispec_selected_adjrec_loc,
312-
mp->nadj_rec_local,
313-
mp->d_kappastore);
310+
mp->nadj_rec_local);
314311
}
315312
#endif
316313

src/gpu/compute_add_sources_viscoelastic_cuda.cu

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -234,7 +234,6 @@ extern EXTERN_LANG
234234
void FC_FUNC_(add_sources_el_sim_type_2_or_3,
235235
ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
236236
realw* h_source_adjoint,
237-
int* nrec,
238237
int* nadj_rec_local,
239238
int* NTSTEP_BETWEEN_READ_ADJSRC,
240239
int* it) {
@@ -276,7 +275,7 @@ void FC_FUNC_(add_sources_el_sim_type_2_or_3,
276275
#ifdef USE_CUDA
277276
if (run_cuda){
278277
add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
279-
*nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
278+
it_index,
280279
mp->d_source_adjoint,
281280
mp->d_hxir_adj,
282281
mp->d_hetar_adj,
@@ -291,7 +290,7 @@ void FC_FUNC_(add_sources_el_sim_type_2_or_3,
291290
if (run_hip){
292291
hipLaunchKernelGGL(add_sources_el_SIM_TYPE_2_OR_3_kernel, dim3(grid), dim3(threads), 0, mp->compute_stream,
293292
mp->d_accel,
294-
*nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
293+
it_index,
295294
mp->d_source_adjoint,
296295
mp->d_hxir_adj,
297296
mp->d_hetar_adj,

src/gpu/compute_forces_viscoelastic_cuda.cu

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -173,7 +173,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
173173
epsilondev_trace,
174174
epsilon_trace_over_3,
175175
mp->simulation_type,
176-
mp->NSPEC_AB,
177176
d_factor_common,
178177
R_xx,R_yy,R_xy,R_xz,R_yz,
179178
R_trace,
@@ -222,7 +221,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
222221
epsilondev_trace,
223222
epsilon_trace_over_3,
224223
mp->simulation_type,
225-
mp->NSPEC_AB,
226224
d_factor_common,
227225
R_xx,R_yy,R_xy,R_xz,R_yz,
228226
R_trace,
@@ -273,7 +271,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
273271
d_b_epsilondev_trace,
274272
d_b_epsilon_trace_over_3,
275273
mp->simulation_type,
276-
mp->NSPEC_AB,
277274
d_factor_common,
278275
d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
279276
d_b_R_trace,
@@ -322,7 +319,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
322319
d_b_epsilondev_trace,
323320
d_b_epsilon_trace_over_3,
324321
mp->simulation_type,
325-
mp->NSPEC_AB,
326322
d_factor_common,
327323
d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
328324
d_b_R_trace,

src/gpu/compute_kernels_cuda.cu

Lines changed: 0 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -303,7 +303,6 @@ void FC_FUNC_(compute_kernels_acoustic_cuda,
303303
mp->d_gammax,mp->d_gammay,mp->d_gammaz,
304304
mp->xix_regular,
305305
mp->d_potential_acoustic,
306-
mp->d_potential_dot_dot_acoustic,
307306
mp->d_b_potential_acoustic,
308307
mp->d_b_potential_dot_dot_acoustic,
309308
mp->d_rho_ac_kl,
@@ -326,7 +325,6 @@ void FC_FUNC_(compute_kernels_acoustic_cuda,
326325
mp->d_gammax,mp->d_gammay,mp->d_gammaz,
327326
mp->xix_regular,
328327
mp->d_potential_acoustic,
329-
mp->d_potential_dot_dot_acoustic,
330328
mp->d_b_potential_acoustic,
331329
mp->d_b_potential_dot_dot_acoustic,
332330
mp->d_rho_ac_kl,

src/gpu/compute_stacey_acoustic_cuda.cu

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -282,11 +282,13 @@ void FC_FUNC_(compute_stacey_acoustic_undoatt_cuda,
282282
GPU_ERROR_CHECKING("compute_stacey_acoustic_undoatt_cuda");
283283
}
284284

285+
/* ----------------------------------------------------------------------------------------------- */
286+
285287
extern EXTERN_LANG
286288
void FC_FUNC_(compute_coupled_injection_contribution_ac_device,
287-
COMPUTE_COUPLED_INJECTION_CONTRIBUTION_AC_DEVICE) (long *Mesh_pointer,
288-
realw* b_boundary_injection_potential,
289-
int* SAVE_STACEY_f) {
289+
COMPUTE_COUPLED_INJECTION_CONTRIBUTION_AC_DEVICE)(long *Mesh_pointer,
290+
realw* b_boundary_injection_potential,
291+
int* SAVE_STACEY_f) {
290292

291293
TRACE("compute_coupled_injection_contribution_ac_device");
292294

@@ -342,4 +344,4 @@ void FC_FUNC_(compute_coupled_injection_contribution_ac_device,
342344
gpuMemcpy_tohost_realw(b_boundary_injection_potential,mp->d_b_boundary_injection_potential,size);
343345
// writing is done in fortran routine
344346
}
345-
}
347+
}

src/gpu/compute_stacey_viscoelastic_cuda.cu

Lines changed: 27 additions & 14 deletions
Original file line numberDiff line numberDiff line change
@@ -85,10 +85,17 @@ void FC_FUNC_(compute_stacey_viscoelastic_cuda,
8585

8686
if (FORWARD_OR_ADJOINT == 0){
8787
// combined forward/backward fields
88+
realw* veloc = mp->d_veloc;
89+
realw* accel = mp->d_accel;
90+
// LTS simulation uses veloc_p(:,:,num_p_level) array
91+
if (mp->lts_mode) {
92+
veloc = &(mp->d_lts_veloc_p[NDIM * mp->NGLOB_AB * (mp->lts_num_p_level-1)]); // -1 for 0-indexing
93+
}
94+
8895
#ifdef USE_CUDA
8996
if (run_cuda){
90-
compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc,
91-
mp->d_accel,
97+
compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(veloc,
98+
accel,
9299
mp->d_abs_boundary_ispec,
93100
mp->d_abs_boundary_ijk,
94101
mp->d_abs_boundary_normal,
@@ -106,8 +113,8 @@ void FC_FUNC_(compute_stacey_viscoelastic_cuda,
106113
#ifdef USE_HIP
107114
if (run_hip){
108115
hipLaunchKernelGGL(compute_stacey_elastic_kernel, dim3(grid), dim3(threads), 0, mp->compute_stream,
109-
mp->d_veloc,
110-
mp->d_accel,
116+
veloc,
117+
accel,
111118
mp->d_abs_boundary_ispec,
112119
mp->d_abs_boundary_ijk,
113120
mp->d_abs_boundary_normal,
@@ -150,17 +157,23 @@ void FC_FUNC_(compute_stacey_viscoelastic_cuda,
150157
#endif
151158

152159
}
153-
}else{
160+
} else {
161+
// single FORWARD_OR_ADJOINT==1/3
154162
// sets gpu arrays
155-
realw *veloc, *accel;
156-
if (FORWARD_OR_ADJOINT == 1) {
157-
veloc = mp->d_veloc;
158-
accel = mp->d_accel;
159-
} else {
163+
realw* veloc = mp->d_veloc; // FORWARD_OR_ADJOINT == 1
164+
realw* accel = mp->d_accel;
165+
// LTS simulation uses veloc_p(:,:,num_p_level) array
166+
if (mp->lts_mode) {
167+
veloc = &(mp->d_lts_veloc_p[NDIM * mp->NGLOB_AB * (mp->lts_num_p_level-1)]); // -1 for 0-indexing
168+
}
169+
if (FORWARD_OR_ADJOINT != 1) {
160170
// for backward/reconstructed fields
161171
veloc = mp->d_b_veloc;
162172
accel = mp->d_b_accel;
173+
// safety stop - kernel simulations w/ LTS mode not fully implemented yet
174+
if (mp->lts_mode) exit_on_error("error LTS mode for kernel simulations not fully implemented yet");
163175
}
176+
164177
// single forward or backward fields
165178
#ifdef USE_CUDA
166179
if (run_cuda){
@@ -304,13 +317,13 @@ void FC_FUNC_(compute_stacey_viscoelastic_undoatt_cuda,
304317
}
305318

306319

320+
/* ----------------------------------------------------------------------------------------------- */
307321

308322
extern EXTERN_LANG
309323
void FC_FUNC_(compute_coupled_injection_contribution_el_device,
310-
COMPUTE_COUPLED_INJECTION_CONTRIBUTION_EL_DEVICE)
311-
(long* Mesh_pointer,
312-
realw* b_boundary_injection_field,
313-
int *SAVE_STACEY_f) {
324+
COMPUTE_COUPLED_INJECTION_CONTRIBUTION_EL_DEVICE)(long* Mesh_pointer,
325+
realw* b_boundary_injection_field,
326+
int *SAVE_STACEY_f) {
314327

315328
TRACE("compute_coupled_injection_contribution_el_device");
316329

src/gpu/fault_solver_dynamics.cu

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -163,7 +163,6 @@ extern EXTERN_LANG
163163
void FC_FUNC_(transfer_fault_data_to_host,
164164
TRANSFER_FAULT_DATA_TO_HOST)(long* Fault_pointer,
165165
int* fault_index,
166-
int* NSPEC_FLT,
167166
int* NGLOB_FLT,
168167
realw* D,
169168
realw* V,

src/gpu/helper_functions.cu

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -207,7 +207,7 @@ void start_timing_gpu(gpu_event* start,gpu_event* stop){
207207
/* ----------------------------------------------------------------------------------------------- */
208208

209209
void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str){
210-
realw time;
210+
realw time = 0.0;
211211

212212
#ifdef USE_CUDA
213213
// stops events
@@ -233,7 +233,7 @@ void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str){
233233
/* ----------------------------------------------------------------------------------------------- */
234234

235235
void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str, realw* t){
236-
realw time;
236+
realw time = 0.0;
237237

238238
#ifdef USE_CUDA
239239
// stops events

src/gpu/kernels/Kernel_2_acoustic_impl.cu

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -573,7 +573,7 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
573573
realw_const_p minus_g,
574574
realw* d_kappastore,
575575
realw_const_p wgll_cube,
576-
const int FORWAR_OR_ADJOINT){
576+
const int FORWARD_OR_ADJOINT){
577577

578578
// block-id == number of local element id in phase_ispec array
579579
int bx = blockIdx.y*gridDim.x+blockIdx.x;

0 commit comments

Comments
 (0)