Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
19 changes: 4 additions & 15 deletions src/gpu/assemble_MPI_vector_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -118,8 +118,7 @@ TRACE("transfer_boun_accel_from_device");

extern EXTERN_LANG
void FC_FUNC_(transfer_boundary_from_device_a,
TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer,
const int* nspec_outer_elastic) {
TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer) {

// asynchronous transfer from device to host

Expand Down Expand Up @@ -173,9 +172,7 @@ void FC_FUNC_(transfer_boundary_from_device_a,
extern EXTERN_LANG
void FC_FUNC_(transfer_boundary_to_device_a,
TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
realw* buffer_recv_vector_ext_mesh,
const int* num_interfaces_ext_mesh,
const int* max_nibool_interfaces_ext_mesh) {
realw* buffer_recv_vector_ext_mesh) {

// asynchronous transfer from host to device

Expand Down Expand Up @@ -205,10 +202,6 @@ extern EXTERN_LANG
void FC_FUNC_(transfer_asmbl_accel_to_device,
TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
realw* buffer_recv_vector_ext_mesh,
const int* num_interfaces_ext_mesh,
const int* max_nibool_interfaces_ext_mesh,
const int* nibool_interfaces_ext_mesh,
const int* ibool_interfaces_ext_mesh,
const int* FORWARD_OR_ADJOINT) {
TRACE("transfer_asmbl_accel_to_device");

Expand Down Expand Up @@ -305,10 +298,6 @@ extern EXTERN_LANG
void FC_FUNC_(transfer_sync_accel_to_device,
TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
realw* buffer_recv_vector_ext_mesh,
const int* num_interfaces_ext_mesh,
const int* max_nibool_interfaces_ext_mesh,
const int* nibool_interfaces_ext_mesh,
const int* ibool_interfaces_ext_mesh,
const int* FORWARD_OR_ADJOINT) {
TRACE("transfer_sync_accel_to_device");

Expand Down Expand Up @@ -422,7 +411,7 @@ TRACE("transfer_sync_accel_to_device");
//
// // ***************************************************************************
// // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
// cudaStreamSynchronize(mp->copy_stream);
// gpuStreamSynchronize(mp->copy_stream);
//
// // Assembling on the copy_stream breaks the solution and it "blows up"
// if (*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
Expand Down Expand Up @@ -464,7 +453,7 @@ void FC_FUNC_(sync_copy_from_device,

// There have been problems using the pinned-memory with MPI, so
// we copy the buffer into a non-pinned region.
memcpy(send_buffer,mp->h_send_accel_buffer,mp->size_mpi_buffer*sizeof(float));
memcpy(send_buffer,mp->h_send_accel_buffer,mp->size_mpi_buffer*sizeof(realw));
}
// memory copy is now finished, so non-blocking MPI send can proceed
}
Expand Down
11 changes: 4 additions & 7 deletions src/gpu/compute_add_sources_acoustic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -199,7 +199,6 @@ extern EXTERN_LANG
void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
realw* h_source_adjoint,
int* nrec,
int* nadj_rec_local,
int* NTSTEP_BETWEEN_READ_ADJSRC,
int* it) {
Expand Down Expand Up @@ -285,32 +284,30 @@ void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
#ifdef USE_CUDA
if (run_cuda){
add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
*nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
it_index,
mp->d_source_adjoint,
mp->d_hxir_adj,
mp->d_hetar_adj,
mp->d_hgammar_adj,
mp->d_ibool,
mp->d_ispec_is_acoustic,
mp->d_ispec_selected_adjrec_loc,
mp->nadj_rec_local,
mp->d_kappastore);
mp->nadj_rec_local);
}
#endif
#ifdef USE_HIP
if (run_hip){
hipLaunchKernelGGL(add_sources_ac_SIM_TYPE_2_OR_3_kernel, dim3(grid), dim3(threads), 0, mp->compute_stream,
mp->d_potential_dot_dot_acoustic,
*nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
it_index,
mp->d_source_adjoint,
mp->d_hxir_adj,
mp->d_hetar_adj,
mp->d_hgammar_adj,
mp->d_ibool,
mp->d_ispec_is_acoustic,
mp->d_ispec_selected_adjrec_loc,
mp->nadj_rec_local,
mp->d_kappastore);
mp->nadj_rec_local);
}
#endif

Expand Down
5 changes: 2 additions & 3 deletions src/gpu/compute_add_sources_viscoelastic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -234,7 +234,6 @@ extern EXTERN_LANG
void FC_FUNC_(add_sources_el_sim_type_2_or_3,
ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
realw* h_source_adjoint,
int* nrec,
int* nadj_rec_local,
int* NTSTEP_BETWEEN_READ_ADJSRC,
int* it) {
Expand Down Expand Up @@ -276,7 +275,7 @@ void FC_FUNC_(add_sources_el_sim_type_2_or_3,
#ifdef USE_CUDA
if (run_cuda){
add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
*nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
it_index,
mp->d_source_adjoint,
mp->d_hxir_adj,
mp->d_hetar_adj,
Expand All @@ -291,7 +290,7 @@ void FC_FUNC_(add_sources_el_sim_type_2_or_3,
if (run_hip){
hipLaunchKernelGGL(add_sources_el_SIM_TYPE_2_OR_3_kernel, dim3(grid), dim3(threads), 0, mp->compute_stream,
mp->d_accel,
*nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
it_index,
mp->d_source_adjoint,
mp->d_hxir_adj,
mp->d_hetar_adj,
Expand Down
4 changes: 0 additions & 4 deletions src/gpu/compute_forces_viscoelastic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -173,7 +173,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
epsilondev_trace,
epsilon_trace_over_3,
mp->simulation_type,
mp->NSPEC_AB,
d_factor_common,
R_xx,R_yy,R_xy,R_xz,R_yz,
R_trace,
Expand Down Expand Up @@ -222,7 +221,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
epsilondev_trace,
epsilon_trace_over_3,
mp->simulation_type,
mp->NSPEC_AB,
d_factor_common,
R_xx,R_yy,R_xy,R_xz,R_yz,
R_trace,
Expand Down Expand Up @@ -273,7 +271,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
d_b_epsilondev_trace,
d_b_epsilon_trace_over_3,
mp->simulation_type,
mp->NSPEC_AB,
d_factor_common,
d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
d_b_R_trace,
Expand Down Expand Up @@ -322,7 +319,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
d_b_epsilondev_trace,
d_b_epsilon_trace_over_3,
mp->simulation_type,
mp->NSPEC_AB,
d_factor_common,
d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
d_b_R_trace,
Expand Down
2 changes: 0 additions & 2 deletions src/gpu/compute_kernels_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -303,7 +303,6 @@ void FC_FUNC_(compute_kernels_acoustic_cuda,
mp->d_gammax,mp->d_gammay,mp->d_gammaz,
mp->xix_regular,
mp->d_potential_acoustic,
mp->d_potential_dot_dot_acoustic,
mp->d_b_potential_acoustic,
mp->d_b_potential_dot_dot_acoustic,
mp->d_rho_ac_kl,
Expand All @@ -326,7 +325,6 @@ void FC_FUNC_(compute_kernels_acoustic_cuda,
mp->d_gammax,mp->d_gammay,mp->d_gammaz,
mp->xix_regular,
mp->d_potential_acoustic,
mp->d_potential_dot_dot_acoustic,
mp->d_b_potential_acoustic,
mp->d_b_potential_dot_dot_acoustic,
mp->d_rho_ac_kl,
Expand Down
10 changes: 6 additions & 4 deletions src/gpu/compute_stacey_acoustic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -282,11 +282,13 @@ void FC_FUNC_(compute_stacey_acoustic_undoatt_cuda,
GPU_ERROR_CHECKING("compute_stacey_acoustic_undoatt_cuda");
}

/* ----------------------------------------------------------------------------------------------- */

extern EXTERN_LANG
void FC_FUNC_(compute_coupled_injection_contribution_ac_device,
COMPUTE_COUPLED_INJECTION_CONTRIBUTION_AC_DEVICE) (long *Mesh_pointer,
realw* b_boundary_injection_potential,
int* SAVE_STACEY_f) {
COMPUTE_COUPLED_INJECTION_CONTRIBUTION_AC_DEVICE)(long *Mesh_pointer,
realw* b_boundary_injection_potential,
int* SAVE_STACEY_f) {

TRACE("compute_coupled_injection_contribution_ac_device");

Expand Down Expand Up @@ -342,4 +344,4 @@ void FC_FUNC_(compute_coupled_injection_contribution_ac_device,
gpuMemcpy_tohost_realw(b_boundary_injection_potential,mp->d_b_boundary_injection_potential,size);
// writing is done in fortran routine
}
}
}
41 changes: 27 additions & 14 deletions src/gpu/compute_stacey_viscoelastic_cuda.cu
Original file line number Diff line number Diff line change
Expand Up @@ -85,10 +85,17 @@ void FC_FUNC_(compute_stacey_viscoelastic_cuda,

if (FORWARD_OR_ADJOINT == 0){
// combined forward/backward fields
realw* veloc = mp->d_veloc;
realw* accel = mp->d_accel;
// LTS simulation uses veloc_p(:,:,num_p_level) array
if (mp->lts_mode) {
veloc = &(mp->d_lts_veloc_p[NDIM * mp->NGLOB_AB * (mp->lts_num_p_level-1)]); // -1 for 0-indexing
}

#ifdef USE_CUDA
if (run_cuda){
compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc,
mp->d_accel,
compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(veloc,
accel,
mp->d_abs_boundary_ispec,
mp->d_abs_boundary_ijk,
mp->d_abs_boundary_normal,
Expand All @@ -106,8 +113,8 @@ void FC_FUNC_(compute_stacey_viscoelastic_cuda,
#ifdef USE_HIP
if (run_hip){
hipLaunchKernelGGL(compute_stacey_elastic_kernel, dim3(grid), dim3(threads), 0, mp->compute_stream,
mp->d_veloc,
mp->d_accel,
veloc,
accel,
mp->d_abs_boundary_ispec,
mp->d_abs_boundary_ijk,
mp->d_abs_boundary_normal,
Expand Down Expand Up @@ -150,17 +157,23 @@ void FC_FUNC_(compute_stacey_viscoelastic_cuda,
#endif

}
}else{
} else {
// single FORWARD_OR_ADJOINT==1/3
// sets gpu arrays
realw *veloc, *accel;
if (FORWARD_OR_ADJOINT == 1) {
veloc = mp->d_veloc;
accel = mp->d_accel;
} else {
realw* veloc = mp->d_veloc; // FORWARD_OR_ADJOINT == 1
realw* accel = mp->d_accel;
// LTS simulation uses veloc_p(:,:,num_p_level) array
if (mp->lts_mode) {
veloc = &(mp->d_lts_veloc_p[NDIM * mp->NGLOB_AB * (mp->lts_num_p_level-1)]); // -1 for 0-indexing
}
if (FORWARD_OR_ADJOINT != 1) {
// for backward/reconstructed fields
veloc = mp->d_b_veloc;
accel = mp->d_b_accel;
// safety stop - kernel simulations w/ LTS mode not fully implemented yet
if (mp->lts_mode) exit_on_error("error LTS mode for kernel simulations not fully implemented yet");
}

// single forward or backward fields
#ifdef USE_CUDA
if (run_cuda){
Expand Down Expand Up @@ -304,13 +317,13 @@ void FC_FUNC_(compute_stacey_viscoelastic_undoatt_cuda,
}


/* ----------------------------------------------------------------------------------------------- */

extern EXTERN_LANG
void FC_FUNC_(compute_coupled_injection_contribution_el_device,
COMPUTE_COUPLED_INJECTION_CONTRIBUTION_EL_DEVICE)
(long* Mesh_pointer,
realw* b_boundary_injection_field,
int *SAVE_STACEY_f) {
COMPUTE_COUPLED_INJECTION_CONTRIBUTION_EL_DEVICE)(long* Mesh_pointer,
realw* b_boundary_injection_field,
int *SAVE_STACEY_f) {

TRACE("compute_coupled_injection_contribution_el_device");

Expand Down
1 change: 0 additions & 1 deletion src/gpu/fault_solver_dynamics.cu
Original file line number Diff line number Diff line change
Expand Up @@ -163,7 +163,6 @@ extern EXTERN_LANG
void FC_FUNC_(transfer_fault_data_to_host,
TRANSFER_FAULT_DATA_TO_HOST)(long* Fault_pointer,
int* fault_index,
int* NSPEC_FLT,
int* NGLOB_FLT,
realw* D,
realw* V,
Expand Down
4 changes: 2 additions & 2 deletions src/gpu/helper_functions.cu
Original file line number Diff line number Diff line change
Expand Up @@ -207,7 +207,7 @@ void start_timing_gpu(gpu_event* start,gpu_event* stop){
/* ----------------------------------------------------------------------------------------------- */

void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str){
realw time;
realw time = 0.0;

#ifdef USE_CUDA
// stops events
Expand All @@ -233,7 +233,7 @@ void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str){
/* ----------------------------------------------------------------------------------------------- */

void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str, realw* t){
realw time;
realw time = 0.0;

#ifdef USE_CUDA
// stops events
Expand Down
2 changes: 1 addition & 1 deletion src/gpu/kernels/Kernel_2_acoustic_impl.cu
Original file line number Diff line number Diff line change
Expand Up @@ -573,7 +573,7 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
realw_const_p minus_g,
realw* d_kappastore,
realw_const_p wgll_cube,
const int FORWAR_OR_ADJOINT){
const int FORWARD_OR_ADJOINT){

// block-id == number of local element id in phase_ispec array
int bx = blockIdx.y*gridDim.x+blockIdx.x;
Expand Down
Loading
Loading