SPECFEM · danielpeter · Apr 12, 2025 · Apr 2, 2025 · Apr 10, 2025 · Apr 12, 2025
diff --git a/src/gpu/assemble_MPI_vector_cuda.cu b/src/gpu/assemble_MPI_vector_cuda.cu
@@ -118,8 +118,7 @@ TRACE("transfer_boun_accel_from_device");
 
 extern EXTERN_LANG
 void FC_FUNC_(transfer_boundary_from_device_a,
-              TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer,
-                                               const int* nspec_outer_elastic) {
+              TRANSFER_BOUNDARY_FROM_DEVICE_A)(long* Mesh_pointer) {
 
 // asynchronous transfer from device to host
 
@@ -173,9 +172,7 @@ void FC_FUNC_(transfer_boundary_from_device_a,
 extern EXTERN_LANG
 void FC_FUNC_(transfer_boundary_to_device_a,
               TRANSFER_BOUNDARY_TO_DEVICE_A)(long* Mesh_pointer,
-                                             realw* buffer_recv_vector_ext_mesh,
-                                             const int* num_interfaces_ext_mesh,
-                                             const int* max_nibool_interfaces_ext_mesh) {
+                                             realw* buffer_recv_vector_ext_mesh) {
 
 // asynchronous transfer from host to device
 
@@ -205,10 +202,6 @@ extern EXTERN_LANG
 void FC_FUNC_(transfer_asmbl_accel_to_device,
               TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
                                               realw* buffer_recv_vector_ext_mesh,
-                                              const int* num_interfaces_ext_mesh,
-                                              const int* max_nibool_interfaces_ext_mesh,
-                                              const int* nibool_interfaces_ext_mesh,
-                                              const int* ibool_interfaces_ext_mesh,
                                               const int* FORWARD_OR_ADJOINT) {
 TRACE("transfer_asmbl_accel_to_device");
 
@@ -305,10 +298,6 @@ extern EXTERN_LANG
 void FC_FUNC_(transfer_sync_accel_to_device,
               TRANSFER_ASMBL_ACCEL_TO_DEVICE)(long* Mesh_pointer,
                                               realw* buffer_recv_vector_ext_mesh,
-                                              const int* num_interfaces_ext_mesh,
-                                              const int* max_nibool_interfaces_ext_mesh,
-                                              const int* nibool_interfaces_ext_mesh,
-                                              const int* ibool_interfaces_ext_mesh,
                                               const int* FORWARD_OR_ADJOINT) {
 TRACE("transfer_sync_accel_to_device");
 
@@ -422,7 +411,7 @@ TRACE("transfer_sync_accel_to_device");
 //
 //  // ***************************************************************************
 //  // Wait until previous copy stream finishes. We assemble while other compute kernels execute.
-//  cudaStreamSynchronize(mp->copy_stream);
+//  gpuStreamSynchronize(mp->copy_stream);
 //
 //  // Assembling on the copy_stream breaks the solution and it "blows up"
 //  if (*FORWARD_OR_ADJOINT == 1) { //assemble forward accel
@@ -464,7 +453,7 @@ void FC_FUNC_(sync_copy_from_device,
 
     // There have been problems using the pinned-memory with MPI, so
     // we copy the buffer into a non-pinned region.
-    memcpy(send_buffer,mp->h_send_accel_buffer,mp->size_mpi_buffer*sizeof(float));
+    memcpy(send_buffer,mp->h_send_accel_buffer,mp->size_mpi_buffer*sizeof(realw));
   }
   // memory copy is now finished, so non-blocking MPI send can proceed
 }

diff --git a/src/gpu/compute_add_sources_acoustic_cuda.cu b/src/gpu/compute_add_sources_acoustic_cuda.cu
@@ -199,7 +199,6 @@ extern EXTERN_LANG
 void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
               ADD_SOURCES_AC_SIM_2_OR_3_CUDA)(long* Mesh_pointer,
                                               realw* h_source_adjoint,
-                                              int* nrec,
                                               int* nadj_rec_local,
                                               int* NTSTEP_BETWEEN_READ_ADJSRC,
                                               int* it) {
@@ -285,32 +284,30 @@ void FC_FUNC_(add_sources_ac_sim_2_or_3_cuda,
 #ifdef USE_CUDA
   if (run_cuda){
     add_sources_ac_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_potential_dot_dot_acoustic,
-                                                                                  *nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
+                                                                                  it_index,
                                                                                   mp->d_source_adjoint,
                                                                                   mp->d_hxir_adj,
                                                                                   mp->d_hetar_adj,
                                                                                   mp->d_hgammar_adj,
                                                                                   mp->d_ibool,
                                                                                   mp->d_ispec_is_acoustic,
                                                                                   mp->d_ispec_selected_adjrec_loc,
-                                                                                  mp->nadj_rec_local,
-                                                                                  mp->d_kappastore);
+                                                                                  mp->nadj_rec_local);
   }
 #endif
 #ifdef USE_HIP
   if (run_hip){
     hipLaunchKernelGGL(add_sources_ac_SIM_TYPE_2_OR_3_kernel, dim3(grid), dim3(threads), 0, mp->compute_stream,
                                                               mp->d_potential_dot_dot_acoustic,
-                                                              *nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
+                                                              it_index,
                                                               mp->d_source_adjoint,
                                                               mp->d_hxir_adj,
                                                               mp->d_hetar_adj,
                                                               mp->d_hgammar_adj,
                                                               mp->d_ibool,
                                                               mp->d_ispec_is_acoustic,
                                                               mp->d_ispec_selected_adjrec_loc,
-                                                              mp->nadj_rec_local,
-                                                              mp->d_kappastore);
+                                                              mp->nadj_rec_local);
   }
 #endif
 

diff --git a/src/gpu/compute_add_sources_viscoelastic_cuda.cu b/src/gpu/compute_add_sources_viscoelastic_cuda.cu
@@ -234,7 +234,6 @@ extern EXTERN_LANG
 void FC_FUNC_(add_sources_el_sim_type_2_or_3,
               ADD_SOURCES_EL_SIM_TYPE_2_OR_3)(long* Mesh_pointer,
                                               realw* h_source_adjoint,
-                                              int* nrec,
                                               int* nadj_rec_local,
                                               int* NTSTEP_BETWEEN_READ_ADJSRC,
                                               int* it) {
@@ -276,7 +275,7 @@ void FC_FUNC_(add_sources_el_sim_type_2_or_3,
 #ifdef USE_CUDA
   if (run_cuda){
     add_sources_el_SIM_TYPE_2_OR_3_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_accel,
-                                                                                 *nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
+                                                                                 it_index,
                                                                                  mp->d_source_adjoint,
                                                                                  mp->d_hxir_adj,
                                                                                  mp->d_hetar_adj,
@@ -291,7 +290,7 @@ void FC_FUNC_(add_sources_el_sim_type_2_or_3,
   if (run_hip){
     hipLaunchKernelGGL(add_sources_el_SIM_TYPE_2_OR_3_kernel, dim3(grid), dim3(threads), 0, mp->compute_stream,
                                                               mp->d_accel,
-                                                              *nrec,it_index,*NTSTEP_BETWEEN_READ_ADJSRC,
+                                                              it_index,
                                                               mp->d_source_adjoint,
                                                               mp->d_hxir_adj,
                                                               mp->d_hetar_adj,

diff --git a/src/gpu/compute_forces_viscoelastic_cuda.cu b/src/gpu/compute_forces_viscoelastic_cuda.cu
@@ -173,7 +173,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
                                                                 epsilondev_trace,
                                                                 epsilon_trace_over_3,
                                                                 mp->simulation_type,
-                                                                mp->NSPEC_AB,
                                                                 d_factor_common,
                                                                 R_xx,R_yy,R_xy,R_xz,R_yz,
                                                                 R_trace,
@@ -222,7 +221,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
                                             epsilondev_trace,
                                             epsilon_trace_over_3,
                                             mp->simulation_type,
-                                            mp->NSPEC_AB,
                                             d_factor_common,
                                             R_xx,R_yy,R_xy,R_xz,R_yz,
                                             R_trace,
@@ -273,7 +271,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
                                                                  d_b_epsilondev_trace,
                                                                  d_b_epsilon_trace_over_3,
                                                                  mp->simulation_type,
-                                                                 mp->NSPEC_AB,
                                                                  d_factor_common,
                                                                  d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
                                                                  d_b_R_trace,
@@ -322,7 +319,6 @@ void Kernel_2(int nb_blocks_to_compute,Mesh* mp,int d_iphase,realw d_deltat,
                                               d_b_epsilondev_trace,
                                               d_b_epsilon_trace_over_3,
                                               mp->simulation_type,
-                                              mp->NSPEC_AB,
                                               d_factor_common,
                                               d_b_R_xx,d_b_R_yy,d_b_R_xy,d_b_R_xz,d_b_R_yz,
                                               d_b_R_trace,

diff --git a/src/gpu/compute_kernels_cuda.cu b/src/gpu/compute_kernels_cuda.cu
@@ -303,7 +303,6 @@ void FC_FUNC_(compute_kernels_acoustic_cuda,
                                                       mp->d_gammax,mp->d_gammay,mp->d_gammaz,
                                                       mp->xix_regular,
                                                       mp->d_potential_acoustic,
-                                                      mp->d_potential_dot_dot_acoustic,
                                                       mp->d_b_potential_acoustic,
                                                       mp->d_b_potential_dot_dot_acoustic,
                                                       mp->d_rho_ac_kl,
@@ -326,7 +325,6 @@ void FC_FUNC_(compute_kernels_acoustic_cuda,
                                                       mp->d_gammax,mp->d_gammay,mp->d_gammaz,
                                                       mp->xix_regular,
                                                       mp->d_potential_acoustic,
-                                                      mp->d_potential_dot_dot_acoustic,
                                                       mp->d_b_potential_acoustic,
                                                       mp->d_b_potential_dot_dot_acoustic,
                                                       mp->d_rho_ac_kl,

diff --git a/src/gpu/compute_stacey_acoustic_cuda.cu b/src/gpu/compute_stacey_acoustic_cuda.cu
@@ -282,11 +282,13 @@ void FC_FUNC_(compute_stacey_acoustic_undoatt_cuda,
   GPU_ERROR_CHECKING("compute_stacey_acoustic_undoatt_cuda");
 }
 
+/* ----------------------------------------------------------------------------------------------- */
+
 extern EXTERN_LANG
 void FC_FUNC_(compute_coupled_injection_contribution_ac_device,
-              COMPUTE_COUPLED_INJECTION_CONTRIBUTION_AC_DEVICE) (long *Mesh_pointer,
-                                                      realw* b_boundary_injection_potential,
-                                                       int* SAVE_STACEY_f) {
+              COMPUTE_COUPLED_INJECTION_CONTRIBUTION_AC_DEVICE)(long *Mesh_pointer,
+                                                                realw* b_boundary_injection_potential,
+                                                                int* SAVE_STACEY_f) {
 
   TRACE("compute_coupled_injection_contribution_ac_device");
 
@@ -342,4 +344,4 @@ void FC_FUNC_(compute_coupled_injection_contribution_ac_device,
     gpuMemcpy_tohost_realw(b_boundary_injection_potential,mp->d_b_boundary_injection_potential,size);
     // writing is done in fortran routine
   }
-}
+}
diff --git a/src/gpu/compute_stacey_viscoelastic_cuda.cu b/src/gpu/compute_stacey_viscoelastic_cuda.cu
@@ -85,10 +85,17 @@ void FC_FUNC_(compute_stacey_viscoelastic_cuda,
 
   if (FORWARD_OR_ADJOINT == 0){
     // combined forward/backward fields
+    realw* veloc = mp->d_veloc;
+    realw* accel = mp->d_accel;
+    // LTS simulation uses veloc_p(:,:,num_p_level) array
+    if (mp->lts_mode) {
+      veloc = &(mp->d_lts_veloc_p[NDIM * mp->NGLOB_AB * (mp->lts_num_p_level-1)]); // -1 for 0-indexing
+    }
+
 #ifdef USE_CUDA
     if (run_cuda){
-      compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(mp->d_veloc,
-                                                                           mp->d_accel,
+      compute_stacey_elastic_kernel<<<grid,threads,0,mp->compute_stream>>>(veloc,
+                                                                           accel,
                                                                            mp->d_abs_boundary_ispec,
                                                                            mp->d_abs_boundary_ijk,
                                                                            mp->d_abs_boundary_normal,
@@ -106,8 +113,8 @@ void FC_FUNC_(compute_stacey_viscoelastic_cuda,
 #ifdef USE_HIP
     if (run_hip){
       hipLaunchKernelGGL(compute_stacey_elastic_kernel, dim3(grid), dim3(threads), 0, mp->compute_stream,
-                                                        mp->d_veloc,
-                                                        mp->d_accel,
+                                                        veloc,
+                                                        accel,
                                                         mp->d_abs_boundary_ispec,
                                                         mp->d_abs_boundary_ijk,
                                                         mp->d_abs_boundary_normal,
@@ -150,17 +157,23 @@ void FC_FUNC_(compute_stacey_viscoelastic_cuda,
 #endif
 
     }
-  }else{
+  } else {
+    // single FORWARD_OR_ADJOINT==1/3
     // sets gpu arrays
-    realw *veloc, *accel;
-    if (FORWARD_OR_ADJOINT == 1) {
-      veloc = mp->d_veloc;
-      accel = mp->d_accel;
-    } else {
+    realw* veloc = mp->d_veloc;  // FORWARD_OR_ADJOINT == 1
+    realw* accel = mp->d_accel;
+    // LTS simulation uses veloc_p(:,:,num_p_level) array
+    if (mp->lts_mode) {
+      veloc = &(mp->d_lts_veloc_p[NDIM * mp->NGLOB_AB * (mp->lts_num_p_level-1)]); // -1 for 0-indexing
+    }
+    if (FORWARD_OR_ADJOINT != 1) {
       // for backward/reconstructed fields
       veloc = mp->d_b_veloc;
       accel = mp->d_b_accel;
+      // safety stop - kernel simulations w/ LTS mode not fully implemented yet
+      if (mp->lts_mode) exit_on_error("error LTS mode for kernel simulations not fully implemented yet");
     }
+
     // single forward or backward fields
 #ifdef USE_CUDA
     if (run_cuda){
@@ -304,13 +317,13 @@ void FC_FUNC_(compute_stacey_viscoelastic_undoatt_cuda,
 }
 
 
+/* ----------------------------------------------------------------------------------------------- */
 
 extern EXTERN_LANG
 void FC_FUNC_(compute_coupled_injection_contribution_el_device,
-              COMPUTE_COUPLED_INJECTION_CONTRIBUTION_EL_DEVICE)
-              (long* Mesh_pointer,
-               realw* b_boundary_injection_field,
-               int *SAVE_STACEY_f) {
+              COMPUTE_COUPLED_INJECTION_CONTRIBUTION_EL_DEVICE)(long* Mesh_pointer,
+                                                                realw* b_boundary_injection_field,
+                                                                int *SAVE_STACEY_f) {
 
   TRACE("compute_coupled_injection_contribution_el_device");
 

diff --git a/src/gpu/fault_solver_dynamics.cu b/src/gpu/fault_solver_dynamics.cu
@@ -163,7 +163,6 @@ extern EXTERN_LANG
 void FC_FUNC_(transfer_fault_data_to_host,
               TRANSFER_FAULT_DATA_TO_HOST)(long* Fault_pointer,
                                            int* fault_index,
-                                           int* NSPEC_FLT,
                                            int* NGLOB_FLT,
                                            realw* D,
                                            realw* V,

diff --git a/src/gpu/helper_functions.cu b/src/gpu/helper_functions.cu
@@ -207,7 +207,7 @@ void start_timing_gpu(gpu_event* start,gpu_event* stop){
 /* ----------------------------------------------------------------------------------------------- */
 
 void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str){
-  realw time;
+  realw time = 0.0;
 
 #ifdef USE_CUDA
   // stops events
@@ -233,7 +233,7 @@ void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str){
 /* ----------------------------------------------------------------------------------------------- */
 
 void stop_timing_gpu(gpu_event* start,gpu_event* stop, const char* info_str, realw* t){
-  realw time;
+  realw time = 0.0;
 
 #ifdef USE_CUDA
   // stops events

diff --git a/src/gpu/kernels/Kernel_2_acoustic_impl.cu b/src/gpu/kernels/Kernel_2_acoustic_impl.cu
@@ -573,7 +573,7 @@ Kernel_2_acoustic_single_impl(const int nb_blocks_to_compute,
                               realw_const_p minus_g,
                               realw* d_kappastore,
                               realw_const_p wgll_cube,
-                              const int FORWAR_OR_ADJOINT){
+                              const int FORWARD_OR_ADJOINT){
 
   // block-id == number of local element id in phase_ispec array
   int bx = blockIdx.y*gridDim.x+blockIdx.x;