From c06d032b6e45fdb84cc562c904c204deeee12215 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:31:38 +0100 Subject: [PATCH 1/5] Combine `interact!` kernels to reduce GPU latency --- src/general/abstract_system.jl | 11 ++- src/general/semidiscretization.jl | 95 ++++++++++++++++--- .../boundary/open_boundary/mirroring.jl | 12 +-- src/schemes/boundary/wall_boundary/rhs.jl | 2 +- .../fluid/weakly_compressible_sph/rhs.jl | 15 ++- .../structure/total_lagrangian_sph/rhs.jl | 31 +++--- 6 files changed, 117 insertions(+), 49 deletions(-) diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl index d1413fd067..58bb44879a 100644 --- a/src/general/abstract_system.jl +++ b/src/general/abstract_system.jl @@ -26,13 +26,20 @@ vtkname(system::AbstractBoundarySystem) = "boundary" # Number of particles in the system @inline nparticles(system) = length(system.mass) -# Number of particles in the system whose positions are to be integrated (corresponds to the size of u and du) +# Number of particles in the system whose positions are to be integrated +# (corresponds to the size of u and du). +# See comment below for `each_integrated_particle` for more details. @inline n_integrated_particles(system) = nparticles(system) +# Iterator over all particle indices in the system @inline eachparticle(system::AbstractSystem) = each_active_particle(system) @inline eachparticle(initial_condition) = Base.OneTo(nparticles(initial_condition)) -# Wrapper for systems with `SystemBuffer` +# Iterator over all particle indices in the system whose positions are to be integrated +# (corresponds to the size of u and du). +# For most systems, this is the same as `eachparticle`, but for the +# `TotalLagrangianSPHSystem`, the clamped particles are included in `eachparticle` +# but not in `each_integrated_particle`. @inline each_integrated_particle(system) = each_integrated_particle(system, buffer(system)) @inline function each_integrated_particle(system, ::Nothing) return Base.OneTo(n_integrated_particles(system)) diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index 38531b37ff..c2e23dec7c 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -55,6 +55,7 @@ struct Semidiscretization{BACKEND, S, RU, RV, NS, UCU, IT} ranges_v :: RV neighborhood_searches :: NS parallelization_backend :: BACKEND + interaction_timers :: TIMERS update_callback_used :: UCU integrate_tlsph :: IT # `false` if TLSPH integration is decoupled @@ -64,19 +65,20 @@ struct Semidiscretization{BACKEND, S, RU, RV, NS, UCU, IT} # and by Adapt.jl. function Semidiscretization(systems::Tuple, ranges_u, ranges_v, neighborhood_searches, parallelization_backend::PointNeighbors.ParallelizationBackend, - update_callback_used, integrate_tlsph) - new{typeof(parallelization_backend), typeof(systems), typeof(ranges_u), + interaction_timers, update_callback_used, integrate_tlsph) + new{typeof(interaction_timers), typeof(systems), typeof(ranges_u), typeof(ranges_v), typeof(neighborhood_searches), - typeof(update_callback_used), - typeof(integrate_tlsph)}(systems, ranges_u, ranges_v, - neighborhood_searches, parallelization_backend, + typeof(parallelization_backend), typeof(update_callback_used), + typeof(integrate_tlsph)}(systems, ranges_u, ranges_v, neighborhood_searches, + parallelization_backend, interaction_timers, update_callback_used, integrate_tlsph) end end function Semidiscretization(systems::Union{AbstractSystem, Nothing}...; neighborhood_search=GridNeighborhoodSearch{ndims(first(systems))}(), - parallelization_backend=PolyesterBackend()) + parallelization_backend=PolyesterBackend(), + interaction_timers=IndividualTimers()) systems = filter(system -> !isnothing(system), systems) if isempty(systems) @@ -120,10 +122,13 @@ function Semidiscretization(systems::Union{AbstractSystem, Nothing}...; integrate_tlsph = Ref(true) return Semidiscretization(systems, ranges_u, ranges_v, searches, - parallelization_backend, update_callback_used, - integrate_tlsph) + parallelization_backend, interaction_timers, + update_callback_used, integrate_tlsph) end +struct IndividualTimers end +struct CombinedTimers end + # Inline show function e.g. Semidiscretization(neighborhood_search=...) function Base.show(io::IO, semi::Semidiscretization) @nospecialize semi # reduce precompilation time @@ -649,8 +654,8 @@ end return -damping_coefficient * velocity end -function system_interaction!(dv_ode, v_ode, u_ode, semi) - # Call `interact!` for each pair of systems +function system_interaction!(dv_ode, v_ode, u_ode, + semi::Semidiscretization{IndividualTimers}) foreach_system(semi) do system foreach_system(semi) do neighbor # Construct string for the interactions timer. @@ -663,7 +668,34 @@ function system_interaction!(dv_ode, v_ode, u_ode, semi) timer_str = "" end - interact!(dv_ode, v_ode, u_ode, system, neighbor, semi, timer_str=timer_str) + # Call individual `interact!` for this pair of systems. + # On GPUs, this is fully synchronized, so we get a separate timer for each + # pair of systems. + @trixi_timeit timer() timer_str begin + interact!(dv_ode, v_ode, u_ode, system, neighbor, semi) + end + end + end + + return dv_ode +end + +function system_interaction!(dv_ode, v_ode, u_ode, + semi::Semidiscretization{CombinedTimers}) + foreach_system(semi) do system + # Construct string for the interactions timer. + # Avoid allocations from string construction when no timers are used. + if timeit_debug_enabled() + system_index = system_indices(system, semi) + timer_str = "$(timer_name(system))$system_index-*" + else + timer_str = "" + end + + # Call a combined `interact!` for all interactions of this system with other systems. + # On GPUs, this is fully synchronized, so we get a separate timer for each system. + @trixi_timeit timer() timer_str begin + interact_combined!(dv_ode, v_ode, u_ode, system, semi) end end @@ -674,7 +706,7 @@ end # One can benchmark, e.g. the fluid-fluid interaction, with: # dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x; # @btime TrixiParticles.interact!($dv_ode, $v_ode, $u_ode, $fluid_system, $fluid_system, $semi); -@inline function interact!(dv_ode, v_ode, u_ode, system, neighbor, semi; timer_str="") +function interact!(dv_ode, v_ode, u_ode, system, neighbor, semi) dv = wrap_v(dv_ode, system, semi) v_system = wrap_v(v_ode, system, semi) u_system = wrap_u(u_ode, system, semi) @@ -682,8 +714,43 @@ end v_neighbor = wrap_v(v_ode, neighbor, semi) u_neighbor = wrap_u(u_ode, neighbor, semi) - @trixi_timeit timer() timer_str begin - interact!(dv, v_system, u_system, v_neighbor, u_neighbor, system, neighbor, semi) + nhs = get_neighborhood_search(system, neighbor, semi) + + # Loop over all particles that are integrated for this system, i.e., all particles + # for which `dv` has entries. + @threaded semi for particle in each_integrated_particle(system) + interact!(dv, v_system, u_system, v_neighbor, u_neighbor, + system, neighbor, semi, nhs, particle) + end +end + +# Benchmark the combined interaction for a system with: +# dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x; +# @btime TrixiParticles.interact_combined!($dv_ode, $v_ode, $u_ode, $system, $semi); +function interact_combined!(dv_ode, v_ode, u_ode, system, semi; synchronize=true) + dv = wrap_v(dv_ode, system, semi) + v_system = wrap_v(v_ode, system, semi) + u_system = wrap_u(u_ode, system, semi) + + # Create an iterator combining systems with their wrapped arrays and NHS. + # Note that we cannot use `get_neighborhood_search` because this will return + # a NHS of a different type for TLSPH-TLSPH interactions, making the iterator tuple + # type-unstable. The different self-interaction NHS for TLSPH is handled + # by the `interact!` function for TLSPH. + system_index = system_indices(system, semi) + f(neighbor) = (neighbor, wrap_v(v_ode, neighbor, semi), wrap_u(u_ode, neighbor, semi), + semi.neighborhood_searches[system_index][system_indices(neighbor, semi)]) + iterator = map(f, semi.systems) + + # Loop over all particles that are integrated for this system, i.e., all particles + # for which `dv` has entries. + @threaded semi for particle in each_integrated_particle(system) + # Now loop over all neighbor systems to avoid separate loops/kernels + # for each pair of systems. + foreach_noalloc(iterator) do (neighbor, v_neighbor, u_neighbor, nhs) + interact!(dv, v_system, u_system, v_neighbor, u_neighbor, + system, neighbor, semi, nhs, particle) + end end end diff --git a/src/schemes/boundary/open_boundary/mirroring.jl b/src/schemes/boundary/open_boundary/mirroring.jl index 75b5029e25..2592a24c14 100644 --- a/src/schemes/boundary/open_boundary/mirroring.jl +++ b/src/schemes/boundary/open_boundary/mirroring.jl @@ -165,10 +165,8 @@ function extrapolate_values!(system, ndims(system) + 1, eltype(system)})) - # TODO: Not public API - PointNeighbors.foreach_neighbor(fluid_coords, nhs, particle, ghost_node_position, - nhs.search_radius) do particle, neighbor, pos_diff, - distance + foreach_neighbor(fluid_coords, nhs, particle, ghost_node_position, + nhs.search_radius) do particle, neighbor, pos_diff, distance m_b = hydrodynamic_mass(fluid_system, neighbor) rho_b = current_density(v_fluid, fluid_system, neighbor) pressure_b = current_pressure(v_fluid, fluid_system, neighbor) @@ -305,10 +303,8 @@ function extrapolate_values!(system, mirror_method::ZerothOrderMirroring, interpolated_pressure = Ref(zero(eltype(system))) interpolated_velocity = Ref(zero(particle_coords)) - # TODO: Not public API - PointNeighbors.foreach_neighbor(fluid_coords, nhs, particle, ghost_node_position, - nhs.search_radius) do particle, neighbor, pos_diff, - distance + foreach_neighbor(fluid_coords, nhs, particle, ghost_node_position, + nhs.search_radius) do particle, neighbor, pos_diff, distance m_b = hydrodynamic_mass(fluid_system, neighbor) rho_b = current_density(v_fluid, fluid_system, neighbor) volume_b = m_b / rho_b diff --git a/src/schemes/boundary/wall_boundary/rhs.jl b/src/schemes/boundary/wall_boundary/rhs.jl index e0553764ef..847810dc42 100644 --- a/src/schemes/boundary/wall_boundary/rhs.jl +++ b/src/schemes/boundary/wall_boundary/rhs.jl @@ -2,7 +2,7 @@ function interact!(dv, v_particle_system, u_particle_system, v_neighbor_system, u_neighbor_system, particle_system::Union{AbstractBoundarySystem, OpenBoundarySystem}, - neighbor_system, semi) + neighbor_system, semi, nhs, particle) # TODO Solids and moving boundaries should be considered in the continuity equation return dv end diff --git a/src/schemes/fluid/weakly_compressible_sph/rhs.jl b/src/schemes/fluid/weakly_compressible_sph/rhs.jl index acb0f06894..6a1bcef905 100644 --- a/src/schemes/fluid/weakly_compressible_sph/rhs.jl +++ b/src/schemes/fluid/weakly_compressible_sph/rhs.jl @@ -2,9 +2,10 @@ # in `neighbor_system` and updates `dv` accordingly. # It takes into account pressure forces, viscosity, and for `ContinuityDensity` updates # the density using the continuity equation. -function interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - particle_system::WeaklyCompressibleSPHSystem, neighbor_system, semi) +@inline function interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::WeaklyCompressibleSPHSystem, neighbor_system, + semi, nhs, particle) (; density_calculator, correction) = particle_system sound_speed = system_sound_speed(particle_system) @@ -20,12 +21,8 @@ function interact!(dv, v_particle_system, u_particle_system, # debug_array = zeros(ndims(particle_system), nparticles(particle_system)) # Loop over all pairs of particles and neighbors within the kernel cutoff - foreach_point_neighbor(particle_system, neighbor_system, - system_coords, neighbor_system_coords, semi; - points=each_integrated_particle(particle_system)) do particle, - neighbor, - pos_diff, - distance + foreach_neighbor(system_coords, neighbor_system_coords, + nhs, particle) do particle, neighbor, pos_diff, distance # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are # in bounds of the respective system. For performance reasons, we use `@inbounds` # in this hot loop to avoid bounds checking when extracting particle quantities. diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl index c2b63c0eab..76b4d28bc3 100644 --- a/src/schemes/structure/total_lagrangian_sph/rhs.jl +++ b/src/schemes/structure/total_lagrangian_sph/rhs.jl @@ -2,7 +2,8 @@ function interact!(dv, v_particle_system, u_particle_system, v_neighbor_system, u_neighbor_system, particle_system::TotalLagrangianSPHSystem, - neighbor_system::TotalLagrangianSPHSystem, semi; + neighbor_system::TotalLagrangianSPHSystem, + semi, nhs, particle; integrate_tlsph=semi.integrate_tlsph[]) # Different structures do not interact with each other (yet) particle_system === neighbor_system || return dv @@ -10,22 +11,25 @@ function interact!(dv, v_particle_system, u_particle_system, # Skip interaction if TLSPH systems are integrated separately integrate_tlsph || return dv - interact_structure_structure!(dv, v_particle_system, particle_system, semi) + interact_structure_structure!(dv, v_particle_system, particle_system, semi, particle) end # Function barrier without dispatch for unit testing -@inline function interact_structure_structure!(dv, v_system, system, semi) +@inline function interact_structure_structure!(dv, v_system, system, semi, particle) (; penalty_force) = system # Everything here is done in the initial coordinates system_coords = initial_coordinates(system) + # Specialized neighborhood search for structure-structure interaction that is + # optimized for finding neighbors in the initial configuration. + nhs = system.self_interaction_nhs + # Loop over all pairs of particles and neighbors within the kernel cutoff. # For structure-structure interaction, this has to happen in the initial coordinates. - foreach_point_neighbor(system, system, system_coords, system_coords, semi; - points=each_integrated_particle(system)) do particle, neighbor, - initial_pos_diff, - initial_distance + foreach_neighbor(system_coords, system_coords, nhs, + particle) do particle, neighbor, + initial_pos_diff, initial_distance # Only consider particles with a distance > 0. # See `src/general/smoothing_kernels.jl` for more details. initial_distance^2 < eps(initial_smoothing_length(system)^2) && return @@ -76,7 +80,7 @@ end function interact!(dv, v_particle_system, u_particle_system, v_neighbor_system, u_neighbor_system, particle_system::TotalLagrangianSPHSystem, - neighbor_system::AbstractFluidSystem, semi; + neighbor_system::AbstractFluidSystem, semi, nhs, particle; integrate_tlsph=semi.integrate_tlsph[]) # Skip interaction if TLSPH systems are integrated separately integrate_tlsph || return dv @@ -87,12 +91,8 @@ function interact!(dv, v_particle_system, u_particle_system, neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) # Loop over all pairs of particles and neighbors within the kernel cutoff - foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_coords, - semi; - points=each_integrated_particle(particle_system)) do particle, - neighbor, - pos_diff, - distance + foreach_neighbor(system_coords, neighbor_coords, + nhs, particle) do particle, neighbor, pos_diff, distance # Only consider particles with a distance > 0. # See `src/general/smoothing_kernels.jl` for more details. distance^2 < eps(initial_smoothing_length(particle_system)^2) && return @@ -183,7 +183,8 @@ end function interact!(dv, v_particle_system, u_particle_system, v_neighbor_system, u_neighbor_system, particle_system::TotalLagrangianSPHSystem, - neighbor_system::Union{WallBoundarySystem, OpenBoundarySystem}, semi; + neighbor_system::Union{WallBoundarySystem, OpenBoundarySystem}, + semi, nhs, particle; integrate_tlsph=semi.integrate_tlsph[]) # TODO continuity equation? return dv From 89676db27ecc080baef6dc41394da7b43778f437 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 5 Mar 2026 16:17:47 +0100 Subject: [PATCH 2/5] Inline functions --- src/general/semidiscretization.jl | 4 +++- src/schemes/boundary/wall_boundary/rhs.jl | 8 ++++---- src/util.jl | 2 +- 3 files changed, 8 insertions(+), 6 deletions(-) diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index c2e23dec7c..deb014c8ec 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -705,7 +705,8 @@ end # Function barrier to make benchmarking interactions easier. # One can benchmark, e.g. the fluid-fluid interaction, with: # dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x; -# @btime TrixiParticles.interact!($dv_ode, $v_ode, $u_ode, $fluid_system, $fluid_system, $semi); +# semi = ode.p; system = semi.systems[1]; +# @btime TrixiParticles.interact!($dv_ode, $v_ode, $u_ode, $system, $system, $semi); function interact!(dv_ode, v_ode, u_ode, system, neighbor, semi) dv = wrap_v(dv_ode, system, semi) v_system = wrap_v(v_ode, system, semi) @@ -726,6 +727,7 @@ end # Benchmark the combined interaction for a system with: # dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x; +# semi = ode.p; system = semi.systems[1]; # @btime TrixiParticles.interact_combined!($dv_ode, $v_ode, $u_ode, $system, $semi); function interact_combined!(dv_ode, v_ode, u_ode, system, semi; synchronize=true) dv = wrap_v(dv_ode, system, semi) diff --git a/src/schemes/boundary/wall_boundary/rhs.jl b/src/schemes/boundary/wall_boundary/rhs.jl index 847810dc42..e35b05b7a3 100644 --- a/src/schemes/boundary/wall_boundary/rhs.jl +++ b/src/schemes/boundary/wall_boundary/rhs.jl @@ -1,8 +1,8 @@ # Interaction of boundary with other systems -function interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - particle_system::Union{AbstractBoundarySystem, OpenBoundarySystem}, - neighbor_system, semi, nhs, particle) +@inline function interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::Union{AbstractBoundarySystem, OpenBoundarySystem}, + neighbor_system, semi, nhs, particle) # TODO Solids and moving boundaries should be considered in the continuity equation return dv end diff --git a/src/util.jl b/src/util.jl index c1c5a8aa41..5822ecc526 100644 --- a/src/util.jl +++ b/src/util.jl @@ -3,7 +3,7 @@ element = first(collection) remaining_collection = Base.tail(collection) - func(element) + @inline func(element) # Process remaining collection foreach_noalloc(func, remaining_collection) From 1645afa183873369f2eda44326075b88eca54c55 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 5 Mar 2026 16:33:25 +0100 Subject: [PATCH 3/5] Fix n-body and other `interact!` calls --- examples/n_body/n_body_benchmark_trixi.jl | 72 +++++++++---------- examples/n_body/n_body_system.jl | 26 +++---- src/general/semidiscretization.jl | 2 +- src/preprocessing/particle_packing/rhs.jl | 23 +++--- src/schemes/boundary/open_boundary/rhs.jl | 32 ++++----- src/schemes/boundary/wall_boundary/rhs.jl | 24 +++---- .../fluid/entropically_damped_sph/rhs.jl | 16 ++--- .../fluid/implicit_incompressible_sph/rhs.jl | 16 ++--- .../structure/discrete_element_method/rhs.jl | 15 ++-- .../structure/total_lagrangian_sph/rhs.jl | 35 ++++----- 10 files changed, 123 insertions(+), 138 deletions(-) diff --git a/examples/n_body/n_body_benchmark_trixi.jl b/examples/n_body/n_body_benchmark_trixi.jl index 828f046bf2..3a824da00d 100644 --- a/examples/n_body/n_body_benchmark_trixi.jl +++ b/examples/n_body/n_body_benchmark_trixi.jl @@ -6,41 +6,38 @@ using TrixiParticles using Printf -using Polyester include("n_body_system.jl") # Redefine interact in a more optimized way -function TrixiParticles.interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - neighborhood_search, - particle_system::NBodySystem, - neighbor_system::NBodySystem) +@inline function TrixiParticles.interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::NBodySystem, + neighbor_system::NBodySystem, + semi, nhs, particle) (; mass, G) = neighbor_system - for particle in TrixiParticles.each_integrated_particle(particle_system) - particle_coords = TrixiParticles.current_coords(u_particle_system, - particle_system, particle) - - # This makes `interact!` about 20% faster than looping over all particles - # and checking for `particle < neighbor`. - # Note that this doesn't work if we have multiple systems. - for neighbor in (particle + 1):TrixiParticles.nparticles(neighbor_system) - neighbor_coords = TrixiParticles.current_coords(u_neighbor_system, - neighbor_system, neighbor) - pos_diff = particle_coords - neighbor_coords - - # Multiplying by pos_diff later makes this slightly faster. - # Multiplying by (1 / norm) is also faster than dividing by norm. - tmp = -G * (1 / norm(pos_diff)^3) - tmp1 = mass[neighbor] * tmp - tmp2 = mass[particle] * tmp - - @inbounds for i in 1:ndims(particle_system) - # This is slightly faster than dv[...] += tmp1 * pos_diff[i] - dv[i, particle] = muladd(tmp1, pos_diff[i], dv[i, particle]) - dv[i, neighbor] = muladd(tmp2, -pos_diff[i], dv[i, neighbor]) - end + particle_coords = @inbounds TrixiParticles.current_coords(u_particle_system, + particle_system, particle) + + # This makes `interact!` about 20% faster than looping over all particles + # and checking for `particle < neighbor`. + # Note that this doesn't work if we have multiple systems. + for neighbor in (particle + 1):TrixiParticles.nparticles(neighbor_system) + neighbor_coords = @inbounds TrixiParticles.current_coords(u_neighbor_system, + neighbor_system, neighbor) + pos_diff = particle_coords - neighbor_coords + + # Multiplying by pos_diff later makes this slightly faster. + # Multiplying by (1 / norm) is also faster than dividing by norm. + tmp = -G * (1 / norm(pos_diff)^3) + tmp1 = @inbounds mass[neighbor] * tmp + tmp2 = @inbounds mass[particle] * tmp + + for i in 1:ndims(particle_system) + # This is slightly faster than dv[...] += tmp1 * pos_diff[i] + @inbounds dv[i, particle] = muladd(tmp1, pos_diff[i], dv[i, particle]) + @inbounds dv[i, neighbor] = muladd(tmp2, -pos_diff[i], dv[i, neighbor]) end end @@ -77,7 +74,9 @@ particle_system = NBodySystem(initial_condition, G) # ========================================================================================== # ==== Simulation -semi = Semidiscretization(particle_system, neighborhood_search=nothing) +# Disable multithreading, since it adds a significant overhead for this small problem +semi = Semidiscretization(particle_system, neighborhood_search=nothing, + parallelization_backend=SerialBackend()) # This is significantly faster than using OrdinaryDiffEq. function symplectic_euler!(velocity, coordinates, semi) @@ -89,14 +88,14 @@ function symplectic_euler!(velocity, coordinates, semi) @time for _ in 1:50_000_000 TrixiParticles.kick!(dv, v, u, semi, 0.0) - @inbounds for i in eachindex(v) - v[i] += 0.01 * dv[i] + for i in eachindex(v) + @inbounds v[i] += 0.01 * dv[i] end TrixiParticles.drift!(du, v, u, semi, 0.0) - @inbounds for i in eachindex(u) - u[i] += 0.01 * du[i] + for i in eachindex(u) + @inbounds u[i] += 0.01 * du[i] end end @@ -115,10 +114,7 @@ end @printf("%.9f\n", energy(velocity, coordinates, particle_system, semi)) -# Disable multithreading, since it adds a significant overhead for this small problem. -disable_polyester_threads() do - symplectic_euler!(velocity, coordinates, semi) -end +symplectic_euler!(velocity, coordinates, semi) @printf("%.9f\n", energy(velocity, coordinates, particle_system, semi)) diff --git a/examples/n_body/n_body_system.jl b/examples/n_body/n_body_system.jl index 8ff21c950c..ee733904ce 100644 --- a/examples/n_body/n_body_system.jl +++ b/examples/n_body/n_body_system.jl @@ -15,7 +15,7 @@ struct NBodySystem{NDIMS, ELTYPE <: Real, IC} <: TrixiParticles.AbstractSystem{N end end -TrixiParticles.timer_name(::NBodySystem) = "nbody" +@inline TrixiParticles.timer_name(::NBodySystem) = "nbody" @inline Base.eltype(system::NBodySystem{NDIMS, ELTYPE}) where {NDIMS, ELTYPE} = ELTYPE @@ -40,25 +40,25 @@ function TrixiParticles.update_nhs!(neighborhood_search, points_moving=(true, true)) end -function TrixiParticles.compact_support(system::NBodySystem, - neighbor::NBodySystem) +@inline function TrixiParticles.compact_support(system::NBodySystem, + neighbor::NBodySystem) # There is no cutoff. All particles interact with each other. return Inf end -function TrixiParticles.interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - particle_system::NBodySystem, - neighbor_system::NBodySystem, semi) +@inline function TrixiParticles.interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::NBodySystem, + neighbor_system::NBodySystem, + semi, nhs, particle) (; mass, G) = neighbor_system system_coords = TrixiParticles.current_coordinates(u_particle_system, particle_system) neighbor_coords = TrixiParticles.current_coordinates(u_neighbor_system, neighbor_system) # Loop over all pairs of particles and neighbors within the kernel cutoff. - TrixiParticles.foreach_point_neighbor(particle_system, neighbor_system, - system_coords, neighbor_coords, - semi) do particle, neighbor, pos_diff, distance + TrixiParticles.foreach_neighbor(system_coords, neighbor_coords, + nhs, particle) do particle, neighbor, pos_diff, distance # No interaction of a particle with itself particle_system === neighbor_system && particle === neighbor && return @@ -77,7 +77,7 @@ function TrixiParticles.interact!(dv, v_particle_system, u_particle_system, return dv end -function energy(v_ode, u_ode, system, semi) +@inline function energy(v_ode, u_ode, system, semi) (; mass) = system e = zero(eltype(system)) @@ -103,9 +103,9 @@ function energy(v_ode, u_ode, system, semi) return e end -TrixiParticles.vtkname(system::NBodySystem) = "n-body" +@inline TrixiParticles.vtkname(system::NBodySystem) = "n-body" -function TrixiParticles.write2vtk!(vtk, v, u, t, system::NBodySystem) +@inline function TrixiParticles.write2vtk!(vtk, v, u, t, system::NBodySystem) (; mass) = system vtk["velocity"] = v diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index deb014c8ec..ea5380d6aa 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -49,7 +49,7 @@ semi = Semidiscretization(fluid_system, boundary_system, └──────────────────────────────────────────────────────────────────────────────────────────────────┘ ``` """ -struct Semidiscretization{BACKEND, S, RU, RV, NS, UCU, IT} +struct Semidiscretization{TIMERS, S, RU, RV, NS, BACKEND, UCU, IT} systems :: S ranges_u :: RU ranges_v :: RV diff --git a/src/preprocessing/particle_packing/rhs.jl b/src/preprocessing/particle_packing/rhs.jl index 77cf460f5b..d5aed3c0c0 100644 --- a/src/preprocessing/particle_packing/rhs.jl +++ b/src/preprocessing/particle_packing/rhs.jl @@ -1,14 +1,16 @@ -function interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - system::ParticlePackingSystem{<:Any, false}, - neighbor_system::ParticlePackingSystem, semi) +@inline function interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + system::ParticlePackingSystem{<:Any, false}, + neighbor_system::ParticlePackingSystem, + semi, nhs, particle) system_coords = current_coordinates(u_particle_system, system) neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) # Loop over all pairs of particles and neighbors within the kernel cutoff - foreach_point_neighbor(system, neighbor_system, system_coords, neighbor_coords, - semi) do particle, neighbor, pos_diff, distance - # Only consider particles with a distance > 0. See `src/general/smoothing_kernels.jl` for more details. + foreach_neighbor(system_coords, neighbor_coords, + nhs, particle) do particle, neighbor, pos_diff, distance + # Only consider particles with a distance > 0. + # See `src/general/smoothing_kernels.jl` for more details. distance^2 < eps(initial_smoothing_length(system)^2) && return rho_a = system.initial_condition.density[particle] @@ -36,8 +38,9 @@ function interact!(dv, v_particle_system, u_particle_system, end # Skip for fixed systems -function interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - system::ParticlePackingSystem{<:Any, true}, neighbor_system, semi) +@inline function interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + system::ParticlePackingSystem{<:Any, true}, + neighbor_system, semi, nhs, particle) return dv end diff --git a/src/schemes/boundary/open_boundary/rhs.jl b/src/schemes/boundary/open_boundary/rhs.jl index 186a9e7b7d..5f66d3bab1 100644 --- a/src/schemes/boundary/open_boundary/rhs.jl +++ b/src/schemes/boundary/open_boundary/rhs.jl @@ -1,8 +1,8 @@ # Interaction for open boundaries only with `BoundaryModelDynamicalPressureZhang` -function interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - particle_system::OpenBoundarySystem{<:BoundaryModelDynamicalPressureZhang}, - neighbor_system, semi) +@inline function interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::OpenBoundarySystem{<:BoundaryModelDynamicalPressureZhang}, + neighbor_system, semi, nhs, particle) (; fluid_system, cache, boundary_model) = particle_system sound_speed = system_sound_speed(fluid_system) @@ -11,12 +11,8 @@ function interact!(dv, v_particle_system, u_particle_system, neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system) # Loop over all pairs of particles and neighbors within the kernel cutoff - foreach_point_neighbor(particle_system, neighbor_system, - system_coords, neighbor_system_coords, semi; - points=each_integrated_particle(particle_system)) do particle, - neighbor, - pos_diff, - distance + foreach_neighbor(system_coords, neighbor_system_coords, + nhs, particle) do particle, neighbor, pos_diff, distance # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are # in bounds of the respective system. For performance reasons, we use `@inbounds` # in this hot loop to avoid bounds checking when extracting particle quantities. @@ -89,17 +85,17 @@ function interact!(dv, v_particle_system, u_particle_system, return dv end -function pressure_evolution!(dv, particle_system, neighbor_system, v_diff, grad_kernel, - particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, p_a, p_b, rho_a, rho_b, - fluid_system::WeaklyCompressibleSPHSystem) +@inline function pressure_evolution!(dv, particle_system, neighbor_system, v_diff, + grad_kernel, particle, neighbor, pos_diff, distance, + sound_speed, m_a, m_b, p_a, p_b, rho_a, rho_b, + fluid_system::WeaklyCompressibleSPHSystem) return dv end -function pressure_evolution!(dv, particle_system, neighbor_system, v_diff, grad_kernel, - particle, neighbor, pos_diff, distance, - sound_speed, m_a, m_b, p_a, p_b, rho_a, rho_b, - fluid_system::EntropicallyDampedSPHSystem) +@inline function pressure_evolution!(dv, particle_system, neighbor_system, v_diff, + grad_kernel, particle, neighbor, pos_diff, distance, + sound_speed, m_a, m_b, p_a, p_b, rho_a, rho_b, + fluid_system::EntropicallyDampedSPHSystem) pressure_evolution!(dv, particle_system, neighbor_system, v_diff, grad_kernel, particle, neighbor, pos_diff, distance, sound_speed, m_a, m_b, p_a, p_b, rho_a, rho_b, fluid_system.nu_edac) diff --git a/src/schemes/boundary/wall_boundary/rhs.jl b/src/schemes/boundary/wall_boundary/rhs.jl index e35b05b7a3..8c294cb19d 100644 --- a/src/schemes/boundary/wall_boundary/rhs.jl +++ b/src/schemes/boundary/wall_boundary/rhs.jl @@ -8,20 +8,20 @@ end # For dummy particles with `ContinuityDensity`, solve the continuity equation -function interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - particle_system::WallBoundarySystem{<:BoundaryModelDummyParticles{ContinuityDensity}}, - neighbor_system::Union{AbstractFluidSystem, - OpenBoundarySystem{<:BoundaryModelDynamicalPressureZhang}}, - semi) +@inline function interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::WallBoundarySystem{<:BoundaryModelDummyParticles{ContinuityDensity}}, + neighbor_system::Union{AbstractFluidSystem, + OpenBoundarySystem{<:BoundaryModelDynamicalPressureZhang}}, + semi, nhs, particle) (; boundary_model) = particle_system system_coords = current_coordinates(u_particle_system, particle_system) neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) # Loop over all pairs of particles and neighbors within the kernel cutoff. - foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_coords, - semi) do particle, neighbor, pos_diff, distance + foreach_neighbor(system_coords, neighbor_coords, + nhs, particle) do particle, neighbor, pos_diff, distance m_b = hydrodynamic_mass(neighbor_system, neighbor) rho_a = current_density(v_particle_system, particle_system, particle) @@ -42,13 +42,13 @@ end # This is the derivative of the density summation, which is compatible with the # `SummationDensity` pressure acceleration. # Energy preservation tests will fail with the other formulation. -function continuity_equation!(dv, fluid_density_calculator::SummationDensity, - m_b, rho_a, rho_b, v_diff, grad_kernel, particle) +@inline function continuity_equation!(dv, fluid_density_calculator::SummationDensity, + m_b, rho_a, rho_b, v_diff, grad_kernel, particle) dv[end, particle] += m_b * dot(v_diff, grad_kernel) end # This is identical to the continuity equation of the fluid -function continuity_equation!(dv, fluid_density_calculator::ContinuityDensity, - m_b, rho_a, rho_b, v_diff, grad_kernel, particle) +@inline function continuity_equation!(dv, fluid_density_calculator::ContinuityDensity, + m_b, rho_a, rho_b, v_diff, grad_kernel, particle) dv[end, particle] += rho_a / rho_b * m_b * dot(v_diff, grad_kernel) end diff --git a/src/schemes/fluid/entropically_damped_sph/rhs.jl b/src/schemes/fluid/entropically_damped_sph/rhs.jl index cea18484d4..654438e881 100644 --- a/src/schemes/fluid/entropically_damped_sph/rhs.jl +++ b/src/schemes/fluid/entropically_damped_sph/rhs.jl @@ -1,8 +1,8 @@ # Fluid-fluid and fluid-boundary interaction -function interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - particle_system::EntropicallyDampedSPHSystem, - neighbor_system, semi) +@inline function interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::EntropicallyDampedSPHSystem, + neighbor_system, semi, nhs, particle) (; sound_speed, density_calculator, correction, nu_edac) = particle_system system_coords = current_coordinates(u_particle_system, particle_system) @@ -12,12 +12,8 @@ function interact!(dv, v_particle_system, u_particle_system, surface_tension_b = surface_tension_model(neighbor_system) # Loop over all pairs of particles and neighbors within the kernel cutoff - foreach_point_neighbor(particle_system, neighbor_system, - system_coords, neighbor_coords, semi; - points=each_integrated_particle(particle_system)) do particle, - neighbor, - pos_diff, - distance + foreach_neighbor(system_coords, neighbor_coords, + nhs, particle) do particle, neighbor, pos_diff, distance # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are # in bounds of the respective system. For performance reasons, we use `@inbounds` # in this hot loop to avoid bounds checking when extracting particle quantities. diff --git a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl index ca7c4b2c35..d5b32968e6 100644 --- a/src/schemes/fluid/implicit_incompressible_sph/rhs.jl +++ b/src/schemes/fluid/implicit_incompressible_sph/rhs.jl @@ -2,21 +2,17 @@ # in `neighbor_system` and updates `dv` accordingly. # It takes into account pressure forces, viscosity, and for `ContinuityDensity` updates the density # using the continuity equation. -function interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - particle_system::ImplicitIncompressibleSPHSystem, - neighbor_system, semi) +@inline function interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::ImplicitIncompressibleSPHSystem, + neighbor_system, semi, nhs, particle) sound_speed = system_sound_speed(particle_system) #TODO system_coords = current_coordinates(u_particle_system, particle_system) neighbor_system_coords = current_coordinates(u_neighbor_system, neighbor_system) # Loop over all pairs of particles and neighbors within the kernel cutoff. - foreach_point_neighbor(particle_system, neighbor_system, - system_coords, neighbor_system_coords, semi; - points=each_integrated_particle(particle_system)) do particle, - neighbor, - pos_diff, - distance + foreach_neighbor(system_coords, neighbor_system_coords, + nhs, particle) do particle, neighbor, pos_diff, distance # `foreach_point_neighbor` makes sure that `particle` and `neighbor` are # in bounds of the respective system. For performance reasons, we use `@inbounds` # in this hot loop to avoid bounds checking when extracting particle quantities. diff --git a/src/schemes/structure/discrete_element_method/rhs.jl b/src/schemes/structure/discrete_element_method/rhs.jl index 0fd327b10c..8bae10d5b5 100644 --- a/src/schemes/structure/discrete_element_method/rhs.jl +++ b/src/schemes/structure/discrete_element_method/rhs.jl @@ -1,17 +1,14 @@ -function interact!(dv, v_particle_system, u_particle_system, v_neighbor_system, - u_neighbor_system, particle_system::DEMSystem, - neighbor_system::Union{BoundaryDEMSystem, DEMSystem}, semi) +@inline function interact!(dv, v_particle_system, u_particle_system, v_neighbor_system, + u_neighbor_system, particle_system::DEMSystem, + neighbor_system::Union{BoundaryDEMSystem, DEMSystem}, + semi, nhs, particle) damping_coefficient = particle_system.damping_coefficient system_coords = current_coordinates(u_particle_system, particle_system) neighbor_coords = current_coordinates(u_neighbor_system, neighbor_system) - foreach_point_neighbor(particle_system, neighbor_system, system_coords, neighbor_coords, - semi; - points=each_integrated_particle(particle_system)) do particle, - neighbor, - pos_diff, - distance + foreach_neighbor(system_coords, neighbor_coords, + nhs, particle) do particle, neighbor, pos_diff, distance # See `src/general/smoothing_kernels.jl` for more details distance^2 < eps(first(particle_system.radius)^2) && return diff --git a/src/schemes/structure/total_lagrangian_sph/rhs.jl b/src/schemes/structure/total_lagrangian_sph/rhs.jl index 76b4d28bc3..cdeb911953 100644 --- a/src/schemes/structure/total_lagrangian_sph/rhs.jl +++ b/src/schemes/structure/total_lagrangian_sph/rhs.jl @@ -1,10 +1,10 @@ # Structure-structure interaction -function interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - particle_system::TotalLagrangianSPHSystem, - neighbor_system::TotalLagrangianSPHSystem, - semi, nhs, particle; - integrate_tlsph=semi.integrate_tlsph[]) +@inline function interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::TotalLagrangianSPHSystem, + neighbor_system::TotalLagrangianSPHSystem, + semi, nhs, particle; + integrate_tlsph=semi.integrate_tlsph[]) # Different structures do not interact with each other (yet) particle_system === neighbor_system || return dv @@ -77,11 +77,12 @@ end end # Structure-fluid interaction -function interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - particle_system::TotalLagrangianSPHSystem, - neighbor_system::AbstractFluidSystem, semi, nhs, particle; - integrate_tlsph=semi.integrate_tlsph[]) +@inline function interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::TotalLagrangianSPHSystem, + neighbor_system::AbstractFluidSystem, + semi, nhs, particle; + integrate_tlsph=semi.integrate_tlsph[]) # Skip interaction if TLSPH systems are integrated separately integrate_tlsph || return dv @@ -180,12 +181,12 @@ end end # Structure-boundary interaction -function interact!(dv, v_particle_system, u_particle_system, - v_neighbor_system, u_neighbor_system, - particle_system::TotalLagrangianSPHSystem, - neighbor_system::Union{WallBoundarySystem, OpenBoundarySystem}, - semi, nhs, particle; - integrate_tlsph=semi.integrate_tlsph[]) +@inline function interact!(dv, v_particle_system, u_particle_system, + v_neighbor_system, u_neighbor_system, + particle_system::TotalLagrangianSPHSystem, + neighbor_system::Union{WallBoundarySystem, OpenBoundarySystem}, + semi, nhs, particle; + integrate_tlsph=semi.integrate_tlsph[]) # TODO continuity equation? return dv end From d94562e724aac7a1bb93173d73fd289a9cb95f00 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Wed, 4 Mar 2026 17:32:56 +0100 Subject: [PATCH 4/5] Add option to remove synchronization between `interact!` kernels on GPUs --- src/TrixiParticles.jl | 4 ++-- src/general/gpu.jl | 23 ++++++++++++++--------- src/general/semidiscretization.jl | 26 +++++++++++++++++++++++++- 3 files changed, 41 insertions(+), 12 deletions(-) diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl index bc3cbca3a2..d7ebc4c0b2 100644 --- a/src/TrixiParticles.jl +++ b/src/TrixiParticles.jl @@ -41,8 +41,8 @@ using TrixiBase: @trixi_timeit, timer, timeit_debug_enabled, FullGridCellList, DictionaryCellList, SerialBackend, PolyesterBackend, ThreadsStaticBackend, ThreadsDynamicBackend, default_backend -using PointNeighbors: PointNeighbors, foreach_point_neighbor, copy_neighborhood_search, - @threaded +using PointNeighbors: PointNeighbors, foreach_point_neighbor, foreach_neighbor, + copy_neighborhood_search, @threaded, @threaded_nosync using WriteVTK: vtk_grid, MeshCell, VTKCellTypes, paraview_collection, vtk_save # `util.jl` needs to be first because of the macros `@trixi_timeit` and `@threaded` diff --git a/src/general/gpu.jl b/src/general/gpu.jl index 6b07775b4f..37e54f5f3a 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -24,20 +24,17 @@ Adapt.@adapt_structure DEMSystem Adapt.@adapt_structure BoundaryDEMSystem Adapt.@adapt_structure RCRWindkesselModel -KernelAbstractions.get_backend(::PtrArray) = KernelAbstractions.CPU() -function KernelAbstractions.get_backend(system::AbstractSystem) - KernelAbstractions.get_backend(system.mass) -end - -function KernelAbstractions.get_backend(system::WallBoundarySystem) - KernelAbstractions.get_backend(system.coordinates) -end - # This makes `@threaded semi for ...` use `semi.parallelization_backend` for parallelization @inline function PointNeighbors.parallel_foreach(f, iterator, semi::Semidiscretization) PointNeighbors.parallel_foreach(f, iterator, semi.parallelization_backend) end +# Same with `@threaded_nosync` +@inline function PointNeighbors.parallel_foreach_nosync(f, iterator, + semi::Semidiscretization) + PointNeighbors.parallel_foreach_nosync(f, iterator, semi.parallelization_backend) +end + function allocate(backend::KernelAbstractions.GPU, ELTYPE, size) return KernelAbstractions.allocate(backend, ELTYPE, size) end @@ -45,3 +42,11 @@ end function allocate(backend, ELTYPE, size) return Array{ELTYPE, length(size)}(undef, size) end + +@inline function synchronize_backend(backend::KernelAbstractions.GPU) + return KernelAbstractions.synchronize(backend) +end + +@inline function synchronize_backend(backend) + return nothing +end diff --git a/src/general/semidiscretization.jl b/src/general/semidiscretization.jl index ea5380d6aa..a5bb1c7b0f 100644 --- a/src/general/semidiscretization.jl +++ b/src/general/semidiscretization.jl @@ -128,6 +128,11 @@ end struct IndividualTimers end struct CombinedTimers end +struct NoTimers end + +# Individual timers are usually not worth the overhead on GPUs +default_timers(parallelization_backend) = IndividualTimers() +default_timers(::KernelAbstractions.GPU) = CombinedTimers() # Inline show function e.g. Semidiscretization(neighborhood_search=...) function Base.show(io::IO, semi::Semidiscretization) @@ -702,6 +707,21 @@ function system_interaction!(dv_ode, v_ode, u_ode, return dv_ode end +function system_interaction!(dv_ode, v_ode, u_ode, + semi::Semidiscretization{NoTimers}) + foreach_system(semi) do system + # Call a combined `interact!` for all interactions of this system with other systems. + # Since no timers are used, we can avoid synchronization, as each combined + # interaction will only write to the part of `dv_ode` corresponding its system. + interact_combined!(dv_ode, v_ode, u_ode, system, semi; synchronize=false) + end + + # Now manually synchronize, since we disabled synchronization above + synchronize_backend(semi.parallelization_backend) + + return dv_ode +end + # Function barrier to make benchmarking interactions easier. # One can benchmark, e.g. the fluid-fluid interaction, with: # dv_ode, du_ode = copy(sol.u[end]).x; v_ode, u_ode = copy(sol.u[end]).x; @@ -746,7 +766,9 @@ function interact_combined!(dv_ode, v_ode, u_ode, system, semi; synchronize=true # Loop over all particles that are integrated for this system, i.e., all particles # for which `dv` has entries. - @threaded semi for particle in each_integrated_particle(system) + # `@threaded_nosync` is the same as `@threaded` but without synchronization on GPUs. + # Manual synchronization is done below. + @threaded_nosync semi for particle in each_integrated_particle(system) # Now loop over all neighbor systems to avoid separate loops/kernels # for each pair of systems. foreach_noalloc(iterator) do (neighbor, v_neighbor, u_neighbor, nhs) @@ -754,6 +776,8 @@ function interact_combined!(dv_ode, v_ode, u_ode, system, semi; synchronize=true system, neighbor, semi, nhs, particle) end end + + synchronize && synchronize_backend(semi.parallelization_backend) end function check_update_callback(semi) From e3c2be5b74e3e36c3e460214e73f5580a6312c50 Mon Sep 17 00:00:00 2001 From: Erik Faulhaber <44124897+efaulhaber@users.noreply.github.com> Date: Thu, 5 Mar 2026 16:35:53 +0100 Subject: [PATCH 5/5] Unify return type of `synchronize_backend` --- src/general/gpu.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/general/gpu.jl b/src/general/gpu.jl index 37e54f5f3a..68947e2041 100644 --- a/src/general/gpu.jl +++ b/src/general/gpu.jl @@ -44,7 +44,8 @@ function allocate(backend, ELTYPE, size) end @inline function synchronize_backend(backend::KernelAbstractions.GPU) - return KernelAbstractions.synchronize(backend) + KernelAbstractions.synchronize(backend) + return nothing end @inline function synchronize_backend(backend)