trixi-framework · efaulhaber · Mar 4, 2026 · Mar 5, 2026 · Mar 5, 2026 · Mar 4, 2026
diff --git a/examples/n_body/n_body_benchmark_trixi.jl b/examples/n_body/n_body_benchmark_trixi.jl
@@ -6,41 +6,38 @@
 
 using TrixiParticles
 using Printf
-using Polyester
 
 include("n_body_system.jl")
 
 # Redefine interact in a more optimized way
-function TrixiParticles.interact!(dv, v_particle_system, u_particle_system,
-                                  v_neighbor_system, u_neighbor_system,
-                                  neighborhood_search,
-                                  particle_system::NBodySystem,
-                                  neighbor_system::NBodySystem)
+@inline function TrixiParticles.interact!(dv, v_particle_system, u_particle_system,
+                                          v_neighbor_system, u_neighbor_system,
+                                          particle_system::NBodySystem,
+                                          neighbor_system::NBodySystem,
+                                          semi, nhs, particle)
     (; mass, G) = neighbor_system
 
-    for particle in TrixiParticles.each_integrated_particle(particle_system)
-        particle_coords = TrixiParticles.current_coords(u_particle_system,
-                                                        particle_system, particle)
-
-        # This makes `interact!` about 20% faster than looping over all particles
-        # and checking for `particle < neighbor`.
-        # Note that this doesn't work if we have multiple systems.
-        for neighbor in (particle + 1):TrixiParticles.nparticles(neighbor_system)
-            neighbor_coords = TrixiParticles.current_coords(u_neighbor_system,
-                                                            neighbor_system, neighbor)
-            pos_diff = particle_coords - neighbor_coords
-
-            # Multiplying by pos_diff later makes this slightly faster.
-            # Multiplying by (1 / norm) is also faster than dividing by norm.
-            tmp = -G * (1 / norm(pos_diff)^3)
-            tmp1 = mass[neighbor] * tmp
-            tmp2 = mass[particle] * tmp
-
-            @inbounds for i in 1:ndims(particle_system)
-                # This is slightly faster than dv[...] += tmp1 * pos_diff[i]
-                dv[i, particle] = muladd(tmp1, pos_diff[i], dv[i, particle])
-                dv[i, neighbor] = muladd(tmp2, -pos_diff[i], dv[i, neighbor])
-            end
+    particle_coords = @inbounds TrixiParticles.current_coords(u_particle_system,
+                                                              particle_system, particle)
+
+    # This makes `interact!` about 20% faster than looping over all particles
+    # and checking for `particle < neighbor`.
+    # Note that this doesn't work if we have multiple systems.
+    for neighbor in (particle + 1):TrixiParticles.nparticles(neighbor_system)
+        neighbor_coords = @inbounds TrixiParticles.current_coords(u_neighbor_system,
+                                                                  neighbor_system, neighbor)
+        pos_diff = particle_coords - neighbor_coords
+
+        # Multiplying by pos_diff later makes this slightly faster.
+        # Multiplying by (1 / norm) is also faster than dividing by norm.
+        tmp = -G * (1 / norm(pos_diff)^3)
+        tmp1 = @inbounds mass[neighbor] * tmp
+        tmp2 = @inbounds mass[particle] * tmp
+
+        for i in 1:ndims(particle_system)
+            # This is slightly faster than dv[...] += tmp1 * pos_diff[i]
+            @inbounds dv[i, particle] = muladd(tmp1, pos_diff[i], dv[i, particle])
+            @inbounds dv[i, neighbor] = muladd(tmp2, -pos_diff[i], dv[i, neighbor])
         end
     end
 
@@ -77,7 +74,9 @@ particle_system = NBodySystem(initial_condition, G)
 # ==========================================================================================
 # ==== Simulation
 
-semi = Semidiscretization(particle_system, neighborhood_search=nothing)
+# Disable multithreading, since it adds a significant overhead for this small problem
+semi = Semidiscretization(particle_system, neighborhood_search=nothing,
+                          parallelization_backend=SerialBackend())
 
 # This is significantly faster than using OrdinaryDiffEq.
 function symplectic_euler!(velocity, coordinates, semi)
@@ -89,14 +88,14 @@ function symplectic_euler!(velocity, coordinates, semi)
     @time for _ in 1:50_000_000
         TrixiParticles.kick!(dv, v, u, semi, 0.0)
 
-        @inbounds for i in eachindex(v)
-            v[i] += 0.01 * dv[i]
+        for i in eachindex(v)
+            @inbounds v[i] += 0.01 * dv[i]
         end
 
         TrixiParticles.drift!(du, v, u, semi, 0.0)
 
-        @inbounds for i in eachindex(u)
-            u[i] += 0.01 * du[i]
+        for i in eachindex(u)
+            @inbounds u[i] += 0.01 * du[i]
         end
     end
 
@@ -115,10 +114,7 @@ end
 
 @printf("%.9f\n", energy(velocity, coordinates, particle_system, semi))
 
-# Disable multithreading, since it adds a significant overhead for this small problem.
-disable_polyester_threads() do
-    symplectic_euler!(velocity, coordinates, semi)
-end
+symplectic_euler!(velocity, coordinates, semi)
 
 @printf("%.9f\n", energy(velocity, coordinates, particle_system, semi))
 

diff --git a/examples/n_body/n_body_system.jl b/examples/n_body/n_body_system.jl
@@ -15,7 +15,7 @@ struct NBodySystem{NDIMS, ELTYPE <: Real, IC} <: TrixiParticles.AbstractSystem{N
     end
 end
 
-TrixiParticles.timer_name(::NBodySystem) = "nbody"
+@inline TrixiParticles.timer_name(::NBodySystem) = "nbody"
 
 @inline Base.eltype(system::NBodySystem{NDIMS, ELTYPE}) where {NDIMS, ELTYPE} = ELTYPE
 
@@ -40,25 +40,25 @@ function TrixiParticles.update_nhs!(neighborhood_search,
                                           points_moving=(true, true))
 end
 
-function TrixiParticles.compact_support(system::NBodySystem,
-                                        neighbor::NBodySystem)
+@inline function TrixiParticles.compact_support(system::NBodySystem,
+                                                neighbor::NBodySystem)
     # There is no cutoff. All particles interact with each other.
     return Inf
 end
 
-function TrixiParticles.interact!(dv, v_particle_system, u_particle_system,
-                                  v_neighbor_system, u_neighbor_system,
-                                  particle_system::NBodySystem,
-                                  neighbor_system::NBodySystem, semi)
+@inline function TrixiParticles.interact!(dv, v_particle_system, u_particle_system,
+                                          v_neighbor_system, u_neighbor_system,
+                                          particle_system::NBodySystem,
+                                          neighbor_system::NBodySystem,
+                                          semi, nhs, particle)
     (; mass, G) = neighbor_system
 
     system_coords = TrixiParticles.current_coordinates(u_particle_system, particle_system)
     neighbor_coords = TrixiParticles.current_coordinates(u_neighbor_system, neighbor_system)
 
     # Loop over all pairs of particles and neighbors within the kernel cutoff.
-    TrixiParticles.foreach_point_neighbor(particle_system, neighbor_system,
-                                          system_coords, neighbor_coords,
-                                          semi) do particle, neighbor, pos_diff, distance
+    TrixiParticles.foreach_neighbor(system_coords, neighbor_coords,
+                                    nhs, particle) do particle, neighbor, pos_diff, distance
         # No interaction of a particle with itself
         particle_system === neighbor_system && particle === neighbor && return
 
@@ -77,7 +77,7 @@ function TrixiParticles.interact!(dv, v_particle_system, u_particle_system,
     return dv
 end
 
-function energy(v_ode, u_ode, system, semi)
+@inline function energy(v_ode, u_ode, system, semi)
     (; mass) = system
 
     e = zero(eltype(system))
@@ -103,9 +103,9 @@ function energy(v_ode, u_ode, system, semi)
     return e
 end
 
-TrixiParticles.vtkname(system::NBodySystem) = "n-body"
+@inline TrixiParticles.vtkname(system::NBodySystem) = "n-body"
 
-function TrixiParticles.write2vtk!(vtk, v, u, t, system::NBodySystem)
+@inline function TrixiParticles.write2vtk!(vtk, v, u, t, system::NBodySystem)
     (; mass) = system
 
     vtk["velocity"] = v

diff --git a/src/TrixiParticles.jl b/src/TrixiParticles.jl
@@ -41,8 +41,8 @@ using TrixiBase: @trixi_timeit, timer, timeit_debug_enabled,
                                 FullGridCellList, DictionaryCellList,
                                 SerialBackend, PolyesterBackend, ThreadsStaticBackend,
                                 ThreadsDynamicBackend, default_backend
-using PointNeighbors: PointNeighbors, foreach_point_neighbor, copy_neighborhood_search,
-                      @threaded
+using PointNeighbors: PointNeighbors, foreach_point_neighbor, foreach_neighbor,
+                      copy_neighborhood_search, @threaded, @threaded_nosync
 using WriteVTK: vtk_grid, MeshCell, VTKCellTypes, paraview_collection, vtk_save
 
 # `util.jl` needs to be first because of the macros `@trixi_timeit` and `@threaded`

diff --git a/src/general/abstract_system.jl b/src/general/abstract_system.jl
@@ -26,13 +26,20 @@ vtkname(system::AbstractBoundarySystem) = "boundary"
 # Number of particles in the system
 @inline nparticles(system) = length(system.mass)
 
-# Number of particles in the system whose positions are to be integrated (corresponds to the size of u and du)
+# Number of particles in the system whose positions are to be integrated
+# (corresponds to the size of u and du).
+# See comment below for `each_integrated_particle` for more details.
 @inline n_integrated_particles(system) = nparticles(system)
 
+# Iterator over all particle indices in the system
 @inline eachparticle(system::AbstractSystem) = each_active_particle(system)
 @inline eachparticle(initial_condition) = Base.OneTo(nparticles(initial_condition))
 
-# Wrapper for systems with `SystemBuffer`
+# Iterator over all particle indices in the system whose positions are to be integrated
+# (corresponds to the size of u and du).
+# For most systems, this is the same as `eachparticle`, but for the
+# `TotalLagrangianSPHSystem`, the clamped particles are included in `eachparticle`
+# but not in `each_integrated_particle`.
 @inline each_integrated_particle(system) = each_integrated_particle(system, buffer(system))
 @inline function each_integrated_particle(system, ::Nothing)
     return Base.OneTo(n_integrated_particles(system))

diff --git a/src/general/gpu.jl b/src/general/gpu.jl
@@ -24,24 +24,30 @@ Adapt.@adapt_structure DEMSystem
 Adapt.@adapt_structure BoundaryDEMSystem
 Adapt.@adapt_structure RCRWindkesselModel
 
-KernelAbstractions.get_backend(::PtrArray) = KernelAbstractions.CPU()
-function KernelAbstractions.get_backend(system::AbstractSystem)
-    KernelAbstractions.get_backend(system.mass)
-end
-
-function KernelAbstractions.get_backend(system::WallBoundarySystem)
-    KernelAbstractions.get_backend(system.coordinates)
-end
-
 # This makes `@threaded semi for ...` use `semi.parallelization_backend` for parallelization
 @inline function PointNeighbors.parallel_foreach(f, iterator, semi::Semidiscretization)
     PointNeighbors.parallel_foreach(f, iterator, semi.parallelization_backend)
 end
 
+# Same with `@threaded_nosync`
+@inline function PointNeighbors.parallel_foreach_nosync(f, iterator,
+                                                        semi::Semidiscretization)
+    PointNeighbors.parallel_foreach_nosync(f, iterator, semi.parallelization_backend)
+end
+
 function allocate(backend::KernelAbstractions.GPU, ELTYPE, size)
     return KernelAbstractions.allocate(backend, ELTYPE, size)
 end
 
 function allocate(backend, ELTYPE, size)
     return Array{ELTYPE, length(size)}(undef, size)
 end
+
+@inline function synchronize_backend(backend::KernelAbstractions.GPU)
+    KernelAbstractions.synchronize(backend)
+    return nothing
+end
+
+@inline function synchronize_backend(backend)
+    return nothing
+end