Skip to content
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
72 changes: 34 additions & 38 deletions examples/n_body/n_body_benchmark_trixi.jl
Original file line number Diff line number Diff line change
Expand Up @@ -6,41 +6,38 @@

using TrixiParticles
using Printf
using Polyester

include("n_body_system.jl")

# Redefine interact in a more optimized way
function TrixiParticles.interact!(dv, v_particle_system, u_particle_system,
v_neighbor_system, u_neighbor_system,
neighborhood_search,
particle_system::NBodySystem,
neighbor_system::NBodySystem)
@inline function TrixiParticles.interact!(dv, v_particle_system, u_particle_system,
v_neighbor_system, u_neighbor_system,
particle_system::NBodySystem,
neighbor_system::NBodySystem,
semi, nhs, particle)
(; mass, G) = neighbor_system

for particle in TrixiParticles.each_integrated_particle(particle_system)
particle_coords = TrixiParticles.current_coords(u_particle_system,
particle_system, particle)

# This makes `interact!` about 20% faster than looping over all particles
# and checking for `particle < neighbor`.
# Note that this doesn't work if we have multiple systems.
for neighbor in (particle + 1):TrixiParticles.nparticles(neighbor_system)
neighbor_coords = TrixiParticles.current_coords(u_neighbor_system,
neighbor_system, neighbor)
pos_diff = particle_coords - neighbor_coords

# Multiplying by pos_diff later makes this slightly faster.
# Multiplying by (1 / norm) is also faster than dividing by norm.
tmp = -G * (1 / norm(pos_diff)^3)
tmp1 = mass[neighbor] * tmp
tmp2 = mass[particle] * tmp

@inbounds for i in 1:ndims(particle_system)
# This is slightly faster than dv[...] += tmp1 * pos_diff[i]
dv[i, particle] = muladd(tmp1, pos_diff[i], dv[i, particle])
dv[i, neighbor] = muladd(tmp2, -pos_diff[i], dv[i, neighbor])
end
particle_coords = @inbounds TrixiParticles.current_coords(u_particle_system,
particle_system, particle)

# This makes `interact!` about 20% faster than looping over all particles
# and checking for `particle < neighbor`.
# Note that this doesn't work if we have multiple systems.
for neighbor in (particle + 1):TrixiParticles.nparticles(neighbor_system)
neighbor_coords = @inbounds TrixiParticles.current_coords(u_neighbor_system,
neighbor_system, neighbor)
pos_diff = particle_coords - neighbor_coords

# Multiplying by pos_diff later makes this slightly faster.
# Multiplying by (1 / norm) is also faster than dividing by norm.
tmp = -G * (1 / norm(pos_diff)^3)
tmp1 = @inbounds mass[neighbor] * tmp
tmp2 = @inbounds mass[particle] * tmp

for i in 1:ndims(particle_system)
# This is slightly faster than dv[...] += tmp1 * pos_diff[i]
@inbounds dv[i, particle] = muladd(tmp1, pos_diff[i], dv[i, particle])
@inbounds dv[i, neighbor] = muladd(tmp2, -pos_diff[i], dv[i, neighbor])
end
end

Expand Down Expand Up @@ -77,7 +74,9 @@ particle_system = NBodySystem(initial_condition, G)
# ==========================================================================================
# ==== Simulation

semi = Semidiscretization(particle_system, neighborhood_search=nothing)
# Disable multithreading, since it adds a significant overhead for this small problem
semi = Semidiscretization(particle_system, neighborhood_search=nothing,
parallelization_backend=SerialBackend())

# This is significantly faster than using OrdinaryDiffEq.
function symplectic_euler!(velocity, coordinates, semi)
Expand All @@ -89,14 +88,14 @@ function symplectic_euler!(velocity, coordinates, semi)
@time for _ in 1:50_000_000
TrixiParticles.kick!(dv, v, u, semi, 0.0)

@inbounds for i in eachindex(v)
v[i] += 0.01 * dv[i]
for i in eachindex(v)
@inbounds v[i] += 0.01 * dv[i]
end

TrixiParticles.drift!(du, v, u, semi, 0.0)

@inbounds for i in eachindex(u)
u[i] += 0.01 * du[i]
for i in eachindex(u)
@inbounds u[i] += 0.01 * du[i]
end
end

Expand All @@ -115,10 +114,7 @@ end

@printf("%.9f\n", energy(velocity, coordinates, particle_system, semi))

# Disable multithreading, since it adds a significant overhead for this small problem.
disable_polyester_threads() do
symplectic_euler!(velocity, coordinates, semi)
end
symplectic_euler!(velocity, coordinates, semi)

@printf("%.9f\n", energy(velocity, coordinates, particle_system, semi))

Expand Down
26 changes: 13 additions & 13 deletions examples/n_body/n_body_system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -15,7 +15,7 @@ struct NBodySystem{NDIMS, ELTYPE <: Real, IC} <: TrixiParticles.AbstractSystem{N
end
end

TrixiParticles.timer_name(::NBodySystem) = "nbody"
@inline TrixiParticles.timer_name(::NBodySystem) = "nbody"

@inline Base.eltype(system::NBodySystem{NDIMS, ELTYPE}) where {NDIMS, ELTYPE} = ELTYPE

Expand All @@ -40,25 +40,25 @@ function TrixiParticles.update_nhs!(neighborhood_search,
points_moving=(true, true))
end

function TrixiParticles.compact_support(system::NBodySystem,
neighbor::NBodySystem)
@inline function TrixiParticles.compact_support(system::NBodySystem,
neighbor::NBodySystem)
# There is no cutoff. All particles interact with each other.
return Inf
end

function TrixiParticles.interact!(dv, v_particle_system, u_particle_system,
v_neighbor_system, u_neighbor_system,
particle_system::NBodySystem,
neighbor_system::NBodySystem, semi)
@inline function TrixiParticles.interact!(dv, v_particle_system, u_particle_system,
v_neighbor_system, u_neighbor_system,
particle_system::NBodySystem,
neighbor_system::NBodySystem,
semi, nhs, particle)
(; mass, G) = neighbor_system

system_coords = TrixiParticles.current_coordinates(u_particle_system, particle_system)
neighbor_coords = TrixiParticles.current_coordinates(u_neighbor_system, neighbor_system)

# Loop over all pairs of particles and neighbors within the kernel cutoff.
TrixiParticles.foreach_point_neighbor(particle_system, neighbor_system,
system_coords, neighbor_coords,
semi) do particle, neighbor, pos_diff, distance
TrixiParticles.foreach_neighbor(system_coords, neighbor_coords,
nhs, particle) do particle, neighbor, pos_diff, distance
# No interaction of a particle with itself
particle_system === neighbor_system && particle === neighbor && return

Expand All @@ -77,7 +77,7 @@ function TrixiParticles.interact!(dv, v_particle_system, u_particle_system,
return dv
end

function energy(v_ode, u_ode, system, semi)
@inline function energy(v_ode, u_ode, system, semi)
(; mass) = system

e = zero(eltype(system))
Expand All @@ -103,9 +103,9 @@ function energy(v_ode, u_ode, system, semi)
return e
end

TrixiParticles.vtkname(system::NBodySystem) = "n-body"
@inline TrixiParticles.vtkname(system::NBodySystem) = "n-body"

function TrixiParticles.write2vtk!(vtk, v, u, t, system::NBodySystem)
@inline function TrixiParticles.write2vtk!(vtk, v, u, t, system::NBodySystem)
(; mass) = system

vtk["velocity"] = v
Expand Down
4 changes: 2 additions & 2 deletions src/TrixiParticles.jl
Original file line number Diff line number Diff line change
Expand Up @@ -41,8 +41,8 @@ using TrixiBase: @trixi_timeit, timer, timeit_debug_enabled,
FullGridCellList, DictionaryCellList,
SerialBackend, PolyesterBackend, ThreadsStaticBackend,
ThreadsDynamicBackend, default_backend
using PointNeighbors: PointNeighbors, foreach_point_neighbor, copy_neighborhood_search,
@threaded
using PointNeighbors: PointNeighbors, foreach_point_neighbor, foreach_neighbor,
copy_neighborhood_search, @threaded, @threaded_nosync
using WriteVTK: vtk_grid, MeshCell, VTKCellTypes, paraview_collection, vtk_save

# `util.jl` needs to be first because of the macros `@trixi_timeit` and `@threaded`
Expand Down
11 changes: 9 additions & 2 deletions src/general/abstract_system.jl
Original file line number Diff line number Diff line change
Expand Up @@ -26,13 +26,20 @@ vtkname(system::AbstractBoundarySystem) = "boundary"
# Number of particles in the system
@inline nparticles(system) = length(system.mass)

# Number of particles in the system whose positions are to be integrated (corresponds to the size of u and du)
# Number of particles in the system whose positions are to be integrated
# (corresponds to the size of u and du).
# See comment below for `each_integrated_particle` for more details.
@inline n_integrated_particles(system) = nparticles(system)

# Iterator over all particle indices in the system
@inline eachparticle(system::AbstractSystem) = each_active_particle(system)
@inline eachparticle(initial_condition) = Base.OneTo(nparticles(initial_condition))

# Wrapper for systems with `SystemBuffer`
# Iterator over all particle indices in the system whose positions are to be integrated
# (corresponds to the size of u and du).
# For most systems, this is the same as `eachparticle`, but for the
# `TotalLagrangianSPHSystem`, the clamped particles are included in `eachparticle`
# but not in `each_integrated_particle`.
@inline each_integrated_particle(system) = each_integrated_particle(system, buffer(system))
@inline function each_integrated_particle(system, ::Nothing)
return Base.OneTo(n_integrated_particles(system))
Expand Down
24 changes: 15 additions & 9 deletions src/general/gpu.jl
Original file line number Diff line number Diff line change
Expand Up @@ -24,24 +24,30 @@ Adapt.@adapt_structure DEMSystem
Adapt.@adapt_structure BoundaryDEMSystem
Adapt.@adapt_structure RCRWindkesselModel

KernelAbstractions.get_backend(::PtrArray) = KernelAbstractions.CPU()
function KernelAbstractions.get_backend(system::AbstractSystem)
KernelAbstractions.get_backend(system.mass)
end

function KernelAbstractions.get_backend(system::WallBoundarySystem)
KernelAbstractions.get_backend(system.coordinates)
end

# This makes `@threaded semi for ...` use `semi.parallelization_backend` for parallelization
@inline function PointNeighbors.parallel_foreach(f, iterator, semi::Semidiscretization)
PointNeighbors.parallel_foreach(f, iterator, semi.parallelization_backend)
end

# Same with `@threaded_nosync`
@inline function PointNeighbors.parallel_foreach_nosync(f, iterator,
semi::Semidiscretization)
PointNeighbors.parallel_foreach_nosync(f, iterator, semi.parallelization_backend)
end

function allocate(backend::KernelAbstractions.GPU, ELTYPE, size)
return KernelAbstractions.allocate(backend, ELTYPE, size)
end

function allocate(backend, ELTYPE, size)
return Array{ELTYPE, length(size)}(undef, size)
end

@inline function synchronize_backend(backend::KernelAbstractions.GPU)
KernelAbstractions.synchronize(backend)
return nothing
end

@inline function synchronize_backend(backend)
return nothing
end
Loading
Loading