trixi-framework
diff --git a/‎Project.toml‎
Lines changed: 3 additions & 0 deletions b/‎Project.toml‎
Lines changed: 3 additions & 0 deletions
diff --git a/‎docs/src/development.md‎
Lines changed: 42 additions & 0 deletions b/‎docs/src/development.md‎
Lines changed: 42 additions & 0 deletions
diff --git a/‎docs/src/gpu.md‎
Lines changed: 5 additions & 0 deletions b/‎docs/src/gpu.md‎
Lines changed: 5 additions & 0 deletions
diff --git a/‎ext/TrixiParticlesCUDAExt.jl‎
Lines changed: 34 additions & 0 deletions b/‎ext/TrixiParticlesCUDAExt.jl‎
Lines changed: 34 additions & 0 deletions
diff --git a/‎src/general/buffer.jl‎
Lines changed: 11 additions & 3 deletions b/‎src/general/buffer.jl‎
Lines changed: 11 additions & 3 deletions
diff --git a/‎src/general/custom_quantities.jl‎
Lines changed: 0 additions & 2 deletions b/‎src/general/custom_quantities.jl‎
Lines changed: 0 additions & 2 deletions
diff --git a/‎src/io/write_vtk.jl‎
Lines changed: 7 additions & 12 deletions b/‎src/io/write_vtk.jl‎
Lines changed: 7 additions & 12 deletions
diff --git a/‎src/schemes/boundary/open_boundary/dynamical_pressure.jl‎
Lines changed: 1 addition & 1 deletion b/‎src/schemes/boundary/open_boundary/dynamical_pressure.jl‎
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/schemes/boundary/open_boundary/rhs.jl‎
Lines changed: 6 additions & 8 deletions b/‎src/schemes/boundary/open_boundary/rhs.jl‎
Lines changed: 6 additions & 8 deletions
diff --git a/‎src/schemes/boundary/open_boundary/system.jl‎
Lines changed: 26 additions & 16 deletions b/‎src/schemes/boundary/open_boundary/system.jl‎
Lines changed: 26 additions & 16 deletions
@@ -37,14 +37,17 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 [weakdeps]
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
+TrixiParticlesCUDAExt = "CUDA"
 
 [compat]
 Accessors = "0.1.43"
 Adapt = "4"
 CSV = "0.10"
+CUDA = "5.9.1"
 DataFrames = "1.6"
 DelimitedFiles = "1"
 DiffEqCallbacks = "4"
 
@@ -66,3 +66,45 @@ To create a new release for TrixiParticles.jl, perform the following steps:
    version should be `v0.3.1-dev`. If you just released `v0.2.4`, the new development
    version should be `v0.2.5-dev`.
 
+## [Writing GPU-compatible code](@id writing_gpu_code)
+
+When implementing new functionality that should run on both CPUs and GPUs,
+follow these rules:
+
+1. Data structures must be generic and parametric.
+   Do not hardcode concrete CPU array types like `Vector` or `Matrix` in fields.
+   Use type parameters, so the same structure can store CPU arrays and GPU arrays.
+2. Add an Adapt.jl rule in `src/general/gpu.jl`.
+   Register the new type with `Adapt.@adapt_structure ...`, so `adapt` can recursively
+   convert all arrays inside the structure to GPU arrays.
+   This conversion is then applied automatically inside `semidiscretize`.
+3. Use `@threaded` for all loops.
+   Accessing GPU arrays inside regular loops is not allowed.
+   With a GPU backend, `@threaded` loops are compiled to GPU kernels.
+4. Write type-stable code and do not allocate inside `@threaded` loops.
+   This is required for GPU kernels and is also essential for fast multithreaded CPU code.
+
+## [Writing fast GPU code](@id writing_fast_gpu_code)
+
+The following rules improve kernel performance and avoid common GPU pitfalls:
+
+1. Avoid exceptions and bounds errors inside kernels.
+   Perform all required checks before entering `@threaded` loops (that is, before GPU kernels).
+   Then use `@inbounds` directly at the loop where bounds are guaranteed.
+   In TrixiParticles.jl, we do not place `@inbounds` inside inner helper functions.
+   Instead, mark helper functions with `@propagate_inbounds` so the loop-level
+   `@inbounds` is propagated.
+2. Avoid implicit `Float64` literals in arithmetic.
+   For example, prefer `x / 2` over `0.5 * x` so `Float32` simulations stay in `Float32`.
+   Verify this with `@device_code`, or by confirming the kernel runs on an Apple GPU
+   (most Apple GPUs do not support `Float64`).
+3. Use `div_fast` in performance-critical divisions, but only after benchmarking (!).
+   It can significantly speed up kernels, but should not be applied indiscriminately.
+   When introducing `div_fast` in code, add a reference to [this section](@ref writing_fast_gpu_code)
+   to document the rationale and benchmarking context, e.g., like so:
+   ```julia
+   # Since this is one of the most performance critical functions, using fast divisions
+   # here gives a significant speedup on GPUs.
+   # See the docs page "Development" for more details on `div_fast`.
+   result = div_fast(dividend, divisor)
+   ```
@@ -137,3 +137,8 @@ On GPUs that do not support `Float64`, such as most Apple GPUs, we also need to
 the coordinates to `Float32` by passing `coordinates_eltype=Float32` to
 the setup functions that create [`InitialCondition`](@ref)s, such as
 [`RectangularTank`](@ref), [`RectangularShape`](@ref), and [`SphereShape`](@ref).
+
+## Writing GPU-compatible code
+
+Please see the [development documentation](@ref writing_gpu_code) for guidelines on
+how to write GPU-compatible code.
@@ -0,0 +1,34 @@
+# TODO this might be integrated into CUDA.jl at some point, see
+# https://github.com/JuliaGPU/CUDA.jl/pull/3077
+module TrixiParticlesCUDAExt
+
+using CUDA: CUDA
+using TrixiParticles: TrixiParticles
+
+# Use faster version of `div_fast` for `Float64` on CUDA.
+# By default, `div_fast` translates to `Base.FastMath.div_fast`, but there is
+# no fast division for `Float64` on CUDA, so we need to redefine it here to use the
+# improved fast reciprocal defined below.
+CUDA.@device_override TrixiParticles.div_fast(x, y::Float64) = x * fast_inv_cuda(y)
+
+# Improved fast reciprocal for `Float64` by @Mikolaj-A-Kowalski, which is significantly
+# more accurate than just calling "llvm.nvvm.rcp.approx.ftz.d" without the cubic iteration,
+# while still being much faster than a full division.
+# This is copied from Oceananigans.jl, see https://github.com/CliMA/Oceananigans.jl/pull/5140.
+@inline function fast_inv_cuda(a::Float64)
+    # Get the approximate reciprocal
+    # https://docs.nvidia.com/cuda/parallel-thread-execution/#floating-point-instructions-rcp-approx-ftz-f64
+    # This instruction chops off last 32bits of mantissa and computes inverse
+    # while treating all subnormal numbers as 0.0
+    # If reciprocal would be subnormal, underflows to 0.0
+    # 32 least significant bits of the result are filled with 0s
+    inv_a = ccall("llvm.nvvm.rcp.approx.ftz.d", llvmcall, Float64, (Float64,), a)
+
+    # Approximate the missing 32bits of mantissa with a single cubic iteration
+    e = fma(inv_a, -a, 1.0)
+    e = fma(e, e, e)
+    inv_a = fma(e, inv_a, inv_a)
+    return inv_a
+end
+
+end # module
@@ -40,10 +40,10 @@ end
 # Dispatch by system type to handle systems that provide a buffer.
 @inline buffer(system) = nothing
 
-@inline update_system_buffer!(buffer::Nothing, semi) = buffer
+@inline update_system_buffer!(buffer::Nothing) = buffer
 
 # TODO `resize` allocates. Find a non-allocating version
-@inline function update_system_buffer!(buffer::SystemBuffer, semi)
+@inline function update_system_buffer!(buffer::SystemBuffer)
     (; active_particle) = buffer
 
     # TODO: Parallelize (see https://github.com/trixi-framework/TrixiParticles.jl/issues/810)
@@ -64,7 +64,7 @@ end
     return view(buffer.eachparticle, 1:buffer.active_particle_count[])
 end
 
-@inline function deactivate_particle!(system, particle, u)
+@inline function deactivate_particle!(system, particle, v, u)
     (; active_particle) = system.buffer
 
     # Set particle far away from simulation domain
@@ -73,6 +73,14 @@ end
         u[dim, particle] = eltype(system)(1e16)
     end
 
+    # To ensure that the velocity of an inactive particle does not dominate the time step
+    # in adaptive time integrators, set this velocity to zero.
+    # Additionally, this enables map-reduce operations for `v_max` computation
+    # without having to distinguish inactive particles.
+    for dim in 1:ndims(system)
+        v[dim, particle] = 0
+    end
+
     # `deactivate_particle!` and `active_particle[particle] = true`
     # are never called on the same buffer inside a kernel,
     # so we don't have any race conditions on this `active_particle` vector.
 
@@ -6,8 +6,6 @@ Returns the total kinetic energy of all particles in a system.
 function kinetic_energy(system, dv_ode, du_ode, v_ode, u_ode, semi, t)
     v = wrap_v(v_ode, system, semi)
 
-    # TODO: `current_velocity` should only contain active particles
-    # (see https://github.com/trixi-framework/TrixiParticles.jl/issues/850)
     velocity = reinterpret(reshape, SVector{ndims(system), eltype(v)},
                            view(current_velocity(v, system), :,
                                 each_active_particle(system)))
 
@@ -350,18 +350,13 @@ function write2vtk!(vtk, v, u, t, system::AbstractFluidSystem)
             rho_b = current_density(v, system, neighbor)
             grad_kernel = smoothing_kernel_grad(system, pos_diff, distance, particle)
 
-            surface_tension[1:ndims(system),
-                            particle] .+= surface_tension_force(surface_tension_a,
-                                                                surface_tension_b,
-                                                                system,
-                                                                system,
-                                                                particle,
-                                                                neighbor,
-                                                                pos_diff,
-                                                                distance,
-                                                                rho_a,
-                                                                rho_b,
-                                                                grad_kernel)
+            dv_surface_tension = Ref(zero(pos_diff))
+            surface_tension_force!(dv_surface_tension,
+                                   surface_tension_a, surface_tension_b,
+                                   system, system, particle, neighbor,
+                                   pos_diff, distance, rho_a, rho_b, grad_kernel, 1)
+
+            surface_tension[1:ndims(system), particle] .+= dv_surface_tension[]
         end
         vtk["surface_tension"] = surface_tension
 
 
@@ -54,7 +54,7 @@ end
 end
 
 @inline function density_calculator(system::OpenBoundarySystem{<:BoundaryModelDynamicalPressureZhang})
-    return system.cache.density_calculator
+    return ContinuityDensity()
 end
 
 @inline impose_rest_density!(v, system, particle, boundary_model) = v
 
@@ -3,7 +3,7 @@ function interact!(dv, v_particle_system, u_particle_system,
                    v_neighbor_system, u_neighbor_system,
                    particle_system::OpenBoundarySystem{<:BoundaryModelDynamicalPressureZhang},
                    neighbor_system, semi)
-    (; fluid_system, cache, boundary_model) = particle_system
+    (; fluid_system, cache) = particle_system
 
     sound_speed = system_sound_speed(fluid_system)
 
@@ -59,13 +59,11 @@ function interact!(dv, v_particle_system, u_particle_system,
         v_diff = current_velocity(v_particle_system, particle_system, particle) -
                  current_velocity(v_neighbor_system, neighbor_system, neighbor)
 
-        # Continuity equation
-        @inbounds dv[end, particle] += rho_a / rho_b * m_b * dot(v_diff, grad_kernel)
-
-        density_diffusion!(dv, density_diffusion(particle_system),
-                           v_particle_system, particle, neighbor,
-                           pos_diff, distance, m_b, rho_a, rho_b,
-                           particle_system, grad_kernel)
+        # Propagate `@inbounds` to the continuity equation, which accesses particle data
+        @inbounds continuity_equation!(dv, particle_system, neighbor_system,
+                                       v_particle_system, v_neighbor_system, particle,
+                                       neighbor, pos_diff, distance, m_b, rho_a, rho_b,
+                                       grad_kernel)
 
         # Open boundary pressure evolution matches the corresponding fluid system:
         # - EDAC: Compute pressure evolution like the fluid system
 
@@ -75,7 +75,10 @@ function OpenBoundarySystem(boundary_zones::Union{BoundaryZone, Nothing}...;
                             pressure_acceleration=fluid_system.pressure_acceleration_formulation,
                             shifting_technique=boundary_model isa
                                                BoundaryModelDynamicalPressureZhang ?
-                                               shifting_technique(fluid_system) : nothing)
+                                               shifting_technique(fluid_system) : nothing,
+                            density_diffusion=boundary_model isa
+                                              BoundaryModelDynamicalPressureZhang ?
+                                              density_diffusion(fluid_system) : nothing)
     boundary_zones_ = filter(bz -> !isnothing(bz), boundary_zones)
 
     initial_conditions = union((bz.initial_condition for bz in boundary_zones_)...)
@@ -90,7 +93,8 @@ function OpenBoundarySystem(boundary_zones::Union{BoundaryZone, Nothing}...;
     cache = (;
              create_cache_shifting(initial_conditions, shifting_technique)...,
              create_cache_open_boundary(boundary_model, fluid_system, initial_conditions,
-                                        calculate_flow_rate, boundary_zones_)...)
+                                        density_diffusion, calculate_flow_rate,
+                                        boundary_zones_)...)
 
     if any(pr -> isa(pr, RCRWindkesselModel), cache.pressure_reference_values)
         calculate_flow_rate = true
@@ -130,7 +134,7 @@ function initialize!(system::OpenBoundarySystem, semi)
 end
 
 function create_cache_open_boundary(boundary_model, fluid_system, initial_condition,
-                                    calculate_flow_rate, boundary_zones)
+                                    density_diffusion, calculate_flow_rate, boundary_zones)
     reference_values = map(bz -> bz.reference_values, boundary_zones)
     ELTYPE = eltype(initial_condition)
 
@@ -174,15 +178,14 @@ function create_cache_open_boundary(boundary_model, fluid_system, initial_condit
         # as it was already verified in `allocate_buffer` that the density array is constant.
         density_rest = first(initial_condition.density)
 
-        dd = density_diffusion(fluid_system)
-        if dd isa DensityDiffusionAntuono
-            density_diffusion_ = DensityDiffusionAntuono(initial_condition; delta=dd.delta)
+        if density_diffusion isa DensityDiffusionAntuono
+            density_diffusion_ = DensityDiffusionAntuono(initial_condition;
+                                                         delta=density_diffusion.delta)
         else
-            density_diffusion_ = dd
+            density_diffusion_ = density_diffusion
         end
 
-        cache = (; density_calculator=ContinuityDensity(),
-                 density_diffusion=density_diffusion_,
+        cache = (; density_diffusion=density_diffusion_,
                  pressure_boundary=pressure_boundary,
                  density_rest=density_rest, cache...)
 
@@ -261,6 +264,10 @@ system_sound_speed(system::OpenBoundarySystem) = system_sound_speed(system.fluid
 
 @inline hydrodynamic_mass(system::OpenBoundarySystem, particle) = system.mass[particle]
 
+@propagate_inbounds function current_velocity(v, system::OpenBoundarySystem)
+    return view(v, 1:ndims(system), :)
+end
+
 @inline function current_density(v, system::OpenBoundarySystem)
     return system.cache.density
 end
@@ -306,6 +313,9 @@ function update_boundary_interpolation!(system::OpenBoundarySystem, v, u, v_ode,
                                         semi, t)
     update_boundary_model!(system, system.boundary_model, v, u, v_ode, u_ode, semi, t)
     update_shifting!(system, shifting_technique(system), v, u, v_ode, u_ode, semi)
+
+    @trixi_timeit timer() "update density diffusion" update!(density_diffusion(system),
+                                                             v, u, system, semi)
 end
 
 # This function is called by the `UpdateCallback`, as the integrator array might be modified
@@ -397,8 +407,8 @@ function check_domain!(system, v, u, v_ode, u_ode, semi)
                           v, u, v_fluid, u_fluid)
     end
 
-    update_system_buffer!(system.buffer, semi)
-    update_system_buffer!(fluid_system.buffer, semi)
+    update_system_buffer!(system.buffer)
+    update_system_buffer!(fluid_system.buffer)
 
     fluid_candidates .= false
 
@@ -428,8 +438,8 @@ function check_domain!(system, v, u, v_ode, u_ode, semi)
                           v, u, v_fluid, u_fluid)
     end
 
-    update_system_buffer!(system.buffer, semi)
-    update_system_buffer!(fluid_system.buffer, semi)
+    update_system_buffer!(system.buffer)
+    update_system_buffer!(fluid_system.buffer)
 
     # Since particles have been transferred, the neighborhood searches must be updated
     update_nhs!(semi, u_ode)
@@ -453,7 +463,7 @@ end
     # to determine if it exited the boundary zone through the free surface (outflow).
     if dot(relative_position, boundary_zone.face_normal) < 0
         # Particle is outside the fluid domain
-        deactivate_particle!(system, particle, u)
+        deactivate_particle!(system, particle, v, u)
 
         return system
     end
@@ -475,7 +485,7 @@ end
     # Verify the particle remains inside the boundary zone after the reset; deactivate it if not.
     particle_coords = current_coords(u, system, particle)
     if !is_in_boundary_zone(boundary_zone, particle_coords)
-        deactivate_particle!(system, particle, u)
+        deactivate_particle!(system, particle, v, u)
 
         return system
     end
@@ -494,7 +504,7 @@ end
     transfer_particle!(system, fluid_system, particle, particle_new, v, u, v_fluid, u_fluid)
 
     # Deactivate particle in interior domain
-    deactivate_particle!(fluid_system, particle, u_fluid)
+    deactivate_particle!(fluid_system, particle, v_fluid, u_fluid)
 
     return fluid_system
 end