From 610c7e44b0c64785a47444abe0a4d3b41ddc67c5 Mon Sep 17 00:00:00 2001 From: Marco Artiano Date: Fri, 19 Jun 2026 00:19:30 +0200 Subject: [PATCH 01/39] add first working version --- src/Trixi.jl | 2 +- src/auxiliary/math.jl | 1 + src/solvers/dgsem_structured/dg.jl | 1 + src/solvers/dgsem_structured/dg_3d_turbo.jl | 389 ++++++++++++++++++++ 4 files changed, 392 insertions(+), 1 deletion(-) create mode 100644 src/solvers/dgsem_structured/dg_3d_turbo.jl diff --git a/src/Trixi.jl b/src/Trixi.jl index f2fce689d1e..2154e40c961 100644 --- a/src/Trixi.jl +++ b/src/Trixi.jl @@ -310,7 +310,7 @@ export DG, FDSBP, BlockFV, UniformFiniteVolumeBasis, VolumeIntegralFiniteVolume, VolumeIntegralWeakForm, VolumeIntegralStrongForm, - VolumeIntegralFluxDifferencing, + VolumeIntegralFluxDifferencing, FluxVolumeTurbo, VolumeIntegralPureLGLFiniteVolume, VolumeIntegralPureLGLFiniteVolumeO2, VolumeIntegralShockCapturingHG, VolumeIntegralShockCapturingRRG, VolumeIntegralShockCapturingHGType, diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl index 8ee1ff5f2fa..1c4f3bc385b 100644 --- a/src/auxiliary/math.jl +++ b/src/auxiliary/math.jl @@ -135,6 +135,7 @@ end See also [`Trixi.set_log_type!`](@ref). """ @inline log(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.log(x) + @inline log(x::LoopVectorization.VectorizationBase.AbstractSIMD) = ifelse(x < zero(x), oftype(x, NaN), Base.log(x)) @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x) @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x) diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl index 5ac27049151..99b672f4fa5 100644 --- a/src/solvers/dgsem_structured/dg.jl +++ b/src/solvers/dgsem_structured/dg.jl @@ -234,4 +234,5 @@ include("dg_2d_subcell_limiters.jl") # Specialized implementations used to improve performance include("dg_2d_compressible_euler.jl") include("dg_3d_compressible_euler.jl") +include("dg_3d_turbo.jl") end # @muladd diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl new file mode 100644 index 00000000000..afd0e08ca83 --- /dev/null +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -0,0 +1,389 @@ +struct FluxVolumeTurbo{VolumeFlux} + volume_flux::VolumeFlux + function FluxVolumeTurbo{VolumeFlux}(volume_flux) where {VolumeFlux} + return new{VolumeFlux}(volume_flux) + end +end + +function FluxVolumeTurbo(volume_flux_conservative, volume_flux_nonconservative) + turbo_flux = combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) + return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux) +end + +function FluxVolumeTurbo(volume_flux) + turbo_flux = combined_turbo_flux(volume_flux) + return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux) +end + +combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) = True() + +@inline combined_turbo_flux(volume_flux) = volume_flux + + +@inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations)) + +@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = + Base.front(conserved_and_equations) + +@inline function turbo_volume_flux(volume_flux, aux_and_normals_and_equations...) + equations = last(aux_and_normals_and_equations) + n = nvariables(equations) + u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n))) + u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n))) + normal = SVector(aux_and_normals_and_equations[end - 3], + aux_and_normals_and_equations[end - 2], + aux_and_normals_and_equations[end - 1]) + return Tuple(volume_flux(u_ll, u_rr, normal, equations)) +end + +@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element, + MeshT::Type{<:Union{StructuredMesh{3}, + P4estMesh{3}}}, + have_nonconservative_terms::False, + equations, + volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM, + cache, alpha) + @unpack volume_flux = volume_flux_turbo + flux_differencing_kernel_turbo!(_du, u_cons, element, equations, + volume_flux, dg, cache, alpha, + n_turbo_flux_aux_node_vars(volume_flux, equations), + Val(nvariables(equations))) +end + +@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element, + MeshT::Type{<:Union{StructuredMesh{3}, + P4estMesh{3}}}, + have_nonconservative_terms::True, + combine_conservative_and_nonconservative_fluxes::True, + equations, + volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM, + cache, alpha) + @unpack volume_flux = volume_flux_turbo + flux_differencing_kernel_turbo!(_du, u_cons, element, equations, + volume_flux, dg, cache, alpha, + n_turbo_flux_aux_node_vars(volume_flux, equations), + Val(nvariables(equations))) +end + +@generated function flux_differencing_kernel_turbo!(_du::PtrArray, u_cons::PtrArray, + element, equations, + volume_turbo_flux_function, + dg, cache, alpha, ::Val{NAUX}, + ::Val{NVARS}) where {NAUX, NVARS} + # Per-node scalars used inside the `@turbo` loops, e.g. `u_prim_ll_1 = rho_ll`, + # `flux_1 = f1, etc. + u_prim_ll = [Symbol(:u_prim_ll_, v) for v in 1:NAUX] + u_prim_rr = [Symbol(:u_prim_rr_, v) for v in 1:NAUX] + flux = [Symbol(:flux_, v) for v in 1:NVARS] + + # Convert conserved to auxiliary variables node-wise, e.g. + # rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations) + # u_prim[i, j, k, 1] = rho # and so on for v2, v3, p, ... + cons_reads = [:(u_cons[$c, i, j, k, element]) for c in 1:NVARS] + cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...), + :(cons2fluxauxiliary(volume_turbo_flux_function, $(cons_reads...), + equations))) + cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX] + + # Evaluate the two-point volume flux + flux_call = Expr(:(=), Expr(:tuple, flux...), + :(turbo_volume_flux(volume_turbo_flux_function, + $(u_prim_ll...), $(u_prim_rr...), + normal_direction_1, normal_direction_2, normal_direction_3, + equations))) + + # Build the inner `@turbo` loop body. For the x direction `loads` gives + # rho_ll = u_prim_permuted[jk, i, 1] # left node, and so on + # rho_rr = u_prim_permuted[jk, ii, 1] # right node + loads(arr, idx_ll, idx_rr) = vcat([:($(u_prim_ll[v]) = $arr[$(idx_ll...), $v]) + for v in 1:NAUX], + [:($(u_prim_rr[v]) = $arr[$(idx_rr...), $v]) + for v in 1:NAUX]) + + # Compute the average of the normal_direction + # e.g. the x direction `normals` gives + # normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] + + # contravariant_vectors_x[jk, ii, 1]) # and 2, 3 + normals(cv, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 * + ($cv[$(idx_ll...), $m] + + $cv[$(idx_rr...), $m])) for m in 1:3] + + # Store the fluxes in the permuted du array + # e.g. the x direction `stores` gives + # du_permuted[jk, i, 1] += factor_l * f1 # left node, and so on + # du_permuted[jk, ii, 1] += factor_r * f1 # right node + stores(du_arr, idx_ll, idx_rr) = vcat([:($du_arr[$(idx_ll...), $v] += factor_l * + $(flux[v])) + for v in 1:NVARS], + [:($du_arr[$(idx_rr...), $v] += factor_r * + $(flux[v])) + for v in 1:NVARS]) + + loads_x = loads(:u_prim_permuted, (:jk, :i), (:jk, :ii)) + normals_x = normals(:contravariant_vectors_x, (:jk, :i), (:jk, :ii)) + stores_x = stores(:du_permuted, (:jk, :i), (:jk, :ii)) + + loads_y = loads(:u_prim, (:i, :j, :k), (:i, :jj, :k)) + normals_y = normals(:contravariant_vectors_y, (:i, :j, :k), (:i, :jj, :k)) + stores_y = stores(:du, (:i, :j, :k), (:i, :jj, :k)) + + loads_z = loads(:u_prim_reshaped, (:ij, :k), (:ij, :kk)) + normals_z = normals(:contravariant_vectors_z, (:ij, :k), (:ij, :kk)) + stores_z = stores(:du_reshaped, (:ij, :k), (:ij, :kk)) + + quote + @unpack derivative_split = dg.basis + @unpack contravariant_vectors = cache.elements + + # Create a temporary array that will be used to store the RHS with permuted + # indices `[i, j, k, v]` to allow using SIMD instructions. + # `StrideArray`s with purely static dimensions do not allocate on the heap. + du = StrideArray{eltype(u_cons)}(undef, + (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)), + StaticInt(nnodes(dg)), StaticInt(NVARS))) + + # Convert conserved to auxiliary variables on the given `element`. + u_prim = StrideArray{eltype(u_cons)}(undef, + (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)), + StaticInt(nnodes(dg)), StaticInt(NAUX))) + + # Convert conserved to auxiliary variables and store them in `u_prim`. + @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) + $cons2aux + $(cons2aux_writes...) + end + + # x direction + # At first, we create new temporary arrays with permuted memory layout to + # allow using SIMD instructions along the first dimension (which is contiguous + # in memory). + du_permuted = StrideArray{eltype(u_cons)}(undef, + (StaticInt(nnodes(dg)^2), + StaticInt(nnodes(dg)), + StaticInt(NVARS))) + + u_prim_permuted = StrideArray{eltype(u_cons)}(undef, + (StaticInt(nnodes(dg)^2), + StaticInt(nnodes(dg)), + StaticInt(NAUX))) + + @turbo for v in indices(u_prim, 4), + k in eachnode(dg), + j in eachnode(dg), + i in eachnode(dg) + + jk = j + nnodes(dg) * (k - 1) + u_prim_permuted[jk, i, v] = u_prim[i, j, k, v] + end + fill!(du_permuted, zero(eltype(du_permuted))) + + # We must also permute the contravariant vectors. + contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef, + (StaticInt(nnodes(dg)^2), + StaticInt(nnodes(dg)), + StaticInt(3))) + + @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) + jk = j + nnodes(dg) * (k - 1) + contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k, + element] + contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k, + element] + contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k, + element] + end + + # Next, we basically inline the volume flux. To allow SIMD vectorization and + # still use the symmetry of the volume flux and the derivative matrix, we + # loop over the triangular part in an outer loop and use a plain inner loop. + for i in eachnode(dg), ii in (i + 1):nnodes(dg) + @turbo for jk in Base.OneTo(nnodes(dg)^2) + $(loads_x...) + $(normals_x...) + $flux_call + factor_l = alpha * derivative_split[i, ii] + factor_r = alpha * derivative_split[ii, i] + $(stores_x...) + end + end + + @turbo for v in eachvariable(equations), + k in eachnode(dg), + j in eachnode(dg), + i in eachnode(dg) + + jk = j + nnodes(dg) * (k - 1) + du[i, j, k, v] = du_permuted[jk, i, v] + end + + # y direction + # We must also permute the contravariant vectors. + contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef, + (StaticInt(nnodes(dg)), + StaticInt(nnodes(dg)), + StaticInt(nnodes(dg)), + StaticInt(3))) + + @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) + contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, + element] + contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k, + element] + contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k, + element] + end + + # A possible permutation of array dimensions with improved opportunities for + # SIMD vectorization appeared to be slower than the direct version used here + # in preliminary numerical experiments on an AVX2 system. + for j in eachnode(dg), jj in (j + 1):nnodes(dg) + @turbo for k in eachnode(dg), i in eachnode(dg) + $(loads_y...) + $(normals_y...) + $flux_call + factor_l = alpha * derivative_split[j, jj] + factor_r = alpha * derivative_split[jj, j] + $(stores_y...) + end + end + + # z direction + # The memory layout is already optimal for SIMD vectorization in this loop. + # We just squeeze the first two dimensions to make the code slightly faster. + GC.@preserve u_prim begin + u_prim_reshaped = PtrArray(pointer(u_prim), + (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)), + StaticInt(NAUX))) + + du_reshaped = PtrArray(pointer(du), + (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)), + StaticInt(NVARS))) + + # We must also permute the contravariant vectors. + contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef, + (StaticInt(nnodes(dg)^2), + StaticInt(nnodes(dg)), + StaticInt(3))) + + @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) + ij = i + nnodes(dg) * (j - 1) + contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k, + element] + contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k, + element] + contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k, + element] + end + + for k in eachnode(dg), kk in (k + 1):nnodes(dg) + @turbo for ij in Base.OneTo(nnodes(dg)^2) + $(loads_z...) + $(normals_z...) + $flux_call + factor_l = alpha * derivative_split[k, kk] + factor_r = alpha * derivative_split[kk, k] + $(stores_z...) + end + end + end # GC.@preserve u_prim begin + + # Finally, we add the temporary RHS computed here to the global RHS in the + # given `element`. + @turbo for v in eachvariable(equations), + k in eachnode(dg), + j in eachnode(dg), + i in eachnode(dg) + + _du[v, i, j, k, element] += du[i, j, k, v] + end + + return nothing + end +end + +LoopVectorization.can_turbo(::typeof(turbo_volume_flux), ::Val) = true +LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true + +@inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo + +@inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), equations::CompressibleEulerEquations3D) + return Val(7) +end + +@inline function cons2fluxauxiliary(volume_flux::typeof(flux_ranocha_turbo), + rho, rho_v1, rho_v2, rho_v3, rho_e, + equations::CompressibleEulerEquations3D) + v1 = rho_v1 / rho + v2 = rho_v2 / rho + v3 = rho_v3 / rho + p = (equations.gamma - 1) * + (rho_e - 0.5f0 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3)) + return rho, v1, v2, v3, p, log(rho), log(p) +end + +@inline function turbo_volume_flux(volume_flux::typeof(flux_ranocha_turbo), + rho_ll, v1_ll, v2_ll, v3_ll, + p_ll, log_rho_ll, log_p_ll, + rho_rr, v1_rr, v2_rr, v3_rr, + p_rr, log_rho_rr, log_p_rr, + normal_direction_1, + normal_direction_2, + normal_direction_3, + equations::CompressibleEulerEquations3D) + + v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 + + v3_ll * normal_direction_3 + v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 + + v3_rr * normal_direction_3 + + # Compute required mean values. + # We inline the logarithmic mean to allow LoopVectorization.jl to optimize it + # efficiently: `log` is precomputed once per node in `cons2fluxauxiliary` + # (`log_rho`, `log_p`) and reused here for the "regular" branch, while the + # "special" branch uses the series expansion to avoid cancellation when x ≈ y. + # This is equivalent to + # rho_mean = ln_mean(rho_ll, rho_rr) + x1 = rho_ll + log_x1 = log_rho_ll + y1 = rho_rr + log_y1 = log_rho_rr + x1_plus_y1 = x1 + y1 + y1_minus_x1 = y1 - x1 + z1 = y1_minus_x1^2 / x1_plus_y1^2 + special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1))) + regular_path1 = y1_minus_x1 / (log_y1 - log_x1) + rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1) + + # Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)` + # in exact arithmetic since + # log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ) + # = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ) + # inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll) + x2 = rho_ll * p_rr + log_x2 = log_rho_ll + log_p_rr + y2 = rho_rr * p_ll + log_y2 = log_rho_rr + log_p_ll + x2_plus_y2 = x2 + y2 + y2_minus_x2 = y2 - x2 + z2 = y2_minus_x2^2 / x2_plus_y2^2 + special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2 + regular_path2 = (log_y2 - log_x2) / y2_minus_x2 + inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2) + + v1_avg = 0.5 * (v1_ll + v1_rr) + v2_avg = 0.5 * (v2_ll + v2_rr) + v3_avg = 0.5 * (v3_ll + v3_rr) + p_avg = 0.5 * (p_ll + p_rr) + velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr) + + # Calculate fluxes depending on normal_direction + f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr) + f2 = f1 * v1_avg + p_avg * normal_direction_1 + f3 = f1 * v2_avg + p_avg * normal_direction_2 + f4 = f1 * v3_avg + p_avg * normal_direction_3 + f5 = (f1 * (velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) + + + 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll)) + + return (f1, f2, f3, f4, f5) +end From 855525004518c9ecff58e98ab5e794ba7bda231b Mon Sep 17 00:00:00 2001 From: Marco Artiano Date: Fri, 19 Jun 2026 01:11:33 +0200 Subject: [PATCH 02/39] add tests --- src/auxiliary/math.jl | 6 +- test/test_performance_specializations_3d.jl | 76 +++++++++++++++++++++ 2 files changed, 81 insertions(+), 1 deletion(-) diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl index 1c4f3bc385b..334f011fdd3 100644 --- a/src/auxiliary/math.jl +++ b/src/auxiliary/math.jl @@ -135,7 +135,11 @@ end See also [`Trixi.set_log_type!`](@ref). """ @inline log(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.log(x) - @inline log(x::LoopVectorization.VectorizationBase.AbstractSIMD) = ifelse(x < zero(x), oftype(x, NaN), Base.log(x)) + @inline log(x::LoopVectorization.VectorizationBase.AbstractSIMD) = ifelse(x < + zero(x), + oftype(x, + NaN), + Base.log(x)) @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x) @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x) diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index 967b0f9cf3e..0f560612df6 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -173,6 +173,82 @@ end end end +@timed_testset "StructuredMesh3D, FluxVolumeTurbo(flux_ranocha)" begin + trixi_include(@__MODULE__, + joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"), + cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3, + volume_flux = Trixi.FluxVolumeTurbo(flux_ranocha), + surface_flux = flux_ranocha) + u_ode = copy(sol.u[end]) + du_ode = zero(u_ode) + + # Preserve original memory since it will be `unsafe_wrap`ped and might + # thus otherwise be garbage collected + GC.@preserve u_ode du_ode begin + u = Trixi.wrap_array(u_ode, semi) + du = Trixi.wrap_array(du_ode, semi) + have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations) + + # Call the optimized version specialized on `FluxVolumeTurbo`, which dispatches + # to the generated `@turbo` kernel + du .= 0 + Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), + have_nonconservative_terms, semi.equations, + semi.solver.volume_integral.volume_flux, + semi.solver, semi.cache, true) + du_specialized = du[:, :, :, :, 1] + + # Call the plain version using the underlying `flux_ranocha` directly, which + # dispatches to the generic flux differencing kernel + du .= 0 + Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), + have_nonconservative_terms, semi.equations, + flux_ranocha, semi.solver, semi.cache, true) + du_baseline = du[:, :, :, :, 1] + + @test du_specialized ≈ du_baseline + end +end + +@timed_testset "StructuredMesh3D, FluxVolumeTurbo(flux_shima_etal)" begin + # `flux_shima_etal` has no dedicated turbo specialization, so `FluxVolumeTurbo` + # exercises the generic default path (conserved auxiliary variables + canonical flux). + trixi_include(@__MODULE__, + joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"), + cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3, + volume_flux = Trixi.FluxVolumeTurbo(flux_shima_etal), + surface_flux = flux_shima_etal) + u_ode = copy(sol.u[end]) + du_ode = zero(u_ode) + + # Preserve original memory since it will be `unsafe_wrap`ped and might + # thus otherwise be garbage collected + GC.@preserve u_ode du_ode begin + u = Trixi.wrap_array(u_ode, semi) + du = Trixi.wrap_array(du_ode, semi) + have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations) + + # Call the optimized version specialized on `FluxVolumeTurbo`, which dispatches + # to the generated `@turbo` kernel + du .= 0 + Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), + have_nonconservative_terms, semi.equations, + semi.solver.volume_integral.volume_flux, + semi.solver, semi.cache, true) + du_specialized = du[:, :, :, :, 1] + + # Call the plain version using the underlying `flux_shima_etal` directly, which + # dispatches to the generic flux differencing kernel + du .= 0 + Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), + have_nonconservative_terms, semi.equations, + flux_shima_etal, semi.solver, semi.cache, true) + du_baseline = du[:, :, :, :, 1] + + @test du_specialized ≈ du_baseline + end +end + @timed_testset "P4estMesh3D, combine_conservative_and_nonconservative_fluxes" begin trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "p4est_3d_dgsem", From eb860cf8465ea0ce048dfd51af5aa2f0458ba8c4 Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Fri, 19 Jun 2026 01:12:46 +0200 Subject: [PATCH 03/39] format --- src/solvers/dgsem_structured/dg_3d_turbo.jl | 52 +++++++++++---------- 1 file changed, 27 insertions(+), 25 deletions(-) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index afd0e08ca83..36a3e87d2bd 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -19,11 +19,9 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) = @inline combined_turbo_flux(volume_flux) = volume_flux - @inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations)) -@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = - Base.front(conserved_and_equations) +@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations) @inline function turbo_volume_flux(volume_flux, aux_and_normals_and_equations...) equations = last(aux_and_normals_and_equations) @@ -37,12 +35,12 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) = end @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element, - MeshT::Type{<:Union{StructuredMesh{3}, - P4estMesh{3}}}, - have_nonconservative_terms::False, - equations, - volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM, - cache, alpha) + MeshT::Type{<:Union{StructuredMesh{3}, + P4estMesh{3}}}, + have_nonconservative_terms::False, + equations, + volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM, + cache, alpha) @unpack volume_flux = volume_flux_turbo flux_differencing_kernel_turbo!(_du, u_cons, element, equations, volume_flux, dg, cache, alpha, @@ -51,13 +49,13 @@ end end @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element, - MeshT::Type{<:Union{StructuredMesh{3}, - P4estMesh{3}}}, - have_nonconservative_terms::True, - combine_conservative_and_nonconservative_fluxes::True, - equations, - volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM, - cache, alpha) + MeshT::Type{<:Union{StructuredMesh{3}, + P4estMesh{3}}}, + have_nonconservative_terms::True, + combine_conservative_and_nonconservative_fluxes::True, + equations, + volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM, + cache, alpha) @unpack volume_flux = volume_flux_turbo flux_differencing_kernel_turbo!(_du, u_cons, element, equations, volume_flux, dg, cache, alpha, @@ -89,24 +87,28 @@ end flux_call = Expr(:(=), Expr(:tuple, flux...), :(turbo_volume_flux(volume_turbo_flux_function, $(u_prim_ll...), $(u_prim_rr...), - normal_direction_1, normal_direction_2, normal_direction_3, + normal_direction_1, normal_direction_2, + normal_direction_3, equations))) # Build the inner `@turbo` loop body. For the x direction `loads` gives # rho_ll = u_prim_permuted[jk, i, 1] # left node, and so on # rho_rr = u_prim_permuted[jk, ii, 1] # right node loads(arr, idx_ll, idx_rr) = vcat([:($(u_prim_ll[v]) = $arr[$(idx_ll...), $v]) - for v in 1:NAUX], - [:($(u_prim_rr[v]) = $arr[$(idx_rr...), $v]) - for v in 1:NAUX]) + for v in 1:NAUX], + [:($(u_prim_rr[v]) = $arr[$(idx_rr...), $v]) + for v in 1:NAUX]) # Compute the average of the normal_direction # e.g. the x direction `normals` gives # normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] + # contravariant_vectors_x[jk, ii, 1]) # and 2, 3 normals(cv, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 * - ($cv[$(idx_ll...), $m] + - $cv[$(idx_rr...), $m])) for m in 1:3] + ($cv[$(idx_ll...), + $m] + + $cv[$(idx_rr...), + $m])) + for m in 1:3] # Store the fluxes in the permuted du array # e.g. the x direction `stores` gives @@ -116,7 +118,7 @@ end $(flux[v])) for v in 1:NVARS], [:($du_arr[$(idx_rr...), $v] += factor_r * - $(flux[v])) + $(flux[v])) for v in 1:NVARS]) loads_x = loads(:u_prim_permuted, (:jk, :i), (:jk, :ii)) @@ -306,7 +308,8 @@ LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true @inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo -@inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), equations::CompressibleEulerEquations3D) +@inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), + equations::CompressibleEulerEquations3D) return Val(7) end @@ -330,7 +333,6 @@ end normal_direction_2, normal_direction_3, equations::CompressibleEulerEquations3D) - v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 + v3_ll * normal_direction_3 v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 + From ab3063a8fd3aa1a598e0e3e9461a49c7e08628b6 Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Fri, 19 Jun 2026 01:22:16 +0200 Subject: [PATCH 04/39] minor fix --- src/solvers/dgsem_structured/dg_3d_turbo.jl | 7 ++++--- test/test_performance_specializations_3d.jl | 14 ++++---------- 2 files changed, 8 insertions(+), 13 deletions(-) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 36a3e87d2bd..5881d73978e 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -34,6 +34,9 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) = return Tuple(volume_flux(u_ll, u_rr, normal, equations)) end +LoopVectorization.can_turbo(::typeof(turbo_volume_flux), ::Val) = true +LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true + @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element, MeshT::Type{<:Union{StructuredMesh{3}, P4estMesh{3}}}, @@ -48,6 +51,7 @@ end Val(nvariables(equations))) end +# TODO: Fix the dispatch @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element, MeshT::Type{<:Union{StructuredMesh{3}, P4estMesh{3}}}, @@ -303,9 +307,6 @@ end end end -LoopVectorization.can_turbo(::typeof(turbo_volume_flux), ::Val) = true -LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true - @inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo @inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index 0f560612df6..9a9ee7d24fe 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -189,8 +189,7 @@ end du = Trixi.wrap_array(du_ode, semi) have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations) - # Call the optimized version specialized on `FluxVolumeTurbo`, which dispatches - # to the generated `@turbo` kernel + # Call the optimized version du .= 0 Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), have_nonconservative_terms, semi.equations, @@ -198,8 +197,7 @@ end semi.solver, semi.cache, true) du_specialized = du[:, :, :, :, 1] - # Call the plain version using the underlying `flux_ranocha` directly, which - # dispatches to the generic flux differencing kernel + # Call the plain version du .= 0 Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), have_nonconservative_terms, semi.equations, @@ -211,8 +209,6 @@ end end @timed_testset "StructuredMesh3D, FluxVolumeTurbo(flux_shima_etal)" begin - # `flux_shima_etal` has no dedicated turbo specialization, so `FluxVolumeTurbo` - # exercises the generic default path (conserved auxiliary variables + canonical flux). trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"), cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3, @@ -228,8 +224,7 @@ end du = Trixi.wrap_array(du_ode, semi) have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations) - # Call the optimized version specialized on `FluxVolumeTurbo`, which dispatches - # to the generated `@turbo` kernel + # Call the optimized version du .= 0 Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), have_nonconservative_terms, semi.equations, @@ -237,8 +232,7 @@ end semi.solver, semi.cache, true) du_specialized = du[:, :, :, :, 1] - # Call the plain version using the underlying `flux_shima_etal` directly, which - # dispatches to the generic flux differencing kernel + # Call the plain version du .= 0 Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), have_nonconservative_terms, semi.equations, From f3bc84d05c648bfcf8a456f62da4276c5c860ea6 Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Fri, 19 Jun 2026 11:11:21 +0200 Subject: [PATCH 05/39] nonconservative unstable kernel --- src/auxiliary/math.jl | 29 +- src/solvers/dgsem_structured/dg_3d_turbo.jl | 321 ++++++++++++++++++-- 2 files changed, 324 insertions(+), 26 deletions(-) diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl index 334f011fdd3..107ea127d57 100644 --- a/src/auxiliary/math.jl +++ b/src/auxiliary/math.jl @@ -140,7 +140,6 @@ end oftype(x, NaN), Base.log(x)) - @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x) @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x) @inline log(x::Float16) = ccall("llvm.log.f16", llvmcall, Float16, (Float16,), x) @@ -210,6 +209,20 @@ Given ε = 1.0e-4, we use the following algorithm. end end +@inline function ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD, + y::LoopVectorization.VectorizationBase.AbstractSIMD) + RealT = eltype(x) + epsilon_f2 = convert(RealT, 1.0e-4) + f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2 + return ifelse(f2 < epsilon_f2, + (x + y) / @evalpoly(f2, + convert(RealT, 2), + convert(RealT, 2 / 3), + convert(RealT, 2 / 5), + convert(RealT, 2 / 7)), + (y - x) / log(y / x)) +end + """ Trixi.inv_ln_mean(x::Real, y::Real) @@ -236,6 +249,20 @@ multiplication. end end + @inline function inv_ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD, + y::LoopVectorization.VectorizationBase.AbstractSIMD) + RealT = eltype(x) + epsilon_f2 = convert(RealT, 1.0e-4) + f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2 + return ifelse(f2 < epsilon_f2, + @evalpoly(f2, + convert(RealT, 2), + convert(RealT, 2 / 3), + convert(RealT, 2 / 5), + convert(RealT, 2 / 7)) / (x + y), + log(y / x) / (y - x)) +end + # `Base.max` and `Base.min` perform additional checks for signed zeros and `NaN`s # which are not present in comparable functions in Fortran/C++. For example, # ```julia diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 5881d73978e..7193cfefa7a 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -19,11 +19,14 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) = @inline combined_turbo_flux(volume_flux) = volume_flux +@inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (; volume_flux_conservative, volume_flux_nonconservative) + @inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations)) @inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations) -@inline function turbo_volume_flux(volume_flux, aux_and_normals_and_equations...) +@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False, + aux_and_normals_and_equations...) equations = last(aux_and_normals_and_equations) n = nvariables(equations) u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n))) @@ -34,7 +37,25 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) = return Tuple(volume_flux(u_ll, u_rr, normal, equations)) end -LoopVectorization.can_turbo(::typeof(turbo_volume_flux), ::Val) = true +@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::True, + aux_and_normals_and_equations...) + @unpack volume_flux_conservative, volume_flux_nonconservative = volume_flux + equations = last(aux_and_normals_and_equations) + n = nvariables(equations) + u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n))) + u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n))) + normal = SVector(aux_and_normals_and_equations[end - 3], + aux_and_normals_and_equations[end - 2], + aux_and_normals_and_equations[end - 1]) + flux = volume_flux_conservative(u_ll, u_rr, normal, equations) + noncons_left = volume_flux_nonconservative(u_ll, u_rr, normal, equations) + noncons_right = volume_flux_nonconservative(u_rr, u_ll, normal, equations) + flux_left = flux + 0.5f0 * noncons_left + flux_right = flux + 0.5f0 * noncons_right + return (Tuple(flux_left)..., Tuple(flux_right)...) +end + +LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element, @@ -45,55 +66,305 @@ LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM, cache, alpha) @unpack volume_flux = volume_flux_turbo - flux_differencing_kernel_turbo!(_du, u_cons, element, equations, + flux_differencing_kernel_turbo!(_du, u_cons, element, MeshT, have_nonconservative_terms, + equations, volume_flux, dg, cache, alpha, n_turbo_flux_aux_node_vars(volume_flux, equations), Val(nvariables(equations))) end -# TODO: Fix the dispatch +@generated function flux_differencing_kernel_turbo!(_du::PtrArray, u_cons::PtrArray, + element, + MeshT::Type{<:Union{StructuredMesh{3}, + P4estMesh{3}}}, + have_nonconservative_terms::False, + equations, + volume_flux, + dg, cache, alpha, ::Val{NAUX}, + ::Val{NVARS}) where {NAUX, NVARS} + # Per-node scalars used inside the `@turbo` loops, e.g. `u_prim_ll_1 = rho_ll`, + # `flux_1 = f1, etc. + u_prim_ll = [Symbol(:u_prim_ll_, v) for v in 1:NAUX] + u_prim_rr = [Symbol(:u_prim_rr_, v) for v in 1:NAUX] + flux = [Symbol(:flux_, v) for v in 1:NVARS] + + # Convert conserved to auxiliary variables node-wise, e.g. + # rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations) + # u_prim[i, j, k, 1] = rho # and so on for v2, v3, p, ... + cons_reads = [:(u_cons[$c, i, j, k, element]) for c in 1:NVARS] + cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...), + :(cons2fluxauxiliary(volume_flux, $(cons_reads...), + equations))) + cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX] + + # Evaluate the two-point volume flux + flux_call = Expr(:(=), Expr(:tuple, flux...), + :(volume_flux_turbo(volume_flux, + have_nonconservative_terms, + $(u_prim_ll...), $(u_prim_rr...), + normal_direction_1, normal_direction_2, + normal_direction_3, + equations))) + + # Build the inner `@turbo` loop body. For the x direction `loads` gives + # rho_ll = u_prim_permuted[jk, i, 1] # left node, and so on + # rho_rr = u_prim_permuted[jk, ii, 1] # right node + loads(arr, idx_ll, idx_rr) = vcat([:($(u_prim_ll[v]) = $arr[$(idx_ll...), $v]) + for v in 1:NAUX], + [:($(u_prim_rr[v]) = $arr[$(idx_rr...), $v]) + for v in 1:NAUX]) + + # Compute the average of the normal_direction + # e.g. the x direction `normals` gives + # normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] + + # contravariant_vectors_x[jk, ii, 1]) # and 2, 3 + normals(cv, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 * + ($cv[$(idx_ll...), + $m] + + $cv[$(idx_rr...), + $m])) + for m in 1:3] + + # Store the fluxes in the permuted du array + # e.g. the x direction `stores` gives + # du_permuted[jk, i, 1] += factor_l * f1 # left node, and so on + # du_permuted[jk, ii, 1] += factor_r * f1 # right node + stores(du_arr, idx_ll, idx_rr) = vcat([:($du_arr[$(idx_ll...), $v] += factor_l * + $(flux[v])) + for v in 1:NVARS], + [:($du_arr[$(idx_rr...), $v] += factor_r * + $(flux[v])) + for v in 1:NVARS]) + + loads_x = loads(:u_prim_permuted, (:jk, :i), (:jk, :ii)) + normals_x = normals(:contravariant_vectors_x, (:jk, :i), (:jk, :ii)) + stores_x = stores(:du_permuted, (:jk, :i), (:jk, :ii)) + + loads_y = loads(:u_prim, (:i, :j, :k), (:i, :jj, :k)) + normals_y = normals(:contravariant_vectors_y, (:i, :j, :k), (:i, :jj, :k)) + stores_y = stores(:du, (:i, :j, :k), (:i, :jj, :k)) + + loads_z = loads(:u_prim_reshaped, (:ij, :k), (:ij, :kk)) + normals_z = normals(:contravariant_vectors_z, (:ij, :k), (:ij, :kk)) + stores_z = stores(:du_reshaped, (:ij, :k), (:ij, :kk)) + + quote + @unpack derivative_split = dg.basis + @unpack contravariant_vectors = cache.elements + + # Create a temporary array that will be used to store the RHS with permuted + # indices `[i, j, k, v]` to allow using SIMD instructions. + # `StrideArray`s with purely static dimensions do not allocate on the heap. + du = StrideArray{eltype(u_cons)}(undef, + (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)), + StaticInt(nnodes(dg)), StaticInt(NVARS))) + + # Convert conserved to auxiliary variables on the given `element`. + u_prim = StrideArray{eltype(u_cons)}(undef, + (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)), + StaticInt(nnodes(dg)), StaticInt(NAUX))) + + # Convert conserved to auxiliary variables and store them in `u_prim`. + @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) + $cons2aux + $(cons2aux_writes...) + end + + # x direction + # At first, we create new temporary arrays with permuted memory layout to + # allow using SIMD instructions along the first dimension (which is contiguous + # in memory). + du_permuted = StrideArray{eltype(u_cons)}(undef, + (StaticInt(nnodes(dg)^2), + StaticInt(nnodes(dg)), + StaticInt(NVARS))) + + u_prim_permuted = StrideArray{eltype(u_cons)}(undef, + (StaticInt(nnodes(dg)^2), + StaticInt(nnodes(dg)), + StaticInt(NAUX))) + + @turbo for v in indices(u_prim, 4), + k in eachnode(dg), + j in eachnode(dg), + i in eachnode(dg) + + jk = j + nnodes(dg) * (k - 1) + u_prim_permuted[jk, i, v] = u_prim[i, j, k, v] + end + fill!(du_permuted, zero(eltype(du_permuted))) + + # We must also permute the contravariant vectors. + contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef, + (StaticInt(nnodes(dg)^2), + StaticInt(nnodes(dg)), + StaticInt(3))) + + @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) + jk = j + nnodes(dg) * (k - 1) + contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k, + element] + contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k, + element] + contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k, + element] + end + + # Next, we basically inline the volume flux. To allow SIMD vectorization and + # still use the symmetry of the volume flux and the derivative matrix, we + # loop over the triangular part in an outer loop and use a plain inner loop. + for i in eachnode(dg), ii in (i + 1):nnodes(dg) + @turbo for jk in Base.OneTo(nnodes(dg)^2) + $(loads_x...) + $(normals_x...) + $flux_call + factor_l = alpha * derivative_split[i, ii] + factor_r = alpha * derivative_split[ii, i] + $(stores_x...) + end + end + + @turbo for v in eachvariable(equations), + k in eachnode(dg), + j in eachnode(dg), + i in eachnode(dg) + + jk = j + nnodes(dg) * (k - 1) + du[i, j, k, v] = du_permuted[jk, i, v] + end + + # y direction + # We must also permute the contravariant vectors. + contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef, + (StaticInt(nnodes(dg)), + StaticInt(nnodes(dg)), + StaticInt(nnodes(dg)), + StaticInt(3))) + + @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) + contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, + element] + contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k, + element] + contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k, + element] + end + + # A possible permutation of array dimensions with improved opportunities for + # SIMD vectorization appeared to be slower than the direct version used here + # in preliminary numerical experiments on an AVX2 system. + for j in eachnode(dg), jj in (j + 1):nnodes(dg) + @turbo for k in eachnode(dg), i in eachnode(dg) + $(loads_y...) + $(normals_y...) + $flux_call + factor_l = alpha * derivative_split[j, jj] + factor_r = alpha * derivative_split[jj, j] + $(stores_y...) + end + end + + # z direction + # The memory layout is already optimal for SIMD vectorization in this loop. + # We just squeeze the first two dimensions to make the code slightly faster. + GC.@preserve u_prim begin + u_prim_reshaped = PtrArray(pointer(u_prim), + (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)), + StaticInt(NAUX))) + + du_reshaped = PtrArray(pointer(du), + (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)), + StaticInt(NVARS))) + + # We must also permute the contravariant vectors. + contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef, + (StaticInt(nnodes(dg)^2), + StaticInt(nnodes(dg)), + StaticInt(3))) + + @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) + ij = i + nnodes(dg) * (j - 1) + contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k, + element] + contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k, + element] + contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k, + element] + end + + for k in eachnode(dg), kk in (k + 1):nnodes(dg) + @turbo for ij in Base.OneTo(nnodes(dg)^2) + $(loads_z...) + $(normals_z...) + $flux_call + factor_l = alpha * derivative_split[k, kk] + factor_r = alpha * derivative_split[kk, k] + $(stores_z...) + end + end + end # GC.@preserve u_prim begin + + # Finally, we add the temporary RHS computed here to the global RHS in the + # given `element`. + @turbo for v in eachvariable(equations), + k in eachnode(dg), + j in eachnode(dg), + i in eachnode(dg) + + _du[v, i, j, k, element] += du[i, j, k, v] + end + + return nothing + end +end + @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element, MeshT::Type{<:Union{StructuredMesh{3}, P4estMesh{3}}}, have_nonconservative_terms::True, - combine_conservative_and_nonconservative_fluxes::True, equations, volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM, cache, alpha) @unpack volume_flux = volume_flux_turbo - flux_differencing_kernel_turbo!(_du, u_cons, element, equations, + flux_differencing_kernel_turbo!(_du, u_cons, element, MeshT, have_nonconservative_terms, + equations, volume_flux, dg, cache, alpha, n_turbo_flux_aux_node_vars(volume_flux, equations), Val(nvariables(equations))) end @generated function flux_differencing_kernel_turbo!(_du::PtrArray, u_cons::PtrArray, - element, equations, - volume_turbo_flux_function, + element, + MeshT::Type{<:Union{StructuredMesh{3}, + P4estMesh{3}}}, + have_nonconservative_terms::True, + equations, + volume_flux, dg, cache, alpha, ::Val{NAUX}, ::Val{NVARS}) where {NAUX, NVARS} # Per-node scalars used inside the `@turbo` loops, e.g. `u_prim_ll_1 = rho_ll`, # `flux_1 = f1, etc. u_prim_ll = [Symbol(:u_prim_ll_, v) for v in 1:NAUX] u_prim_rr = [Symbol(:u_prim_rr_, v) for v in 1:NAUX] - flux = [Symbol(:flux_, v) for v in 1:NVARS] + flux_left = [Symbol(:flux_left_, v) for v in 1:NVARS] + flux_right = [Symbol(:flux_right_, v) for v in 1:NVARS] # Convert conserved to auxiliary variables node-wise, e.g. # rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations) # u_prim[i, j, k, 1] = rho # and so on for v2, v3, p, ... cons_reads = [:(u_cons[$c, i, j, k, element]) for c in 1:NVARS] cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...), - :(cons2fluxauxiliary(volume_turbo_flux_function, $(cons_reads...), + :(cons2fluxauxiliary(volume_flux, $(cons_reads...), equations))) cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX] # Evaluate the two-point volume flux - flux_call = Expr(:(=), Expr(:tuple, flux...), - :(turbo_volume_flux(volume_turbo_flux_function, - $(u_prim_ll...), $(u_prim_rr...), + flux_call = Expr(:(=), Expr(:tuple, Expr(:tuple, flux_left...), Expr(:tuple, flux_right...)), + :(volume_flux_turbo(volume_flux, + have_nonconservative_terms, $(u_prim_ll...), + $(u_prim_rr...), normal_direction_1, normal_direction_2, - normal_direction_3, - equations))) + normal_direction_3, equations))) # Build the inner `@turbo` loop body. For the x direction `loads` gives # rho_ll = u_prim_permuted[jk, i, 1] # left node, and so on @@ -119,10 +390,10 @@ end # du_permuted[jk, i, 1] += factor_l * f1 # left node, and so on # du_permuted[jk, ii, 1] += factor_r * f1 # right node stores(du_arr, idx_ll, idx_rr) = vcat([:($du_arr[$(idx_ll...), $v] += factor_l * - $(flux[v])) + $(flux_left[v])) for v in 1:NVARS], [:($du_arr[$(idx_rr...), $v] += factor_r * - $(flux[v])) + $(flux_right[v])) for v in 1:NVARS]) loads_x = loads(:u_prim_permuted, (:jk, :i), (:jk, :ii)) @@ -325,7 +596,7 @@ end return rho, v1, v2, v3, p, log(rho), log(p) end -@inline function turbo_volume_flux(volume_flux::typeof(flux_ranocha_turbo), +@inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha_turbo), rho_ll, v1_ll, v2_ll, v3_ll, p_ll, log_rho_ll, log_p_ll, rho_rr, v1_rr, v2_rr, v3_rr, @@ -373,20 +644,20 @@ end regular_path2 = (log_y2 - log_x2) / y2_minus_x2 inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2) - v1_avg = 0.5 * (v1_ll + v1_rr) - v2_avg = 0.5 * (v2_ll + v2_rr) - v3_avg = 0.5 * (v3_ll + v3_rr) - p_avg = 0.5 * (p_ll + p_rr) - velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr) + v1_avg = 0.5f0 * (v1_ll + v1_rr) + v2_avg = 0.5f0 * (v2_ll + v2_rr) + v3_avg = 0.5f0 * (v3_ll + v3_rr) + p_avg = 0.5f0 * (p_ll + p_rr) + velocity_square_avg = 0.5f0 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr) # Calculate fluxes depending on normal_direction - f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr) + f1 = rho_mean * 0.5f0 * (v_dot_n_ll + v_dot_n_rr) f2 = f1 * v1_avg + p_avg * normal_direction_1 f3 = f1 * v2_avg + p_avg * normal_direction_2 f4 = f1 * v3_avg + p_avg * normal_direction_3 f5 = (f1 * (velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one) + - 0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll)) + 0.5f0 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll)) return (f1, f2, f3, f4, f5) end From 65670429ff3069e75eeac4770a1aa7433505783f Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Fri, 19 Jun 2026 11:13:13 +0200 Subject: [PATCH 06/39] format --- src/auxiliary/math.jl | 2 +- src/solvers/dgsem_structured/dg_3d_turbo.jl | 7 +++++-- 2 files changed, 6 insertions(+), 3 deletions(-) diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl index 107ea127d57..61b2e7ff21a 100644 --- a/src/auxiliary/math.jl +++ b/src/auxiliary/math.jl @@ -249,7 +249,7 @@ multiplication. end end - @inline function inv_ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD, +@inline function inv_ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD, y::LoopVectorization.VectorizationBase.AbstractSIMD) RealT = eltype(x) epsilon_f2 = convert(RealT, 1.0e-4) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 7193cfefa7a..4a7a94e56c4 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -19,7 +19,9 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) = @inline combined_turbo_flux(volume_flux) = volume_flux -@inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (; volume_flux_conservative, volume_flux_nonconservative) +@inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (; + volume_flux_conservative, + volume_flux_nonconservative) @inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations)) @@ -359,7 +361,8 @@ end cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX] # Evaluate the two-point volume flux - flux_call = Expr(:(=), Expr(:tuple, Expr(:tuple, flux_left...), Expr(:tuple, flux_right...)), + flux_call = Expr(:(=), + Expr(:tuple, Expr(:tuple, flux_left...), Expr(:tuple, flux_right...)), :(volume_flux_turbo(volume_flux, have_nonconservative_terms, $(u_prim_ll...), $(u_prim_rr...), From 3d0685567f417e72b59df2bb72a3cffdf6421910 Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Fri, 19 Jun 2026 11:41:39 +0200 Subject: [PATCH 07/39] fix nonconservative kernel + tests --- src/solvers/dgsem_structured/dg_3d_turbo.jl | 3 +- test/test_performance_specializations_3d.jl | 37 +++++++++++++++++++++ 2 files changed, 39 insertions(+), 1 deletion(-) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 4a7a94e56c4..003250e4190 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -54,7 +54,7 @@ end noncons_right = volume_flux_nonconservative(u_rr, u_ll, normal, equations) flux_left = flux + 0.5f0 * noncons_left flux_right = flux + 0.5f0 * noncons_right - return (Tuple(flux_left)..., Tuple(flux_right)...) + return flux_left, flux_right end LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true @@ -600,6 +600,7 @@ end end @inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha_turbo), + have_nonconservative_terms::False, rho_ll, v1_ll, v2_ll, v3_ll, p_ll, log_rho_ll, log_p_ll, rho_rr, v1_rr, v2_rr, v3_rr, diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index 9a9ee7d24fe..9114f748978 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -243,6 +243,43 @@ end end end +@timed_testset "P4estMesh3D, FluxVolumeTurbo(flux_hindenlang_gassner, flux_nonconservative_powell)" begin + trixi_include(@__MODULE__, + joinpath(EXAMPLES_DIR, "p4est_3d_dgsem", + "elixir_mhd_alfven_wave_nonperiodic.jl"), + volume_flux = Trixi.FluxVolumeTurbo(flux_hindenlang_gassner, + flux_nonconservative_powell)) + u_ode = copy(sol.u[end]) + du_ode = zero(u_ode) + + # Preserve original memory since it will be `unsafe_wrap`ped and might + # thus otherwise be garbage collected + GC.@preserve u_ode du_ode begin + u = Trixi.wrap_array(u_ode, semi) + du = Trixi.wrap_array(du_ode, semi) + have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations) + + # Call the optimized version + du .= 0 + Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), + have_nonconservative_terms, semi.equations, + semi.solver.volume_integral.volume_flux, + semi.solver, semi.cache, true) + du_specialized = du[:, :, :, :, 1] + + # Call the plain version + du .= 0 + Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), + have_nonconservative_terms, semi.equations, + (; flux_hindenlang_gassner, + flux_nonconservative_powell), semi.solver, + semi.cache, true) + du_baseline = du[:, :, :, :, 1] + + @test du_specialized ≈ du_baseline + end +end + @timed_testset "P4estMesh3D, combine_conservative_and_nonconservative_fluxes" begin trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "p4est_3d_dgsem", From 690bfd4eb3a3e1dd742d476d583b262c55639fe2 Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Fri, 19 Jun 2026 11:57:46 +0200 Subject: [PATCH 08/39] increase readability --- src/solvers/dgsem_structured/dg_3d_turbo.jl | 28 ++++++++++----------- 1 file changed, 14 insertions(+), 14 deletions(-) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 003250e4190..3db2b4aa1b9 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -93,7 +93,7 @@ end # Convert conserved to auxiliary variables node-wise, e.g. # rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations) # u_prim[i, j, k, 1] = rho # and so on for v2, v3, p, ... - cons_reads = [:(u_cons[$c, i, j, k, element]) for c in 1:NVARS] + cons_reads = [:(u_cons[$v, i, j, k, element]) for v in 1:NVARS] cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...), :(cons2fluxauxiliary(volume_flux, $(cons_reads...), equations))) @@ -120,12 +120,12 @@ end # e.g. the x direction `normals` gives # normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] + # contravariant_vectors_x[jk, ii, 1]) # and 2, 3 - normals(cv, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 * - ($cv[$(idx_ll...), - $m] + - $cv[$(idx_rr...), - $m])) - for m in 1:3] + normals(controvariant_vector, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 * + ($controvariant_vector[$(idx_ll...), + $m] + + $controvariant_vector[$(idx_rr...), + $m])) + for m in 1:3] # Store the fluxes in the permuted du array # e.g. the x direction `stores` gives @@ -354,7 +354,7 @@ end # Convert conserved to auxiliary variables node-wise, e.g. # rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations) # u_prim[i, j, k, 1] = rho # and so on for v2, v3, p, ... - cons_reads = [:(u_cons[$c, i, j, k, element]) for c in 1:NVARS] + cons_reads = [:(u_cons[$v, i, j, k, element]) for v in 1:NVARS] cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...), :(cons2fluxauxiliary(volume_flux, $(cons_reads...), equations))) @@ -381,12 +381,12 @@ end # e.g. the x direction `normals` gives # normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] + # contravariant_vectors_x[jk, ii, 1]) # and 2, 3 - normals(cv, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 * - ($cv[$(idx_ll...), - $m] + - $cv[$(idx_rr...), - $m])) - for m in 1:3] + normals(controvariant_vector, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 * + ($controvariant_vector[$(idx_ll...), + $m] + + $controvariant_vector[$(idx_rr...), + $m])) + for m in 1:3] # Store the fluxes in the permuted du array # e.g. the x direction `stores` gives From 8bba09793ef3f79c87a04c3e35f7dcab888e601c Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Fri, 19 Jun 2026 12:03:00 +0200 Subject: [PATCH 09/39] fix tests --- test/test_performance_specializations_3d.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index 9114f748978..c52fb37a094 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -269,10 +269,12 @@ end # Call the plain version du .= 0 + symmetric_flux = flux_hindenlang_gassner + nonconservative_flux = flux_nonconservative_powell Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), have_nonconservative_terms, semi.equations, - (; flux_hindenlang_gassner, - flux_nonconservative_powell), semi.solver, + (; symmetric_flux, + nonconservative_flux), semi.solver, semi.cache, true) du_baseline = du[:, :, :, :, 1] From 8256ff1be5cb53025e698ce760355ae9d2875202 Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Fri, 19 Jun 2026 12:06:14 +0200 Subject: [PATCH 10/39] rename normal to normal direction --- src/solvers/dgsem_structured/dg_3d_turbo.jl | 20 ++++++++++---------- 1 file changed, 10 insertions(+), 10 deletions(-) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 3db2b4aa1b9..39061ec1b42 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -33,10 +33,10 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) = n = nvariables(equations) u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n))) u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n))) - normal = SVector(aux_and_normals_and_equations[end - 3], - aux_and_normals_and_equations[end - 2], - aux_and_normals_and_equations[end - 1]) - return Tuple(volume_flux(u_ll, u_rr, normal, equations)) + normal_direction = SVector(aux_and_normals_and_equations[end - 3], + aux_and_normals_and_equations[end - 2], + aux_and_normals_and_equations[end - 1]) + return volume_flux(u_ll, u_rr, normal_direction, equations) end @inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::True, @@ -46,12 +46,12 @@ end n = nvariables(equations) u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n))) u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n))) - normal = SVector(aux_and_normals_and_equations[end - 3], - aux_and_normals_and_equations[end - 2], - aux_and_normals_and_equations[end - 1]) - flux = volume_flux_conservative(u_ll, u_rr, normal, equations) - noncons_left = volume_flux_nonconservative(u_ll, u_rr, normal, equations) - noncons_right = volume_flux_nonconservative(u_rr, u_ll, normal, equations) + normal_direction = SVector(aux_and_normals_and_equations[end - 3], + aux_and_normals_and_equations[end - 2], + aux_and_normals_and_equations[end - 1]) + flux = volume_flux_conservative(u_ll, u_rr, normal_direction, equations) + noncons_left = volume_flux_nonconservative(u_ll, u_rr, normal_direction, equations) + noncons_right = volume_flux_nonconservative(u_rr, u_ll, normal_direction, equations) flux_left = flux + 0.5f0 * noncons_left flux_right = flux + 0.5f0 * noncons_right return flux_left, flux_right From 160023ba12d4cf9ac62080594e56c531e7603ecf Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Fri, 19 Jun 2026 12:17:11 +0200 Subject: [PATCH 11/39] explicit AbstractSIMD --- src/auxiliary/math.jl | 13 ++++--------- 1 file changed, 4 insertions(+), 9 deletions(-) diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl index 61b2e7ff21a..18b8db8568c 100644 --- a/src/auxiliary/math.jl +++ b/src/auxiliary/math.jl @@ -1,3 +1,4 @@ +using LoopVectorization: AbstractSIMD # By default, Julia/LLVM does not use fused multiply-add operations (FMAs). # Since these FMAs can increase the performance of many numerical algorithms, # we need to opt-in explicitly. @@ -135,11 +136,7 @@ end See also [`Trixi.set_log_type!`](@ref). """ @inline log(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.log(x) - @inline log(x::LoopVectorization.VectorizationBase.AbstractSIMD) = ifelse(x < - zero(x), - oftype(x, - NaN), - Base.log(x)) + @inline log(x::AbstractSIMD) = ifelse(x < zero(x), oftype(x, NaN), Base.log(x)) @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x) @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x) @inline log(x::Float16) = ccall("llvm.log.f16", llvmcall, Float16, (Float16,), x) @@ -209,8 +206,7 @@ Given ε = 1.0e-4, we use the following algorithm. end end -@inline function ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD, - y::LoopVectorization.VectorizationBase.AbstractSIMD) +@inline function ln_mean(x::AbstractSIMD, y::AbstractSIMD) RealT = eltype(x) epsilon_f2 = convert(RealT, 1.0e-4) f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2 @@ -249,8 +245,7 @@ multiplication. end end -@inline function inv_ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD, - y::LoopVectorization.VectorizationBase.AbstractSIMD) +@inline function inv_ln_mean(x::AbstractSIMD, y::AbstractSIMD) RealT = eltype(x) epsilon_f2 = convert(RealT, 1.0e-4) f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2 From 61e96fa5c165905fd5639e3a5fb613fc7acf32d0 Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Fri, 19 Jun 2026 12:34:30 +0200 Subject: [PATCH 12/39] Update src/solvers/dgsem_structured/dg_3d_turbo.jl --- src/solvers/dgsem_structured/dg_3d_turbo.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 39061ec1b42..08a58b7b943 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -15,7 +15,6 @@ function FluxVolumeTurbo(volume_flux) return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux) end -combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) = True() @inline combined_turbo_flux(volume_flux) = volume_flux From ac3a2d8095d49d72a0fa5b0994e7eef98012a616 Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Fri, 19 Jun 2026 12:36:09 +0200 Subject: [PATCH 13/39] Update src/solvers/dgsem_structured/dg_3d_turbo.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/solvers/dgsem_structured/dg_3d_turbo.jl | 1 - 1 file changed, 1 deletion(-) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 08a58b7b943..d8f7f961c7d 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -15,7 +15,6 @@ function FluxVolumeTurbo(volume_flux) return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux) end - @inline combined_turbo_flux(volume_flux) = volume_flux @inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (; From 80a9813f339445639f0ca767c130e3b75eabb10f Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Fri, 19 Jun 2026 13:23:51 +0200 Subject: [PATCH 14/39] simplify flux args --- src/solvers/dgsem_structured/dg_3d_turbo.jl | 10 +++++++--- 1 file changed, 7 insertions(+), 3 deletions(-) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index d8f7f961c7d..1aea1ddbb67 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -25,6 +25,12 @@ end @inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations) +@inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...) + equations = last(aux_and_normals_and_equations) + volume_flux_turbo(volume_flux, have_nonconservative_terms(equations), + aux_and_normals_and_equations...) +end + @inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False, aux_and_normals_and_equations...) equations = last(aux_and_normals_and_equations) @@ -100,7 +106,6 @@ end # Evaluate the two-point volume flux flux_call = Expr(:(=), Expr(:tuple, flux...), :(volume_flux_turbo(volume_flux, - have_nonconservative_terms, $(u_prim_ll...), $(u_prim_rr...), normal_direction_1, normal_direction_2, normal_direction_3, @@ -362,7 +367,7 @@ end flux_call = Expr(:(=), Expr(:tuple, Expr(:tuple, flux_left...), Expr(:tuple, flux_right...)), :(volume_flux_turbo(volume_flux, - have_nonconservative_terms, $(u_prim_ll...), + $(u_prim_ll...), $(u_prim_rr...), normal_direction_1, normal_direction_2, normal_direction_3, equations))) @@ -598,7 +603,6 @@ end end @inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha_turbo), - have_nonconservative_terms::False, rho_ll, v1_ll, v2_ll, v3_ll, p_ll, log_rho_ll, log_p_ll, rho_rr, v1_rr, v2_rr, v3_rr, From 4a0a3c147bdad7e4b229a5f71f8080b0604e0b2f Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Sat, 20 Jun 2026 09:59:34 +0200 Subject: [PATCH 15/39] add comments --- src/solvers/dgsem_structured/dg_3d_turbo.jl | 22 ++++++++++++++++++++- 1 file changed, 21 insertions(+), 1 deletion(-) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 1aea1ddbb67..3814fe75b03 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -1,3 +1,8 @@ +""" + FluxVolumeTurbo(volume_flux) + +Specialize the volume flux to use the SIMD instructions via LoopVectorization.jl +""" struct FluxVolumeTurbo{VolumeFlux} volume_flux::VolumeFlux function FluxVolumeTurbo{VolumeFlux}(volume_flux) where {VolumeFlux} @@ -5,26 +10,34 @@ struct FluxVolumeTurbo{VolumeFlux} end end +# Helper function for nonconservative systems. function FluxVolumeTurbo(volume_flux_conservative, volume_flux_nonconservative) turbo_flux = combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux) end +# Helper function for conservative systems. function FluxVolumeTurbo(volume_flux) turbo_flux = combined_turbo_flux(volume_flux) return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux) end +# By default the turbo flux has no specialization and re-uses +# the numerical flux in terms of conservative variables. @inline combined_turbo_flux(volume_flux) = volume_flux @inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (; volume_flux_conservative, volume_flux_nonconservative) - +# By default the turbo flux has the same number of precomputed variables +# as the number of variables. @inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations)) +# Transform the conserved variables in precomputed auxiliary variables to speed up the computation +# of the numerical flux. When no specialization is given, this gives cons2cons. @inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations) +# Numerical volume flux that recalls the plain volume flux when no specialization is given. @inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...) equations = last(aux_and_normals_and_equations) volume_flux_turbo(volume_flux, have_nonconservative_terms(equations), @@ -61,6 +74,7 @@ end return flux_left, flux_right end +# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2fluxauxiliary LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true @@ -79,6 +93,8 @@ LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true Val(nvariables(equations))) end +# Generated function that writes the equivalent hand-written code as in dg_compressible_euler_3d.jl, +# but generalizes for each number of variables and precomputed variables. @generated function flux_differencing_kernel_turbo!(_du::PtrArray, u_cons::PtrArray, element, MeshT::Type{<:Union{StructuredMesh{3}, @@ -584,13 +600,16 @@ end end end +# Specialize the name of the turbo flux for flux_ranocha. @inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo +# Number of precomputed variables for the specialization flux_ranocha @inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), equations::CompressibleEulerEquations3D) return Val(7) end +# Transformation from conserved to precomputed variables for flux_ranocha @inline function cons2fluxauxiliary(volume_flux::typeof(flux_ranocha_turbo), rho, rho_v1, rho_v2, rho_v3, rho_e, equations::CompressibleEulerEquations3D) @@ -602,6 +621,7 @@ end return rho, v1, v2, v3, p, log(rho), log(p) end +# Computation of the numerical flux_ranocha with respect to precomputed variables @inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha_turbo), rho_ll, v1_ll, v2_ll, v3_ll, p_ll, log_rho_ll, log_p_ll, From 2a8c4366c4ab21de34a17a5e2972743967756ef1 Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Sat, 20 Jun 2026 10:01:07 +0200 Subject: [PATCH 16/39] remove nonconservative part --- src/solvers/dgsem_structured/dg_3d_turbo.jl | 288 -------------------- test/test_performance_specializations_3d.jl | 39 --- 2 files changed, 327 deletions(-) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 3814fe75b03..80a7235ae67 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -10,12 +10,6 @@ struct FluxVolumeTurbo{VolumeFlux} end end -# Helper function for nonconservative systems. -function FluxVolumeTurbo(volume_flux_conservative, volume_flux_nonconservative) - turbo_flux = combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) - return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux) -end - # Helper function for conservative systems. function FluxVolumeTurbo(volume_flux) turbo_flux = combined_turbo_flux(volume_flux) @@ -26,9 +20,6 @@ end # the numerical flux in terms of conservative variables. @inline combined_turbo_flux(volume_flux) = volume_flux -@inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (; - volume_flux_conservative, - volume_flux_nonconservative) # By default the turbo flux has the same number of precomputed variables # as the number of variables. @inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations)) @@ -56,24 +47,6 @@ end return volume_flux(u_ll, u_rr, normal_direction, equations) end -@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::True, - aux_and_normals_and_equations...) - @unpack volume_flux_conservative, volume_flux_nonconservative = volume_flux - equations = last(aux_and_normals_and_equations) - n = nvariables(equations) - u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n))) - u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n))) - normal_direction = SVector(aux_and_normals_and_equations[end - 3], - aux_and_normals_and_equations[end - 2], - aux_and_normals_and_equations[end - 1]) - flux = volume_flux_conservative(u_ll, u_rr, normal_direction, equations) - noncons_left = volume_flux_nonconservative(u_ll, u_rr, normal_direction, equations) - noncons_right = volume_flux_nonconservative(u_rr, u_ll, normal_direction, equations) - flux_left = flux + 0.5f0 * noncons_left - flux_right = flux + 0.5f0 * noncons_right - return flux_left, flux_right -end - # Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2fluxauxiliary LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true @@ -339,267 +312,6 @@ end end end -@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element, - MeshT::Type{<:Union{StructuredMesh{3}, - P4estMesh{3}}}, - have_nonconservative_terms::True, - equations, - volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM, - cache, alpha) - @unpack volume_flux = volume_flux_turbo - flux_differencing_kernel_turbo!(_du, u_cons, element, MeshT, have_nonconservative_terms, - equations, - volume_flux, dg, cache, alpha, - n_turbo_flux_aux_node_vars(volume_flux, equations), - Val(nvariables(equations))) -end - -@generated function flux_differencing_kernel_turbo!(_du::PtrArray, u_cons::PtrArray, - element, - MeshT::Type{<:Union{StructuredMesh{3}, - P4estMesh{3}}}, - have_nonconservative_terms::True, - equations, - volume_flux, - dg, cache, alpha, ::Val{NAUX}, - ::Val{NVARS}) where {NAUX, NVARS} - # Per-node scalars used inside the `@turbo` loops, e.g. `u_prim_ll_1 = rho_ll`, - # `flux_1 = f1, etc. - u_prim_ll = [Symbol(:u_prim_ll_, v) for v in 1:NAUX] - u_prim_rr = [Symbol(:u_prim_rr_, v) for v in 1:NAUX] - flux_left = [Symbol(:flux_left_, v) for v in 1:NVARS] - flux_right = [Symbol(:flux_right_, v) for v in 1:NVARS] - - # Convert conserved to auxiliary variables node-wise, e.g. - # rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations) - # u_prim[i, j, k, 1] = rho # and so on for v2, v3, p, ... - cons_reads = [:(u_cons[$v, i, j, k, element]) for v in 1:NVARS] - cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...), - :(cons2fluxauxiliary(volume_flux, $(cons_reads...), - equations))) - cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX] - - # Evaluate the two-point volume flux - flux_call = Expr(:(=), - Expr(:tuple, Expr(:tuple, flux_left...), Expr(:tuple, flux_right...)), - :(volume_flux_turbo(volume_flux, - $(u_prim_ll...), - $(u_prim_rr...), - normal_direction_1, normal_direction_2, - normal_direction_3, equations))) - - # Build the inner `@turbo` loop body. For the x direction `loads` gives - # rho_ll = u_prim_permuted[jk, i, 1] # left node, and so on - # rho_rr = u_prim_permuted[jk, ii, 1] # right node - loads(arr, idx_ll, idx_rr) = vcat([:($(u_prim_ll[v]) = $arr[$(idx_ll...), $v]) - for v in 1:NAUX], - [:($(u_prim_rr[v]) = $arr[$(idx_rr...), $v]) - for v in 1:NAUX]) - - # Compute the average of the normal_direction - # e.g. the x direction `normals` gives - # normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] + - # contravariant_vectors_x[jk, ii, 1]) # and 2, 3 - normals(controvariant_vector, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 * - ($controvariant_vector[$(idx_ll...), - $m] + - $controvariant_vector[$(idx_rr...), - $m])) - for m in 1:3] - - # Store the fluxes in the permuted du array - # e.g. the x direction `stores` gives - # du_permuted[jk, i, 1] += factor_l * f1 # left node, and so on - # du_permuted[jk, ii, 1] += factor_r * f1 # right node - stores(du_arr, idx_ll, idx_rr) = vcat([:($du_arr[$(idx_ll...), $v] += factor_l * - $(flux_left[v])) - for v in 1:NVARS], - [:($du_arr[$(idx_rr...), $v] += factor_r * - $(flux_right[v])) - for v in 1:NVARS]) - - loads_x = loads(:u_prim_permuted, (:jk, :i), (:jk, :ii)) - normals_x = normals(:contravariant_vectors_x, (:jk, :i), (:jk, :ii)) - stores_x = stores(:du_permuted, (:jk, :i), (:jk, :ii)) - - loads_y = loads(:u_prim, (:i, :j, :k), (:i, :jj, :k)) - normals_y = normals(:contravariant_vectors_y, (:i, :j, :k), (:i, :jj, :k)) - stores_y = stores(:du, (:i, :j, :k), (:i, :jj, :k)) - - loads_z = loads(:u_prim_reshaped, (:ij, :k), (:ij, :kk)) - normals_z = normals(:contravariant_vectors_z, (:ij, :k), (:ij, :kk)) - stores_z = stores(:du_reshaped, (:ij, :k), (:ij, :kk)) - - quote - @unpack derivative_split = dg.basis - @unpack contravariant_vectors = cache.elements - - # Create a temporary array that will be used to store the RHS with permuted - # indices `[i, j, k, v]` to allow using SIMD instructions. - # `StrideArray`s with purely static dimensions do not allocate on the heap. - du = StrideArray{eltype(u_cons)}(undef, - (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)), - StaticInt(nnodes(dg)), StaticInt(NVARS))) - - # Convert conserved to auxiliary variables on the given `element`. - u_prim = StrideArray{eltype(u_cons)}(undef, - (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)), - StaticInt(nnodes(dg)), StaticInt(NAUX))) - - # Convert conserved to auxiliary variables and store them in `u_prim`. - @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) - $cons2aux - $(cons2aux_writes...) - end - - # x direction - # At first, we create new temporary arrays with permuted memory layout to - # allow using SIMD instructions along the first dimension (which is contiguous - # in memory). - du_permuted = StrideArray{eltype(u_cons)}(undef, - (StaticInt(nnodes(dg)^2), - StaticInt(nnodes(dg)), - StaticInt(NVARS))) - - u_prim_permuted = StrideArray{eltype(u_cons)}(undef, - (StaticInt(nnodes(dg)^2), - StaticInt(nnodes(dg)), - StaticInt(NAUX))) - - @turbo for v in indices(u_prim, 4), - k in eachnode(dg), - j in eachnode(dg), - i in eachnode(dg) - - jk = j + nnodes(dg) * (k - 1) - u_prim_permuted[jk, i, v] = u_prim[i, j, k, v] - end - fill!(du_permuted, zero(eltype(du_permuted))) - - # We must also permute the contravariant vectors. - contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef, - (StaticInt(nnodes(dg)^2), - StaticInt(nnodes(dg)), - StaticInt(3))) - - @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) - jk = j + nnodes(dg) * (k - 1) - contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k, - element] - contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k, - element] - contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k, - element] - end - - # Next, we basically inline the volume flux. To allow SIMD vectorization and - # still use the symmetry of the volume flux and the derivative matrix, we - # loop over the triangular part in an outer loop and use a plain inner loop. - for i in eachnode(dg), ii in (i + 1):nnodes(dg) - @turbo for jk in Base.OneTo(nnodes(dg)^2) - $(loads_x...) - $(normals_x...) - $flux_call - factor_l = alpha * derivative_split[i, ii] - factor_r = alpha * derivative_split[ii, i] - $(stores_x...) - end - end - - @turbo for v in eachvariable(equations), - k in eachnode(dg), - j in eachnode(dg), - i in eachnode(dg) - - jk = j + nnodes(dg) * (k - 1) - du[i, j, k, v] = du_permuted[jk, i, v] - end - - # y direction - # We must also permute the contravariant vectors. - contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef, - (StaticInt(nnodes(dg)), - StaticInt(nnodes(dg)), - StaticInt(nnodes(dg)), - StaticInt(3))) - - @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) - contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k, - element] - contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k, - element] - contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k, - element] - end - - # A possible permutation of array dimensions with improved opportunities for - # SIMD vectorization appeared to be slower than the direct version used here - # in preliminary numerical experiments on an AVX2 system. - for j in eachnode(dg), jj in (j + 1):nnodes(dg) - @turbo for k in eachnode(dg), i in eachnode(dg) - $(loads_y...) - $(normals_y...) - $flux_call - factor_l = alpha * derivative_split[j, jj] - factor_r = alpha * derivative_split[jj, j] - $(stores_y...) - end - end - - # z direction - # The memory layout is already optimal for SIMD vectorization in this loop. - # We just squeeze the first two dimensions to make the code slightly faster. - GC.@preserve u_prim begin - u_prim_reshaped = PtrArray(pointer(u_prim), - (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)), - StaticInt(NAUX))) - - du_reshaped = PtrArray(pointer(du), - (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)), - StaticInt(NVARS))) - - # We must also permute the contravariant vectors. - contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef, - (StaticInt(nnodes(dg)^2), - StaticInt(nnodes(dg)), - StaticInt(3))) - - @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) - ij = i + nnodes(dg) * (j - 1) - contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k, - element] - contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k, - element] - contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k, - element] - end - - for k in eachnode(dg), kk in (k + 1):nnodes(dg) - @turbo for ij in Base.OneTo(nnodes(dg)^2) - $(loads_z...) - $(normals_z...) - $flux_call - factor_l = alpha * derivative_split[k, kk] - factor_r = alpha * derivative_split[kk, k] - $(stores_z...) - end - end - end # GC.@preserve u_prim begin - - # Finally, we add the temporary RHS computed here to the global RHS in the - # given `element`. - @turbo for v in eachvariable(equations), - k in eachnode(dg), - j in eachnode(dg), - i in eachnode(dg) - - _du[v, i, j, k, element] += du[i, j, k, v] - end - - return nothing - end -end - # Specialize the name of the turbo flux for flux_ranocha. @inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index c52fb37a094..9a9ee7d24fe 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -243,45 +243,6 @@ end end end -@timed_testset "P4estMesh3D, FluxVolumeTurbo(flux_hindenlang_gassner, flux_nonconservative_powell)" begin - trixi_include(@__MODULE__, - joinpath(EXAMPLES_DIR, "p4est_3d_dgsem", - "elixir_mhd_alfven_wave_nonperiodic.jl"), - volume_flux = Trixi.FluxVolumeTurbo(flux_hindenlang_gassner, - flux_nonconservative_powell)) - u_ode = copy(sol.u[end]) - du_ode = zero(u_ode) - - # Preserve original memory since it will be `unsafe_wrap`ped and might - # thus otherwise be garbage collected - GC.@preserve u_ode du_ode begin - u = Trixi.wrap_array(u_ode, semi) - du = Trixi.wrap_array(du_ode, semi) - have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations) - - # Call the optimized version - du .= 0 - Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), - have_nonconservative_terms, semi.equations, - semi.solver.volume_integral.volume_flux, - semi.solver, semi.cache, true) - du_specialized = du[:, :, :, :, 1] - - # Call the plain version - du .= 0 - symmetric_flux = flux_hindenlang_gassner - nonconservative_flux = flux_nonconservative_powell - Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), - have_nonconservative_terms, semi.equations, - (; symmetric_flux, - nonconservative_flux), semi.solver, - semi.cache, true) - du_baseline = du[:, :, :, :, 1] - - @test du_specialized ≈ du_baseline - end -end - @timed_testset "P4estMesh3D, combine_conservative_and_nonconservative_fluxes" begin trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "p4est_3d_dgsem", From 86ffed2e7f2abcfe7f3671a9d643af8ef140bb39 Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Sat, 20 Jun 2026 10:02:45 +0200 Subject: [PATCH 17/39] minor change --- src/solvers/dgsem_structured/dg_3d_turbo.jl | 5 +---- 1 file changed, 1 insertion(+), 4 deletions(-) diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 80a7235ae67..78fedfef2d6 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -316,10 +316,7 @@ end @inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo # Number of precomputed variables for the specialization flux_ranocha -@inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), - equations::CompressibleEulerEquations3D) - return Val(7) -end +@inline n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), equations::CompressibleEulerEquations3D) = Val(7) # Transformation from conserved to precomputed variables for flux_ranocha @inline function cons2fluxauxiliary(volume_flux::typeof(flux_ranocha_turbo), From 8c718d4e17c4312e3fcbb70fb3efc4981371d4d3 Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Sat, 20 Jun 2026 10:22:22 +0200 Subject: [PATCH 18/39] move fluxes to numerical fluxes --- src/equations/numerical_fluxes.jl | 55 +++++++++++++++++++++ src/solvers/dgsem_structured/dg_3d_turbo.jl | 53 -------------------- 2 files changed, 55 insertions(+), 53 deletions(-) diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index 545155b4889..0af82ed68a4 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -578,4 +578,59 @@ end end Base.show(io::IO, f::FluxUpwind) = print(io, "FluxUpwind(", f.splitting, ")") + +""" + FluxVolumeTurbo(volume_flux) + +Specialize the volume flux to use the SIMD instructions via LoopVectorization.jl +""" +struct FluxVolumeTurbo{VolumeFlux} + volume_flux::VolumeFlux + function FluxVolumeTurbo{VolumeFlux}(volume_flux) where {VolumeFlux} + return new{VolumeFlux}(volume_flux) + end +end + +# Helper function for conservative systems. +function FluxVolumeTurbo(volume_flux) + turbo_flux = combined_turbo_flux(volume_flux) + return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux) +end + +# By default the turbo flux has no specialization and re-uses +# the numerical flux in terms of conservative variables. +@inline combined_turbo_flux(volume_flux) = volume_flux + +# By default the turbo flux has the same number of precomputed variables +# as the number of variables. +@inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations)) + +# Transform the conserved variables in precomputed auxiliary variables to speed up the computation +# of the numerical flux. When no specialization is given, this gives cons2cons. +@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations) + +# Numerical volume flux that recalls the plain volume flux when no specialization is given. +@inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...) + equations = last(aux_and_normals_and_equations) + volume_flux_turbo(volume_flux, have_nonconservative_terms(equations), + aux_and_normals_and_equations...) +end + +@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False, + aux_and_normals_and_equations...) + equations = last(aux_and_normals_and_equations) + n = nvariables(equations) + u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n))) + u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n))) + normal_direction = SVector(aux_and_normals_and_equations[end - 3], + aux_and_normals_and_equations[end - 2], + aux_and_normals_and_equations[end - 1]) + return volume_flux(u_ll, u_rr, normal_direction, equations) +end + +# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2fluxauxiliary +LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true +LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true + +Base.show(io::IO, f::FluxUpwind) = print(io, "FluxVolumeTurbo(", f.volume_flux, ")") end # @muladd diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 78fedfef2d6..facc4adabf2 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -1,56 +1,3 @@ -""" - FluxVolumeTurbo(volume_flux) - -Specialize the volume flux to use the SIMD instructions via LoopVectorization.jl -""" -struct FluxVolumeTurbo{VolumeFlux} - volume_flux::VolumeFlux - function FluxVolumeTurbo{VolumeFlux}(volume_flux) where {VolumeFlux} - return new{VolumeFlux}(volume_flux) - end -end - -# Helper function for conservative systems. -function FluxVolumeTurbo(volume_flux) - turbo_flux = combined_turbo_flux(volume_flux) - return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux) -end - -# By default the turbo flux has no specialization and re-uses -# the numerical flux in terms of conservative variables. -@inline combined_turbo_flux(volume_flux) = volume_flux - -# By default the turbo flux has the same number of precomputed variables -# as the number of variables. -@inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations)) - -# Transform the conserved variables in precomputed auxiliary variables to speed up the computation -# of the numerical flux. When no specialization is given, this gives cons2cons. -@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations) - -# Numerical volume flux that recalls the plain volume flux when no specialization is given. -@inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...) - equations = last(aux_and_normals_and_equations) - volume_flux_turbo(volume_flux, have_nonconservative_terms(equations), - aux_and_normals_and_equations...) -end - -@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False, - aux_and_normals_and_equations...) - equations = last(aux_and_normals_and_equations) - n = nvariables(equations) - u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n))) - u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n))) - normal_direction = SVector(aux_and_normals_and_equations[end - 3], - aux_and_normals_and_equations[end - 2], - aux_and_normals_and_equations[end - 1]) - return volume_flux(u_ll, u_rr, normal_direction, equations) -end - -# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2fluxauxiliary -LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true -LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true - @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element, MeshT::Type{<:Union{StructuredMesh{3}, P4estMesh{3}}}, From 11e35510cc3f85bfb2252f0d51b96fbe8834e909 Mon Sep 17 00:00:00 2001 From: MarcoArtiano Date: Sat, 20 Jun 2026 10:23:44 +0200 Subject: [PATCH 19/39] change export position --- src/Trixi.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/src/Trixi.jl b/src/Trixi.jl index 2154e40c961..0edb895a7cf 100644 --- a/src/Trixi.jl +++ b/src/Trixi.jl @@ -255,7 +255,7 @@ export flux, flux_central, flux_lax_friedrichs, flux_hll, flux_hllc, flux_hlle, FluxRotated, flux_shima_etal_turbo, flux_ranocha_turbo, FluxUpwind, - FluxTracerEquationsCentral + FluxTracerEquationsCentral, FluxVolumeTurbo export splitting_steger_warming, splitting_vanleer_haenel, splitting_coirier_vanleer, splitting_lax_friedrichs, @@ -310,7 +310,7 @@ export DG, FDSBP, BlockFV, UniformFiniteVolumeBasis, VolumeIntegralFiniteVolume, VolumeIntegralWeakForm, VolumeIntegralStrongForm, - VolumeIntegralFluxDifferencing, FluxVolumeTurbo, + VolumeIntegralFluxDifferencing, VolumeIntegralPureLGLFiniteVolume, VolumeIntegralPureLGLFiniteVolumeO2, VolumeIntegralShockCapturingHG, VolumeIntegralShockCapturingRRG, VolumeIntegralShockCapturingHGType, From d34c047efd419270fd0da657a6c9f160e2ccd06c Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Sat, 20 Jun 2026 10:43:17 +0200 Subject: [PATCH 20/39] fix typo Co-authored-by: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> --- src/equations/numerical_fluxes.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index 0af82ed68a4..183ebf03f15 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -632,5 +632,5 @@ end LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true -Base.show(io::IO, f::FluxUpwind) = print(io, "FluxVolumeTurbo(", f.volume_flux, ")") +Base.show(io::IO, f::FluxVolumeTurbo) = print(io, "FluxVolumeTurbo(", f.volume_flux, ")") end # @muladd From 93f0373c304cea827e8b8e0b4336455ef707d002 Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Sat, 20 Jun 2026 10:50:03 +0200 Subject: [PATCH 21/39] Update src/equations/numerical_fluxes.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- src/equations/numerical_fluxes.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index 183ebf03f15..47fd58ba4e8 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -632,5 +632,6 @@ end LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true -Base.show(io::IO, f::FluxVolumeTurbo) = print(io, "FluxVolumeTurbo(", f.volume_flux, ")") +Base.show(io::IO, f::FluxVolumeTurbo) = print(io, "FluxVolumeTurbo(", f.volume_flux, + ")") end # @muladd From 3659c1e5290d8b6cf577ee662c40aa84556c7200 Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Tue, 23 Jun 2026 17:15:13 +0200 Subject: [PATCH 22/39] Apply suggestions from code review Co-authored-by: Hendrik Ranocha --- src/auxiliary/math.jl | 8 +++++++- src/equations/numerical_fluxes.jl | 11 ++++++++--- src/solvers/dgsem_structured/dg_3d_turbo.jl | 2 +- test/test_performance_specializations_3d.jl | 4 ++-- 4 files changed, 18 insertions(+), 7 deletions(-) diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl index 18b8db8568c..2a173798578 100644 --- a/src/auxiliary/math.jl +++ b/src/auxiliary/math.jl @@ -136,10 +136,14 @@ end See also [`Trixi.set_log_type!`](@ref). """ @inline log(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.log(x) - @inline log(x::AbstractSIMD) = ifelse(x < zero(x), oftype(x, NaN), Base.log(x)) @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x) @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x) @inline log(x::Float16) = ccall("llvm.log.f16", llvmcall, Float16, (Float16,), x) + + # This is required for performance specializations like `FluxTurbo`. + # The LoopVectorization.jl ecosystem does not throw an error but + # already creates `NaN`s. + @inline log(x::AbstractSIMD) = Base.log(x) end """ @@ -206,6 +210,7 @@ Given ε = 1.0e-4, we use the following algorithm. end end +# This is required for performance specializations like `FluxTurbo`. @inline function ln_mean(x::AbstractSIMD, y::AbstractSIMD) RealT = eltype(x) epsilon_f2 = convert(RealT, 1.0e-4) @@ -245,6 +250,7 @@ multiplication. end end +# This is required for performance specializations like `FluxTurbo`. @inline function inv_ln_mean(x::AbstractSIMD, y::AbstractSIMD) RealT = eltype(x) epsilon_f2 = convert(RealT, 1.0e-4) diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index 47fd58ba4e8..8634da90bc6 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -580,9 +580,14 @@ end Base.show(io::IO, f::FluxUpwind) = print(io, "FluxUpwind(", f.splitting, ")") """ - FluxVolumeTurbo(volume_flux) - -Specialize the volume flux to use the SIMD instructions via LoopVectorization.jl + FluxTurbo(numerical_flux) + +Create a numerical flux equivalent to the given `numerical_flux` +except that it may use specialized methods, e.g., when used with +[`VolumeIntegralFluxDifferencing`](@ref). These specialized methods +may enable better use of SIMD instructions to increase runtime efficiency +on modern hardware, e.g., using +[LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl). """ struct FluxVolumeTurbo{VolumeFlux} volume_flux::VolumeFlux diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index facc4adabf2..1f9b0e73372 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -25,7 +25,7 @@ end dg, cache, alpha, ::Val{NAUX}, ::Val{NVARS}) where {NAUX, NVARS} # Per-node scalars used inside the `@turbo` loops, e.g. `u_prim_ll_1 = rho_ll`, - # `flux_1 = f1, etc. + # `flux_1 = f1`, etc. u_prim_ll = [Symbol(:u_prim_ll_, v) for v in 1:NAUX] u_prim_rr = [Symbol(:u_prim_rr_, v) for v in 1:NAUX] flux = [Symbol(:flux_, v) for v in 1:NVARS] diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index 9a9ee7d24fe..27c4918cf9a 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -177,7 +177,7 @@ end trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"), cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3, - volume_flux = Trixi.FluxVolumeTurbo(flux_ranocha), + volume_flux = FluxVolumeTurbo(flux_ranocha), surface_flux = flux_ranocha) u_ode = copy(sol.u[end]) du_ode = zero(u_ode) @@ -195,7 +195,7 @@ end have_nonconservative_terms, semi.equations, semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true) - du_specialized = du[:, :, :, :, 1] + du_specialized = copy(du[:, :, :, :, 1]) # Call the plain version du .= 0 From 4c9dedbf5ff229aa5b3bc2dc54c139b35944fee4 Mon Sep 17 00:00:00 2001 From: Marco Artiano Date: Tue, 23 Jun 2026 18:14:31 +0200 Subject: [PATCH 23/39] apply some code review suggestions --- src/Trixi.jl | 4 +- src/auxiliary/math.jl | 3 +- src/equations/numerical_fluxes.jl | 29 ++++++-------- src/solvers/dgsem_structured/dg_3d_turbo.jl | 44 +++++++++------------ 4 files changed, 34 insertions(+), 46 deletions(-) diff --git a/src/Trixi.jl b/src/Trixi.jl index 0edb895a7cf..b49ca6ba4bb 100644 --- a/src/Trixi.jl +++ b/src/Trixi.jl @@ -72,7 +72,7 @@ using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Back using AcceleratedKernels: AcceleratedKernels using LinearMaps: LinearMap if _PREFERENCE_LOOPVECTORIZATION - using LoopVectorization: LoopVectorization, @turbo, indices + using LoopVectorization: LoopVectorization, @turbo, indices, AbstractSIMD else using LoopVectorization: LoopVectorization, indices include("auxiliary/mock_turbo.jl") @@ -255,7 +255,7 @@ export flux, flux_central, flux_lax_friedrichs, flux_hll, flux_hllc, flux_hlle, FluxRotated, flux_shima_etal_turbo, flux_ranocha_turbo, FluxUpwind, - FluxTracerEquationsCentral, FluxVolumeTurbo + FluxTracerEquationsCentral, FluxTurbo export splitting_steger_warming, splitting_vanleer_haenel, splitting_coirier_vanleer, splitting_lax_friedrichs, diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl index 2a173798578..7bda5efea7c 100644 --- a/src/auxiliary/math.jl +++ b/src/auxiliary/math.jl @@ -1,4 +1,3 @@ -using LoopVectorization: AbstractSIMD # By default, Julia/LLVM does not use fused multiply-add operations (FMAs). # Since these FMAs can increase the performance of many numerical algorithms, # we need to opt-in explicitly. @@ -139,7 +138,7 @@ end @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x) @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x) @inline log(x::Float16) = ccall("llvm.log.f16", llvmcall, Float16, (Float16,), x) - + # This is required for performance specializations like `FluxTurbo`. # The LoopVectorization.jl ecosystem does not throw an error but # already creates `NaN`s. diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index 8634da90bc6..69f3a4f11aa 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -579,6 +579,7 @@ end Base.show(io::IO, f::FluxUpwind) = print(io, "FluxUpwind(", f.splitting, ")") +#TODO: Expand the docstring and explain what is required for this to work """ FluxTurbo(numerical_flux) @@ -589,30 +590,25 @@ may enable better use of SIMD instructions to increase runtime efficiency on modern hardware, e.g., using [LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl). """ -struct FluxVolumeTurbo{VolumeFlux} - volume_flux::VolumeFlux - function FluxVolumeTurbo{VolumeFlux}(volume_flux) where {VolumeFlux} - return new{VolumeFlux}(volume_flux) +struct FluxTurbo{NumericalFlux} + numerical_flux::NumericalFlux + function FluxTurbo{NumericalFlux}(numerical_flux) where {NumericalFlux} + return new{NumericalFlux}(numerical_flux) end end # Helper function for conservative systems. -function FluxVolumeTurbo(volume_flux) - turbo_flux = combined_turbo_flux(volume_flux) - return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux) +function FluxTurbo(numerical_flux) + return FluxTurbo{typeof(numerical_flux)}(numerical_flux) end -# By default the turbo flux has no specialization and re-uses -# the numerical flux in terms of conservative variables. -@inline combined_turbo_flux(volume_flux) = volume_flux - # By default the turbo flux has the same number of precomputed variables # as the number of variables. -@inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations)) +@inline nturbovars(numerical_flux, equations) = Val(nvariables(equations)) # Transform the conserved variables in precomputed auxiliary variables to speed up the computation # of the numerical flux. When no specialization is given, this gives cons2cons. -@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations) +@inline cons2turbo(numerical_flux, conserved_and_equations...) = Base.front(conserved_and_equations) # Numerical volume flux that recalls the plain volume flux when no specialization is given. @inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...) @@ -633,10 +629,9 @@ end return volume_flux(u_ll, u_rr, normal_direction, equations) end -# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2fluxauxiliary +# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2turbo LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true -LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true +LoopVectorization.can_turbo(::typeof(cons2turbo), ::Val) = true -Base.show(io::IO, f::FluxVolumeTurbo) = print(io, "FluxVolumeTurbo(", f.volume_flux, - ")") +Base.show(io::IO, f::FluxTurbo) = print(io, "FluxTurbo(", f.numerical_flux, ")") end # @muladd diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 1f9b0e73372..1b63fb125e0 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -3,13 +3,13 @@ P4estMesh{3}}}, have_nonconservative_terms::False, equations, - volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM, + volume_flux::FluxTurbo, dg::DGSEM, cache, alpha) - @unpack volume_flux = volume_flux_turbo + @unpack numerical_flux = volume_flux flux_differencing_kernel_turbo!(_du, u_cons, element, MeshT, have_nonconservative_terms, equations, - volume_flux, dg, cache, alpha, - n_turbo_flux_aux_node_vars(volume_flux, equations), + numerical_flux, dg, cache, alpha, + nturbovars(numerical_flux, equations), Val(nvariables(equations))) end @@ -34,10 +34,10 @@ end # rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations) # u_prim[i, j, k, 1] = rho # and so on for v2, v3, p, ... cons_reads = [:(u_cons[$v, i, j, k, element]) for v in 1:NVARS] - cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...), - :(cons2fluxauxiliary(volume_flux, $(cons_reads...), - equations))) - cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX] + cons2turbo_ = Expr(:(=), Expr(:tuple, [Symbol(:u_prim_, v) for v in 1:NAUX]...), + :(cons2turbo(volume_flux, $(cons_reads...), + equations))) + cons2turbo_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:u_prim_, v))) for v in 1:NAUX] # Evaluate the two-point volume flux flux_call = Expr(:(=), Expr(:tuple, flux...), @@ -107,8 +107,8 @@ end # Convert conserved to auxiliary variables and store them in `u_prim`. @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg) - $cons2aux - $(cons2aux_writes...) + $cons2turbo_ + $(cons2turbo_writes...) end # x direction @@ -259,16 +259,13 @@ end end end -# Specialize the name of the turbo flux for flux_ranocha. -@inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo - # Number of precomputed variables for the specialization flux_ranocha -@inline n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), equations::CompressibleEulerEquations3D) = Val(7) +@inline nturbovars(volume_flux::typeof(flux_ranocha), equations::CompressibleEulerEquations3D) = Val(7) # Transformation from conserved to precomputed variables for flux_ranocha -@inline function cons2fluxauxiliary(volume_flux::typeof(flux_ranocha_turbo), - rho, rho_v1, rho_v2, rho_v3, rho_e, - equations::CompressibleEulerEquations3D) +@inline function cons2turbo(volume_flux::typeof(flux_ranocha), + rho, rho_v1, rho_v2, rho_v3, rho_e, + equations::CompressibleEulerEquations3D) v1 = rho_v1 / rho v2 = rho_v2 / rho v3 = rho_v3 / rho @@ -278,7 +275,7 @@ end end # Computation of the numerical flux_ranocha with respect to precomputed variables -@inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha_turbo), +@inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha), rho_ll, v1_ll, v2_ll, v3_ll, p_ll, log_rho_ll, log_p_ll, rho_rr, v1_rr, v2_rr, v3_rr, @@ -292,13 +289,10 @@ end v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 + v3_rr * normal_direction_3 - # Compute required mean values. - # We inline the logarithmic mean to allow LoopVectorization.jl to optimize it - # efficiently: `log` is precomputed once per node in `cons2fluxauxiliary` - # (`log_rho`, `log_p`) and reused here for the "regular" branch, while the - # "special" branch uses the series expansion to avoid cancellation when x ≈ y. - # This is equivalent to - # rho_mean = ln_mean(rho_ll, rho_rr) + # Compute required mean values + # We inline the logarithmic mean to allow LoopVectorization.jl to optimize + # it efficiently. This is equivalent to + # rho_mean = ln_mean(rho_ll, rho_rr) x1 = rho_ll log_x1 = log_rho_ll y1 = rho_rr From 7a51c08d3265117668b153c784d3d3812d5b2cac Mon Sep 17 00:00:00 2001 From: Marco Artiano Date: Tue, 23 Jun 2026 18:22:40 +0200 Subject: [PATCH 24/39] more robust tests --- test/test_performance_specializations_3d.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index 27c4918cf9a..27c41c67a95 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -230,7 +230,7 @@ end have_nonconservative_terms, semi.equations, semi.solver.volume_integral.volume_flux, semi.solver, semi.cache, true) - du_specialized = du[:, :, :, :, 1] + du_specialized = copy(du[:, :, :, :, 1]) # Call the plain version du .= 0 From 2a1f47a5708020a4ad77454593c134d2953fe765 Mon Sep 17 00:00:00 2001 From: Marco Artiano Date: Tue, 23 Jun 2026 18:28:10 +0200 Subject: [PATCH 25/39] remove inner constructor --- src/equations/numerical_fluxes.jl | 8 -------- 1 file changed, 8 deletions(-) diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index 69f3a4f11aa..e5a95835bca 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -592,14 +592,6 @@ on modern hardware, e.g., using """ struct FluxTurbo{NumericalFlux} numerical_flux::NumericalFlux - function FluxTurbo{NumericalFlux}(numerical_flux) where {NumericalFlux} - return new{NumericalFlux}(numerical_flux) - end -end - -# Helper function for conservative systems. -function FluxTurbo(numerical_flux) - return FluxTurbo{typeof(numerical_flux)}(numerical_flux) end # By default the turbo flux has the same number of precomputed variables From c2fe52dee93d4d54d243f8d55f68d763d8f31d28 Mon Sep 17 00:00:00 2001 From: Marco Artiano Date: Tue, 23 Jun 2026 18:48:46 +0200 Subject: [PATCH 26/39] add news and minor changes --- NEWS.md | 1 + src/equations/numerical_fluxes.jl | 9 ++++++++- 2 files changed, 9 insertions(+), 1 deletion(-) diff --git a/NEWS.md b/NEWS.md index 00634fce30f..177bbabc623 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,6 +20,7 @@ The new equation types `LinearDiffusionEquation1D` and `LinearDiffusionEquation2 - Support for 3D subcell limiting was extended by local limiting for nonperiodic `TreeMesh`es ([#2878]). - Support for user-defined RHS splitting for IMEX methods via SemidiscretizationHyperbolicSplit ([#2518]). The splitting follows the form `y_t = f_1(y) + f_2(y)`, allowing users to define separate solvers for the stiff (`f_1`) and non-stiff (`f_2`) parts of the right-hand side. Boundary conditions and source terms can be specified independently for the stiff and non-stiff parts. - Added postprocessing for kinetic energy spectral analysis via `compute_kinetic_energy_spectrum` for `AbstractCompressibleEulerEquations` on `TreeMesh`/`DGSEM` and on `DGMultiMesh`/`DGMultiSBP` in 2D and 3D; the routine returns matching integer wavenumber shells and the isotropic 1D spectrum `E(k)`. +- Added the wrapper `FluxTurbo` that enables SIMD optimized implementations for conservative systems. Precomputation of primitive variables or other more efficient sets of variables can be enabled by specifying the number of precomputed variables, `nturbovars`; the transformation from conservative to precomputed variables, `cons2turbo`; and the `volume_turbo_flux`, which computes the numerical flux in terms of the precomputed variables. ([#3090]) #### Changed - For performance, `LaplaceDiffusionEntropyVariables` parabolic fluxes for `CompressibleEulerEquations1D`, `CompressibleEulerEquations2D`, and `CompressibleEulerEquations3D` now use explicit Jacobian formulas from Barth 1999 instead of AD ([#3028]). Other equation types continue to use an automatic differentiation fallback. diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index e5a95835bca..7b960e8934b 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -579,7 +579,6 @@ end Base.show(io::IO, f::FluxUpwind) = print(io, "FluxUpwind(", f.splitting, ")") -#TODO: Expand the docstring and explain what is required for this to work """ FluxTurbo(numerical_flux) @@ -589,6 +588,14 @@ except that it may use specialized methods, e.g., when used with may enable better use of SIMD instructions to increase runtime efficiency on modern hardware, e.g., using [LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl). +The following specialization works by default for all numerical fluxes, +enabling SIMD instructions. However, for some systems, it is cheaper to precompute a set variables +(e.g., primitive variables and or logarithm of certain variables) and then compute the numerical fluxes. +This optimization can be enabled by defining three ingredients: the number of precomputed variables `nturbovars`; +the transformation from conservative to precomputed variables `cons2turbo`; and the +`volume_turbo_flux(volume_flux::typeof(numercal_flux), ...)`, that computes the numerical flux +in terms of the precomputed variables. +See also [DGSEM Turbo](https://github.com/trixi-framework/Trixi.jl/blob/main/src/solvers/dgsem_structured/dg_3d_turbo.jl) """ struct FluxTurbo{NumericalFlux} numerical_flux::NumericalFlux From 5eac91d86b36660b34645cdb14060ada367b24b8 Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Tue, 23 Jun 2026 20:57:48 +0200 Subject: [PATCH 27/39] improve docstring --- src/equations/numerical_fluxes.jl | 22 +++++++++++++--------- 1 file changed, 13 insertions(+), 9 deletions(-) diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index 7b960e8934b..2c038a3bd92 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -586,16 +586,20 @@ Create a numerical flux equivalent to the given `numerical_flux` except that it may use specialized methods, e.g., when used with [`VolumeIntegralFluxDifferencing`](@ref). These specialized methods may enable better use of SIMD instructions to increase runtime efficiency -on modern hardware, e.g., using +on modern hardware, e.g., using [LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl). + The following specialization works by default for all numerical fluxes, -enabling SIMD instructions. However, for some systems, it is cheaper to precompute a set variables -(e.g., primitive variables and or logarithm of certain variables) and then compute the numerical fluxes. -This optimization can be enabled by defining three ingredients: the number of precomputed variables `nturbovars`; -the transformation from conservative to precomputed variables `cons2turbo`; and the -`volume_turbo_flux(volume_flux::typeof(numercal_flux), ...)`, that computes the numerical flux -in terms of the precomputed variables. -See also [DGSEM Turbo](https://github.com/trixi-framework/Trixi.jl/blob/main/src/solvers/dgsem_structured/dg_3d_turbo.jl) +enabling SIMD instructions. However, for some systems, it is even better +to precompute some variables (e.g., primitive variables or logarithms of certain variables) +and then compute the numerical fluxes. +This optimization can be enabled by defining three ingredients: +- the number of precomputed variables `nturbovars`; +- the transformation from conservative to precomputed variables `cons2turbo` +- and the `volume_turbo_flux(volume_flux::typeof(numerical_flux), ...)`, + that computes the numerical flux in terms of the precomputed variables. + +See the implementation of [DGSEM Turbo](https://github.com/trixi-framework/Trixi.jl/blob/main/src/solvers/dgsem_structured/dg_3d_turbo.jl) """ struct FluxTurbo{NumericalFlux} numerical_flux::NumericalFlux @@ -628,7 +632,7 @@ end return volume_flux(u_ll, u_rr, normal_direction, equations) end -# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2turbo +# Allow LoopVectorization.jl to use SIMD instructions on volume_flux_turbo and cons2turbo LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true LoopVectorization.can_turbo(::typeof(cons2turbo), ::Val) = true From 8446f34ff6bea6b57314cdcabcf65c08b673b027 Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Tue, 23 Jun 2026 20:57:53 +0200 Subject: [PATCH 28/39] fix tests --- test/test_performance_specializations_3d.jl | 16 ++++++++-------- 1 file changed, 8 insertions(+), 8 deletions(-) diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index 27c41c67a95..c523e2fc772 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -173,11 +173,11 @@ end end end -@timed_testset "StructuredMesh3D, FluxVolumeTurbo(flux_ranocha)" begin +@timed_testset "StructuredMesh3D, FluxTurbo(flux_ranocha)" begin trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"), cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3, - volume_flux = FluxVolumeTurbo(flux_ranocha), + volume_flux = FluxTurbo(flux_ranocha), surface_flux = flux_ranocha) u_ode = copy(sol.u[end]) du_ode = zero(u_ode) @@ -189,7 +189,7 @@ end du = Trixi.wrap_array(du_ode, semi) have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations) - # Call the optimized version + # Call the optimized version du .= 0 Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), have_nonconservative_terms, semi.equations, @@ -197,7 +197,7 @@ end semi.solver, semi.cache, true) du_specialized = copy(du[:, :, :, :, 1]) - # Call the plain version + # Call the plain version du .= 0 Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), have_nonconservative_terms, semi.equations, @@ -208,11 +208,11 @@ end end end -@timed_testset "StructuredMesh3D, FluxVolumeTurbo(flux_shima_etal)" begin +@timed_testset "StructuredMesh3D, FluxTurbo(flux_shima_etal)" begin trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"), cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3, - volume_flux = Trixi.FluxVolumeTurbo(flux_shima_etal), + volume_flux = Trixi.FluxTurbo(flux_shima_etal), surface_flux = flux_shima_etal) u_ode = copy(sol.u[end]) du_ode = zero(u_ode) @@ -224,7 +224,7 @@ end du = Trixi.wrap_array(du_ode, semi) have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations) - # Call the optimized version + # Call the optimized version du .= 0 Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), have_nonconservative_terms, semi.equations, @@ -232,7 +232,7 @@ end semi.solver, semi.cache, true) du_specialized = copy(du[:, :, :, :, 1]) - # Call the plain version + # Call the plain version du .= 0 Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh), have_nonconservative_terms, semi.equations, From d70ceab26c222c68aad9fc173f34fb2618ce663e Mon Sep 17 00:00:00 2001 From: Hendrik Ranocha Date: Tue, 23 Jun 2026 21:04:27 +0200 Subject: [PATCH 29/39] fix name --- NEWS.md | 2 +- src/equations/numerical_fluxes.jl | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/NEWS.md b/NEWS.md index 177bbabc623..d8ee307dbef 100644 --- a/NEWS.md +++ b/NEWS.md @@ -20,7 +20,7 @@ The new equation types `LinearDiffusionEquation1D` and `LinearDiffusionEquation2 - Support for 3D subcell limiting was extended by local limiting for nonperiodic `TreeMesh`es ([#2878]). - Support for user-defined RHS splitting for IMEX methods via SemidiscretizationHyperbolicSplit ([#2518]). The splitting follows the form `y_t = f_1(y) + f_2(y)`, allowing users to define separate solvers for the stiff (`f_1`) and non-stiff (`f_2`) parts of the right-hand side. Boundary conditions and source terms can be specified independently for the stiff and non-stiff parts. - Added postprocessing for kinetic energy spectral analysis via `compute_kinetic_energy_spectrum` for `AbstractCompressibleEulerEquations` on `TreeMesh`/`DGSEM` and on `DGMultiMesh`/`DGMultiSBP` in 2D and 3D; the routine returns matching integer wavenumber shells and the isotropic 1D spectrum `E(k)`. -- Added the wrapper `FluxTurbo` that enables SIMD optimized implementations for conservative systems. Precomputation of primitive variables or other more efficient sets of variables can be enabled by specifying the number of precomputed variables, `nturbovars`; the transformation from conservative to precomputed variables, `cons2turbo`; and the `volume_turbo_flux`, which computes the numerical flux in terms of the precomputed variables. ([#3090]) +- Added the wrapper `FluxTurbo` that enables SIMD optimized implementations for conservative systems. Precomputation of primitive variables or other more efficient sets of variables can be enabled by specifying the number of precomputed variables, `nturbovars`; the transformation from conservative to precomputed variables, `cons2turbo`; and the `volume_flux_turbo`, which computes the numerical flux in terms of the precomputed variables. ([#3090]) #### Changed - For performance, `LaplaceDiffusionEntropyVariables` parabolic fluxes for `CompressibleEulerEquations1D`, `CompressibleEulerEquations2D`, and `CompressibleEulerEquations3D` now use explicit Jacobian formulas from Barth 1999 instead of AD ([#3028]). Other equation types continue to use an automatic differentiation fallback. diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index 2c038a3bd92..8d820b6c926 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -596,7 +596,7 @@ and then compute the numerical fluxes. This optimization can be enabled by defining three ingredients: - the number of precomputed variables `nturbovars`; - the transformation from conservative to precomputed variables `cons2turbo` -- and the `volume_turbo_flux(volume_flux::typeof(numerical_flux), ...)`, +- and the `volume_flux_turbo(volume_flux::typeof(numerical_flux), ...)`, that computes the numerical flux in terms of the precomputed variables. See the implementation of [DGSEM Turbo](https://github.com/trixi-framework/Trixi.jl/blob/main/src/solvers/dgsem_structured/dg_3d_turbo.jl) From 8a61bb762a13b64e11611ff69c24108388cf7646 Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Tue, 23 Jun 2026 22:48:07 +0200 Subject: [PATCH 30/39] Apply suggestions from code review Co-authored-by: Hendrik Ranocha --- src/Trixi.jl | 2 +- src/equations/numerical_fluxes.jl | 9 +++++++++ 2 files changed, 10 insertions(+), 1 deletion(-) diff --git a/src/Trixi.jl b/src/Trixi.jl index b7c011e9aa5..0b6e5f7c22b 100644 --- a/src/Trixi.jl +++ b/src/Trixi.jl @@ -74,7 +74,7 @@ using LinearMaps: LinearMap if _PREFERENCE_LOOPVECTORIZATION using LoopVectorization: LoopVectorization, @turbo, indices, AbstractSIMD else - using LoopVectorization: LoopVectorization, indices + using LoopVectorization: LoopVectorization, indices, AbstractSIMD include("auxiliary/mock_turbo.jl") end diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index 8d820b6c926..98823fb7183 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -605,12 +605,21 @@ struct FluxTurbo{NumericalFlux} numerical_flux::NumericalFlux end +# As a fallback method, the wrapped flux is called. +@inline (f::FluxTurbo)(u_ll, u_rr, orientation_or_normal_direction, + equations) + return f.numerical_flux(u_ll, u_rr, orientation_or_normal_direction, equations) +end + # By default the turbo flux has the same number of precomputed variables # as the number of variables. @inline nturbovars(numerical_flux, equations) = Val(nvariables(equations)) # Transform the conserved variables in precomputed auxiliary variables to speed up the computation # of the numerical flux. When no specialization is given, this gives cons2cons. +# Note that the conserved variables are passed as scalars instead of a vector +# to enable optimizations by LoopVectorization.jl. Thus, we use +# `conserved_and_equations...` instead of `cons, equations`. @inline cons2turbo(numerical_flux, conserved_and_equations...) = Base.front(conserved_and_equations) # Numerical volume flux that recalls the plain volume flux when no specialization is given. From 4967a129088401aca13166da063d2aa2428f309a Mon Sep 17 00:00:00 2001 From: Marco Artiano Date: Wed, 24 Jun 2026 08:50:49 +0200 Subject: [PATCH 31/39] add fallback tests, fix name --- .../elixir_euler_modified_sod.jl | 18 +++++++------- src/equations/numerical_fluxes.jl | 24 +++++++++---------- test/test_performance_specializations_3d.jl | 17 ++++++++++++- 3 files changed, 38 insertions(+), 21 deletions(-) diff --git a/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl b/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl index fd030fc37bd..e1499c62783 100644 --- a/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl +++ b/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl @@ -1,7 +1,8 @@ using OrdinaryDiffEqSSPRK using Trixi -equations = CompressibleEulerEquations1D(1.4) +RealT = Float64 +equations = CompressibleEulerEquations1D(RealT(1.4)) """ initial_condition_modified_sod(x, t, equations::CompressibleEulerEquations1D) @@ -21,10 +22,11 @@ An entropy-satisfying solution should produce a smooth(!) rarefaction wave. [DOI: 10.1016/j.jcp.2023.112677](https://doi.org/10.1016/j.jcp.2023.112677) """ function initial_condition_modified_sod(x, t, ::CompressibleEulerEquations1D) - if x[1] < 0.3 - return prim2cons(SVector(1, 0.75, 1), equations) + RealT = eltype(x) + if x[1] < RealT(0.3) + return prim2cons(SVector(one(RealT), 0.75f0, one(RealT)), equations) else - return prim2cons(SVector(0.125, 0.0, 0.1), equations) + return prim2cons(SVector(0.125f0, zero(eltype(x)), RealT(0.1)), equations) end end initial_condition = initial_condition_modified_sod @@ -38,11 +40,11 @@ volume_integral = VolumeIntegralFluxDifferencing(flux_ranocha) solver = DGSEM(polydeg = 3, surface_flux = flux_hllc, volume_integral = volume_integral) -coordinates_min = 0.0 -coordinates_max = 1.0 +coordinates_min = zero(RealT) +coordinates_max = one(RealT) mesh = TreeMesh(coordinates_min, coordinates_max, - initial_refinement_level = 6, + initial_refinement_level = 6, RealT = RealT, n_cells_max = 30_000, periodicity = false) @@ -54,7 +56,7 @@ boundary_conditions = (; x_neg = BoundaryConditionDirichlet(initial_condition), semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver, boundary_conditions = boundary_conditions) -tspan = (0.0, 0.2) +tspan = (zero(RealT), RealT(0.2)) ode = semidiscretize(semi, tspan) summary_callback = SummaryCallback() diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index 98823fb7183..befa2851514 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -606,8 +606,8 @@ struct FluxTurbo{NumericalFlux} end # As a fallback method, the wrapped flux is called. -@inline (f::FluxTurbo)(u_ll, u_rr, orientation_or_normal_direction, - equations) +@inline function (f::FluxTurbo)(u_ll, u_rr, orientation_or_normal_direction, + equations) return f.numerical_flux(u_ll, u_rr, orientation_or_normal_direction, equations) end @@ -623,21 +623,21 @@ end @inline cons2turbo(numerical_flux, conserved_and_equations...) = Base.front(conserved_and_equations) # Numerical volume flux that recalls the plain volume flux when no specialization is given. -@inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...) - equations = last(aux_and_normals_and_equations) +@inline function volume_flux_turbo(volume_flux, turbovars_and_normals_and_equations...) + equations = last(turbovars_and_normals_and_equations) volume_flux_turbo(volume_flux, have_nonconservative_terms(equations), - aux_and_normals_and_equations...) + turbovars_and_normals_and_equations...) end @inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False, - aux_and_normals_and_equations...) - equations = last(aux_and_normals_and_equations) + turbovars_and_normals_and_equations...) + equations = last(turbovars_and_normals_and_equations) n = nvariables(equations) - u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n))) - u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n))) - normal_direction = SVector(aux_and_normals_and_equations[end - 3], - aux_and_normals_and_equations[end - 2], - aux_and_normals_and_equations[end - 1]) + u_ll = SVector(ntuple(v -> turbovars_and_normals_and_equations[v], Val(n))) + u_rr = SVector(ntuple(v -> turbovars_and_normals_and_equations[n + v], Val(n))) + normal_direction = SVector(turbovars_and_normals_and_equations[end - 3], + turbovars_and_normals_and_equations[end - 2], + turbovars_and_normals_and_equations[end - 1]) return volume_flux(u_ll, u_rr, normal_direction, equations) end diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index c523e2fc772..4d840e15efb 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -212,7 +212,7 @@ end trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"), cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3, - volume_flux = Trixi.FluxTurbo(flux_shima_etal), + volume_flux = FluxTurbo(flux_shima_etal), surface_flux = flux_shima_etal) u_ode = copy(sol.u[end]) du_ode = zero(u_ode) @@ -243,6 +243,21 @@ end end end +# Test fallback method for mesh and type that are not supported +@timed_testset "TreeMesh1D, fallback call FluxTurbo(flux_chandrashekar)" begin + trixi_include(@__MODULE__, + joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"), + volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar)) + u_ode = copy(sol.u[end]) + + trixi_include(@__MODULE__, + joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"), + volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar))) + u_ode_specialized = copy(sol.u[end]) + + @test u_ode_specialized ≈ u_ode +end + @timed_testset "P4estMesh3D, combine_conservative_and_nonconservative_fluxes" begin trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "p4est_3d_dgsem", From 9d064d864457fab386ba185abeffe88073a33866 Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Wed, 24 Jun 2026 09:02:18 +0200 Subject: [PATCH 32/39] Apply suggestions from code review Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- test/test_performance_specializations_3d.jl | 6 ++++-- 1 file changed, 4 insertions(+), 2 deletions(-) diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index 4d840e15efb..02404c47ec4 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -246,12 +246,14 @@ end # Test fallback method for mesh and type that are not supported @timed_testset "TreeMesh1D, fallback call FluxTurbo(flux_chandrashekar)" begin trixi_include(@__MODULE__, - joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"), + joinpath(EXAMPLES_DIR, "tree_1d_dgsem", + "elixir_euler_modified_sod.jl"), volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar)) u_ode = copy(sol.u[end]) trixi_include(@__MODULE__, - joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"), + joinpath(EXAMPLES_DIR, "tree_1d_dgsem", + "elixir_euler_modified_sod.jl"), volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar))) u_ode_specialized = copy(sol.u[end]) From cd374594c421910f1bbfea25e4c6317551dd8c77 Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Wed, 24 Jun 2026 20:02:39 +0200 Subject: [PATCH 33/39] Apply suggestions from code review Co-authored-by: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> --- test/test_performance_specializations_3d.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index 02404c47ec4..80a16b01d25 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -248,13 +248,13 @@ end trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"), - volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar)) + volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar), RealT = BigFloat) u_ode = copy(sol.u[end]) trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"), - volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar))) + volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar), RealT = BigFloat)) u_ode_specialized = copy(sol.u[end]) @test u_ode_specialized ≈ u_ode From 59fa6434c0fec2836e75ddbc23dea73bdf896666 Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Wed, 24 Jun 2026 20:11:08 +0200 Subject: [PATCH 34/39] Update test/test_performance_specializations_3d.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- test/test_performance_specializations_3d.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index 80a16b01d25..8b79dc5c8d8 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -254,7 +254,8 @@ end trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"), - volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar), RealT = BigFloat)) + volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar), + RealT = BigFloat)) u_ode_specialized = copy(sol.u[end]) @test u_ode_specialized ≈ u_ode From 1d45aeb9cd25a3602e2aae50e1a7a7fa29dc6b83 Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Wed, 24 Jun 2026 20:11:16 +0200 Subject: [PATCH 35/39] Update test/test_performance_specializations_3d.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- test/test_performance_specializations_3d.jl | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index 8b79dc5c8d8..e34ef4f684e 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -248,7 +248,8 @@ end trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"), - volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar), RealT = BigFloat) + volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar), + RealT = BigFloat) u_ode = copy(sol.u[end]) trixi_include(@__MODULE__, From 47cc8260a8af4ded83431e5a26b20c2dd100b115 Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Thu, 25 Jun 2026 08:43:56 +0200 Subject: [PATCH 36/39] Apply suggestions from code review Co-authored-by: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> --- test/test_performance_specializations_3d.jl | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index e34ef4f684e..b5db6749d94 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -255,8 +255,8 @@ end trixi_include(@__MODULE__, joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"), - volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar), - RealT = BigFloat)) + volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar)), + RealT = BigFloat) u_ode_specialized = copy(sol.u[end]) @test u_ode_specialized ≈ u_ode From cc92253e2887caa166b41ba882179ac29287b68e Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Thu, 25 Jun 2026 09:01:21 +0200 Subject: [PATCH 37/39] Update test/test_performance_specializations_3d.jl Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com> --- test/test_performance_specializations_3d.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl index b5db6749d94..7b953c06c02 100644 --- a/test/test_performance_specializations_3d.jl +++ b/test/test_performance_specializations_3d.jl @@ -256,7 +256,7 @@ end joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"), volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar)), - RealT = BigFloat) + RealT = BigFloat) u_ode_specialized = copy(sol.u[end]) @test u_ode_specialized ≈ u_ode From 19885c9bf41e5aa9cb24fc775947a02b5953b899 Mon Sep 17 00:00:00 2001 From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com> Date: Fri, 26 Jun 2026 08:36:02 +0200 Subject: [PATCH 38/39] Update examples/tree_1d_dgsem/elixir_euler_modified_sod.jl Co-authored-by: Hendrik Ranocha --- examples/tree_1d_dgsem/elixir_euler_modified_sod.jl | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl b/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl index e1499c62783..875e8da05e1 100644 --- a/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl +++ b/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl @@ -26,7 +26,7 @@ function initial_condition_modified_sod(x, t, ::CompressibleEulerEquations1D) if x[1] < RealT(0.3) return prim2cons(SVector(one(RealT), 0.75f0, one(RealT)), equations) else - return prim2cons(SVector(0.125f0, zero(eltype(x)), RealT(0.1)), equations) + return prim2cons(SVector(0.125f0, zero(RealT), RealT(0.1)), equations) end end initial_condition = initial_condition_modified_sod From 050943ee89ca603c76f1307e9e46bdc29d6c5c57 Mon Sep 17 00:00:00 2001 From: Marco Artiano Date: Fri, 26 Jun 2026 08:40:50 +0200 Subject: [PATCH 39/39] rename volume flux turbo in flux turbo --- src/equations/numerical_fluxes.jl | 16 +++++------ src/solvers/dgsem_structured/dg_3d_turbo.jl | 32 ++++++++++----------- 2 files changed, 24 insertions(+), 24 deletions(-) diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl index befa2851514..6d8893740a7 100644 --- a/src/equations/numerical_fluxes.jl +++ b/src/equations/numerical_fluxes.jl @@ -596,7 +596,7 @@ and then compute the numerical fluxes. This optimization can be enabled by defining three ingredients: - the number of precomputed variables `nturbovars`; - the transformation from conservative to precomputed variables `cons2turbo` -- and the `volume_flux_turbo(volume_flux::typeof(numerical_flux), ...)`, +- and the `flux_turbo(flux_turbo::typeof(numerical_flux), ...)`, that computes the numerical flux in terms of the precomputed variables. See the implementation of [DGSEM Turbo](https://github.com/trixi-framework/Trixi.jl/blob/main/src/solvers/dgsem_structured/dg_3d_turbo.jl) @@ -623,14 +623,14 @@ end @inline cons2turbo(numerical_flux, conserved_and_equations...) = Base.front(conserved_and_equations) # Numerical volume flux that recalls the plain volume flux when no specialization is given. -@inline function volume_flux_turbo(volume_flux, turbovars_and_normals_and_equations...) +@inline function flux_turbo(numerical_flux, turbovars_and_normals_and_equations...) equations = last(turbovars_and_normals_and_equations) - volume_flux_turbo(volume_flux, have_nonconservative_terms(equations), - turbovars_and_normals_and_equations...) + flux_turbo(numerical_flux, have_nonconservative_terms(equations), + turbovars_and_normals_and_equations...) end -@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False, - turbovars_and_normals_and_equations...) +@inline function flux_turbo(numerical_flux, have_nonconservative_terms::False, + turbovars_and_normals_and_equations...) equations = last(turbovars_and_normals_and_equations) n = nvariables(equations) u_ll = SVector(ntuple(v -> turbovars_and_normals_and_equations[v], Val(n))) @@ -638,11 +638,11 @@ end normal_direction = SVector(turbovars_and_normals_and_equations[end - 3], turbovars_and_normals_and_equations[end - 2], turbovars_and_normals_and_equations[end - 1]) - return volume_flux(u_ll, u_rr, normal_direction, equations) + return numerical_flux(u_ll, u_rr, normal_direction, equations) end # Allow LoopVectorization.jl to use SIMD instructions on volume_flux_turbo and cons2turbo -LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true +LoopVectorization.can_turbo(::typeof(flux_turbo), ::Val) = true LoopVectorization.can_turbo(::typeof(cons2turbo), ::Val) = true Base.show(io::IO, f::FluxTurbo) = print(io, "FluxTurbo(", f.numerical_flux, ")") diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl index 1b63fb125e0..d4054ac4085 100644 --- a/src/solvers/dgsem_structured/dg_3d_turbo.jl +++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl @@ -41,11 +41,11 @@ end # Evaluate the two-point volume flux flux_call = Expr(:(=), Expr(:tuple, flux...), - :(volume_flux_turbo(volume_flux, - $(u_prim_ll...), $(u_prim_rr...), - normal_direction_1, normal_direction_2, - normal_direction_3, - equations))) + :(flux_turbo(volume_flux, + $(u_prim_ll...), $(u_prim_rr...), + normal_direction_1, normal_direction_2, + normal_direction_3, + equations))) # Build the inner `@turbo` loop body. For the x direction `loads` gives # rho_ll = u_prim_permuted[jk, i, 1] # left node, and so on @@ -260,10 +260,10 @@ end end # Number of precomputed variables for the specialization flux_ranocha -@inline nturbovars(volume_flux::typeof(flux_ranocha), equations::CompressibleEulerEquations3D) = Val(7) +@inline nturbovars(flux_turbo::typeof(flux_ranocha), equations::CompressibleEulerEquations3D) = Val(7) # Transformation from conserved to precomputed variables for flux_ranocha -@inline function cons2turbo(volume_flux::typeof(flux_ranocha), +@inline function cons2turbo(flux_turbo::typeof(flux_ranocha), rho, rho_v1, rho_v2, rho_v3, rho_e, equations::CompressibleEulerEquations3D) v1 = rho_v1 / rho @@ -275,15 +275,15 @@ end end # Computation of the numerical flux_ranocha with respect to precomputed variables -@inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha), - rho_ll, v1_ll, v2_ll, v3_ll, - p_ll, log_rho_ll, log_p_ll, - rho_rr, v1_rr, v2_rr, v3_rr, - p_rr, log_rho_rr, log_p_rr, - normal_direction_1, - normal_direction_2, - normal_direction_3, - equations::CompressibleEulerEquations3D) +@inline function flux_turbo(flux_turbo::typeof(flux_ranocha), + rho_ll, v1_ll, v2_ll, v3_ll, + p_ll, log_rho_ll, log_p_ll, + rho_rr, v1_rr, v2_rr, v3_rr, + p_rr, log_rho_rr, log_p_rr, + normal_direction_1, + normal_direction_2, + normal_direction_3, + equations::CompressibleEulerEquations3D) v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 + v3_ll * normal_direction_3 v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +