From 610c7e44b0c64785a47444abe0a4d3b41ddc67c5 Mon Sep 17 00:00:00 2001
From: Marco Artiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 00:19:30 +0200
Subject: [PATCH 01/39] add first working version

---
 src/Trixi.jl                                |   2 +-
 src/auxiliary/math.jl                       |   1 +
 src/solvers/dgsem_structured/dg.jl          |   1 +
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 389 ++++++++++++++++++++
 4 files changed, 392 insertions(+), 1 deletion(-)
 create mode 100644 src/solvers/dgsem_structured/dg_3d_turbo.jl

diff --git a/src/Trixi.jl b/src/Trixi.jl
index f2fce689d1e..2154e40c961 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -310,7 +310,7 @@ export DG,
        FDSBP,
        BlockFV, UniformFiniteVolumeBasis, VolumeIntegralFiniteVolume,
        VolumeIntegralWeakForm, VolumeIntegralStrongForm,
-       VolumeIntegralFluxDifferencing,
+       VolumeIntegralFluxDifferencing, FluxVolumeTurbo,
        VolumeIntegralPureLGLFiniteVolume, VolumeIntegralPureLGLFiniteVolumeO2,
        VolumeIntegralShockCapturingHG, VolumeIntegralShockCapturingRRG,
        VolumeIntegralShockCapturingHGType,
diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl
index 8ee1ff5f2fa..1c4f3bc385b 100644
--- a/src/auxiliary/math.jl
+++ b/src/auxiliary/math.jl
@@ -135,6 +135,7 @@ end
     See also [`Trixi.set_log_type!`](@ref).
     """
     @inline log(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.log(x)
+    @inline log(x::LoopVectorization.VectorizationBase.AbstractSIMD) = ifelse(x < zero(x), oftype(x, NaN), Base.log(x))
 
     @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x)
     @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x)
diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl
index 5ac27049151..99b672f4fa5 100644
--- a/src/solvers/dgsem_structured/dg.jl
+++ b/src/solvers/dgsem_structured/dg.jl
@@ -234,4 +234,5 @@ include("dg_2d_subcell_limiters.jl")
 # Specialized implementations used to improve performance
 include("dg_2d_compressible_euler.jl")
 include("dg_3d_compressible_euler.jl")
+include("dg_3d_turbo.jl")
 end # @muladd
diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
new file mode 100644
index 00000000000..afd0e08ca83
--- /dev/null
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -0,0 +1,389 @@
+struct FluxVolumeTurbo{VolumeFlux}
+    volume_flux::VolumeFlux
+    function FluxVolumeTurbo{VolumeFlux}(volume_flux) where {VolumeFlux}
+        return new{VolumeFlux}(volume_flux)
+    end
+end
+
+function FluxVolumeTurbo(volume_flux_conservative, volume_flux_nonconservative)
+    turbo_flux = combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative)
+    return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux)
+end
+
+function FluxVolumeTurbo(volume_flux)
+    turbo_flux = combined_turbo_flux(volume_flux)
+    return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux)
+end
+
+combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) = True()
+
+@inline combined_turbo_flux(volume_flux) = volume_flux
+
+
+@inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations))
+
+@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) =
+    Base.front(conserved_and_equations)
+
+@inline function turbo_volume_flux(volume_flux, aux_and_normals_and_equations...)
+    equations = last(aux_and_normals_and_equations)
+    n = nvariables(equations)
+    u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n)))
+    u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n)))
+    normal = SVector(aux_and_normals_and_equations[end - 3],
+                     aux_and_normals_and_equations[end - 2],
+                     aux_and_normals_and_equations[end - 1])
+    return Tuple(volume_flux(u_ll, u_rr, normal, equations))
+end
+
+@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element,
+                                                 MeshT::Type{<:Union{StructuredMesh{3},
+                                                                     P4estMesh{3}}},
+                                                 have_nonconservative_terms::False,
+                                                 equations,
+                                                 volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM,
+                                                 cache, alpha)
+    @unpack volume_flux = volume_flux_turbo
+    flux_differencing_kernel_turbo!(_du, u_cons, element, equations,
+                                    volume_flux, dg, cache, alpha,
+                                    n_turbo_flux_aux_node_vars(volume_flux, equations),
+                                    Val(nvariables(equations)))
+end
+
+@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element,
+                                                 MeshT::Type{<:Union{StructuredMesh{3},
+                                                                     P4estMesh{3}}},
+                                                 have_nonconservative_terms::True,
+                                                 combine_conservative_and_nonconservative_fluxes::True,
+                                                 equations,
+                                                 volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM,
+                                                 cache, alpha)
+    @unpack volume_flux = volume_flux_turbo
+    flux_differencing_kernel_turbo!(_du, u_cons, element, equations,
+                                    volume_flux, dg, cache, alpha,
+                                    n_turbo_flux_aux_node_vars(volume_flux, equations),
+                                    Val(nvariables(equations)))
+end
+
+@generated function flux_differencing_kernel_turbo!(_du::PtrArray, u_cons::PtrArray,
+                                                    element, equations,
+                                                    volume_turbo_flux_function,
+                                                    dg, cache, alpha, ::Val{NAUX},
+                                                    ::Val{NVARS}) where {NAUX, NVARS}
+    # Per-node scalars used inside the `@turbo` loops, e.g. `u_prim_ll_1 = rho_ll`,
+    # `flux_1 = f1, etc.  
+    u_prim_ll = [Symbol(:u_prim_ll_, v) for v in 1:NAUX]
+    u_prim_rr = [Symbol(:u_prim_rr_, v) for v in 1:NAUX]
+    flux = [Symbol(:flux_, v) for v in 1:NVARS]
+
+    # Convert conserved to auxiliary variables node-wise, e.g. 
+    #   rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations)
+    #   u_prim[i, j, k, 1] = rho   # and so on for v2, v3, p, ...
+    cons_reads = [:(u_cons[$c, i, j, k, element]) for c in 1:NVARS]
+    cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...),
+                    :(cons2fluxauxiliary(volume_turbo_flux_function, $(cons_reads...),
+                                         equations)))
+    cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX]
+
+    # Evaluate the two-point volume flux
+    flux_call = Expr(:(=), Expr(:tuple, flux...),
+                     :(turbo_volume_flux(volume_turbo_flux_function,
+                                         $(u_prim_ll...), $(u_prim_rr...),
+                                         normal_direction_1, normal_direction_2, normal_direction_3,
+                                         equations)))
+
+    # Build the inner `@turbo` loop body. For the x direction `loads` gives
+    #   rho_ll = u_prim_permuted[jk, i, 1]   # left node, and so on
+    #   rho_rr = u_prim_permuted[jk, ii, 1]  # right node
+    loads(arr, idx_ll, idx_rr) = vcat([:($(u_prim_ll[v]) = $arr[$(idx_ll...), $v])
+                                        for v in 1:NAUX],
+                                       [:($(u_prim_rr[v]) = $arr[$(idx_rr...), $v])
+                                        for v in 1:NAUX])
+
+    # Compute the average of the normal_direction
+    # e.g. the x direction `normals` gives
+    #   normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] +
+    #                                 contravariant_vectors_x[jk, ii, 1])   # and 2, 3
+    normals(cv, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 *
+                                                           ($cv[$(idx_ll...), $m] +
+                                                            $cv[$(idx_rr...), $m])) for m in 1:3]
+
+    # Store the fluxes in the permuted du array
+    # e.g. the x direction `stores` gives
+    #   du_permuted[jk, i, 1]  += factor_l * f1   # left node, and so on
+    #   du_permuted[jk, ii, 1] += factor_r * f1   # right node
+    stores(du_arr, idx_ll, idx_rr) = vcat([:($du_arr[$(idx_ll...), $v] += factor_l *
+                                                                          $(flux[v]))
+                                           for v in 1:NVARS],
+                                          [:($du_arr[$(idx_rr...), $v] += factor_r *
+                                                                         $(flux[v]))
+                                           for v in 1:NVARS])
+
+    loads_x = loads(:u_prim_permuted, (:jk, :i), (:jk, :ii))
+    normals_x = normals(:contravariant_vectors_x, (:jk, :i), (:jk, :ii))
+    stores_x = stores(:du_permuted, (:jk, :i), (:jk, :ii))
+
+    loads_y = loads(:u_prim, (:i, :j, :k), (:i, :jj, :k))
+    normals_y = normals(:contravariant_vectors_y, (:i, :j, :k), (:i, :jj, :k))
+    stores_y = stores(:du, (:i, :j, :k), (:i, :jj, :k))
+
+    loads_z = loads(:u_prim_reshaped, (:ij, :k), (:ij, :kk))
+    normals_z = normals(:contravariant_vectors_z, (:ij, :k), (:ij, :kk))
+    stores_z = stores(:du_reshaped, (:ij, :k), (:ij, :kk))
+
+    quote
+        @unpack derivative_split = dg.basis
+        @unpack contravariant_vectors = cache.elements
+
+        # Create a temporary array that will be used to store the RHS with permuted
+        # indices `[i, j, k, v]` to allow using SIMD instructions.
+        # `StrideArray`s with purely static dimensions do not allocate on the heap.
+        du = StrideArray{eltype(u_cons)}(undef,
+                                         (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
+                                          StaticInt(nnodes(dg)), StaticInt(NVARS)))
+
+        # Convert conserved to auxiliary variables on the given `element`.
+        u_prim = StrideArray{eltype(u_cons)}(undef,
+                                             (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
+                                              StaticInt(nnodes(dg)), StaticInt(NAUX)))
+
+        # Convert conserved to auxiliary variables and store them in `u_prim`.
+        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            $cons2aux
+            $(cons2aux_writes...)
+        end
+
+        # x direction
+        # At first, we create new temporary arrays with permuted memory layout to
+        # allow using SIMD instructions along the first dimension (which is contiguous
+        # in memory).
+        du_permuted = StrideArray{eltype(u_cons)}(undef,
+                                                  (StaticInt(nnodes(dg)^2),
+                                                   StaticInt(nnodes(dg)),
+                                                   StaticInt(NVARS)))
+
+        u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
+                                                      (StaticInt(nnodes(dg)^2),
+                                                       StaticInt(nnodes(dg)),
+                                                       StaticInt(NAUX)))
+
+        @turbo for v in indices(u_prim, 4),
+                   k in eachnode(dg),
+                   j in eachnode(dg),
+                   i in eachnode(dg)
+
+            jk = j + nnodes(dg) * (k - 1)
+            u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]
+        end
+        fill!(du_permuted, zero(eltype(du_permuted)))
+
+        # We must also permute the contravariant vectors.
+        contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
+                                                                             (StaticInt(nnodes(dg)^2),
+                                                                              StaticInt(nnodes(dg)),
+                                                                              StaticInt(3)))
+
+        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            jk = j + nnodes(dg) * (k - 1)
+            contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k,
+                                                                      element]
+            contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k,
+                                                                      element]
+            contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k,
+                                                                      element]
+        end
+
+        # Next, we basically inline the volume flux. To allow SIMD vectorization and
+        # still use the symmetry of the volume flux and the derivative matrix, we
+        # loop over the triangular part in an outer loop and use a plain inner loop.
+        for i in eachnode(dg), ii in (i + 1):nnodes(dg)
+            @turbo for jk in Base.OneTo(nnodes(dg)^2)
+                $(loads_x...)
+                $(normals_x...)
+                $flux_call
+                factor_l = alpha * derivative_split[i, ii]
+                factor_r = alpha * derivative_split[ii, i]
+                $(stores_x...)
+            end
+        end
+
+        @turbo for v in eachvariable(equations),
+                   k in eachnode(dg),
+                   j in eachnode(dg),
+                   i in eachnode(dg)
+
+            jk = j + nnodes(dg) * (k - 1)
+            du[i, j, k, v] = du_permuted[jk, i, v]
+        end
+
+        # y direction
+        # We must also permute the contravariant vectors.
+        contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
+                                                                             (StaticInt(nnodes(dg)),
+                                                                              StaticInt(nnodes(dg)),
+                                                                              StaticInt(nnodes(dg)),
+                                                                              StaticInt(3)))
+
+        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k,
+                                                                        element]
+            contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k,
+                                                                        element]
+            contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k,
+                                                                        element]
+        end
+
+        # A possible permutation of array dimensions with improved opportunities for
+        # SIMD vectorization appeared to be slower than the direct version used here
+        # in preliminary numerical experiments on an AVX2 system.
+        for j in eachnode(dg), jj in (j + 1):nnodes(dg)
+            @turbo for k in eachnode(dg), i in eachnode(dg)
+                $(loads_y...)
+                $(normals_y...)
+                $flux_call
+                factor_l = alpha * derivative_split[j, jj]
+                factor_r = alpha * derivative_split[jj, j]
+                $(stores_y...)
+            end
+        end
+
+        # z direction
+        # The memory layout is already optimal for SIMD vectorization in this loop.
+        # We just squeeze the first two dimensions to make the code slightly faster.
+        GC.@preserve u_prim begin
+            u_prim_reshaped = PtrArray(pointer(u_prim),
+                                       (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
+                                        StaticInt(NAUX)))
+
+            du_reshaped = PtrArray(pointer(du),
+                                   (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
+                                    StaticInt(NVARS)))
+
+            # We must also permute the contravariant vectors.
+            contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
+                                                                                 (StaticInt(nnodes(dg)^2),
+                                                                                  StaticInt(nnodes(dg)),
+                                                                                  StaticInt(3)))
+
+            @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+                ij = i + nnodes(dg) * (j - 1)
+                contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k,
+                                                                          element]
+                contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k,
+                                                                          element]
+                contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k,
+                                                                          element]
+            end
+
+            for k in eachnode(dg), kk in (k + 1):nnodes(dg)
+                @turbo for ij in Base.OneTo(nnodes(dg)^2)
+                    $(loads_z...)
+                    $(normals_z...)
+                    $flux_call
+                    factor_l = alpha * derivative_split[k, kk]
+                    factor_r = alpha * derivative_split[kk, k]
+                    $(stores_z...)
+                end
+            end
+        end # GC.@preserve u_prim begin
+
+        # Finally, we add the temporary RHS computed here to the global RHS in the
+        # given `element`.
+        @turbo for v in eachvariable(equations),
+                   k in eachnode(dg),
+                   j in eachnode(dg),
+                   i in eachnode(dg)
+
+            _du[v, i, j, k, element] += du[i, j, k, v]
+        end
+
+        return nothing
+    end
+end
+
+LoopVectorization.can_turbo(::typeof(turbo_volume_flux), ::Val) = true
+LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
+
+@inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo
+
+@inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), equations::CompressibleEulerEquations3D)
+    return Val(7)
+end
+
+@inline function cons2fluxauxiliary(volume_flux::typeof(flux_ranocha_turbo),
+                                    rho, rho_v1, rho_v2, rho_v3, rho_e,
+                                    equations::CompressibleEulerEquations3D)
+    v1 = rho_v1 / rho
+    v2 = rho_v2 / rho
+    v3 = rho_v3 / rho
+    p = (equations.gamma - 1) *
+        (rho_e - 0.5f0 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))
+    return rho, v1, v2, v3, p, log(rho), log(p)
+end
+
+@inline function turbo_volume_flux(volume_flux::typeof(flux_ranocha_turbo),
+                                   rho_ll, v1_ll, v2_ll, v3_ll,
+                                   p_ll, log_rho_ll, log_p_ll,
+                                   rho_rr, v1_rr, v2_rr, v3_rr,
+                                   p_rr, log_rho_rr, log_p_rr,
+                                   normal_direction_1,
+                                   normal_direction_2,
+                                   normal_direction_3,
+                                   equations::CompressibleEulerEquations3D)
+
+    v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
+                 v3_ll * normal_direction_3
+    v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
+                 v3_rr * normal_direction_3
+
+    # Compute required mean values.
+    # We inline the logarithmic mean to allow LoopVectorization.jl to optimize it
+    # efficiently: `log` is precomputed once per node in `cons2fluxauxiliary`
+    # (`log_rho`, `log_p`) and reused here for the "regular" branch, while the
+    # "special" branch uses the series expansion to avoid cancellation when x ≈ y.
+    # This is equivalent to
+    #   rho_mean = ln_mean(rho_ll, rho_rr)
+    x1 = rho_ll
+    log_x1 = log_rho_ll
+    y1 = rho_rr
+    log_y1 = log_rho_rr
+    x1_plus_y1 = x1 + y1
+    y1_minus_x1 = y1 - x1
+    z1 = y1_minus_x1^2 / x1_plus_y1^2
+    special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
+    regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
+    rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
+
+    # Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
+    # in exact arithmetic since
+    #     log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
+    #   = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
+    # inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
+    x2 = rho_ll * p_rr
+    log_x2 = log_rho_ll + log_p_rr
+    y2 = rho_rr * p_ll
+    log_y2 = log_rho_rr + log_p_ll
+    x2_plus_y2 = x2 + y2
+    y2_minus_x2 = y2 - x2
+    z2 = y2_minus_x2^2 / x2_plus_y2^2
+    special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
+    regular_path2 = (log_y2 - log_x2) / y2_minus_x2
+    inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
+
+    v1_avg = 0.5 * (v1_ll + v1_rr)
+    v2_avg = 0.5 * (v2_ll + v2_rr)
+    v3_avg = 0.5 * (v3_ll + v3_rr)
+    p_avg = 0.5 * (p_ll + p_rr)
+    velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
+
+    # Calculate fluxes depending on normal_direction
+    f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)
+    f2 = f1 * v1_avg + p_avg * normal_direction_1
+    f3 = f1 * v2_avg + p_avg * normal_direction_2
+    f4 = f1 * v3_avg + p_avg * normal_direction_3
+    f5 = (f1 * (velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)
+          +
+          0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
+
+    return (f1, f2, f3, f4, f5)
+end

From 855525004518c9ecff58e98ab5e794ba7bda231b Mon Sep 17 00:00:00 2001
From: Marco Artiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 01:11:33 +0200
Subject: [PATCH 02/39] add tests

---
 src/auxiliary/math.jl                       |  6 +-
 test/test_performance_specializations_3d.jl | 76 +++++++++++++++++++++
 2 files changed, 81 insertions(+), 1 deletion(-)

diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl
index 1c4f3bc385b..334f011fdd3 100644
--- a/src/auxiliary/math.jl
+++ b/src/auxiliary/math.jl
@@ -135,7 +135,11 @@ end
     See also [`Trixi.set_log_type!`](@ref).
     """
     @inline log(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.log(x)
-    @inline log(x::LoopVectorization.VectorizationBase.AbstractSIMD) = ifelse(x < zero(x), oftype(x, NaN), Base.log(x))
+    @inline log(x::LoopVectorization.VectorizationBase.AbstractSIMD) = ifelse(x <
+                                                                              zero(x),
+                                                                              oftype(x,
+                                                                                     NaN),
+                                                                              Base.log(x))
 
     @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x)
     @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x)
diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 967b0f9cf3e..0f560612df6 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -173,6 +173,82 @@ end
     end
 end
 
+@timed_testset "StructuredMesh3D, FluxVolumeTurbo(flux_ranocha)" begin
+    trixi_include(@__MODULE__,
+                  joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"),
+                  cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3,
+                  volume_flux = Trixi.FluxVolumeTurbo(flux_ranocha),
+                  surface_flux = flux_ranocha)
+    u_ode = copy(sol.u[end])
+    du_ode = zero(u_ode)
+
+    # Preserve original memory since it will be `unsafe_wrap`ped and might
+    # thus otherwise be garbage collected
+    GC.@preserve u_ode du_ode begin
+        u = Trixi.wrap_array(u_ode, semi)
+        du = Trixi.wrap_array(du_ode, semi)
+        have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations)
+
+        # Call the optimized version specialized on `FluxVolumeTurbo`, which dispatches
+        # to the generated `@turbo` kernel
+        du .= 0
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
+                                        have_nonconservative_terms, semi.equations,
+                                        semi.solver.volume_integral.volume_flux,
+                                        semi.solver, semi.cache, true)
+        du_specialized = du[:, :, :, :, 1]
+
+        # Call the plain version using the underlying `flux_ranocha` directly, which
+        # dispatches to the generic flux differencing kernel
+        du .= 0
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
+                                        have_nonconservative_terms, semi.equations,
+                                        flux_ranocha, semi.solver, semi.cache, true)
+        du_baseline = du[:, :, :, :, 1]
+
+        @test du_specialized ≈ du_baseline
+    end
+end
+
+@timed_testset "StructuredMesh3D, FluxVolumeTurbo(flux_shima_etal)" begin
+    # `flux_shima_etal` has no dedicated turbo specialization, so `FluxVolumeTurbo`
+    # exercises the generic default path (conserved auxiliary variables + canonical flux).
+    trixi_include(@__MODULE__,
+                  joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"),
+                  cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3,
+                  volume_flux = Trixi.FluxVolumeTurbo(flux_shima_etal),
+                  surface_flux = flux_shima_etal)
+    u_ode = copy(sol.u[end])
+    du_ode = zero(u_ode)
+
+    # Preserve original memory since it will be `unsafe_wrap`ped and might
+    # thus otherwise be garbage collected
+    GC.@preserve u_ode du_ode begin
+        u = Trixi.wrap_array(u_ode, semi)
+        du = Trixi.wrap_array(du_ode, semi)
+        have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations)
+
+        # Call the optimized version specialized on `FluxVolumeTurbo`, which dispatches
+        # to the generated `@turbo` kernel
+        du .= 0
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
+                                        have_nonconservative_terms, semi.equations,
+                                        semi.solver.volume_integral.volume_flux,
+                                        semi.solver, semi.cache, true)
+        du_specialized = du[:, :, :, :, 1]
+
+        # Call the plain version using the underlying `flux_shima_etal` directly, which
+        # dispatches to the generic flux differencing kernel
+        du .= 0
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
+                                        have_nonconservative_terms, semi.equations,
+                                        flux_shima_etal, semi.solver, semi.cache, true)
+        du_baseline = du[:, :, :, :, 1]
+
+        @test du_specialized ≈ du_baseline
+    end
+end
+
 @timed_testset "P4estMesh3D, combine_conservative_and_nonconservative_fluxes" begin
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",

From eb860cf8465ea0ce048dfd51af5aa2f0458ba8c4 Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 01:12:46 +0200
Subject: [PATCH 03/39] format

---
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 52 +++++++++++----------
 1 file changed, 27 insertions(+), 25 deletions(-)

diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index afd0e08ca83..36a3e87d2bd 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -19,11 +19,9 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) =
 
 @inline combined_turbo_flux(volume_flux) = volume_flux
 
-
 @inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations))
 
-@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) =
-    Base.front(conserved_and_equations)
+@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
 
 @inline function turbo_volume_flux(volume_flux, aux_and_normals_and_equations...)
     equations = last(aux_and_normals_and_equations)
@@ -37,12 +35,12 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) =
 end
 
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element,
-                                                 MeshT::Type{<:Union{StructuredMesh{3},
-                                                                     P4estMesh{3}}},
-                                                 have_nonconservative_terms::False,
-                                                 equations,
-                                                 volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM,
-                                                 cache, alpha)
+                                           MeshT::Type{<:Union{StructuredMesh{3},
+                                                               P4estMesh{3}}},
+                                           have_nonconservative_terms::False,
+                                           equations,
+                                           volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM,
+                                           cache, alpha)
     @unpack volume_flux = volume_flux_turbo
     flux_differencing_kernel_turbo!(_du, u_cons, element, equations,
                                     volume_flux, dg, cache, alpha,
@@ -51,13 +49,13 @@ end
 end
 
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element,
-                                                 MeshT::Type{<:Union{StructuredMesh{3},
-                                                                     P4estMesh{3}}},
-                                                 have_nonconservative_terms::True,
-                                                 combine_conservative_and_nonconservative_fluxes::True,
-                                                 equations,
-                                                 volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM,
-                                                 cache, alpha)
+                                           MeshT::Type{<:Union{StructuredMesh{3},
+                                                               P4estMesh{3}}},
+                                           have_nonconservative_terms::True,
+                                           combine_conservative_and_nonconservative_fluxes::True,
+                                           equations,
+                                           volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM,
+                                           cache, alpha)
     @unpack volume_flux = volume_flux_turbo
     flux_differencing_kernel_turbo!(_du, u_cons, element, equations,
                                     volume_flux, dg, cache, alpha,
@@ -89,24 +87,28 @@ end
     flux_call = Expr(:(=), Expr(:tuple, flux...),
                      :(turbo_volume_flux(volume_turbo_flux_function,
                                          $(u_prim_ll...), $(u_prim_rr...),
-                                         normal_direction_1, normal_direction_2, normal_direction_3,
+                                         normal_direction_1, normal_direction_2,
+                                         normal_direction_3,
                                          equations)))
 
     # Build the inner `@turbo` loop body. For the x direction `loads` gives
     #   rho_ll = u_prim_permuted[jk, i, 1]   # left node, and so on
     #   rho_rr = u_prim_permuted[jk, ii, 1]  # right node
     loads(arr, idx_ll, idx_rr) = vcat([:($(u_prim_ll[v]) = $arr[$(idx_ll...), $v])
-                                        for v in 1:NAUX],
-                                       [:($(u_prim_rr[v]) = $arr[$(idx_rr...), $v])
-                                        for v in 1:NAUX])
+                                       for v in 1:NAUX],
+                                      [:($(u_prim_rr[v]) = $arr[$(idx_rr...), $v])
+                                       for v in 1:NAUX])
 
     # Compute the average of the normal_direction
     # e.g. the x direction `normals` gives
     #   normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] +
     #                                 contravariant_vectors_x[jk, ii, 1])   # and 2, 3
     normals(cv, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 *
-                                                           ($cv[$(idx_ll...), $m] +
-                                                            $cv[$(idx_rr...), $m])) for m in 1:3]
+                                                                        ($cv[$(idx_ll...),
+                                                                             $m] +
+                                                                         $cv[$(idx_rr...),
+                                                                             $m]))
+                                   for m in 1:3]
 
     # Store the fluxes in the permuted du array
     # e.g. the x direction `stores` gives
@@ -116,7 +118,7 @@ end
                                                                           $(flux[v]))
                                            for v in 1:NVARS],
                                           [:($du_arr[$(idx_rr...), $v] += factor_r *
-                                                                         $(flux[v]))
+                                                                          $(flux[v]))
                                            for v in 1:NVARS])
 
     loads_x = loads(:u_prim_permuted, (:jk, :i), (:jk, :ii))
@@ -306,7 +308,8 @@ LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
 
 @inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo
 
-@inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), equations::CompressibleEulerEquations3D)
+@inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo),
+                                            equations::CompressibleEulerEquations3D)
     return Val(7)
 end
 
@@ -330,7 +333,6 @@ end
                                    normal_direction_2,
                                    normal_direction_3,
                                    equations::CompressibleEulerEquations3D)
-
     v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
                  v3_ll * normal_direction_3
     v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +

From ab3063a8fd3aa1a598e0e3e9461a49c7e08628b6 Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 01:22:16 +0200
Subject: [PATCH 04/39] minor fix

---
 src/solvers/dgsem_structured/dg_3d_turbo.jl |  7 ++++---
 test/test_performance_specializations_3d.jl | 14 ++++----------
 2 files changed, 8 insertions(+), 13 deletions(-)

diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 36a3e87d2bd..5881d73978e 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -34,6 +34,9 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) =
     return Tuple(volume_flux(u_ll, u_rr, normal, equations))
 end
 
+LoopVectorization.can_turbo(::typeof(turbo_volume_flux), ::Val) = true
+LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
+
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element,
                                            MeshT::Type{<:Union{StructuredMesh{3},
                                                                P4estMesh{3}}},
@@ -48,6 +51,7 @@ end
                                     Val(nvariables(equations)))
 end
 
+# TODO: Fix the dispatch
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element,
                                            MeshT::Type{<:Union{StructuredMesh{3},
                                                                P4estMesh{3}}},
@@ -303,9 +307,6 @@ end
     end
 end
 
-LoopVectorization.can_turbo(::typeof(turbo_volume_flux), ::Val) = true
-LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
-
 @inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo
 
 @inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo),
diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 0f560612df6..9a9ee7d24fe 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -189,8 +189,7 @@ end
         du = Trixi.wrap_array(du_ode, semi)
         have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations)
 
-        # Call the optimized version specialized on `FluxVolumeTurbo`, which dispatches
-        # to the generated `@turbo` kernel
+        # Call the optimized version 
         du .= 0
         Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
@@ -198,8 +197,7 @@ end
                                         semi.solver, semi.cache, true)
         du_specialized = du[:, :, :, :, 1]
 
-        # Call the plain version using the underlying `flux_ranocha` directly, which
-        # dispatches to the generic flux differencing kernel
+        # Call the plain version 
         du .= 0
         Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
@@ -211,8 +209,6 @@ end
 end
 
 @timed_testset "StructuredMesh3D, FluxVolumeTurbo(flux_shima_etal)" begin
-    # `flux_shima_etal` has no dedicated turbo specialization, so `FluxVolumeTurbo`
-    # exercises the generic default path (conserved auxiliary variables + canonical flux).
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"),
                   cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3,
@@ -228,8 +224,7 @@ end
         du = Trixi.wrap_array(du_ode, semi)
         have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations)
 
-        # Call the optimized version specialized on `FluxVolumeTurbo`, which dispatches
-        # to the generated `@turbo` kernel
+        # Call the optimized version 
         du .= 0
         Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
@@ -237,8 +232,7 @@ end
                                         semi.solver, semi.cache, true)
         du_specialized = du[:, :, :, :, 1]
 
-        # Call the plain version using the underlying `flux_shima_etal` directly, which
-        # dispatches to the generic flux differencing kernel
+        # Call the plain version 
         du .= 0
         Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,

From f3bc84d05c648bfcf8a456f62da4276c5c860ea6 Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 11:11:21 +0200
Subject: [PATCH 05/39] nonconservative unstable kernel

---
 src/auxiliary/math.jl                       |  29 +-
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 321 ++++++++++++++++++--
 2 files changed, 324 insertions(+), 26 deletions(-)

diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl
index 334f011fdd3..107ea127d57 100644
--- a/src/auxiliary/math.jl
+++ b/src/auxiliary/math.jl
@@ -140,7 +140,6 @@ end
                                                                               oftype(x,
                                                                                      NaN),
                                                                               Base.log(x))
-
     @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x)
     @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x)
     @inline log(x::Float16) = ccall("llvm.log.f16", llvmcall, Float16, (Float16,), x)
@@ -210,6 +209,20 @@ Given ε = 1.0e-4, we use the following algorithm.
     end
 end
 
+@inline function ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD,
+                         y::LoopVectorization.VectorizationBase.AbstractSIMD)
+    RealT = eltype(x)
+    epsilon_f2 = convert(RealT, 1.0e-4)
+    f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2
+    return ifelse(f2 < epsilon_f2,
+                  (x + y) / @evalpoly(f2,
+                            convert(RealT, 2),
+                            convert(RealT, 2 / 3),
+                            convert(RealT, 2 / 5),
+                            convert(RealT, 2 / 7)),
+                  (y - x) / log(y / x))
+end
+
 """
     Trixi.inv_ln_mean(x::Real, y::Real)
 
@@ -236,6 +249,20 @@ multiplication.
     end
 end
 
+	@inline function inv_ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD,
+                             y::LoopVectorization.VectorizationBase.AbstractSIMD)
+    RealT = eltype(x)
+    epsilon_f2 = convert(RealT, 1.0e-4)
+    f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2
+    return ifelse(f2 < epsilon_f2,
+                  @evalpoly(f2,
+                            convert(RealT, 2),
+                            convert(RealT, 2 / 3),
+                            convert(RealT, 2 / 5),
+                            convert(RealT, 2 / 7)) / (x + y),
+                  log(y / x) / (y - x))
+end
+
 # `Base.max` and `Base.min` perform additional checks for signed zeros and `NaN`s
 # which are not present in comparable functions in Fortran/C++. For example,
 # ```julia
diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 5881d73978e..7193cfefa7a 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -19,11 +19,14 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) =
 
 @inline combined_turbo_flux(volume_flux) = volume_flux
 
+@inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (; volume_flux_conservative, volume_flux_nonconservative)
+
 @inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations))
 
 @inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
 
-@inline function turbo_volume_flux(volume_flux, aux_and_normals_and_equations...)
+@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False,
+                                   aux_and_normals_and_equations...)
     equations = last(aux_and_normals_and_equations)
     n = nvariables(equations)
     u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n)))
@@ -34,7 +37,25 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) =
     return Tuple(volume_flux(u_ll, u_rr, normal, equations))
 end
 
-LoopVectorization.can_turbo(::typeof(turbo_volume_flux), ::Val) = true
+@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::True,
+                                   aux_and_normals_and_equations...)
+    @unpack volume_flux_conservative, volume_flux_nonconservative = volume_flux
+    equations = last(aux_and_normals_and_equations)
+    n = nvariables(equations)
+    u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n)))
+    u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n)))
+    normal = SVector(aux_and_normals_and_equations[end - 3],
+                     aux_and_normals_and_equations[end - 2],
+                     aux_and_normals_and_equations[end - 1])
+    flux = volume_flux_conservative(u_ll, u_rr, normal, equations)
+    noncons_left = volume_flux_nonconservative(u_ll, u_rr, normal, equations)
+    noncons_right = volume_flux_nonconservative(u_rr, u_ll, normal, equations)
+    flux_left = flux + 0.5f0 * noncons_left
+    flux_right = flux + 0.5f0 * noncons_right
+    return (Tuple(flux_left)..., Tuple(flux_right)...)
+end
+
+LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true
 LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
 
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element,
@@ -45,55 +66,305 @@ LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
                                            volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM,
                                            cache, alpha)
     @unpack volume_flux = volume_flux_turbo
-    flux_differencing_kernel_turbo!(_du, u_cons, element, equations,
+    flux_differencing_kernel_turbo!(_du, u_cons, element, MeshT, have_nonconservative_terms,
+                                    equations,
                                     volume_flux, dg, cache, alpha,
                                     n_turbo_flux_aux_node_vars(volume_flux, equations),
                                     Val(nvariables(equations)))
 end
 
-# TODO: Fix the dispatch
+@generated function flux_differencing_kernel_turbo!(_du::PtrArray, u_cons::PtrArray,
+                                                    element,
+                                                    MeshT::Type{<:Union{StructuredMesh{3},
+                                                                        P4estMesh{3}}},
+                                                    have_nonconservative_terms::False,
+                                                    equations,
+                                                    volume_flux,
+                                                    dg, cache, alpha, ::Val{NAUX},
+                                                    ::Val{NVARS}) where {NAUX, NVARS}
+    # Per-node scalars used inside the `@turbo` loops, e.g. `u_prim_ll_1 = rho_ll`,
+    # `flux_1 = f1, etc.  
+    u_prim_ll = [Symbol(:u_prim_ll_, v) for v in 1:NAUX]
+    u_prim_rr = [Symbol(:u_prim_rr_, v) for v in 1:NAUX]
+    flux = [Symbol(:flux_, v) for v in 1:NVARS]
+
+    # Convert conserved to auxiliary variables node-wise, e.g. 
+    #   rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations)
+    #   u_prim[i, j, k, 1] = rho   # and so on for v2, v3, p, ...
+    cons_reads = [:(u_cons[$c, i, j, k, element]) for c in 1:NVARS]
+    cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...),
+                    :(cons2fluxauxiliary(volume_flux, $(cons_reads...),
+                                         equations)))
+    cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX]
+
+    # Evaluate the two-point volume flux
+    flux_call = Expr(:(=), Expr(:tuple, flux...),
+                     :(volume_flux_turbo(volume_flux,
+                                         have_nonconservative_terms,
+                                         $(u_prim_ll...), $(u_prim_rr...),
+                                         normal_direction_1, normal_direction_2,
+                                         normal_direction_3,
+                                         equations)))
+
+    # Build the inner `@turbo` loop body. For the x direction `loads` gives
+    #   rho_ll = u_prim_permuted[jk, i, 1]   # left node, and so on
+    #   rho_rr = u_prim_permuted[jk, ii, 1]  # right node
+    loads(arr, idx_ll, idx_rr) = vcat([:($(u_prim_ll[v]) = $arr[$(idx_ll...), $v])
+                                       for v in 1:NAUX],
+                                      [:($(u_prim_rr[v]) = $arr[$(idx_rr...), $v])
+                                       for v in 1:NAUX])
+
+    # Compute the average of the normal_direction
+    # e.g. the x direction `normals` gives
+    #   normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] +
+    #                                 contravariant_vectors_x[jk, ii, 1])   # and 2, 3
+    normals(cv, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 *
+                                                                        ($cv[$(idx_ll...),
+                                                                             $m] +
+                                                                         $cv[$(idx_rr...),
+                                                                             $m]))
+                                   for m in 1:3]
+
+    # Store the fluxes in the permuted du array
+    # e.g. the x direction `stores` gives
+    #   du_permuted[jk, i, 1]  += factor_l * f1   # left node, and so on
+    #   du_permuted[jk, ii, 1] += factor_r * f1   # right node
+    stores(du_arr, idx_ll, idx_rr) = vcat([:($du_arr[$(idx_ll...), $v] += factor_l *
+                                                                          $(flux[v]))
+                                           for v in 1:NVARS],
+                                          [:($du_arr[$(idx_rr...), $v] += factor_r *
+                                                                          $(flux[v]))
+                                           for v in 1:NVARS])
+
+    loads_x = loads(:u_prim_permuted, (:jk, :i), (:jk, :ii))
+    normals_x = normals(:contravariant_vectors_x, (:jk, :i), (:jk, :ii))
+    stores_x = stores(:du_permuted, (:jk, :i), (:jk, :ii))
+
+    loads_y = loads(:u_prim, (:i, :j, :k), (:i, :jj, :k))
+    normals_y = normals(:contravariant_vectors_y, (:i, :j, :k), (:i, :jj, :k))
+    stores_y = stores(:du, (:i, :j, :k), (:i, :jj, :k))
+
+    loads_z = loads(:u_prim_reshaped, (:ij, :k), (:ij, :kk))
+    normals_z = normals(:contravariant_vectors_z, (:ij, :k), (:ij, :kk))
+    stores_z = stores(:du_reshaped, (:ij, :k), (:ij, :kk))
+
+    quote
+        @unpack derivative_split = dg.basis
+        @unpack contravariant_vectors = cache.elements
+
+        # Create a temporary array that will be used to store the RHS with permuted
+        # indices `[i, j, k, v]` to allow using SIMD instructions.
+        # `StrideArray`s with purely static dimensions do not allocate on the heap.
+        du = StrideArray{eltype(u_cons)}(undef,
+                                         (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
+                                          StaticInt(nnodes(dg)), StaticInt(NVARS)))
+
+        # Convert conserved to auxiliary variables on the given `element`.
+        u_prim = StrideArray{eltype(u_cons)}(undef,
+                                             (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
+                                              StaticInt(nnodes(dg)), StaticInt(NAUX)))
+
+        # Convert conserved to auxiliary variables and store them in `u_prim`.
+        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            $cons2aux
+            $(cons2aux_writes...)
+        end
+
+        # x direction
+        # At first, we create new temporary arrays with permuted memory layout to
+        # allow using SIMD instructions along the first dimension (which is contiguous
+        # in memory).
+        du_permuted = StrideArray{eltype(u_cons)}(undef,
+                                                  (StaticInt(nnodes(dg)^2),
+                                                   StaticInt(nnodes(dg)),
+                                                   StaticInt(NVARS)))
+
+        u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
+                                                      (StaticInt(nnodes(dg)^2),
+                                                       StaticInt(nnodes(dg)),
+                                                       StaticInt(NAUX)))
+
+        @turbo for v in indices(u_prim, 4),
+                   k in eachnode(dg),
+                   j in eachnode(dg),
+                   i in eachnode(dg)
+
+            jk = j + nnodes(dg) * (k - 1)
+            u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]
+        end
+        fill!(du_permuted, zero(eltype(du_permuted)))
+
+        # We must also permute the contravariant vectors.
+        contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
+                                                                             (StaticInt(nnodes(dg)^2),
+                                                                              StaticInt(nnodes(dg)),
+                                                                              StaticInt(3)))
+
+        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            jk = j + nnodes(dg) * (k - 1)
+            contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k,
+                                                                      element]
+            contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k,
+                                                                      element]
+            contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k,
+                                                                      element]
+        end
+
+        # Next, we basically inline the volume flux. To allow SIMD vectorization and
+        # still use the symmetry of the volume flux and the derivative matrix, we
+        # loop over the triangular part in an outer loop and use a plain inner loop.
+        for i in eachnode(dg), ii in (i + 1):nnodes(dg)
+            @turbo for jk in Base.OneTo(nnodes(dg)^2)
+                $(loads_x...)
+                $(normals_x...)
+                $flux_call
+                factor_l = alpha * derivative_split[i, ii]
+                factor_r = alpha * derivative_split[ii, i]
+                $(stores_x...)
+            end
+        end
+
+        @turbo for v in eachvariable(equations),
+                   k in eachnode(dg),
+                   j in eachnode(dg),
+                   i in eachnode(dg)
+
+            jk = j + nnodes(dg) * (k - 1)
+            du[i, j, k, v] = du_permuted[jk, i, v]
+        end
+
+        # y direction
+        # We must also permute the contravariant vectors.
+        contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
+                                                                             (StaticInt(nnodes(dg)),
+                                                                              StaticInt(nnodes(dg)),
+                                                                              StaticInt(nnodes(dg)),
+                                                                              StaticInt(3)))
+
+        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k,
+                                                                        element]
+            contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k,
+                                                                        element]
+            contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k,
+                                                                        element]
+        end
+
+        # A possible permutation of array dimensions with improved opportunities for
+        # SIMD vectorization appeared to be slower than the direct version used here
+        # in preliminary numerical experiments on an AVX2 system.
+        for j in eachnode(dg), jj in (j + 1):nnodes(dg)
+            @turbo for k in eachnode(dg), i in eachnode(dg)
+                $(loads_y...)
+                $(normals_y...)
+                $flux_call
+                factor_l = alpha * derivative_split[j, jj]
+                factor_r = alpha * derivative_split[jj, j]
+                $(stores_y...)
+            end
+        end
+
+        # z direction
+        # The memory layout is already optimal for SIMD vectorization in this loop.
+        # We just squeeze the first two dimensions to make the code slightly faster.
+        GC.@preserve u_prim begin
+            u_prim_reshaped = PtrArray(pointer(u_prim),
+                                       (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
+                                        StaticInt(NAUX)))
+
+            du_reshaped = PtrArray(pointer(du),
+                                   (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
+                                    StaticInt(NVARS)))
+
+            # We must also permute the contravariant vectors.
+            contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
+                                                                                 (StaticInt(nnodes(dg)^2),
+                                                                                  StaticInt(nnodes(dg)),
+                                                                                  StaticInt(3)))
+
+            @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+                ij = i + nnodes(dg) * (j - 1)
+                contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k,
+                                                                          element]
+                contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k,
+                                                                          element]
+                contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k,
+                                                                          element]
+            end
+
+            for k in eachnode(dg), kk in (k + 1):nnodes(dg)
+                @turbo for ij in Base.OneTo(nnodes(dg)^2)
+                    $(loads_z...)
+                    $(normals_z...)
+                    $flux_call
+                    factor_l = alpha * derivative_split[k, kk]
+                    factor_r = alpha * derivative_split[kk, k]
+                    $(stores_z...)
+                end
+            end
+        end # GC.@preserve u_prim begin
+
+        # Finally, we add the temporary RHS computed here to the global RHS in the
+        # given `element`.
+        @turbo for v in eachvariable(equations),
+                   k in eachnode(dg),
+                   j in eachnode(dg),
+                   i in eachnode(dg)
+
+            _du[v, i, j, k, element] += du[i, j, k, v]
+        end
+
+        return nothing
+    end
+end
+
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element,
                                            MeshT::Type{<:Union{StructuredMesh{3},
                                                                P4estMesh{3}}},
                                            have_nonconservative_terms::True,
-                                           combine_conservative_and_nonconservative_fluxes::True,
                                            equations,
                                            volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM,
                                            cache, alpha)
     @unpack volume_flux = volume_flux_turbo
-    flux_differencing_kernel_turbo!(_du, u_cons, element, equations,
+    flux_differencing_kernel_turbo!(_du, u_cons, element, MeshT, have_nonconservative_terms,
+                                    equations,
                                     volume_flux, dg, cache, alpha,
                                     n_turbo_flux_aux_node_vars(volume_flux, equations),
                                     Val(nvariables(equations)))
 end
 
 @generated function flux_differencing_kernel_turbo!(_du::PtrArray, u_cons::PtrArray,
-                                                    element, equations,
-                                                    volume_turbo_flux_function,
+                                                    element,
+                                                    MeshT::Type{<:Union{StructuredMesh{3},
+                                                                        P4estMesh{3}}},
+                                                    have_nonconservative_terms::True,
+                                                    equations,
+                                                    volume_flux,
                                                     dg, cache, alpha, ::Val{NAUX},
                                                     ::Val{NVARS}) where {NAUX, NVARS}
     # Per-node scalars used inside the `@turbo` loops, e.g. `u_prim_ll_1 = rho_ll`,
     # `flux_1 = f1, etc.  
     u_prim_ll = [Symbol(:u_prim_ll_, v) for v in 1:NAUX]
     u_prim_rr = [Symbol(:u_prim_rr_, v) for v in 1:NAUX]
-    flux = [Symbol(:flux_, v) for v in 1:NVARS]
+    flux_left = [Symbol(:flux_left_, v) for v in 1:NVARS]
+    flux_right = [Symbol(:flux_right_, v) for v in 1:NVARS]
 
     # Convert conserved to auxiliary variables node-wise, e.g. 
     #   rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations)
     #   u_prim[i, j, k, 1] = rho   # and so on for v2, v3, p, ...
     cons_reads = [:(u_cons[$c, i, j, k, element]) for c in 1:NVARS]
     cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...),
-                    :(cons2fluxauxiliary(volume_turbo_flux_function, $(cons_reads...),
+                    :(cons2fluxauxiliary(volume_flux, $(cons_reads...),
                                          equations)))
     cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX]
 
     # Evaluate the two-point volume flux
-    flux_call = Expr(:(=), Expr(:tuple, flux...),
-                     :(turbo_volume_flux(volume_turbo_flux_function,
-                                         $(u_prim_ll...), $(u_prim_rr...),
+    flux_call = Expr(:(=), Expr(:tuple, Expr(:tuple, flux_left...), Expr(:tuple, flux_right...)),
+                     :(volume_flux_turbo(volume_flux,
+                                         have_nonconservative_terms, $(u_prim_ll...),
+                                         $(u_prim_rr...),
                                          normal_direction_1, normal_direction_2,
-                                         normal_direction_3,
-                                         equations)))
+                                         normal_direction_3, equations)))
 
     # Build the inner `@turbo` loop body. For the x direction `loads` gives
     #   rho_ll = u_prim_permuted[jk, i, 1]   # left node, and so on
@@ -119,10 +390,10 @@ end
     #   du_permuted[jk, i, 1]  += factor_l * f1   # left node, and so on
     #   du_permuted[jk, ii, 1] += factor_r * f1   # right node
     stores(du_arr, idx_ll, idx_rr) = vcat([:($du_arr[$(idx_ll...), $v] += factor_l *
-                                                                          $(flux[v]))
+                                                                          $(flux_left[v]))
                                            for v in 1:NVARS],
                                           [:($du_arr[$(idx_rr...), $v] += factor_r *
-                                                                          $(flux[v]))
+                                                                          $(flux_right[v]))
                                            for v in 1:NVARS])
 
     loads_x = loads(:u_prim_permuted, (:jk, :i), (:jk, :ii))
@@ -325,7 +596,7 @@ end
     return rho, v1, v2, v3, p, log(rho), log(p)
 end
 
-@inline function turbo_volume_flux(volume_flux::typeof(flux_ranocha_turbo),
+@inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha_turbo),
                                    rho_ll, v1_ll, v2_ll, v3_ll,
                                    p_ll, log_rho_ll, log_p_ll,
                                    rho_rr, v1_rr, v2_rr, v3_rr,
@@ -373,20 +644,20 @@ end
     regular_path2 = (log_y2 - log_x2) / y2_minus_x2
     inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
 
-    v1_avg = 0.5 * (v1_ll + v1_rr)
-    v2_avg = 0.5 * (v2_ll + v2_rr)
-    v3_avg = 0.5 * (v3_ll + v3_rr)
-    p_avg = 0.5 * (p_ll + p_rr)
-    velocity_square_avg = 0.5 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
+    v1_avg = 0.5f0 * (v1_ll + v1_rr)
+    v2_avg = 0.5f0 * (v2_ll + v2_rr)
+    v3_avg = 0.5f0 * (v3_ll + v3_rr)
+    p_avg = 0.5f0 * (p_ll + p_rr)
+    velocity_square_avg = 0.5f0 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
 
     # Calculate fluxes depending on normal_direction
-    f1 = rho_mean * 0.5 * (v_dot_n_ll + v_dot_n_rr)
+    f1 = rho_mean * 0.5f0 * (v_dot_n_ll + v_dot_n_rr)
     f2 = f1 * v1_avg + p_avg * normal_direction_1
     f3 = f1 * v2_avg + p_avg * normal_direction_2
     f4 = f1 * v3_avg + p_avg * normal_direction_3
     f5 = (f1 * (velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)
           +
-          0.5 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
+          0.5f0 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
 
     return (f1, f2, f3, f4, f5)
 end

From 65670429ff3069e75eeac4770a1aa7433505783f Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 11:13:13 +0200
Subject: [PATCH 06/39] format

---
 src/auxiliary/math.jl                       | 2 +-
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 7 +++++--
 2 files changed, 6 insertions(+), 3 deletions(-)

diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl
index 107ea127d57..61b2e7ff21a 100644
--- a/src/auxiliary/math.jl
+++ b/src/auxiliary/math.jl
@@ -249,7 +249,7 @@ multiplication.
     end
 end
 
-	@inline function inv_ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD,
+@inline function inv_ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD,
                              y::LoopVectorization.VectorizationBase.AbstractSIMD)
     RealT = eltype(x)
     epsilon_f2 = convert(RealT, 1.0e-4)
diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 7193cfefa7a..4a7a94e56c4 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -19,7 +19,9 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) =
 
 @inline combined_turbo_flux(volume_flux) = volume_flux
 
-@inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (; volume_flux_conservative, volume_flux_nonconservative)
+@inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (;
+                                                                                      volume_flux_conservative,
+                                                                                      volume_flux_nonconservative)
 
 @inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations))
 
@@ -359,7 +361,8 @@ end
     cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX]
 
     # Evaluate the two-point volume flux
-    flux_call = Expr(:(=), Expr(:tuple, Expr(:tuple, flux_left...), Expr(:tuple, flux_right...)),
+    flux_call = Expr(:(=),
+                     Expr(:tuple, Expr(:tuple, flux_left...), Expr(:tuple, flux_right...)),
                      :(volume_flux_turbo(volume_flux,
                                          have_nonconservative_terms, $(u_prim_ll...),
                                          $(u_prim_rr...),

From 3d0685567f417e72b59df2bb72a3cffdf6421910 Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 11:41:39 +0200
Subject: [PATCH 07/39] fix nonconservative kernel + tests

---
 src/solvers/dgsem_structured/dg_3d_turbo.jl |  3 +-
 test/test_performance_specializations_3d.jl | 37 +++++++++++++++++++++
 2 files changed, 39 insertions(+), 1 deletion(-)

diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 4a7a94e56c4..003250e4190 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -54,7 +54,7 @@ end
     noncons_right = volume_flux_nonconservative(u_rr, u_ll, normal, equations)
     flux_left = flux + 0.5f0 * noncons_left
     flux_right = flux + 0.5f0 * noncons_right
-    return (Tuple(flux_left)..., Tuple(flux_right)...)
+    return flux_left, flux_right
 end
 
 LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true
@@ -600,6 +600,7 @@ end
 end
 
 @inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha_turbo),
+                                   have_nonconservative_terms::False,
                                    rho_ll, v1_ll, v2_ll, v3_ll,
                                    p_ll, log_rho_ll, log_p_ll,
                                    rho_rr, v1_rr, v2_rr, v3_rr,
diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 9a9ee7d24fe..9114f748978 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -243,6 +243,43 @@ end
     end
 end
 
+@timed_testset "P4estMesh3D, FluxVolumeTurbo(flux_hindenlang_gassner, flux_nonconservative_powell)" begin
+    trixi_include(@__MODULE__,
+                  joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",
+                           "elixir_mhd_alfven_wave_nonperiodic.jl"),
+                  volume_flux = Trixi.FluxVolumeTurbo(flux_hindenlang_gassner,
+                                                      flux_nonconservative_powell))
+    u_ode = copy(sol.u[end])
+    du_ode = zero(u_ode)
+
+    # Preserve original memory since it will be `unsafe_wrap`ped and might
+    # thus otherwise be garbage collected
+    GC.@preserve u_ode du_ode begin
+        u = Trixi.wrap_array(u_ode, semi)
+        du = Trixi.wrap_array(du_ode, semi)
+        have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations)
+
+        # Call the optimized version 
+        du .= 0
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
+                                        have_nonconservative_terms, semi.equations,
+                                        semi.solver.volume_integral.volume_flux,
+                                        semi.solver, semi.cache, true)
+        du_specialized = du[:, :, :, :, 1]
+
+        # Call the plain version 
+        du .= 0
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
+                                        have_nonconservative_terms, semi.equations,
+                                        (; flux_hindenlang_gassner,
+                                         flux_nonconservative_powell), semi.solver,
+                                        semi.cache, true)
+        du_baseline = du[:, :, :, :, 1]
+
+        @test du_specialized ≈ du_baseline
+    end
+end
+
 @timed_testset "P4estMesh3D, combine_conservative_and_nonconservative_fluxes" begin
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",

From 690bfd4eb3a3e1dd742d476d583b262c55639fe2 Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 11:57:46 +0200
Subject: [PATCH 08/39] increase readability

---
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 28 ++++++++++-----------
 1 file changed, 14 insertions(+), 14 deletions(-)

diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 003250e4190..3db2b4aa1b9 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -93,7 +93,7 @@ end
     # Convert conserved to auxiliary variables node-wise, e.g. 
     #   rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations)
     #   u_prim[i, j, k, 1] = rho   # and so on for v2, v3, p, ...
-    cons_reads = [:(u_cons[$c, i, j, k, element]) for c in 1:NVARS]
+    cons_reads = [:(u_cons[$v, i, j, k, element]) for v in 1:NVARS]
     cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...),
                     :(cons2fluxauxiliary(volume_flux, $(cons_reads...),
                                          equations)))
@@ -120,12 +120,12 @@ end
     # e.g. the x direction `normals` gives
     #   normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] +
     #                                 contravariant_vectors_x[jk, ii, 1])   # and 2, 3
-    normals(cv, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 *
-                                                                        ($cv[$(idx_ll...),
-                                                                             $m] +
-                                                                         $cv[$(idx_rr...),
-                                                                             $m]))
-                                   for m in 1:3]
+    normals(controvariant_vector, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 *
+                                                                                          ($controvariant_vector[$(idx_ll...),
+                                                                                                                 $m] +
+                                                                                           $controvariant_vector[$(idx_rr...),
+                                                                                                                 $m]))
+                                                     for m in 1:3]
 
     # Store the fluxes in the permuted du array
     # e.g. the x direction `stores` gives
@@ -354,7 +354,7 @@ end
     # Convert conserved to auxiliary variables node-wise, e.g. 
     #   rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations)
     #   u_prim[i, j, k, 1] = rho   # and so on for v2, v3, p, ...
-    cons_reads = [:(u_cons[$c, i, j, k, element]) for c in 1:NVARS]
+    cons_reads = [:(u_cons[$v, i, j, k, element]) for v in 1:NVARS]
     cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...),
                     :(cons2fluxauxiliary(volume_flux, $(cons_reads...),
                                          equations)))
@@ -381,12 +381,12 @@ end
     # e.g. the x direction `normals` gives
     #   normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] +
     #                                 contravariant_vectors_x[jk, ii, 1])   # and 2, 3
-    normals(cv, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 *
-                                                                        ($cv[$(idx_ll...),
-                                                                             $m] +
-                                                                         $cv[$(idx_rr...),
-                                                                             $m]))
-                                   for m in 1:3]
+    normals(controvariant_vector, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 *
+                                                                                          ($controvariant_vector[$(idx_ll...),
+                                                                                                                 $m] +
+                                                                                           $controvariant_vector[$(idx_rr...),
+                                                                                                                 $m]))
+                                                     for m in 1:3]
 
     # Store the fluxes in the permuted du array
     # e.g. the x direction `stores` gives

From 8bba09793ef3f79c87a04c3e35f7dcab888e601c Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 12:03:00 +0200
Subject: [PATCH 09/39] fix tests

---
 test/test_performance_specializations_3d.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 9114f748978..c52fb37a094 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -269,10 +269,12 @@ end
 
         # Call the plain version 
         du .= 0
+        symmetric_flux = flux_hindenlang_gassner
+        nonconservative_flux = flux_nonconservative_powell
         Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
-                                        (; flux_hindenlang_gassner,
-                                         flux_nonconservative_powell), semi.solver,
+                                        (; symmetric_flux,
+                                         nonconservative_flux), semi.solver,
                                         semi.cache, true)
         du_baseline = du[:, :, :, :, 1]
 

From 8256ff1be5cb53025e698ce760355ae9d2875202 Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 12:06:14 +0200
Subject: [PATCH 10/39] rename normal to normal direction

---
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 20 ++++++++++----------
 1 file changed, 10 insertions(+), 10 deletions(-)

diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 3db2b4aa1b9..39061ec1b42 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -33,10 +33,10 @@ combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) =
     n = nvariables(equations)
     u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n)))
     u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n)))
-    normal = SVector(aux_and_normals_and_equations[end - 3],
-                     aux_and_normals_and_equations[end - 2],
-                     aux_and_normals_and_equations[end - 1])
-    return Tuple(volume_flux(u_ll, u_rr, normal, equations))
+    normal_direction = SVector(aux_and_normals_and_equations[end - 3],
+                               aux_and_normals_and_equations[end - 2],
+                               aux_and_normals_and_equations[end - 1])
+    return volume_flux(u_ll, u_rr, normal_direction, equations)
 end
 
 @inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::True,
@@ -46,12 +46,12 @@ end
     n = nvariables(equations)
     u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n)))
     u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n)))
-    normal = SVector(aux_and_normals_and_equations[end - 3],
-                     aux_and_normals_and_equations[end - 2],
-                     aux_and_normals_and_equations[end - 1])
-    flux = volume_flux_conservative(u_ll, u_rr, normal, equations)
-    noncons_left = volume_flux_nonconservative(u_ll, u_rr, normal, equations)
-    noncons_right = volume_flux_nonconservative(u_rr, u_ll, normal, equations)
+    normal_direction = SVector(aux_and_normals_and_equations[end - 3],
+                               aux_and_normals_and_equations[end - 2],
+                               aux_and_normals_and_equations[end - 1])
+    flux = volume_flux_conservative(u_ll, u_rr, normal_direction, equations)
+    noncons_left = volume_flux_nonconservative(u_ll, u_rr, normal_direction, equations)
+    noncons_right = volume_flux_nonconservative(u_rr, u_ll, normal_direction, equations)
     flux_left = flux + 0.5f0 * noncons_left
     flux_right = flux + 0.5f0 * noncons_right
     return flux_left, flux_right

From 160023ba12d4cf9ac62080594e56c531e7603ecf Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 12:17:11 +0200
Subject: [PATCH 11/39] explicit AbstractSIMD

---
 src/auxiliary/math.jl | 13 ++++---------
 1 file changed, 4 insertions(+), 9 deletions(-)

diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl
index 61b2e7ff21a..18b8db8568c 100644
--- a/src/auxiliary/math.jl
+++ b/src/auxiliary/math.jl
@@ -1,3 +1,4 @@
+using LoopVectorization: AbstractSIMD
 # By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
 # Since these FMAs can increase the performance of many numerical algorithms,
 # we need to opt-in explicitly.
@@ -135,11 +136,7 @@ end
     See also [`Trixi.set_log_type!`](@ref).
     """
     @inline log(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.log(x)
-    @inline log(x::LoopVectorization.VectorizationBase.AbstractSIMD) = ifelse(x <
-                                                                              zero(x),
-                                                                              oftype(x,
-                                                                                     NaN),
-                                                                              Base.log(x))
+    @inline log(x::AbstractSIMD) = ifelse(x < zero(x), oftype(x, NaN), Base.log(x))
     @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x)
     @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x)
     @inline log(x::Float16) = ccall("llvm.log.f16", llvmcall, Float16, (Float16,), x)
@@ -209,8 +206,7 @@ Given ε = 1.0e-4, we use the following algorithm.
     end
 end
 
-@inline function ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD,
-                         y::LoopVectorization.VectorizationBase.AbstractSIMD)
+@inline function ln_mean(x::AbstractSIMD, y::AbstractSIMD)
     RealT = eltype(x)
     epsilon_f2 = convert(RealT, 1.0e-4)
     f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2
@@ -249,8 +245,7 @@ multiplication.
     end
 end
 
-@inline function inv_ln_mean(x::LoopVectorization.VectorizationBase.AbstractSIMD,
-                             y::LoopVectorization.VectorizationBase.AbstractSIMD)
+@inline function inv_ln_mean(x::AbstractSIMD, y::AbstractSIMD)
     RealT = eltype(x)
     epsilon_f2 = convert(RealT, 1.0e-4)
     f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2

From 61e96fa5c165905fd5639e3a5fb613fc7acf32d0 Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Fri, 19 Jun 2026 12:34:30 +0200
Subject: [PATCH 12/39] Update src/solvers/dgsem_structured/dg_3d_turbo.jl

---
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 39061ec1b42..08a58b7b943 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -15,7 +15,6 @@ function FluxVolumeTurbo(volume_flux)
     return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux)
 end
 
-combine_conservative_and_nonconservative_fluxes(::FluxVolumeTurbo, equations) = True()
 
 @inline combined_turbo_flux(volume_flux) = volume_flux
 

From ac3a2d8095d49d72a0fa5b0994e7eef98012a616 Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Fri, 19 Jun 2026 12:36:09 +0200
Subject: [PATCH 13/39] Update src/solvers/dgsem_structured/dg_3d_turbo.jl

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 1 -
 1 file changed, 1 deletion(-)

diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 08a58b7b943..d8f7f961c7d 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -15,7 +15,6 @@ function FluxVolumeTurbo(volume_flux)
     return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux)
 end
 
-
 @inline combined_turbo_flux(volume_flux) = volume_flux
 
 @inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (;

From 80a9813f339445639f0ca767c130e3b75eabb10f Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Fri, 19 Jun 2026 13:23:51 +0200
Subject: [PATCH 14/39] simplify flux args

---
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 10 +++++++---
 1 file changed, 7 insertions(+), 3 deletions(-)

diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index d8f7f961c7d..1aea1ddbb67 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -25,6 +25,12 @@ end
 
 @inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
 
+@inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...)
+    equations = last(aux_and_normals_and_equations)
+    volume_flux_turbo(volume_flux, have_nonconservative_terms(equations),
+                      aux_and_normals_and_equations...)
+end
+
 @inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False,
                                    aux_and_normals_and_equations...)
     equations = last(aux_and_normals_and_equations)
@@ -100,7 +106,6 @@ end
     # Evaluate the two-point volume flux
     flux_call = Expr(:(=), Expr(:tuple, flux...),
                      :(volume_flux_turbo(volume_flux,
-                                         have_nonconservative_terms,
                                          $(u_prim_ll...), $(u_prim_rr...),
                                          normal_direction_1, normal_direction_2,
                                          normal_direction_3,
@@ -362,7 +367,7 @@ end
     flux_call = Expr(:(=),
                      Expr(:tuple, Expr(:tuple, flux_left...), Expr(:tuple, flux_right...)),
                      :(volume_flux_turbo(volume_flux,
-                                         have_nonconservative_terms, $(u_prim_ll...),
+                                         $(u_prim_ll...),
                                          $(u_prim_rr...),
                                          normal_direction_1, normal_direction_2,
                                          normal_direction_3, equations)))
@@ -598,7 +603,6 @@ end
 end
 
 @inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha_turbo),
-                                   have_nonconservative_terms::False,
                                    rho_ll, v1_ll, v2_ll, v3_ll,
                                    p_ll, log_rho_ll, log_p_ll,
                                    rho_rr, v1_rr, v2_rr, v3_rr,

From 4a0a3c147bdad7e4b229a5f71f8080b0604e0b2f Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Sat, 20 Jun 2026 09:59:34 +0200
Subject: [PATCH 15/39] add comments

---
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 22 ++++++++++++++++++++-
 1 file changed, 21 insertions(+), 1 deletion(-)

diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 1aea1ddbb67..3814fe75b03 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -1,3 +1,8 @@
+"""
+    FluxVolumeTurbo(volume_flux)
+
+Specialize the volume flux to use the SIMD instructions via LoopVectorization.jl
+"""
 struct FluxVolumeTurbo{VolumeFlux}
     volume_flux::VolumeFlux
     function FluxVolumeTurbo{VolumeFlux}(volume_flux) where {VolumeFlux}
@@ -5,26 +10,34 @@ struct FluxVolumeTurbo{VolumeFlux}
     end
 end
 
+# Helper function for nonconservative systems.
 function FluxVolumeTurbo(volume_flux_conservative, volume_flux_nonconservative)
     turbo_flux = combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative)
     return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux)
 end
 
+# Helper function for conservative systems.
 function FluxVolumeTurbo(volume_flux)
     turbo_flux = combined_turbo_flux(volume_flux)
     return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux)
 end
 
+# By default the turbo flux has no specialization and re-uses
+# the numerical flux in terms of conservative variables.
 @inline combined_turbo_flux(volume_flux) = volume_flux
 
 @inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (;
                                                                                       volume_flux_conservative,
                                                                                       volume_flux_nonconservative)
-
+# By default the turbo flux has the same number of precomputed variables
+# as the number of variables.
 @inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations))
 
+# Transform the conserved variables in precomputed auxiliary variables to speed up the computation
+# of the numerical flux. When no specialization is given, this gives cons2cons.
 @inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
 
+# Numerical volume flux that recalls the plain volume flux when no specialization is given.
 @inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...)
     equations = last(aux_and_normals_and_equations)
     volume_flux_turbo(volume_flux, have_nonconservative_terms(equations),
@@ -61,6 +74,7 @@ end
     return flux_left, flux_right
 end
 
+# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2fluxauxiliary
 LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true
 LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
 
@@ -79,6 +93,8 @@ LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
                                     Val(nvariables(equations)))
 end
 
+# Generated function that writes the equivalent hand-written code as in dg_compressible_euler_3d.jl,
+# but generalizes for each number of variables and precomputed variables.
 @generated function flux_differencing_kernel_turbo!(_du::PtrArray, u_cons::PtrArray,
                                                     element,
                                                     MeshT::Type{<:Union{StructuredMesh{3},
@@ -584,13 +600,16 @@ end
     end
 end
 
+# Specialize the name of the turbo flux for flux_ranocha.
 @inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo
 
+# Number of precomputed variables for the specialization flux_ranocha
 @inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo),
                                             equations::CompressibleEulerEquations3D)
     return Val(7)
 end
 
+# Transformation from conserved to precomputed variables for flux_ranocha
 @inline function cons2fluxauxiliary(volume_flux::typeof(flux_ranocha_turbo),
                                     rho, rho_v1, rho_v2, rho_v3, rho_e,
                                     equations::CompressibleEulerEquations3D)
@@ -602,6 +621,7 @@ end
     return rho, v1, v2, v3, p, log(rho), log(p)
 end
 
+# Computation of the numerical flux_ranocha with respect to precomputed variables
 @inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha_turbo),
                                    rho_ll, v1_ll, v2_ll, v3_ll,
                                    p_ll, log_rho_ll, log_p_ll,

From 2a8c4366c4ab21de34a17a5e2972743967756ef1 Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Sat, 20 Jun 2026 10:01:07 +0200
Subject: [PATCH 16/39] remove nonconservative part

---
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 288 --------------------
 test/test_performance_specializations_3d.jl |  39 ---
 2 files changed, 327 deletions(-)

diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 3814fe75b03..80a7235ae67 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -10,12 +10,6 @@ struct FluxVolumeTurbo{VolumeFlux}
     end
 end
 
-# Helper function for nonconservative systems.
-function FluxVolumeTurbo(volume_flux_conservative, volume_flux_nonconservative)
-    turbo_flux = combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative)
-    return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux)
-end
-
 # Helper function for conservative systems.
 function FluxVolumeTurbo(volume_flux)
     turbo_flux = combined_turbo_flux(volume_flux)
@@ -26,9 +20,6 @@ end
 # the numerical flux in terms of conservative variables.
 @inline combined_turbo_flux(volume_flux) = volume_flux
 
-@inline combined_turbo_flux(volume_flux_conservative, volume_flux_nonconservative) = (;
-                                                                                      volume_flux_conservative,
-                                                                                      volume_flux_nonconservative)
 # By default the turbo flux has the same number of precomputed variables
 # as the number of variables.
 @inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations))
@@ -56,24 +47,6 @@ end
     return volume_flux(u_ll, u_rr, normal_direction, equations)
 end
 
-@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::True,
-                                   aux_and_normals_and_equations...)
-    @unpack volume_flux_conservative, volume_flux_nonconservative = volume_flux
-    equations = last(aux_and_normals_and_equations)
-    n = nvariables(equations)
-    u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n)))
-    u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n)))
-    normal_direction = SVector(aux_and_normals_and_equations[end - 3],
-                               aux_and_normals_and_equations[end - 2],
-                               aux_and_normals_and_equations[end - 1])
-    flux = volume_flux_conservative(u_ll, u_rr, normal_direction, equations)
-    noncons_left = volume_flux_nonconservative(u_ll, u_rr, normal_direction, equations)
-    noncons_right = volume_flux_nonconservative(u_rr, u_ll, normal_direction, equations)
-    flux_left = flux + 0.5f0 * noncons_left
-    flux_right = flux + 0.5f0 * noncons_right
-    return flux_left, flux_right
-end
-
 # Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2fluxauxiliary
 LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true
 LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
@@ -339,267 +312,6 @@ end
     end
 end
 
-@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element,
-                                           MeshT::Type{<:Union{StructuredMesh{3},
-                                                               P4estMesh{3}}},
-                                           have_nonconservative_terms::True,
-                                           equations,
-                                           volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM,
-                                           cache, alpha)
-    @unpack volume_flux = volume_flux_turbo
-    flux_differencing_kernel_turbo!(_du, u_cons, element, MeshT, have_nonconservative_terms,
-                                    equations,
-                                    volume_flux, dg, cache, alpha,
-                                    n_turbo_flux_aux_node_vars(volume_flux, equations),
-                                    Val(nvariables(equations)))
-end
-
-@generated function flux_differencing_kernel_turbo!(_du::PtrArray, u_cons::PtrArray,
-                                                    element,
-                                                    MeshT::Type{<:Union{StructuredMesh{3},
-                                                                        P4estMesh{3}}},
-                                                    have_nonconservative_terms::True,
-                                                    equations,
-                                                    volume_flux,
-                                                    dg, cache, alpha, ::Val{NAUX},
-                                                    ::Val{NVARS}) where {NAUX, NVARS}
-    # Per-node scalars used inside the `@turbo` loops, e.g. `u_prim_ll_1 = rho_ll`,
-    # `flux_1 = f1, etc.  
-    u_prim_ll = [Symbol(:u_prim_ll_, v) for v in 1:NAUX]
-    u_prim_rr = [Symbol(:u_prim_rr_, v) for v in 1:NAUX]
-    flux_left = [Symbol(:flux_left_, v) for v in 1:NVARS]
-    flux_right = [Symbol(:flux_right_, v) for v in 1:NVARS]
-
-    # Convert conserved to auxiliary variables node-wise, e.g. 
-    #   rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations)
-    #   u_prim[i, j, k, 1] = rho   # and so on for v2, v3, p, ...
-    cons_reads = [:(u_cons[$v, i, j, k, element]) for v in 1:NVARS]
-    cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...),
-                    :(cons2fluxauxiliary(volume_flux, $(cons_reads...),
-                                         equations)))
-    cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX]
-
-    # Evaluate the two-point volume flux
-    flux_call = Expr(:(=),
-                     Expr(:tuple, Expr(:tuple, flux_left...), Expr(:tuple, flux_right...)),
-                     :(volume_flux_turbo(volume_flux,
-                                         $(u_prim_ll...),
-                                         $(u_prim_rr...),
-                                         normal_direction_1, normal_direction_2,
-                                         normal_direction_3, equations)))
-
-    # Build the inner `@turbo` loop body. For the x direction `loads` gives
-    #   rho_ll = u_prim_permuted[jk, i, 1]   # left node, and so on
-    #   rho_rr = u_prim_permuted[jk, ii, 1]  # right node
-    loads(arr, idx_ll, idx_rr) = vcat([:($(u_prim_ll[v]) = $arr[$(idx_ll...), $v])
-                                       for v in 1:NAUX],
-                                      [:($(u_prim_rr[v]) = $arr[$(idx_rr...), $v])
-                                       for v in 1:NAUX])
-
-    # Compute the average of the normal_direction
-    # e.g. the x direction `normals` gives
-    #   normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] +
-    #                                 contravariant_vectors_x[jk, ii, 1])   # and 2, 3
-    normals(controvariant_vector, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 *
-                                                                                          ($controvariant_vector[$(idx_ll...),
-                                                                                                                 $m] +
-                                                                                           $controvariant_vector[$(idx_rr...),
-                                                                                                                 $m]))
-                                                     for m in 1:3]
-
-    # Store the fluxes in the permuted du array
-    # e.g. the x direction `stores` gives
-    #   du_permuted[jk, i, 1]  += factor_l * f1   # left node, and so on
-    #   du_permuted[jk, ii, 1] += factor_r * f1   # right node
-    stores(du_arr, idx_ll, idx_rr) = vcat([:($du_arr[$(idx_ll...), $v] += factor_l *
-                                                                          $(flux_left[v]))
-                                           for v in 1:NVARS],
-                                          [:($du_arr[$(idx_rr...), $v] += factor_r *
-                                                                          $(flux_right[v]))
-                                           for v in 1:NVARS])
-
-    loads_x = loads(:u_prim_permuted, (:jk, :i), (:jk, :ii))
-    normals_x = normals(:contravariant_vectors_x, (:jk, :i), (:jk, :ii))
-    stores_x = stores(:du_permuted, (:jk, :i), (:jk, :ii))
-
-    loads_y = loads(:u_prim, (:i, :j, :k), (:i, :jj, :k))
-    normals_y = normals(:contravariant_vectors_y, (:i, :j, :k), (:i, :jj, :k))
-    stores_y = stores(:du, (:i, :j, :k), (:i, :jj, :k))
-
-    loads_z = loads(:u_prim_reshaped, (:ij, :k), (:ij, :kk))
-    normals_z = normals(:contravariant_vectors_z, (:ij, :k), (:ij, :kk))
-    stores_z = stores(:du_reshaped, (:ij, :k), (:ij, :kk))
-
-    quote
-        @unpack derivative_split = dg.basis
-        @unpack contravariant_vectors = cache.elements
-
-        # Create a temporary array that will be used to store the RHS with permuted
-        # indices `[i, j, k, v]` to allow using SIMD instructions.
-        # `StrideArray`s with purely static dimensions do not allocate on the heap.
-        du = StrideArray{eltype(u_cons)}(undef,
-                                         (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
-                                          StaticInt(nnodes(dg)), StaticInt(NVARS)))
-
-        # Convert conserved to auxiliary variables on the given `element`.
-        u_prim = StrideArray{eltype(u_cons)}(undef,
-                                             (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
-                                              StaticInt(nnodes(dg)), StaticInt(NAUX)))
-
-        # Convert conserved to auxiliary variables and store them in `u_prim`.
-        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            $cons2aux
-            $(cons2aux_writes...)
-        end
-
-        # x direction
-        # At first, we create new temporary arrays with permuted memory layout to
-        # allow using SIMD instructions along the first dimension (which is contiguous
-        # in memory).
-        du_permuted = StrideArray{eltype(u_cons)}(undef,
-                                                  (StaticInt(nnodes(dg)^2),
-                                                   StaticInt(nnodes(dg)),
-                                                   StaticInt(NVARS)))
-
-        u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
-                                                      (StaticInt(nnodes(dg)^2),
-                                                       StaticInt(nnodes(dg)),
-                                                       StaticInt(NAUX)))
-
-        @turbo for v in indices(u_prim, 4),
-                   k in eachnode(dg),
-                   j in eachnode(dg),
-                   i in eachnode(dg)
-
-            jk = j + nnodes(dg) * (k - 1)
-            u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]
-        end
-        fill!(du_permuted, zero(eltype(du_permuted)))
-
-        # We must also permute the contravariant vectors.
-        contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
-                                                                             (StaticInt(nnodes(dg)^2),
-                                                                              StaticInt(nnodes(dg)),
-                                                                              StaticInt(3)))
-
-        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            jk = j + nnodes(dg) * (k - 1)
-            contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k,
-                                                                      element]
-            contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k,
-                                                                      element]
-            contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k,
-                                                                      element]
-        end
-
-        # Next, we basically inline the volume flux. To allow SIMD vectorization and
-        # still use the symmetry of the volume flux and the derivative matrix, we
-        # loop over the triangular part in an outer loop and use a plain inner loop.
-        for i in eachnode(dg), ii in (i + 1):nnodes(dg)
-            @turbo for jk in Base.OneTo(nnodes(dg)^2)
-                $(loads_x...)
-                $(normals_x...)
-                $flux_call
-                factor_l = alpha * derivative_split[i, ii]
-                factor_r = alpha * derivative_split[ii, i]
-                $(stores_x...)
-            end
-        end
-
-        @turbo for v in eachvariable(equations),
-                   k in eachnode(dg),
-                   j in eachnode(dg),
-                   i in eachnode(dg)
-
-            jk = j + nnodes(dg) * (k - 1)
-            du[i, j, k, v] = du_permuted[jk, i, v]
-        end
-
-        # y direction
-        # We must also permute the contravariant vectors.
-        contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
-                                                                             (StaticInt(nnodes(dg)),
-                                                                              StaticInt(nnodes(dg)),
-                                                                              StaticInt(nnodes(dg)),
-                                                                              StaticInt(3)))
-
-        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k,
-                                                                        element]
-            contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k,
-                                                                        element]
-            contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k,
-                                                                        element]
-        end
-
-        # A possible permutation of array dimensions with improved opportunities for
-        # SIMD vectorization appeared to be slower than the direct version used here
-        # in preliminary numerical experiments on an AVX2 system.
-        for j in eachnode(dg), jj in (j + 1):nnodes(dg)
-            @turbo for k in eachnode(dg), i in eachnode(dg)
-                $(loads_y...)
-                $(normals_y...)
-                $flux_call
-                factor_l = alpha * derivative_split[j, jj]
-                factor_r = alpha * derivative_split[jj, j]
-                $(stores_y...)
-            end
-        end
-
-        # z direction
-        # The memory layout is already optimal for SIMD vectorization in this loop.
-        # We just squeeze the first two dimensions to make the code slightly faster.
-        GC.@preserve u_prim begin
-            u_prim_reshaped = PtrArray(pointer(u_prim),
-                                       (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
-                                        StaticInt(NAUX)))
-
-            du_reshaped = PtrArray(pointer(du),
-                                   (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
-                                    StaticInt(NVARS)))
-
-            # We must also permute the contravariant vectors.
-            contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
-                                                                                 (StaticInt(nnodes(dg)^2),
-                                                                                  StaticInt(nnodes(dg)),
-                                                                                  StaticInt(3)))
-
-            @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-                ij = i + nnodes(dg) * (j - 1)
-                contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k,
-                                                                          element]
-                contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k,
-                                                                          element]
-                contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k,
-                                                                          element]
-            end
-
-            for k in eachnode(dg), kk in (k + 1):nnodes(dg)
-                @turbo for ij in Base.OneTo(nnodes(dg)^2)
-                    $(loads_z...)
-                    $(normals_z...)
-                    $flux_call
-                    factor_l = alpha * derivative_split[k, kk]
-                    factor_r = alpha * derivative_split[kk, k]
-                    $(stores_z...)
-                end
-            end
-        end # GC.@preserve u_prim begin
-
-        # Finally, we add the temporary RHS computed here to the global RHS in the
-        # given `element`.
-        @turbo for v in eachvariable(equations),
-                   k in eachnode(dg),
-                   j in eachnode(dg),
-                   i in eachnode(dg)
-
-            _du[v, i, j, k, element] += du[i, j, k, v]
-        end
-
-        return nothing
-    end
-end
-
 # Specialize the name of the turbo flux for flux_ranocha.
 @inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo
 
diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index c52fb37a094..9a9ee7d24fe 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -243,45 +243,6 @@ end
     end
 end
 
-@timed_testset "P4estMesh3D, FluxVolumeTurbo(flux_hindenlang_gassner, flux_nonconservative_powell)" begin
-    trixi_include(@__MODULE__,
-                  joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",
-                           "elixir_mhd_alfven_wave_nonperiodic.jl"),
-                  volume_flux = Trixi.FluxVolumeTurbo(flux_hindenlang_gassner,
-                                                      flux_nonconservative_powell))
-    u_ode = copy(sol.u[end])
-    du_ode = zero(u_ode)
-
-    # Preserve original memory since it will be `unsafe_wrap`ped and might
-    # thus otherwise be garbage collected
-    GC.@preserve u_ode du_ode begin
-        u = Trixi.wrap_array(u_ode, semi)
-        du = Trixi.wrap_array(du_ode, semi)
-        have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations)
-
-        # Call the optimized version 
-        du .= 0
-        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
-                                        have_nonconservative_terms, semi.equations,
-                                        semi.solver.volume_integral.volume_flux,
-                                        semi.solver, semi.cache, true)
-        du_specialized = du[:, :, :, :, 1]
-
-        # Call the plain version 
-        du .= 0
-        symmetric_flux = flux_hindenlang_gassner
-        nonconservative_flux = flux_nonconservative_powell
-        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
-                                        have_nonconservative_terms, semi.equations,
-                                        (; symmetric_flux,
-                                         nonconservative_flux), semi.solver,
-                                        semi.cache, true)
-        du_baseline = du[:, :, :, :, 1]
-
-        @test du_specialized ≈ du_baseline
-    end
-end
-
 @timed_testset "P4estMesh3D, combine_conservative_and_nonconservative_fluxes" begin
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",

From 86ffed2e7f2abcfe7f3671a9d643af8ef140bb39 Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Sat, 20 Jun 2026 10:02:45 +0200
Subject: [PATCH 17/39] minor change

---
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 5 +----
 1 file changed, 1 insertion(+), 4 deletions(-)

diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 80a7235ae67..78fedfef2d6 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -316,10 +316,7 @@ end
 @inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo
 
 # Number of precomputed variables for the specialization flux_ranocha
-@inline function n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo),
-                                            equations::CompressibleEulerEquations3D)
-    return Val(7)
-end
+@inline n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), equations::CompressibleEulerEquations3D) = Val(7)
 
 # Transformation from conserved to precomputed variables for flux_ranocha
 @inline function cons2fluxauxiliary(volume_flux::typeof(flux_ranocha_turbo),

From 8c718d4e17c4312e3fcbb70fb3efc4981371d4d3 Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Sat, 20 Jun 2026 10:22:22 +0200
Subject: [PATCH 18/39] move fluxes to numerical fluxes

---
 src/equations/numerical_fluxes.jl           | 55 +++++++++++++++++++++
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 53 --------------------
 2 files changed, 55 insertions(+), 53 deletions(-)

diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index 545155b4889..0af82ed68a4 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -578,4 +578,59 @@ end
 end
 
 Base.show(io::IO, f::FluxUpwind) = print(io, "FluxUpwind(", f.splitting, ")")
+
+"""
+    FluxVolumeTurbo(volume_flux)
+
+Specialize the volume flux to use the SIMD instructions via LoopVectorization.jl
+"""
+struct FluxVolumeTurbo{VolumeFlux}
+    volume_flux::VolumeFlux
+    function FluxVolumeTurbo{VolumeFlux}(volume_flux) where {VolumeFlux}
+        return new{VolumeFlux}(volume_flux)
+    end
+end
+
+# Helper function for conservative systems.
+function FluxVolumeTurbo(volume_flux)
+    turbo_flux = combined_turbo_flux(volume_flux)
+    return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux)
+end
+
+# By default the turbo flux has no specialization and re-uses
+# the numerical flux in terms of conservative variables.
+@inline combined_turbo_flux(volume_flux) = volume_flux
+
+# By default the turbo flux has the same number of precomputed variables
+# as the number of variables.
+@inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations))
+
+# Transform the conserved variables in precomputed auxiliary variables to speed up the computation
+# of the numerical flux. When no specialization is given, this gives cons2cons.
+@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
+
+# Numerical volume flux that recalls the plain volume flux when no specialization is given.
+@inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...)
+    equations = last(aux_and_normals_and_equations)
+    volume_flux_turbo(volume_flux, have_nonconservative_terms(equations),
+                      aux_and_normals_and_equations...)
+end
+
+@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False,
+                                   aux_and_normals_and_equations...)
+    equations = last(aux_and_normals_and_equations)
+    n = nvariables(equations)
+    u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n)))
+    u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n)))
+    normal_direction = SVector(aux_and_normals_and_equations[end - 3],
+                               aux_and_normals_and_equations[end - 2],
+                               aux_and_normals_and_equations[end - 1])
+    return volume_flux(u_ll, u_rr, normal_direction, equations)
+end
+
+# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2fluxauxiliary
+LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true
+LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
+
+Base.show(io::IO, f::FluxUpwind) = print(io, "FluxVolumeTurbo(", f.volume_flux, ")")
 end # @muladd
diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 78fedfef2d6..facc4adabf2 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -1,56 +1,3 @@
-"""
-    FluxVolumeTurbo(volume_flux)
-
-Specialize the volume flux to use the SIMD instructions via LoopVectorization.jl
-"""
-struct FluxVolumeTurbo{VolumeFlux}
-    volume_flux::VolumeFlux
-    function FluxVolumeTurbo{VolumeFlux}(volume_flux) where {VolumeFlux}
-        return new{VolumeFlux}(volume_flux)
-    end
-end
-
-# Helper function for conservative systems.
-function FluxVolumeTurbo(volume_flux)
-    turbo_flux = combined_turbo_flux(volume_flux)
-    return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux)
-end
-
-# By default the turbo flux has no specialization and re-uses
-# the numerical flux in terms of conservative variables.
-@inline combined_turbo_flux(volume_flux) = volume_flux
-
-# By default the turbo flux has the same number of precomputed variables
-# as the number of variables.
-@inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations))
-
-# Transform the conserved variables in precomputed auxiliary variables to speed up the computation
-# of the numerical flux. When no specialization is given, this gives cons2cons.
-@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
-
-# Numerical volume flux that recalls the plain volume flux when no specialization is given.
-@inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...)
-    equations = last(aux_and_normals_and_equations)
-    volume_flux_turbo(volume_flux, have_nonconservative_terms(equations),
-                      aux_and_normals_and_equations...)
-end
-
-@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False,
-                                   aux_and_normals_and_equations...)
-    equations = last(aux_and_normals_and_equations)
-    n = nvariables(equations)
-    u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n)))
-    u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n)))
-    normal_direction = SVector(aux_and_normals_and_equations[end - 3],
-                               aux_and_normals_and_equations[end - 2],
-                               aux_and_normals_and_equations[end - 1])
-    return volume_flux(u_ll, u_rr, normal_direction, equations)
-end
-
-# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2fluxauxiliary
-LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true
-LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
-
 @inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element,
                                            MeshT::Type{<:Union{StructuredMesh{3},
                                                                P4estMesh{3}}},

From 11e35510cc3f85bfb2252f0d51b96fbe8834e909 Mon Sep 17 00:00:00 2001
From: MarcoArtiano <artianomarco97@gmail.com>
Date: Sat, 20 Jun 2026 10:23:44 +0200
Subject: [PATCH 19/39] change export position

---
 src/Trixi.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index 2154e40c961..0edb895a7cf 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -255,7 +255,7 @@ export flux, flux_central, flux_lax_friedrichs, flux_hll, flux_hllc, flux_hlle,
        FluxRotated,
        flux_shima_etal_turbo, flux_ranocha_turbo,
        FluxUpwind,
-       FluxTracerEquationsCentral
+       FluxTracerEquationsCentral, FluxVolumeTurbo
 
 export splitting_steger_warming, splitting_vanleer_haenel,
        splitting_coirier_vanleer, splitting_lax_friedrichs,
@@ -310,7 +310,7 @@ export DG,
        FDSBP,
        BlockFV, UniformFiniteVolumeBasis, VolumeIntegralFiniteVolume,
        VolumeIntegralWeakForm, VolumeIntegralStrongForm,
-       VolumeIntegralFluxDifferencing, FluxVolumeTurbo,
+       VolumeIntegralFluxDifferencing,
        VolumeIntegralPureLGLFiniteVolume, VolumeIntegralPureLGLFiniteVolumeO2,
        VolumeIntegralShockCapturingHG, VolumeIntegralShockCapturingRRG,
        VolumeIntegralShockCapturingHGType,

From d34c047efd419270fd0da657a6c9f160e2ccd06c Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Sat, 20 Jun 2026 10:43:17 +0200
Subject: [PATCH 20/39] fix typo

Co-authored-by: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
---
 src/equations/numerical_fluxes.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index 0af82ed68a4..183ebf03f15 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -632,5 +632,5 @@ end
 LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true
 LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
 
-Base.show(io::IO, f::FluxUpwind) = print(io, "FluxVolumeTurbo(", f.volume_flux, ")")
+Base.show(io::IO, f::FluxVolumeTurbo) = print(io, "FluxVolumeTurbo(", f.volume_flux, ")")
 end # @muladd

From 93f0373c304cea827e8b8e0b4336455ef707d002 Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Sat, 20 Jun 2026 10:50:03 +0200
Subject: [PATCH 21/39] Update src/equations/numerical_fluxes.jl

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 src/equations/numerical_fluxes.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index 183ebf03f15..47fd58ba4e8 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -632,5 +632,6 @@ end
 LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true
 LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
 
-Base.show(io::IO, f::FluxVolumeTurbo) = print(io, "FluxVolumeTurbo(", f.volume_flux, ")")
+Base.show(io::IO, f::FluxVolumeTurbo) = print(io, "FluxVolumeTurbo(", f.volume_flux,
+                                              ")")
 end # @muladd

From 3659c1e5290d8b6cf577ee662c40aa84556c7200 Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Tue, 23 Jun 2026 17:15:13 +0200
Subject: [PATCH 22/39] Apply suggestions from code review

Co-authored-by: Hendrik Ranocha <ranocha@users.noreply.github.com>
---
 src/auxiliary/math.jl                       |  8 +++++++-
 src/equations/numerical_fluxes.jl           | 11 ++++++++---
 src/solvers/dgsem_structured/dg_3d_turbo.jl |  2 +-
 test/test_performance_specializations_3d.jl |  4 ++--
 4 files changed, 18 insertions(+), 7 deletions(-)

diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl
index 18b8db8568c..2a173798578 100644
--- a/src/auxiliary/math.jl
+++ b/src/auxiliary/math.jl
@@ -136,10 +136,14 @@ end
     See also [`Trixi.set_log_type!`](@ref).
     """
     @inline log(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.log(x)
-    @inline log(x::AbstractSIMD) = ifelse(x < zero(x), oftype(x, NaN), Base.log(x))
     @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x)
     @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x)
     @inline log(x::Float16) = ccall("llvm.log.f16", llvmcall, Float16, (Float16,), x)
+    
+    # This is required for performance specializations like `FluxTurbo`.
+    # The LoopVectorization.jl ecosystem does not throw an error but
+    # already creates `NaN`s.
+    @inline log(x::AbstractSIMD) = Base.log(x)
 end
 
 """
@@ -206,6 +210,7 @@ Given ε = 1.0e-4, we use the following algorithm.
     end
 end
 
+# This is required for performance specializations like `FluxTurbo`.
 @inline function ln_mean(x::AbstractSIMD, y::AbstractSIMD)
     RealT = eltype(x)
     epsilon_f2 = convert(RealT, 1.0e-4)
@@ -245,6 +250,7 @@ multiplication.
     end
 end
 
+# This is required for performance specializations like `FluxTurbo`.
 @inline function inv_ln_mean(x::AbstractSIMD, y::AbstractSIMD)
     RealT = eltype(x)
     epsilon_f2 = convert(RealT, 1.0e-4)
diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index 47fd58ba4e8..8634da90bc6 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -580,9 +580,14 @@ end
 Base.show(io::IO, f::FluxUpwind) = print(io, "FluxUpwind(", f.splitting, ")")
 
 """
-    FluxVolumeTurbo(volume_flux)
-
-Specialize the volume flux to use the SIMD instructions via LoopVectorization.jl
+    FluxTurbo(numerical_flux)
+
+Create a numerical flux equivalent to the given `numerical_flux`
+except that it may use specialized methods, e.g., when used with
+[`VolumeIntegralFluxDifferencing`](@ref). These specialized methods
+may enable better use of SIMD instructions to increase runtime efficiency
+on modern hardware, e.g., using 
+[LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl).
 """
 struct FluxVolumeTurbo{VolumeFlux}
     volume_flux::VolumeFlux
diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index facc4adabf2..1f9b0e73372 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -25,7 +25,7 @@ end
                                                     dg, cache, alpha, ::Val{NAUX},
                                                     ::Val{NVARS}) where {NAUX, NVARS}
     # Per-node scalars used inside the `@turbo` loops, e.g. `u_prim_ll_1 = rho_ll`,
-    # `flux_1 = f1, etc.  
+    # `flux_1 = f1`, etc.  
     u_prim_ll = [Symbol(:u_prim_ll_, v) for v in 1:NAUX]
     u_prim_rr = [Symbol(:u_prim_rr_, v) for v in 1:NAUX]
     flux = [Symbol(:flux_, v) for v in 1:NVARS]
diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 9a9ee7d24fe..27c4918cf9a 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -177,7 +177,7 @@ end
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"),
                   cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3,
-                  volume_flux = Trixi.FluxVolumeTurbo(flux_ranocha),
+                  volume_flux = FluxVolumeTurbo(flux_ranocha),
                   surface_flux = flux_ranocha)
     u_ode = copy(sol.u[end])
     du_ode = zero(u_ode)
@@ -195,7 +195,7 @@ end
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
-        du_specialized = du[:, :, :, :, 1]
+        du_specialized = copy(du[:, :, :, :, 1])
 
         # Call the plain version 
         du .= 0

From 4c9dedbf5ff229aa5b3bc2dc54c139b35944fee4 Mon Sep 17 00:00:00 2001
From: Marco Artiano <artianomarco97@gmail.com>
Date: Tue, 23 Jun 2026 18:14:31 +0200
Subject: [PATCH 23/39] apply some code review suggestions

---
 src/Trixi.jl                                |  4 +-
 src/auxiliary/math.jl                       |  3 +-
 src/equations/numerical_fluxes.jl           | 29 ++++++--------
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 44 +++++++++------------
 4 files changed, 34 insertions(+), 46 deletions(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index 0edb895a7cf..b49ca6ba4bb 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -72,7 +72,7 @@ using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Back
 using AcceleratedKernels: AcceleratedKernels
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
-    using LoopVectorization: LoopVectorization, @turbo, indices
+    using LoopVectorization: LoopVectorization, @turbo, indices, AbstractSIMD
 else
     using LoopVectorization: LoopVectorization, indices
     include("auxiliary/mock_turbo.jl")
@@ -255,7 +255,7 @@ export flux, flux_central, flux_lax_friedrichs, flux_hll, flux_hllc, flux_hlle,
        FluxRotated,
        flux_shima_etal_turbo, flux_ranocha_turbo,
        FluxUpwind,
-       FluxTracerEquationsCentral, FluxVolumeTurbo
+       FluxTracerEquationsCentral, FluxTurbo
 
 export splitting_steger_warming, splitting_vanleer_haenel,
        splitting_coirier_vanleer, splitting_lax_friedrichs,
diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl
index 2a173798578..7bda5efea7c 100644
--- a/src/auxiliary/math.jl
+++ b/src/auxiliary/math.jl
@@ -1,4 +1,3 @@
-using LoopVectorization: AbstractSIMD
 # By default, Julia/LLVM does not use fused multiply-add operations (FMAs).
 # Since these FMAs can increase the performance of many numerical algorithms,
 # we need to opt-in explicitly.
@@ -139,7 +138,7 @@ end
     @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x)
     @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x)
     @inline log(x::Float16) = ccall("llvm.log.f16", llvmcall, Float16, (Float16,), x)
-    
+
     # This is required for performance specializations like `FluxTurbo`.
     # The LoopVectorization.jl ecosystem does not throw an error but
     # already creates `NaN`s.
diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index 8634da90bc6..69f3a4f11aa 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -579,6 +579,7 @@ end
 
 Base.show(io::IO, f::FluxUpwind) = print(io, "FluxUpwind(", f.splitting, ")")
 
+#TODO: Expand the docstring and explain what is required for this to work 
 """
     FluxTurbo(numerical_flux)
 
@@ -589,30 +590,25 @@ may enable better use of SIMD instructions to increase runtime efficiency
 on modern hardware, e.g., using 
 [LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl).
 """
-struct FluxVolumeTurbo{VolumeFlux}
-    volume_flux::VolumeFlux
-    function FluxVolumeTurbo{VolumeFlux}(volume_flux) where {VolumeFlux}
-        return new{VolumeFlux}(volume_flux)
+struct FluxTurbo{NumericalFlux}
+    numerical_flux::NumericalFlux
+    function FluxTurbo{NumericalFlux}(numerical_flux) where {NumericalFlux}
+        return new{NumericalFlux}(numerical_flux)
     end
 end
 
 # Helper function for conservative systems.
-function FluxVolumeTurbo(volume_flux)
-    turbo_flux = combined_turbo_flux(volume_flux)
-    return FluxVolumeTurbo{typeof(turbo_flux)}(turbo_flux)
+function FluxTurbo(numerical_flux)
+    return FluxTurbo{typeof(numerical_flux)}(numerical_flux)
 end
 
-# By default the turbo flux has no specialization and re-uses
-# the numerical flux in terms of conservative variables.
-@inline combined_turbo_flux(volume_flux) = volume_flux
-
 # By default the turbo flux has the same number of precomputed variables
 # as the number of variables.
-@inline n_turbo_flux_aux_node_vars(volume_flux, equations) = Val(nvariables(equations))
+@inline nturbovars(numerical_flux, equations) = Val(nvariables(equations))
 
 # Transform the conserved variables in precomputed auxiliary variables to speed up the computation
 # of the numerical flux. When no specialization is given, this gives cons2cons.
-@inline cons2fluxauxiliary(volume_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
+@inline cons2turbo(numerical_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
 
 # Numerical volume flux that recalls the plain volume flux when no specialization is given.
 @inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...)
@@ -633,10 +629,9 @@ end
     return volume_flux(u_ll, u_rr, normal_direction, equations)
 end
 
-# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2fluxauxiliary
+# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2turbo
 LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true
-LoopVectorization.can_turbo(::typeof(cons2fluxauxiliary), ::Val) = true
+LoopVectorization.can_turbo(::typeof(cons2turbo), ::Val) = true
 
-Base.show(io::IO, f::FluxVolumeTurbo) = print(io, "FluxVolumeTurbo(", f.volume_flux,
-                                              ")")
+Base.show(io::IO, f::FluxTurbo) = print(io, "FluxTurbo(", f.numerical_flux, ")")
 end # @muladd
diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 1f9b0e73372..1b63fb125e0 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -3,13 +3,13 @@
                                                                P4estMesh{3}}},
                                            have_nonconservative_terms::False,
                                            equations,
-                                           volume_flux_turbo::FluxVolumeTurbo, dg::DGSEM,
+                                           volume_flux::FluxTurbo, dg::DGSEM,
                                            cache, alpha)
-    @unpack volume_flux = volume_flux_turbo
+    @unpack numerical_flux = volume_flux
     flux_differencing_kernel_turbo!(_du, u_cons, element, MeshT, have_nonconservative_terms,
                                     equations,
-                                    volume_flux, dg, cache, alpha,
-                                    n_turbo_flux_aux_node_vars(volume_flux, equations),
+                                    numerical_flux, dg, cache, alpha,
+                                    nturbovars(numerical_flux, equations),
                                     Val(nvariables(equations)))
 end
 
@@ -34,10 +34,10 @@ end
     #   rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations)
     #   u_prim[i, j, k, 1] = rho   # and so on for v2, v3, p, ...
     cons_reads = [:(u_cons[$v, i, j, k, element]) for v in 1:NVARS]
-    cons2aux = Expr(:(=), Expr(:tuple, [Symbol(:aux_, v) for v in 1:NAUX]...),
-                    :(cons2fluxauxiliary(volume_flux, $(cons_reads...),
-                                         equations)))
-    cons2aux_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:aux_, v))) for v in 1:NAUX]
+    cons2turbo_ = Expr(:(=), Expr(:tuple, [Symbol(:u_prim_, v) for v in 1:NAUX]...),
+                       :(cons2turbo(volume_flux, $(cons_reads...),
+                                    equations)))
+    cons2turbo_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:u_prim_, v))) for v in 1:NAUX]
 
     # Evaluate the two-point volume flux
     flux_call = Expr(:(=), Expr(:tuple, flux...),
@@ -107,8 +107,8 @@ end
 
         # Convert conserved to auxiliary variables and store them in `u_prim`.
         @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
-            $cons2aux
-            $(cons2aux_writes...)
+            $cons2turbo_
+            $(cons2turbo_writes...)
         end
 
         # x direction
@@ -259,16 +259,13 @@ end
     end
 end
 
-# Specialize the name of the turbo flux for flux_ranocha.
-@inline combined_turbo_flux(flux_conservative::typeof(flux_ranocha)) = flux_ranocha_turbo
-
 # Number of precomputed variables for the specialization flux_ranocha
-@inline n_turbo_flux_aux_node_vars(volume_flux::typeof(flux_ranocha_turbo), equations::CompressibleEulerEquations3D) = Val(7)
+@inline nturbovars(volume_flux::typeof(flux_ranocha), equations::CompressibleEulerEquations3D) = Val(7)
 
 # Transformation from conserved to precomputed variables for flux_ranocha
-@inline function cons2fluxauxiliary(volume_flux::typeof(flux_ranocha_turbo),
-                                    rho, rho_v1, rho_v2, rho_v3, rho_e,
-                                    equations::CompressibleEulerEquations3D)
+@inline function cons2turbo(volume_flux::typeof(flux_ranocha),
+                            rho, rho_v1, rho_v2, rho_v3, rho_e,
+                            equations::CompressibleEulerEquations3D)
     v1 = rho_v1 / rho
     v2 = rho_v2 / rho
     v3 = rho_v3 / rho
@@ -278,7 +275,7 @@ end
 end
 
 # Computation of the numerical flux_ranocha with respect to precomputed variables
-@inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha_turbo),
+@inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha),
                                    rho_ll, v1_ll, v2_ll, v3_ll,
                                    p_ll, log_rho_ll, log_p_ll,
                                    rho_rr, v1_rr, v2_rr, v3_rr,
@@ -292,13 +289,10 @@ end
     v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
                  v3_rr * normal_direction_3
 
-    # Compute required mean values.
-    # We inline the logarithmic mean to allow LoopVectorization.jl to optimize it
-    # efficiently: `log` is precomputed once per node in `cons2fluxauxiliary`
-    # (`log_rho`, `log_p`) and reused here for the "regular" branch, while the
-    # "special" branch uses the series expansion to avoid cancellation when x ≈ y.
-    # This is equivalent to
-    #   rho_mean = ln_mean(rho_ll, rho_rr)
+    # Compute required mean values
+    # We inline the logarithmic mean to allow LoopVectorization.jl to optimize
+    # it efficiently. This is equivalent to
+    # rho_mean = ln_mean(rho_ll, rho_rr)
     x1 = rho_ll
     log_x1 = log_rho_ll
     y1 = rho_rr

From 7a51c08d3265117668b153c784d3d3812d5b2cac Mon Sep 17 00:00:00 2001
From: Marco Artiano <artianomarco97@gmail.com>
Date: Tue, 23 Jun 2026 18:22:40 +0200
Subject: [PATCH 24/39] more robust tests

---
 test/test_performance_specializations_3d.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 27c4918cf9a..27c41c67a95 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -230,7 +230,7 @@ end
                                         have_nonconservative_terms, semi.equations,
                                         semi.solver.volume_integral.volume_flux,
                                         semi.solver, semi.cache, true)
-        du_specialized = du[:, :, :, :, 1]
+        du_specialized = copy(du[:, :, :, :, 1])
 
         # Call the plain version 
         du .= 0

From 2a1f47a5708020a4ad77454593c134d2953fe765 Mon Sep 17 00:00:00 2001
From: Marco Artiano <artianomarco97@gmail.com>
Date: Tue, 23 Jun 2026 18:28:10 +0200
Subject: [PATCH 25/39] remove inner constructor

---
 src/equations/numerical_fluxes.jl | 8 --------
 1 file changed, 8 deletions(-)

diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index 69f3a4f11aa..e5a95835bca 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -592,14 +592,6 @@ on modern hardware, e.g., using
 """
 struct FluxTurbo{NumericalFlux}
     numerical_flux::NumericalFlux
-    function FluxTurbo{NumericalFlux}(numerical_flux) where {NumericalFlux}
-        return new{NumericalFlux}(numerical_flux)
-    end
-end
-
-# Helper function for conservative systems.
-function FluxTurbo(numerical_flux)
-    return FluxTurbo{typeof(numerical_flux)}(numerical_flux)
 end
 
 # By default the turbo flux has the same number of precomputed variables

From c2fe52dee93d4d54d243f8d55f68d763d8f31d28 Mon Sep 17 00:00:00 2001
From: Marco Artiano <artianomarco97@gmail.com>
Date: Tue, 23 Jun 2026 18:48:46 +0200
Subject: [PATCH 26/39] add news and minor changes

---
 NEWS.md                           | 1 +
 src/equations/numerical_fluxes.jl | 9 ++++++++-
 2 files changed, 9 insertions(+), 1 deletion(-)

diff --git a/NEWS.md b/NEWS.md
index 00634fce30f..177bbabc623 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -20,6 +20,7 @@ The new equation types `LinearDiffusionEquation1D` and `LinearDiffusionEquation2
 - Support for 3D subcell limiting was extended by local limiting for nonperiodic `TreeMesh`es ([#2878]).
 - Support for user-defined RHS splitting for IMEX methods via SemidiscretizationHyperbolicSplit ([#2518]). The splitting follows the form `y_t = f_1(y) + f_2(y)`, allowing users to define separate solvers for the stiff (`f_1`) and non-stiff (`f_2`) parts of the right-hand side. Boundary conditions and source terms can be specified independently for the stiff and non-stiff parts.
 - Added postprocessing for kinetic energy spectral analysis via `compute_kinetic_energy_spectrum` for `AbstractCompressibleEulerEquations` on `TreeMesh`/`DGSEM` and on `DGMultiMesh`/`DGMultiSBP` in 2D and 3D; the routine returns matching integer wavenumber shells and the isotropic 1D spectrum `E(k)`.
+- Added the wrapper `FluxTurbo` that enables SIMD optimized implementations for conservative systems. Precomputation of primitive variables or other more efficient sets of variables can be enabled by specifying the number of precomputed variables, `nturbovars`; the transformation from conservative to precomputed variables, `cons2turbo`; and the `volume_turbo_flux`, which computes the numerical flux in terms of the precomputed variables. ([#3090])
 
 #### Changed
 - For performance, `LaplaceDiffusionEntropyVariables` parabolic fluxes for `CompressibleEulerEquations1D`, `CompressibleEulerEquations2D`, and `CompressibleEulerEquations3D` now use explicit Jacobian formulas from Barth 1999 instead of AD ([#3028]). Other equation types continue to use an automatic differentiation fallback.
diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index e5a95835bca..7b960e8934b 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -579,7 +579,6 @@ end
 
 Base.show(io::IO, f::FluxUpwind) = print(io, "FluxUpwind(", f.splitting, ")")
 
-#TODO: Expand the docstring and explain what is required for this to work 
 """
     FluxTurbo(numerical_flux)
 
@@ -589,6 +588,14 @@ except that it may use specialized methods, e.g., when used with
 may enable better use of SIMD instructions to increase runtime efficiency
 on modern hardware, e.g., using 
 [LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl).
+The following specialization works by default for all numerical fluxes,
+enabling SIMD instructions. However, for some systems, it is cheaper to precompute a set variables
+(e.g., primitive variables and or logarithm of certain variables) and then compute the numerical fluxes. 
+This optimization can be enabled by defining three ingredients: the number of precomputed variables `nturbovars`; 
+the transformation from conservative to precomputed variables `cons2turbo`; and the 
+`volume_turbo_flux(volume_flux::typeof(numercal_flux), ...)`, that computes the numerical flux
+in terms of the precomputed variables.
+See also [DGSEM Turbo](https://github.com/trixi-framework/Trixi.jl/blob/main/src/solvers/dgsem_structured/dg_3d_turbo.jl)
 """
 struct FluxTurbo{NumericalFlux}
     numerical_flux::NumericalFlux

From 5eac91d86b36660b34645cdb14060ada367b24b8 Mon Sep 17 00:00:00 2001
From: Hendrik Ranocha <mail@ranocha.de>
Date: Tue, 23 Jun 2026 20:57:48 +0200
Subject: [PATCH 27/39] improve docstring

---
 src/equations/numerical_fluxes.jl | 22 +++++++++++++---------
 1 file changed, 13 insertions(+), 9 deletions(-)

diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index 7b960e8934b..2c038a3bd92 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -586,16 +586,20 @@ Create a numerical flux equivalent to the given `numerical_flux`
 except that it may use specialized methods, e.g., when used with
 [`VolumeIntegralFluxDifferencing`](@ref). These specialized methods
 may enable better use of SIMD instructions to increase runtime efficiency
-on modern hardware, e.g., using 
+on modern hardware, e.g., using
 [LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl).
+
 The following specialization works by default for all numerical fluxes,
-enabling SIMD instructions. However, for some systems, it is cheaper to precompute a set variables
-(e.g., primitive variables and or logarithm of certain variables) and then compute the numerical fluxes. 
-This optimization can be enabled by defining three ingredients: the number of precomputed variables `nturbovars`; 
-the transformation from conservative to precomputed variables `cons2turbo`; and the 
-`volume_turbo_flux(volume_flux::typeof(numercal_flux), ...)`, that computes the numerical flux
-in terms of the precomputed variables.
-See also [DGSEM Turbo](https://github.com/trixi-framework/Trixi.jl/blob/main/src/solvers/dgsem_structured/dg_3d_turbo.jl)
+enabling SIMD instructions. However, for some systems, it is even better
+to precompute some variables (e.g., primitive variables or logarithms of certain variables)
+and then compute the numerical fluxes.
+This optimization can be enabled by defining three ingredients:
+- the number of precomputed variables `nturbovars`;
+- the transformation from conservative to precomputed variables `cons2turbo`
+- and the  `volume_turbo_flux(volume_flux::typeof(numerical_flux), ...)`,
+  that computes the numerical flux in terms of the precomputed variables.
+
+See the implementation of [DGSEM Turbo](https://github.com/trixi-framework/Trixi.jl/blob/main/src/solvers/dgsem_structured/dg_3d_turbo.jl)
 """
 struct FluxTurbo{NumericalFlux}
     numerical_flux::NumericalFlux
@@ -628,7 +632,7 @@ end
     return volume_flux(u_ll, u_rr, normal_direction, equations)
 end
 
-# Allow LoopVectorization to use SIMD instructions on volume_flux_turbo and cons2turbo
+# Allow LoopVectorization.jl to use SIMD instructions on volume_flux_turbo and cons2turbo
 LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true
 LoopVectorization.can_turbo(::typeof(cons2turbo), ::Val) = true
 

From 8446f34ff6bea6b57314cdcabcf65c08b673b027 Mon Sep 17 00:00:00 2001
From: Hendrik Ranocha <mail@ranocha.de>
Date: Tue, 23 Jun 2026 20:57:53 +0200
Subject: [PATCH 28/39] fix tests

---
 test/test_performance_specializations_3d.jl | 16 ++++++++--------
 1 file changed, 8 insertions(+), 8 deletions(-)

diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 27c41c67a95..c523e2fc772 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -173,11 +173,11 @@ end
     end
 end
 
-@timed_testset "StructuredMesh3D, FluxVolumeTurbo(flux_ranocha)" begin
+@timed_testset "StructuredMesh3D, FluxTurbo(flux_ranocha)" begin
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"),
                   cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3,
-                  volume_flux = FluxVolumeTurbo(flux_ranocha),
+                  volume_flux = FluxTurbo(flux_ranocha),
                   surface_flux = flux_ranocha)
     u_ode = copy(sol.u[end])
     du_ode = zero(u_ode)
@@ -189,7 +189,7 @@ end
         du = Trixi.wrap_array(du_ode, semi)
         have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations)
 
-        # Call the optimized version 
+        # Call the optimized version
         du .= 0
         Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
@@ -197,7 +197,7 @@ end
                                         semi.solver, semi.cache, true)
         du_specialized = copy(du[:, :, :, :, 1])
 
-        # Call the plain version 
+        # Call the plain version
         du .= 0
         Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
@@ -208,11 +208,11 @@ end
     end
 end
 
-@timed_testset "StructuredMesh3D, FluxVolumeTurbo(flux_shima_etal)" begin
+@timed_testset "StructuredMesh3D, FluxTurbo(flux_shima_etal)" begin
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"),
                   cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3,
-                  volume_flux = Trixi.FluxVolumeTurbo(flux_shima_etal),
+                  volume_flux = Trixi.FluxTurbo(flux_shima_etal),
                   surface_flux = flux_shima_etal)
     u_ode = copy(sol.u[end])
     du_ode = zero(u_ode)
@@ -224,7 +224,7 @@ end
         du = Trixi.wrap_array(du_ode, semi)
         have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations)
 
-        # Call the optimized version 
+        # Call the optimized version
         du .= 0
         Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,
@@ -232,7 +232,7 @@ end
                                         semi.solver, semi.cache, true)
         du_specialized = copy(du[:, :, :, :, 1])
 
-        # Call the plain version 
+        # Call the plain version
         du .= 0
         Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
                                         have_nonconservative_terms, semi.equations,

From d70ceab26c222c68aad9fc173f34fb2618ce663e Mon Sep 17 00:00:00 2001
From: Hendrik Ranocha <mail@ranocha.de>
Date: Tue, 23 Jun 2026 21:04:27 +0200
Subject: [PATCH 29/39] fix name

---
 NEWS.md                           | 2 +-
 src/equations/numerical_fluxes.jl | 2 +-
 2 files changed, 2 insertions(+), 2 deletions(-)

diff --git a/NEWS.md b/NEWS.md
index 177bbabc623..d8ee307dbef 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -20,7 +20,7 @@ The new equation types `LinearDiffusionEquation1D` and `LinearDiffusionEquation2
 - Support for 3D subcell limiting was extended by local limiting for nonperiodic `TreeMesh`es ([#2878]).
 - Support for user-defined RHS splitting for IMEX methods via SemidiscretizationHyperbolicSplit ([#2518]). The splitting follows the form `y_t = f_1(y) + f_2(y)`, allowing users to define separate solvers for the stiff (`f_1`) and non-stiff (`f_2`) parts of the right-hand side. Boundary conditions and source terms can be specified independently for the stiff and non-stiff parts.
 - Added postprocessing for kinetic energy spectral analysis via `compute_kinetic_energy_spectrum` for `AbstractCompressibleEulerEquations` on `TreeMesh`/`DGSEM` and on `DGMultiMesh`/`DGMultiSBP` in 2D and 3D; the routine returns matching integer wavenumber shells and the isotropic 1D spectrum `E(k)`.
-- Added the wrapper `FluxTurbo` that enables SIMD optimized implementations for conservative systems. Precomputation of primitive variables or other more efficient sets of variables can be enabled by specifying the number of precomputed variables, `nturbovars`; the transformation from conservative to precomputed variables, `cons2turbo`; and the `volume_turbo_flux`, which computes the numerical flux in terms of the precomputed variables. ([#3090])
+- Added the wrapper `FluxTurbo` that enables SIMD optimized implementations for conservative systems. Precomputation of primitive variables or other more efficient sets of variables can be enabled by specifying the number of precomputed variables, `nturbovars`; the transformation from conservative to precomputed variables, `cons2turbo`; and the `volume_flux_turbo`, which computes the numerical flux in terms of the precomputed variables. ([#3090])
 
 #### Changed
 - For performance, `LaplaceDiffusionEntropyVariables` parabolic fluxes for `CompressibleEulerEquations1D`, `CompressibleEulerEquations2D`, and `CompressibleEulerEquations3D` now use explicit Jacobian formulas from Barth 1999 instead of AD ([#3028]). Other equation types continue to use an automatic differentiation fallback.
diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index 2c038a3bd92..8d820b6c926 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -596,7 +596,7 @@ and then compute the numerical fluxes.
 This optimization can be enabled by defining three ingredients:
 - the number of precomputed variables `nturbovars`;
 - the transformation from conservative to precomputed variables `cons2turbo`
-- and the  `volume_turbo_flux(volume_flux::typeof(numerical_flux), ...)`,
+- and the  `volume_flux_turbo(volume_flux::typeof(numerical_flux), ...)`,
   that computes the numerical flux in terms of the precomputed variables.
 
 See the implementation of [DGSEM Turbo](https://github.com/trixi-framework/Trixi.jl/blob/main/src/solvers/dgsem_structured/dg_3d_turbo.jl)

From 8a61bb762a13b64e11611ff69c24108388cf7646 Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Tue, 23 Jun 2026 22:48:07 +0200
Subject: [PATCH 30/39] Apply suggestions from code review

Co-authored-by: Hendrik Ranocha <ranocha@users.noreply.github.com>
---
 src/Trixi.jl                      | 2 +-
 src/equations/numerical_fluxes.jl | 9 +++++++++
 2 files changed, 10 insertions(+), 1 deletion(-)

diff --git a/src/Trixi.jl b/src/Trixi.jl
index b7c011e9aa5..0b6e5f7c22b 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -74,7 +74,7 @@ using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
     using LoopVectorization: LoopVectorization, @turbo, indices, AbstractSIMD
 else
-    using LoopVectorization: LoopVectorization, indices
+    using LoopVectorization: LoopVectorization, indices, AbstractSIMD
     include("auxiliary/mock_turbo.jl")
 end
 
diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index 8d820b6c926..98823fb7183 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -605,12 +605,21 @@ struct FluxTurbo{NumericalFlux}
     numerical_flux::NumericalFlux
 end
 
+# As a fallback method, the wrapped flux is called.
+@inline (f::FluxTurbo)(u_ll, u_rr, orientation_or_normal_direction,
+                       equations)
+    return f.numerical_flux(u_ll, u_rr, orientation_or_normal_direction, equations)
+end
+
 # By default the turbo flux has the same number of precomputed variables
 # as the number of variables.
 @inline nturbovars(numerical_flux, equations) = Val(nvariables(equations))
 
 # Transform the conserved variables in precomputed auxiliary variables to speed up the computation
 # of the numerical flux. When no specialization is given, this gives cons2cons.
+# Note that the conserved variables are passed as scalars instead of a vector
+# to enable optimizations by LoopVectorization.jl. Thus, we use
+# `conserved_and_equations...` instead of `cons, equations`.
 @inline cons2turbo(numerical_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
 
 # Numerical volume flux that recalls the plain volume flux when no specialization is given.

From 4967a129088401aca13166da063d2aa2428f309a Mon Sep 17 00:00:00 2001
From: Marco Artiano <artianomarco97@gmail.com>
Date: Wed, 24 Jun 2026 08:50:49 +0200
Subject: [PATCH 31/39] add fallback tests, fix name

---
 .../elixir_euler_modified_sod.jl              | 18 +++++++-------
 src/equations/numerical_fluxes.jl             | 24 +++++++++----------
 test/test_performance_specializations_3d.jl   | 17 ++++++++++++-
 3 files changed, 38 insertions(+), 21 deletions(-)

diff --git a/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl b/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl
index fd030fc37bd..e1499c62783 100644
--- a/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl
+++ b/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl
@@ -1,7 +1,8 @@
 using OrdinaryDiffEqSSPRK
 using Trixi
 
-equations = CompressibleEulerEquations1D(1.4)
+RealT = Float64
+equations = CompressibleEulerEquations1D(RealT(1.4))
 
 """
     initial_condition_modified_sod(x, t, equations::CompressibleEulerEquations1D)
@@ -21,10 +22,11 @@ An entropy-satisfying solution should produce a smooth(!) rarefaction wave.
   [DOI: 10.1016/j.jcp.2023.112677](https://doi.org/10.1016/j.jcp.2023.112677)
 """
 function initial_condition_modified_sod(x, t, ::CompressibleEulerEquations1D)
-    if x[1] < 0.3
-        return prim2cons(SVector(1, 0.75, 1), equations)
+    RealT = eltype(x)
+    if x[1] < RealT(0.3)
+        return prim2cons(SVector(one(RealT), 0.75f0, one(RealT)), equations)
     else
-        return prim2cons(SVector(0.125, 0.0, 0.1), equations)
+        return prim2cons(SVector(0.125f0, zero(eltype(x)), RealT(0.1)), equations)
     end
 end
 initial_condition = initial_condition_modified_sod
@@ -38,11 +40,11 @@ volume_integral = VolumeIntegralFluxDifferencing(flux_ranocha)
 solver = DGSEM(polydeg = 3, surface_flux = flux_hllc,
                volume_integral = volume_integral)
 
-coordinates_min = 0.0
-coordinates_max = 1.0
+coordinates_min = zero(RealT)
+coordinates_max = one(RealT)
 
 mesh = TreeMesh(coordinates_min, coordinates_max,
-                initial_refinement_level = 6,
+                initial_refinement_level = 6, RealT = RealT,
                 n_cells_max = 30_000,
                 periodicity = false)
 
@@ -54,7 +56,7 @@ boundary_conditions = (; x_neg = BoundaryConditionDirichlet(initial_condition),
 semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
                                     boundary_conditions = boundary_conditions)
 
-tspan = (0.0, 0.2)
+tspan = (zero(RealT), RealT(0.2))
 ode = semidiscretize(semi, tspan)
 
 summary_callback = SummaryCallback()
diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index 98823fb7183..befa2851514 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -606,8 +606,8 @@ struct FluxTurbo{NumericalFlux}
 end
 
 # As a fallback method, the wrapped flux is called.
-@inline (f::FluxTurbo)(u_ll, u_rr, orientation_or_normal_direction,
-                       equations)
+@inline function (f::FluxTurbo)(u_ll, u_rr, orientation_or_normal_direction,
+                                equations)
     return f.numerical_flux(u_ll, u_rr, orientation_or_normal_direction, equations)
 end
 
@@ -623,21 +623,21 @@ end
 @inline cons2turbo(numerical_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
 
 # Numerical volume flux that recalls the plain volume flux when no specialization is given.
-@inline function volume_flux_turbo(volume_flux, aux_and_normals_and_equations...)
-    equations = last(aux_and_normals_and_equations)
+@inline function volume_flux_turbo(volume_flux, turbovars_and_normals_and_equations...)
+    equations = last(turbovars_and_normals_and_equations)
     volume_flux_turbo(volume_flux, have_nonconservative_terms(equations),
-                      aux_and_normals_and_equations...)
+                      turbovars_and_normals_and_equations...)
 end
 
 @inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False,
-                                   aux_and_normals_and_equations...)
-    equations = last(aux_and_normals_and_equations)
+                                   turbovars_and_normals_and_equations...)
+    equations = last(turbovars_and_normals_and_equations)
     n = nvariables(equations)
-    u_ll = SVector(ntuple(v -> aux_and_normals_and_equations[v], Val(n)))
-    u_rr = SVector(ntuple(v -> aux_and_normals_and_equations[n + v], Val(n)))
-    normal_direction = SVector(aux_and_normals_and_equations[end - 3],
-                               aux_and_normals_and_equations[end - 2],
-                               aux_and_normals_and_equations[end - 1])
+    u_ll = SVector(ntuple(v -> turbovars_and_normals_and_equations[v], Val(n)))
+    u_rr = SVector(ntuple(v -> turbovars_and_normals_and_equations[n + v], Val(n)))
+    normal_direction = SVector(turbovars_and_normals_and_equations[end - 3],
+                               turbovars_and_normals_and_equations[end - 2],
+                               turbovars_and_normals_and_equations[end - 1])
     return volume_flux(u_ll, u_rr, normal_direction, equations)
 end
 
diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index c523e2fc772..4d840e15efb 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -212,7 +212,7 @@ end
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"),
                   cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3,
-                  volume_flux = Trixi.FluxTurbo(flux_shima_etal),
+                  volume_flux = FluxTurbo(flux_shima_etal),
                   surface_flux = flux_shima_etal)
     u_ode = copy(sol.u[end])
     du_ode = zero(u_ode)
@@ -243,6 +243,21 @@ end
     end
 end
 
+# Test fallback method for mesh and type that are not supported
+@timed_testset "TreeMesh1D, fallback call FluxTurbo(flux_chandrashekar)" begin
+    trixi_include(@__MODULE__,
+                  joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"),
+                  volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar))
+    u_ode = copy(sol.u[end])
+
+    trixi_include(@__MODULE__,
+                  joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"),
+                  volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar)))
+    u_ode_specialized = copy(sol.u[end])
+
+    @test u_ode_specialized ≈ u_ode
+end
+
 @timed_testset "P4estMesh3D, combine_conservative_and_nonconservative_fluxes" begin
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",

From 9d064d864457fab386ba185abeffe88073a33866 Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Wed, 24 Jun 2026 09:02:18 +0200
Subject: [PATCH 32/39] Apply suggestions from code review

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 test/test_performance_specializations_3d.jl | 6 ++++--
 1 file changed, 4 insertions(+), 2 deletions(-)

diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 4d840e15efb..02404c47ec4 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -246,12 +246,14 @@ end
 # Test fallback method for mesh and type that are not supported
 @timed_testset "TreeMesh1D, fallback call FluxTurbo(flux_chandrashekar)" begin
     trixi_include(@__MODULE__,
-                  joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"),
+                  joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
+                           "elixir_euler_modified_sod.jl"),
                   volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar))
     u_ode = copy(sol.u[end])
 
     trixi_include(@__MODULE__,
-                  joinpath(EXAMPLES_DIR, "tree_1d_dgsem", "elixir_euler_modified_sod.jl"),
+                  joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
+                           "elixir_euler_modified_sod.jl"),
                   volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar)))
     u_ode_specialized = copy(sol.u[end])
 

From cd374594c421910f1bbfea25e4c6317551dd8c77 Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Wed, 24 Jun 2026 20:02:39 +0200
Subject: [PATCH 33/39] Apply suggestions from code review

Co-authored-by: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
---
 test/test_performance_specializations_3d.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 02404c47ec4..80a16b01d25 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -248,13 +248,13 @@ end
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
                            "elixir_euler_modified_sod.jl"),
-                  volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar))
+                  volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar), RealT = BigFloat)
     u_ode = copy(sol.u[end])
 
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
                            "elixir_euler_modified_sod.jl"),
-                  volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar)))
+                  volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar), RealT = BigFloat))
     u_ode_specialized = copy(sol.u[end])
 
     @test u_ode_specialized ≈ u_ode

From 59fa6434c0fec2836e75ddbc23dea73bdf896666 Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Wed, 24 Jun 2026 20:11:08 +0200
Subject: [PATCH 34/39] Update test/test_performance_specializations_3d.jl

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 test/test_performance_specializations_3d.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 80a16b01d25..8b79dc5c8d8 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -254,7 +254,8 @@ end
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
                            "elixir_euler_modified_sod.jl"),
-                  volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar), RealT = BigFloat))
+                  volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar),
+                                                                   RealT = BigFloat))
     u_ode_specialized = copy(sol.u[end])
 
     @test u_ode_specialized ≈ u_ode

From 1d45aeb9cd25a3602e2aae50e1a7a7fa29dc6b83 Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Wed, 24 Jun 2026 20:11:16 +0200
Subject: [PATCH 35/39] Update test/test_performance_specializations_3d.jl

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 test/test_performance_specializations_3d.jl | 3 ++-
 1 file changed, 2 insertions(+), 1 deletion(-)

diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 8b79dc5c8d8..e34ef4f684e 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -248,7 +248,8 @@ end
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
                            "elixir_euler_modified_sod.jl"),
-                  volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar), RealT = BigFloat)
+                  volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar),
+                  RealT = BigFloat)
     u_ode = copy(sol.u[end])
 
     trixi_include(@__MODULE__,

From 47cc8260a8af4ded83431e5a26b20c2dd100b115 Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Thu, 25 Jun 2026 08:43:56 +0200
Subject: [PATCH 36/39] Apply suggestions from code review

Co-authored-by: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
---
 test/test_performance_specializations_3d.jl | 4 ++--
 1 file changed, 2 insertions(+), 2 deletions(-)

diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index e34ef4f684e..b5db6749d94 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -255,8 +255,8 @@ end
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
                            "elixir_euler_modified_sod.jl"),
-                  volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar),
-                                                                   RealT = BigFloat))
+                  volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar)),
+                                                                   RealT = BigFloat)
     u_ode_specialized = copy(sol.u[end])
 
     @test u_ode_specialized ≈ u_ode

From cc92253e2887caa166b41ba882179ac29287b68e Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Thu, 25 Jun 2026 09:01:21 +0200
Subject: [PATCH 37/39] Update test/test_performance_specializations_3d.jl

Co-authored-by: github-actions[bot] <41898282+github-actions[bot]@users.noreply.github.com>
---
 test/test_performance_specializations_3d.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index b5db6749d94..7b953c06c02 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -256,7 +256,7 @@ end
                   joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
                            "elixir_euler_modified_sod.jl"),
                   volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar)),
-                                                                   RealT = BigFloat)
+                  RealT = BigFloat)
     u_ode_specialized = copy(sol.u[end])
 
     @test u_ode_specialized ≈ u_ode

From 19885c9bf41e5aa9cb24fc775947a02b5953b899 Mon Sep 17 00:00:00 2001
From: Marco Artiano <57838732+MarcoArtiano@users.noreply.github.com>
Date: Fri, 26 Jun 2026 08:36:02 +0200
Subject: [PATCH 38/39] Update
 examples/tree_1d_dgsem/elixir_euler_modified_sod.jl

Co-authored-by: Hendrik Ranocha <ranocha@users.noreply.github.com>
---
 examples/tree_1d_dgsem/elixir_euler_modified_sod.jl | 2 +-
 1 file changed, 1 insertion(+), 1 deletion(-)

diff --git a/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl b/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl
index e1499c62783..875e8da05e1 100644
--- a/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl
+++ b/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl
@@ -26,7 +26,7 @@ function initial_condition_modified_sod(x, t, ::CompressibleEulerEquations1D)
     if x[1] < RealT(0.3)
         return prim2cons(SVector(one(RealT), 0.75f0, one(RealT)), equations)
     else
-        return prim2cons(SVector(0.125f0, zero(eltype(x)), RealT(0.1)), equations)
+        return prim2cons(SVector(0.125f0, zero(RealT), RealT(0.1)), equations)
     end
 end
 initial_condition = initial_condition_modified_sod

From 050943ee89ca603c76f1307e9e46bdc29d6c5c57 Mon Sep 17 00:00:00 2001
From: Marco Artiano <artianomarco97@gmail.com>
Date: Fri, 26 Jun 2026 08:40:50 +0200
Subject: [PATCH 39/39] rename volume flux turbo in flux turbo

---
 src/equations/numerical_fluxes.jl           | 16 +++++------
 src/solvers/dgsem_structured/dg_3d_turbo.jl | 32 ++++++++++-----------
 2 files changed, 24 insertions(+), 24 deletions(-)

diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index befa2851514..6d8893740a7 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -596,7 +596,7 @@ and then compute the numerical fluxes.
 This optimization can be enabled by defining three ingredients:
 - the number of precomputed variables `nturbovars`;
 - the transformation from conservative to precomputed variables `cons2turbo`
-- and the  `volume_flux_turbo(volume_flux::typeof(numerical_flux), ...)`,
+- and the  `flux_turbo(flux_turbo::typeof(numerical_flux), ...)`,
   that computes the numerical flux in terms of the precomputed variables.
 
 See the implementation of [DGSEM Turbo](https://github.com/trixi-framework/Trixi.jl/blob/main/src/solvers/dgsem_structured/dg_3d_turbo.jl)
@@ -623,14 +623,14 @@ end
 @inline cons2turbo(numerical_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
 
 # Numerical volume flux that recalls the plain volume flux when no specialization is given.
-@inline function volume_flux_turbo(volume_flux, turbovars_and_normals_and_equations...)
+@inline function flux_turbo(numerical_flux, turbovars_and_normals_and_equations...)
     equations = last(turbovars_and_normals_and_equations)
-    volume_flux_turbo(volume_flux, have_nonconservative_terms(equations),
-                      turbovars_and_normals_and_equations...)
+    flux_turbo(numerical_flux, have_nonconservative_terms(equations),
+               turbovars_and_normals_and_equations...)
 end
 
-@inline function volume_flux_turbo(volume_flux, have_nonconservative_terms::False,
-                                   turbovars_and_normals_and_equations...)
+@inline function flux_turbo(numerical_flux, have_nonconservative_terms::False,
+                            turbovars_and_normals_and_equations...)
     equations = last(turbovars_and_normals_and_equations)
     n = nvariables(equations)
     u_ll = SVector(ntuple(v -> turbovars_and_normals_and_equations[v], Val(n)))
@@ -638,11 +638,11 @@ end
     normal_direction = SVector(turbovars_and_normals_and_equations[end - 3],
                                turbovars_and_normals_and_equations[end - 2],
                                turbovars_and_normals_and_equations[end - 1])
-    return volume_flux(u_ll, u_rr, normal_direction, equations)
+    return numerical_flux(u_ll, u_rr, normal_direction, equations)
 end
 
 # Allow LoopVectorization.jl to use SIMD instructions on volume_flux_turbo and cons2turbo
-LoopVectorization.can_turbo(::typeof(volume_flux_turbo), ::Val) = true
+LoopVectorization.can_turbo(::typeof(flux_turbo), ::Val) = true
 LoopVectorization.can_turbo(::typeof(cons2turbo), ::Val) = true
 
 Base.show(io::IO, f::FluxTurbo) = print(io, "FluxTurbo(", f.numerical_flux, ")")
diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
index 1b63fb125e0..d4054ac4085 100644
--- a/src/solvers/dgsem_structured/dg_3d_turbo.jl
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -41,11 +41,11 @@ end
 
     # Evaluate the two-point volume flux
     flux_call = Expr(:(=), Expr(:tuple, flux...),
-                     :(volume_flux_turbo(volume_flux,
-                                         $(u_prim_ll...), $(u_prim_rr...),
-                                         normal_direction_1, normal_direction_2,
-                                         normal_direction_3,
-                                         equations)))
+                     :(flux_turbo(volume_flux,
+                                  $(u_prim_ll...), $(u_prim_rr...),
+                                  normal_direction_1, normal_direction_2,
+                                  normal_direction_3,
+                                  equations)))
 
     # Build the inner `@turbo` loop body. For the x direction `loads` gives
     #   rho_ll = u_prim_permuted[jk, i, 1]   # left node, and so on
@@ -260,10 +260,10 @@ end
 end
 
 # Number of precomputed variables for the specialization flux_ranocha
-@inline nturbovars(volume_flux::typeof(flux_ranocha), equations::CompressibleEulerEquations3D) = Val(7)
+@inline nturbovars(flux_turbo::typeof(flux_ranocha), equations::CompressibleEulerEquations3D) = Val(7)
 
 # Transformation from conserved to precomputed variables for flux_ranocha
-@inline function cons2turbo(volume_flux::typeof(flux_ranocha),
+@inline function cons2turbo(flux_turbo::typeof(flux_ranocha),
                             rho, rho_v1, rho_v2, rho_v3, rho_e,
                             equations::CompressibleEulerEquations3D)
     v1 = rho_v1 / rho
@@ -275,15 +275,15 @@ end
 end
 
 # Computation of the numerical flux_ranocha with respect to precomputed variables
-@inline function volume_flux_turbo(volume_flux::typeof(flux_ranocha),
-                                   rho_ll, v1_ll, v2_ll, v3_ll,
-                                   p_ll, log_rho_ll, log_p_ll,
-                                   rho_rr, v1_rr, v2_rr, v3_rr,
-                                   p_rr, log_rho_rr, log_p_rr,
-                                   normal_direction_1,
-                                   normal_direction_2,
-                                   normal_direction_3,
-                                   equations::CompressibleEulerEquations3D)
+@inline function flux_turbo(flux_turbo::typeof(flux_ranocha),
+                            rho_ll, v1_ll, v2_ll, v3_ll,
+                            p_ll, log_rho_ll, log_p_ll,
+                            rho_rr, v1_rr, v2_rr, v3_rr,
+                            p_rr, log_rho_rr, log_p_rr,
+                            normal_direction_1,
+                            normal_direction_2,
+                            normal_direction_3,
+                            equations::CompressibleEulerEquations3D)
     v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
                  v3_ll * normal_direction_3
     v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +