diff --git a/NEWS.md b/NEWS.md
index db183182d20..6b17677a2a0 100644
--- a/NEWS.md
+++ b/NEWS.md
@@ -20,6 +20,7 @@ The new equation types `LinearDiffusionEquation1D` and `LinearDiffusionEquation2
 - Support for 3D subcell limiting was extended by local limiting for nonperiodic `TreeMesh`es ([#2878]).
 - Support for user-defined RHS splitting for IMEX methods via SemidiscretizationHyperbolicSplit ([#2518]). The splitting follows the form `y_t = f_1(y) + f_2(y)`, allowing users to define separate solvers for the stiff (`f_1`) and non-stiff (`f_2`) parts of the right-hand side. Boundary conditions and source terms can be specified independently for the stiff and non-stiff parts.
 - Added postprocessing for kinetic energy spectral analysis via `compute_kinetic_energy_spectrum` for `AbstractCompressibleEulerEquations` on `TreeMesh`/`DGSEM` and on `DGMultiMesh`/`DGMultiSBP` in 2D and 3D; the routine returns matching integer wavenumber shells and the isotropic 1D spectrum `E(k)`.
+- Added the wrapper `FluxTurbo` that enables SIMD optimized implementations for conservative systems. Precomputation of primitive variables or other more efficient sets of variables can be enabled by specifying the number of precomputed variables, `nturbovars`; the transformation from conservative to precomputed variables, `cons2turbo`; and the `volume_flux_turbo`, which computes the numerical flux in terms of the precomputed variables. ([#3090])
 - For the `NonIdealCompressibleEulerEquations` with a non-ideal equation of state a new Newton solver interface for the
 temperature has been added.
 This is required for nonideal equations of state, where one cannot explicitly solve for temperature given two other thermodynamic variables.
diff --git a/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl b/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl
index fd030fc37bd..875e8da05e1 100644
--- a/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl
+++ b/examples/tree_1d_dgsem/elixir_euler_modified_sod.jl
@@ -1,7 +1,8 @@
 using OrdinaryDiffEqSSPRK
 using Trixi
 
-equations = CompressibleEulerEquations1D(1.4)
+RealT = Float64
+equations = CompressibleEulerEquations1D(RealT(1.4))
 
 """
     initial_condition_modified_sod(x, t, equations::CompressibleEulerEquations1D)
@@ -21,10 +22,11 @@ An entropy-satisfying solution should produce a smooth(!) rarefaction wave.
   [DOI: 10.1016/j.jcp.2023.112677](https://doi.org/10.1016/j.jcp.2023.112677)
 """
 function initial_condition_modified_sod(x, t, ::CompressibleEulerEquations1D)
-    if x[1] < 0.3
-        return prim2cons(SVector(1, 0.75, 1), equations)
+    RealT = eltype(x)
+    if x[1] < RealT(0.3)
+        return prim2cons(SVector(one(RealT), 0.75f0, one(RealT)), equations)
     else
-        return prim2cons(SVector(0.125, 0.0, 0.1), equations)
+        return prim2cons(SVector(0.125f0, zero(RealT), RealT(0.1)), equations)
     end
 end
 initial_condition = initial_condition_modified_sod
@@ -38,11 +40,11 @@ volume_integral = VolumeIntegralFluxDifferencing(flux_ranocha)
 solver = DGSEM(polydeg = 3, surface_flux = flux_hllc,
                volume_integral = volume_integral)
 
-coordinates_min = 0.0
-coordinates_max = 1.0
+coordinates_min = zero(RealT)
+coordinates_max = one(RealT)
 
 mesh = TreeMesh(coordinates_min, coordinates_max,
-                initial_refinement_level = 6,
+                initial_refinement_level = 6, RealT = RealT,
                 n_cells_max = 30_000,
                 periodicity = false)
 
@@ -54,7 +56,7 @@ boundary_conditions = (; x_neg = BoundaryConditionDirichlet(initial_condition),
 semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver,
                                     boundary_conditions = boundary_conditions)
 
-tspan = (0.0, 0.2)
+tspan = (zero(RealT), RealT(0.2))
 ode = semidiscretize(semi, tspan)
 
 summary_callback = SummaryCallback()
diff --git a/src/Trixi.jl b/src/Trixi.jl
index f259a9add71..bb9a6e23220 100644
--- a/src/Trixi.jl
+++ b/src/Trixi.jl
@@ -72,9 +72,9 @@ using KernelAbstractions: KernelAbstractions, @index, @kernel, get_backend, Back
 using AcceleratedKernels: AcceleratedKernels
 using LinearMaps: LinearMap
 if _PREFERENCE_LOOPVECTORIZATION
-    using LoopVectorization: LoopVectorization, @turbo, indices
+    using LoopVectorization: LoopVectorization, @turbo, indices, AbstractSIMD
 else
-    using LoopVectorization: LoopVectorization, indices
+    using LoopVectorization: LoopVectorization, indices, AbstractSIMD
     include("auxiliary/mock_turbo.jl")
 end
 
@@ -255,7 +255,7 @@ export flux, flux_central, flux_lax_friedrichs, flux_hll, flux_hllc, flux_hlle,
        FluxRotated,
        flux_shima_etal_turbo, flux_ranocha_turbo,
        FluxUpwind,
-       FluxTracerEquationsCentral
+       FluxTracerEquationsCentral, FluxTurbo
 
 export splitting_steger_warming, splitting_vanleer_haenel,
        splitting_coirier_vanleer, splitting_lax_friedrichs,
diff --git a/src/auxiliary/math.jl b/src/auxiliary/math.jl
index 8ee1ff5f2fa..7bda5efea7c 100644
--- a/src/auxiliary/math.jl
+++ b/src/auxiliary/math.jl
@@ -135,10 +135,14 @@ end
     See also [`Trixi.set_log_type!`](@ref).
     """
     @inline log(x::Real) = x < zero(x) ? oftype(x, NaN) : Base.log(x)
-
     @inline log(x::Float64) = ccall("llvm.log.f64", llvmcall, Float64, (Float64,), x)
     @inline log(x::Float32) = ccall("llvm.log.f32", llvmcall, Float32, (Float32,), x)
     @inline log(x::Float16) = ccall("llvm.log.f16", llvmcall, Float16, (Float16,), x)
+
+    # This is required for performance specializations like `FluxTurbo`.
+    # The LoopVectorization.jl ecosystem does not throw an error but
+    # already creates `NaN`s.
+    @inline log(x::AbstractSIMD) = Base.log(x)
 end
 
 """
@@ -205,6 +209,20 @@ Given ε = 1.0e-4, we use the following algorithm.
     end
 end
 
+# This is required for performance specializations like `FluxTurbo`.
+@inline function ln_mean(x::AbstractSIMD, y::AbstractSIMD)
+    RealT = eltype(x)
+    epsilon_f2 = convert(RealT, 1.0e-4)
+    f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2
+    return ifelse(f2 < epsilon_f2,
+                  (x + y) / @evalpoly(f2,
+                            convert(RealT, 2),
+                            convert(RealT, 2 / 3),
+                            convert(RealT, 2 / 5),
+                            convert(RealT, 2 / 7)),
+                  (y - x) / log(y / x))
+end
+
 """
     Trixi.inv_ln_mean(x::Real, y::Real)
 
@@ -231,6 +249,20 @@ multiplication.
     end
 end
 
+# This is required for performance specializations like `FluxTurbo`.
+@inline function inv_ln_mean(x::AbstractSIMD, y::AbstractSIMD)
+    RealT = eltype(x)
+    epsilon_f2 = convert(RealT, 1.0e-4)
+    f2 = (x * (x - 2 * y) + y * y) / (x * (x + 2 * y) + y * y) # f2 = f^2
+    return ifelse(f2 < epsilon_f2,
+                  @evalpoly(f2,
+                            convert(RealT, 2),
+                            convert(RealT, 2 / 3),
+                            convert(RealT, 2 / 5),
+                            convert(RealT, 2 / 7)) / (x + y),
+                  log(y / x) / (y - x))
+end
+
 # `Base.max` and `Base.min` perform additional checks for signed zeros and `NaN`s
 # which are not present in comparable functions in Fortran/C++. For example,
 # ```julia
diff --git a/src/equations/numerical_fluxes.jl b/src/equations/numerical_fluxes.jl
index 545155b4889..6d8893740a7 100644
--- a/src/equations/numerical_fluxes.jl
+++ b/src/equations/numerical_fluxes.jl
@@ -578,4 +578,72 @@ end
 end
 
 Base.show(io::IO, f::FluxUpwind) = print(io, "FluxUpwind(", f.splitting, ")")
+
+"""
+    FluxTurbo(numerical_flux)
+
+Create a numerical flux equivalent to the given `numerical_flux`
+except that it may use specialized methods, e.g., when used with
+[`VolumeIntegralFluxDifferencing`](@ref). These specialized methods
+may enable better use of SIMD instructions to increase runtime efficiency
+on modern hardware, e.g., using
+[LoopVectorization.jl](https://github.com/JuliaSIMD/LoopVectorization.jl).
+
+The following specialization works by default for all numerical fluxes,
+enabling SIMD instructions. However, for some systems, it is even better
+to precompute some variables (e.g., primitive variables or logarithms of certain variables)
+and then compute the numerical fluxes.
+This optimization can be enabled by defining three ingredients:
+- the number of precomputed variables `nturbovars`;
+- the transformation from conservative to precomputed variables `cons2turbo`
+- and the  `flux_turbo(flux_turbo::typeof(numerical_flux), ...)`,
+  that computes the numerical flux in terms of the precomputed variables.
+
+See the implementation of [DGSEM Turbo](https://github.com/trixi-framework/Trixi.jl/blob/main/src/solvers/dgsem_structured/dg_3d_turbo.jl)
+"""
+struct FluxTurbo{NumericalFlux}
+    numerical_flux::NumericalFlux
+end
+
+# As a fallback method, the wrapped flux is called.
+@inline function (f::FluxTurbo)(u_ll, u_rr, orientation_or_normal_direction,
+                                equations)
+    return f.numerical_flux(u_ll, u_rr, orientation_or_normal_direction, equations)
+end
+
+# By default the turbo flux has the same number of precomputed variables
+# as the number of variables.
+@inline nturbovars(numerical_flux, equations) = Val(nvariables(equations))
+
+# Transform the conserved variables in precomputed auxiliary variables to speed up the computation
+# of the numerical flux. When no specialization is given, this gives cons2cons.
+# Note that the conserved variables are passed as scalars instead of a vector
+# to enable optimizations by LoopVectorization.jl. Thus, we use
+# `conserved_and_equations...` instead of `cons, equations`.
+@inline cons2turbo(numerical_flux, conserved_and_equations...) = Base.front(conserved_and_equations)
+
+# Numerical volume flux that recalls the plain volume flux when no specialization is given.
+@inline function flux_turbo(numerical_flux, turbovars_and_normals_and_equations...)
+    equations = last(turbovars_and_normals_and_equations)
+    flux_turbo(numerical_flux, have_nonconservative_terms(equations),
+               turbovars_and_normals_and_equations...)
+end
+
+@inline function flux_turbo(numerical_flux, have_nonconservative_terms::False,
+                            turbovars_and_normals_and_equations...)
+    equations = last(turbovars_and_normals_and_equations)
+    n = nvariables(equations)
+    u_ll = SVector(ntuple(v -> turbovars_and_normals_and_equations[v], Val(n)))
+    u_rr = SVector(ntuple(v -> turbovars_and_normals_and_equations[n + v], Val(n)))
+    normal_direction = SVector(turbovars_and_normals_and_equations[end - 3],
+                               turbovars_and_normals_and_equations[end - 2],
+                               turbovars_and_normals_and_equations[end - 1])
+    return numerical_flux(u_ll, u_rr, normal_direction, equations)
+end
+
+# Allow LoopVectorization.jl to use SIMD instructions on volume_flux_turbo and cons2turbo
+LoopVectorization.can_turbo(::typeof(flux_turbo), ::Val) = true
+LoopVectorization.can_turbo(::typeof(cons2turbo), ::Val) = true
+
+Base.show(io::IO, f::FluxTurbo) = print(io, "FluxTurbo(", f.numerical_flux, ")")
 end # @muladd
diff --git a/src/solvers/dgsem_structured/dg.jl b/src/solvers/dgsem_structured/dg.jl
index 5ac27049151..99b672f4fa5 100644
--- a/src/solvers/dgsem_structured/dg.jl
+++ b/src/solvers/dgsem_structured/dg.jl
@@ -234,4 +234,5 @@ include("dg_2d_subcell_limiters.jl")
 # Specialized implementations used to improve performance
 include("dg_2d_compressible_euler.jl")
 include("dg_3d_compressible_euler.jl")
+include("dg_3d_turbo.jl")
 end # @muladd
diff --git a/src/solvers/dgsem_structured/dg_3d_turbo.jl b/src/solvers/dgsem_structured/dg_3d_turbo.jl
new file mode 100644
index 00000000000..d4054ac4085
--- /dev/null
+++ b/src/solvers/dgsem_structured/dg_3d_turbo.jl
@@ -0,0 +1,339 @@
+@inline function flux_differencing_kernel!(_du::PtrArray, u_cons::PtrArray, element,
+                                           MeshT::Type{<:Union{StructuredMesh{3},
+                                                               P4estMesh{3}}},
+                                           have_nonconservative_terms::False,
+                                           equations,
+                                           volume_flux::FluxTurbo, dg::DGSEM,
+                                           cache, alpha)
+    @unpack numerical_flux = volume_flux
+    flux_differencing_kernel_turbo!(_du, u_cons, element, MeshT, have_nonconservative_terms,
+                                    equations,
+                                    numerical_flux, dg, cache, alpha,
+                                    nturbovars(numerical_flux, equations),
+                                    Val(nvariables(equations)))
+end
+
+# Generated function that writes the equivalent hand-written code as in dg_compressible_euler_3d.jl,
+# but generalizes for each number of variables and precomputed variables.
+@generated function flux_differencing_kernel_turbo!(_du::PtrArray, u_cons::PtrArray,
+                                                    element,
+                                                    MeshT::Type{<:Union{StructuredMesh{3},
+                                                                        P4estMesh{3}}},
+                                                    have_nonconservative_terms::False,
+                                                    equations,
+                                                    volume_flux,
+                                                    dg, cache, alpha, ::Val{NAUX},
+                                                    ::Val{NVARS}) where {NAUX, NVARS}
+    # Per-node scalars used inside the `@turbo` loops, e.g. `u_prim_ll_1 = rho_ll`,
+    # `flux_1 = f1`, etc.  
+    u_prim_ll = [Symbol(:u_prim_ll_, v) for v in 1:NAUX]
+    u_prim_rr = [Symbol(:u_prim_rr_, v) for v in 1:NAUX]
+    flux = [Symbol(:flux_, v) for v in 1:NVARS]
+
+    # Convert conserved to auxiliary variables node-wise, e.g. 
+    #   rho, v1, v2, v3, p = cons2prim(u_cons[:, i, j, k, element], equations)
+    #   u_prim[i, j, k, 1] = rho   # and so on for v2, v3, p, ...
+    cons_reads = [:(u_cons[$v, i, j, k, element]) for v in 1:NVARS]
+    cons2turbo_ = Expr(:(=), Expr(:tuple, [Symbol(:u_prim_, v) for v in 1:NAUX]...),
+                       :(cons2turbo(volume_flux, $(cons_reads...),
+                                    equations)))
+    cons2turbo_writes = [:(u_prim[i, j, k, $v] = $(Symbol(:u_prim_, v))) for v in 1:NAUX]
+
+    # Evaluate the two-point volume flux
+    flux_call = Expr(:(=), Expr(:tuple, flux...),
+                     :(flux_turbo(volume_flux,
+                                  $(u_prim_ll...), $(u_prim_rr...),
+                                  normal_direction_1, normal_direction_2,
+                                  normal_direction_3,
+                                  equations)))
+
+    # Build the inner `@turbo` loop body. For the x direction `loads` gives
+    #   rho_ll = u_prim_permuted[jk, i, 1]   # left node, and so on
+    #   rho_rr = u_prim_permuted[jk, ii, 1]  # right node
+    loads(arr, idx_ll, idx_rr) = vcat([:($(u_prim_ll[v]) = $arr[$(idx_ll...), $v])
+                                       for v in 1:NAUX],
+                                      [:($(u_prim_rr[v]) = $arr[$(idx_rr...), $v])
+                                       for v in 1:NAUX])
+
+    # Compute the average of the normal_direction
+    # e.g. the x direction `normals` gives
+    #   normal_direction_1 = 0.5f0 * (contravariant_vectors_x[jk, i, 1] +
+    #                                 contravariant_vectors_x[jk, ii, 1])   # and 2, 3
+    normals(controvariant_vector, idx_ll, idx_rr) = [:($(Symbol(:normal_direction_, m)) = 0.5f0 *
+                                                                                          ($controvariant_vector[$(idx_ll...),
+                                                                                                                 $m] +
+                                                                                           $controvariant_vector[$(idx_rr...),
+                                                                                                                 $m]))
+                                                     for m in 1:3]
+
+    # Store the fluxes in the permuted du array
+    # e.g. the x direction `stores` gives
+    #   du_permuted[jk, i, 1]  += factor_l * f1   # left node, and so on
+    #   du_permuted[jk, ii, 1] += factor_r * f1   # right node
+    stores(du_arr, idx_ll, idx_rr) = vcat([:($du_arr[$(idx_ll...), $v] += factor_l *
+                                                                          $(flux[v]))
+                                           for v in 1:NVARS],
+                                          [:($du_arr[$(idx_rr...), $v] += factor_r *
+                                                                          $(flux[v]))
+                                           for v in 1:NVARS])
+
+    loads_x = loads(:u_prim_permuted, (:jk, :i), (:jk, :ii))
+    normals_x = normals(:contravariant_vectors_x, (:jk, :i), (:jk, :ii))
+    stores_x = stores(:du_permuted, (:jk, :i), (:jk, :ii))
+
+    loads_y = loads(:u_prim, (:i, :j, :k), (:i, :jj, :k))
+    normals_y = normals(:contravariant_vectors_y, (:i, :j, :k), (:i, :jj, :k))
+    stores_y = stores(:du, (:i, :j, :k), (:i, :jj, :k))
+
+    loads_z = loads(:u_prim_reshaped, (:ij, :k), (:ij, :kk))
+    normals_z = normals(:contravariant_vectors_z, (:ij, :k), (:ij, :kk))
+    stores_z = stores(:du_reshaped, (:ij, :k), (:ij, :kk))
+
+    quote
+        @unpack derivative_split = dg.basis
+        @unpack contravariant_vectors = cache.elements
+
+        # Create a temporary array that will be used to store the RHS with permuted
+        # indices `[i, j, k, v]` to allow using SIMD instructions.
+        # `StrideArray`s with purely static dimensions do not allocate on the heap.
+        du = StrideArray{eltype(u_cons)}(undef,
+                                         (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
+                                          StaticInt(nnodes(dg)), StaticInt(NVARS)))
+
+        # Convert conserved to auxiliary variables on the given `element`.
+        u_prim = StrideArray{eltype(u_cons)}(undef,
+                                             (StaticInt(nnodes(dg)), StaticInt(nnodes(dg)),
+                                              StaticInt(nnodes(dg)), StaticInt(NAUX)))
+
+        # Convert conserved to auxiliary variables and store them in `u_prim`.
+        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            $cons2turbo_
+            $(cons2turbo_writes...)
+        end
+
+        # x direction
+        # At first, we create new temporary arrays with permuted memory layout to
+        # allow using SIMD instructions along the first dimension (which is contiguous
+        # in memory).
+        du_permuted = StrideArray{eltype(u_cons)}(undef,
+                                                  (StaticInt(nnodes(dg)^2),
+                                                   StaticInt(nnodes(dg)),
+                                                   StaticInt(NVARS)))
+
+        u_prim_permuted = StrideArray{eltype(u_cons)}(undef,
+                                                      (StaticInt(nnodes(dg)^2),
+                                                       StaticInt(nnodes(dg)),
+                                                       StaticInt(NAUX)))
+
+        @turbo for v in indices(u_prim, 4),
+                   k in eachnode(dg),
+                   j in eachnode(dg),
+                   i in eachnode(dg)
+
+            jk = j + nnodes(dg) * (k - 1)
+            u_prim_permuted[jk, i, v] = u_prim[i, j, k, v]
+        end
+        fill!(du_permuted, zero(eltype(du_permuted)))
+
+        # We must also permute the contravariant vectors.
+        contravariant_vectors_x = StrideArray{eltype(contravariant_vectors)}(undef,
+                                                                             (StaticInt(nnodes(dg)^2),
+                                                                              StaticInt(nnodes(dg)),
+                                                                              StaticInt(3)))
+
+        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            jk = j + nnodes(dg) * (k - 1)
+            contravariant_vectors_x[jk, i, 1] = contravariant_vectors[1, 1, i, j, k,
+                                                                      element]
+            contravariant_vectors_x[jk, i, 2] = contravariant_vectors[2, 1, i, j, k,
+                                                                      element]
+            contravariant_vectors_x[jk, i, 3] = contravariant_vectors[3, 1, i, j, k,
+                                                                      element]
+        end
+
+        # Next, we basically inline the volume flux. To allow SIMD vectorization and
+        # still use the symmetry of the volume flux and the derivative matrix, we
+        # loop over the triangular part in an outer loop and use a plain inner loop.
+        for i in eachnode(dg), ii in (i + 1):nnodes(dg)
+            @turbo for jk in Base.OneTo(nnodes(dg)^2)
+                $(loads_x...)
+                $(normals_x...)
+                $flux_call
+                factor_l = alpha * derivative_split[i, ii]
+                factor_r = alpha * derivative_split[ii, i]
+                $(stores_x...)
+            end
+        end
+
+        @turbo for v in eachvariable(equations),
+                   k in eachnode(dg),
+                   j in eachnode(dg),
+                   i in eachnode(dg)
+
+            jk = j + nnodes(dg) * (k - 1)
+            du[i, j, k, v] = du_permuted[jk, i, v]
+        end
+
+        # y direction
+        # We must also permute the contravariant vectors.
+        contravariant_vectors_y = StrideArray{eltype(contravariant_vectors)}(undef,
+                                                                             (StaticInt(nnodes(dg)),
+                                                                              StaticInt(nnodes(dg)),
+                                                                              StaticInt(nnodes(dg)),
+                                                                              StaticInt(3)))
+
+        @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+            contravariant_vectors_y[i, j, k, 1] = contravariant_vectors[1, 2, i, j, k,
+                                                                        element]
+            contravariant_vectors_y[i, j, k, 2] = contravariant_vectors[2, 2, i, j, k,
+                                                                        element]
+            contravariant_vectors_y[i, j, k, 3] = contravariant_vectors[3, 2, i, j, k,
+                                                                        element]
+        end
+
+        # A possible permutation of array dimensions with improved opportunities for
+        # SIMD vectorization appeared to be slower than the direct version used here
+        # in preliminary numerical experiments on an AVX2 system.
+        for j in eachnode(dg), jj in (j + 1):nnodes(dg)
+            @turbo for k in eachnode(dg), i in eachnode(dg)
+                $(loads_y...)
+                $(normals_y...)
+                $flux_call
+                factor_l = alpha * derivative_split[j, jj]
+                factor_r = alpha * derivative_split[jj, j]
+                $(stores_y...)
+            end
+        end
+
+        # z direction
+        # The memory layout is already optimal for SIMD vectorization in this loop.
+        # We just squeeze the first two dimensions to make the code slightly faster.
+        GC.@preserve u_prim begin
+            u_prim_reshaped = PtrArray(pointer(u_prim),
+                                       (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
+                                        StaticInt(NAUX)))
+
+            du_reshaped = PtrArray(pointer(du),
+                                   (StaticInt(nnodes(dg)^2), StaticInt(nnodes(dg)),
+                                    StaticInt(NVARS)))
+
+            # We must also permute the contravariant vectors.
+            contravariant_vectors_z = StrideArray{eltype(contravariant_vectors)}(undef,
+                                                                                 (StaticInt(nnodes(dg)^2),
+                                                                                  StaticInt(nnodes(dg)),
+                                                                                  StaticInt(3)))
+
+            @turbo for k in eachnode(dg), j in eachnode(dg), i in eachnode(dg)
+                ij = i + nnodes(dg) * (j - 1)
+                contravariant_vectors_z[ij, k, 1] = contravariant_vectors[1, 3, i, j, k,
+                                                                          element]
+                contravariant_vectors_z[ij, k, 2] = contravariant_vectors[2, 3, i, j, k,
+                                                                          element]
+                contravariant_vectors_z[ij, k, 3] = contravariant_vectors[3, 3, i, j, k,
+                                                                          element]
+            end
+
+            for k in eachnode(dg), kk in (k + 1):nnodes(dg)
+                @turbo for ij in Base.OneTo(nnodes(dg)^2)
+                    $(loads_z...)
+                    $(normals_z...)
+                    $flux_call
+                    factor_l = alpha * derivative_split[k, kk]
+                    factor_r = alpha * derivative_split[kk, k]
+                    $(stores_z...)
+                end
+            end
+        end # GC.@preserve u_prim begin
+
+        # Finally, we add the temporary RHS computed here to the global RHS in the
+        # given `element`.
+        @turbo for v in eachvariable(equations),
+                   k in eachnode(dg),
+                   j in eachnode(dg),
+                   i in eachnode(dg)
+
+            _du[v, i, j, k, element] += du[i, j, k, v]
+        end
+
+        return nothing
+    end
+end
+
+# Number of precomputed variables for the specialization flux_ranocha
+@inline nturbovars(flux_turbo::typeof(flux_ranocha), equations::CompressibleEulerEquations3D) = Val(7)
+
+# Transformation from conserved to precomputed variables for flux_ranocha
+@inline function cons2turbo(flux_turbo::typeof(flux_ranocha),
+                            rho, rho_v1, rho_v2, rho_v3, rho_e,
+                            equations::CompressibleEulerEquations3D)
+    v1 = rho_v1 / rho
+    v2 = rho_v2 / rho
+    v3 = rho_v3 / rho
+    p = (equations.gamma - 1) *
+        (rho_e - 0.5f0 * (rho_v1 * v1 + rho_v2 * v2 + rho_v3 * v3))
+    return rho, v1, v2, v3, p, log(rho), log(p)
+end
+
+# Computation of the numerical flux_ranocha with respect to precomputed variables
+@inline function flux_turbo(flux_turbo::typeof(flux_ranocha),
+                            rho_ll, v1_ll, v2_ll, v3_ll,
+                            p_ll, log_rho_ll, log_p_ll,
+                            rho_rr, v1_rr, v2_rr, v3_rr,
+                            p_rr, log_rho_rr, log_p_rr,
+                            normal_direction_1,
+                            normal_direction_2,
+                            normal_direction_3,
+                            equations::CompressibleEulerEquations3D)
+    v_dot_n_ll = v1_ll * normal_direction_1 + v2_ll * normal_direction_2 +
+                 v3_ll * normal_direction_3
+    v_dot_n_rr = v1_rr * normal_direction_1 + v2_rr * normal_direction_2 +
+                 v3_rr * normal_direction_3
+
+    # Compute required mean values
+    # We inline the logarithmic mean to allow LoopVectorization.jl to optimize
+    # it efficiently. This is equivalent to
+    # rho_mean = ln_mean(rho_ll, rho_rr)
+    x1 = rho_ll
+    log_x1 = log_rho_ll
+    y1 = rho_rr
+    log_y1 = log_rho_rr
+    x1_plus_y1 = x1 + y1
+    y1_minus_x1 = y1 - x1
+    z1 = y1_minus_x1^2 / x1_plus_y1^2
+    special_path1 = x1_plus_y1 / (2 + z1 * (2 / 3 + z1 * (2 / 5 + 2 / 7 * z1)))
+    regular_path1 = y1_minus_x1 / (log_y1 - log_x1)
+    rho_mean = ifelse(z1 < 1.0e-4, special_path1, regular_path1)
+
+    # Algebraically equivalent to `inv_ln_mean(rho_ll / p_ll, rho_rr / p_rr)`
+    # in exact arithmetic since
+    #     log((ϱₗ/pₗ) / (ϱᵣ/pᵣ)) / (ϱₗ/pₗ - ϱᵣ/pᵣ)
+    #   = pₗ pᵣ log((ϱₗ pᵣ) / (ϱᵣ pₗ)) / (ϱₗ pᵣ - ϱᵣ pₗ)
+    # inv_rho_p_mean = p_ll * p_rr * inv_ln_mean(rho_ll * p_rr, rho_rr * p_ll)
+    x2 = rho_ll * p_rr
+    log_x2 = log_rho_ll + log_p_rr
+    y2 = rho_rr * p_ll
+    log_y2 = log_rho_rr + log_p_ll
+    x2_plus_y2 = x2 + y2
+    y2_minus_x2 = y2 - x2
+    z2 = y2_minus_x2^2 / x2_plus_y2^2
+    special_path2 = (2 + z2 * (2 / 3 + z2 * (2 / 5 + 2 / 7 * z2))) / x2_plus_y2
+    regular_path2 = (log_y2 - log_x2) / y2_minus_x2
+    inv_rho_p_mean = p_ll * p_rr * ifelse(z2 < 1.0e-4, special_path2, regular_path2)
+
+    v1_avg = 0.5f0 * (v1_ll + v1_rr)
+    v2_avg = 0.5f0 * (v2_ll + v2_rr)
+    v3_avg = 0.5f0 * (v3_ll + v3_rr)
+    p_avg = 0.5f0 * (p_ll + p_rr)
+    velocity_square_avg = 0.5f0 * (v1_ll * v1_rr + v2_ll * v2_rr + v3_ll * v3_rr)
+
+    # Calculate fluxes depending on normal_direction
+    f1 = rho_mean * 0.5f0 * (v_dot_n_ll + v_dot_n_rr)
+    f2 = f1 * v1_avg + p_avg * normal_direction_1
+    f3 = f1 * v2_avg + p_avg * normal_direction_2
+    f4 = f1 * v3_avg + p_avg * normal_direction_3
+    f5 = (f1 * (velocity_square_avg + inv_rho_p_mean * equations.inv_gamma_minus_one)
+          +
+          0.5f0 * (p_ll * v_dot_n_rr + p_rr * v_dot_n_ll))
+
+    return (f1, f2, f3, f4, f5)
+end
diff --git a/test/test_performance_specializations_3d.jl b/test/test_performance_specializations_3d.jl
index 967b0f9cf3e..7b953c06c02 100644
--- a/test/test_performance_specializations_3d.jl
+++ b/test/test_performance_specializations_3d.jl
@@ -173,6 +173,95 @@ end
     end
 end
 
+@timed_testset "StructuredMesh3D, FluxTurbo(flux_ranocha)" begin
+    trixi_include(@__MODULE__,
+                  joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"),
+                  cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3,
+                  volume_flux = FluxTurbo(flux_ranocha),
+                  surface_flux = flux_ranocha)
+    u_ode = copy(sol.u[end])
+    du_ode = zero(u_ode)
+
+    # Preserve original memory since it will be `unsafe_wrap`ped and might
+    # thus otherwise be garbage collected
+    GC.@preserve u_ode du_ode begin
+        u = Trixi.wrap_array(u_ode, semi)
+        du = Trixi.wrap_array(du_ode, semi)
+        have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations)
+
+        # Call the optimized version
+        du .= 0
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
+                                        have_nonconservative_terms, semi.equations,
+                                        semi.solver.volume_integral.volume_flux,
+                                        semi.solver, semi.cache, true)
+        du_specialized = copy(du[:, :, :, :, 1])
+
+        # Call the plain version
+        du .= 0
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
+                                        have_nonconservative_terms, semi.equations,
+                                        flux_ranocha, semi.solver, semi.cache, true)
+        du_baseline = du[:, :, :, :, 1]
+
+        @test du_specialized ≈ du_baseline
+    end
+end
+
+@timed_testset "StructuredMesh3D, FluxTurbo(flux_shima_etal)" begin
+    trixi_include(@__MODULE__,
+                  joinpath(EXAMPLES_DIR, "structured_3d_dgsem", "elixir_euler_ec.jl"),
+                  cells_per_dimension = (1, 1, 1), tspan = (0.0, 0.0), polydeg = 3,
+                  volume_flux = FluxTurbo(flux_shima_etal),
+                  surface_flux = flux_shima_etal)
+    u_ode = copy(sol.u[end])
+    du_ode = zero(u_ode)
+
+    # Preserve original memory since it will be `unsafe_wrap`ped and might
+    # thus otherwise be garbage collected
+    GC.@preserve u_ode du_ode begin
+        u = Trixi.wrap_array(u_ode, semi)
+        du = Trixi.wrap_array(du_ode, semi)
+        have_nonconservative_terms = Trixi.have_nonconservative_terms(semi.equations)
+
+        # Call the optimized version
+        du .= 0
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
+                                        have_nonconservative_terms, semi.equations,
+                                        semi.solver.volume_integral.volume_flux,
+                                        semi.solver, semi.cache, true)
+        du_specialized = copy(du[:, :, :, :, 1])
+
+        # Call the plain version
+        du .= 0
+        Trixi.flux_differencing_kernel!(du, u, 1, typeof(semi.mesh),
+                                        have_nonconservative_terms, semi.equations,
+                                        flux_shima_etal, semi.solver, semi.cache, true)
+        du_baseline = du[:, :, :, :, 1]
+
+        @test du_specialized ≈ du_baseline
+    end
+end
+
+# Test fallback method for mesh and type that are not supported
+@timed_testset "TreeMesh1D, fallback call FluxTurbo(flux_chandrashekar)" begin
+    trixi_include(@__MODULE__,
+                  joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
+                           "elixir_euler_modified_sod.jl"),
+                  volume_integral = VolumeIntegralFluxDifferencing(flux_chandrashekar),
+                  RealT = BigFloat)
+    u_ode = copy(sol.u[end])
+
+    trixi_include(@__MODULE__,
+                  joinpath(EXAMPLES_DIR, "tree_1d_dgsem",
+                           "elixir_euler_modified_sod.jl"),
+                  volume_integral = VolumeIntegralFluxDifferencing(FluxTurbo(flux_chandrashekar)),
+                  RealT = BigFloat)
+    u_ode_specialized = copy(sol.u[end])
+
+    @test u_ode_specialized ≈ u_ode
+end
+
 @timed_testset "P4estMesh3D, combine_conservative_and_nonconservative_fluxes" begin
     trixi_include(@__MODULE__,
                   joinpath(EXAMPLES_DIR, "p4est_3d_dgsem",