trixi-framework · ranocha · Jun 24, 2026 · Jun 4, 2026 · Jun 4, 2026 · Jun 4, 2026
diff --git a/examples/p4est_3d_dgsem/elixir_mhd_alfven_wave_combined_fluxes_nonperiodic.jl b/examples/p4est_3d_dgsem/elixir_mhd_alfven_wave_combined_fluxes_nonperiodic.jl
@@ -233,18 +233,16 @@ semi = SemidiscretizationHyperbolic(mesh, equations, initial_condition, solver;
 # ODE solvers, callbacks etc.
 
 tspan = (0.0, 1.0)
-ode = semidiscretize(semi, tspan)
+ode = semidiscretize(semi, tspan; real_type = nothing, storage_type = nothing)
 
 summary_callback = SummaryCallback()
 
 analysis_interval = 100
-analysis_callback = AnalysisCallback(semi, interval = analysis_interval)
-alive_callback = AliveCallback(analysis_interval = analysis_interval)
 
-save_solution = SaveSolutionCallback(interval = 100,
-                                     save_initial_solution = true,
-                                     save_final_solution = true,
-                                     solution_variables = cons2prim)
+analysis_callback = AnalysisCallback(semi, interval = analysis_interval,
+                                     analysis_integrals = (entropy,))
+
+alive_callback = AliveCallback(analysis_interval = analysis_interval)
 
 cfl = 1.0
 stepsize_callback = StepsizeCallback(cfl = cfl)
@@ -253,7 +251,6 @@ glm_speed_callback = GlmSpeedCallback(glm_scale = 0.5, cfl = cfl)
 
 callbacks = CallbackSet(summary_callback,
                         analysis_callback, alive_callback,
-                        save_solution,
                         stepsize_callback,
                         glm_speed_callback)
 

diff --git a/src/callbacks_step/glm_speed.jl b/src/callbacks_step/glm_speed.jl
@@ -59,7 +59,7 @@ function Base.show(io::IO, ::MIME"text/plain",
     end
 end
 
-function GlmSpeedCallback(; glm_scale = 0.5, cfl, semi_indices = Int[])
+function GlmSpeedCallback(; glm_scale = 0.5f0, cfl, semi_indices = Int[])
     @assert 0<=glm_scale<=1 "glm_scale must be between 0 and 1"
 
     cfl_function = isa(cfl, Real) ? Returns(cfl) : cfl
@@ -86,9 +86,9 @@ function update_cleaning_speed!(semi, glm_speed_callback, dt, t)
     @unpack glm_scale, cfl = glm_speed_callback
 
     mesh, equations, solver, cache = mesh_equations_solver_cache(semi)
-
     # compute time step for GLM linear advection equation with c_h=1 (redone due to the possible AMR)
-    c_h_deltat = calc_dt_for_cleaning_speed(cfl(t), mesh, equations, solver, cache)
+    c_h_deltat = calc_dt_for_cleaning_speed(cfl(t), mesh, equations, solver,
+                                            cache)
 
     # c_h is proportional to its own time step divided by the complete MHD time step
     # We use @reset here since the equations are immutable (to work on GPUs etc.).

diff --git a/src/solvers/dgsem_p4est/dg_3d_gpu.jl b/src/solvers/dgsem_p4est/dg_3d_gpu.jl
@@ -20,7 +20,7 @@
             combine_conservative_and_nonconservative_fluxes(volume_integral.volume_flux,
                                                             equations),
             dg,
-            volume_integral, Val(NNODES),
+            volume_integral.volume_flux, Val(NNODES),
             derivative_split,
             contravariant_vectors,
             ndrange = (NNODES, NNODES, NNODES, nelements(dg, cache)))
@@ -33,7 +33,7 @@ end
                                              have_nonconservative_terms::False,
                                              combine_conservative_and_nonconservative_fluxes::False,
                                              dg::DGSEM,
-                                             volume_integral,
+                                             volume_flux,
                                              ::Val{NNODES},
                                              derivative_split,
                                              contravariant_vectors,
@@ -42,8 +42,6 @@ end
     # This can (hopefully) be optimized away due to constant propagation.
     i, j, k, element = @index(Global, NTuple)
 
-    @unpack volume_flux = volume_integral
-
     u_node = get_node_vars(u, equations, dg, i, j, k, element)
 
     # pull the contravariant vectors in each coordinate direction
@@ -137,6 +135,117 @@ end
     end
 end
 
+@kernel function flux_differencing_KAkernel!(du, u, equations,
+                                             MeshT::Type{<:Union{P4estMesh{3},
+                                                                 T8codeMesh{3}}},
+                                             have_nonconservative_terms::True,
+                                             combine_conservative_and_nonconservative_fluxes::True,
+                                             dg::DGSEM,
+                                             volume_flux,
+                                             ::Val{NNODES},
+                                             derivative_split,
+                                             contravariant_vectors,
+                                             alpha = true) where {NNODES}
+    # `true * [some floating point value] == [exactly the same floating point value]`
+    # This can (hopefully) be optimized away due to constant propagation.
+    i, j, k, element = @index(Global, NTuple)
+
+    u_node = get_node_vars(u, equations, dg, i, j, k, element)
+
+    # pull the contravariant vectors in each coordinate direction
+    Ja1_node = get_contravariant_vector(1, contravariant_vectors, i, j, k, element)
+    Ja2_node = get_contravariant_vector(2, contravariant_vectors, i, j, k, element)
+    Ja3_node = get_contravariant_vector(3, contravariant_vectors, i, j, k, element)
+
+    # All diagonal entries of `derivative_split` are zero. Thus, we can skip
+    # the computation of the diagonal terms. In addition, we use the symmetry
+    # of the `volume_flux` to save half of the possible two-point flux
+    # computations.
+    #
+    # Instead of assigning thread i the partners i+1, …, N,
+    # we distribute the half-sweep cyclically: each thread visits
+    # half = div(N,2) partners at a fixed rotating offset.
+    # Every unordered pair is still covered exactly
+    # once, but now every thread performs the same number of loop iterations.
+    # When N is even (odd polynomial degree) the antipodal pair at
+    # offset half is shared by two threads, so its contribution is weighted by
+    # 1/2 to avoid double counting.
+    #
+    # See Section 4.1 (Eq. 6) of
+    # - Waterhouse, Waruszewski, Wilcox, Giraldo (2026)
+    #   GPU Performance of an Entropy-Stable Discontinuous Galerkin Euler Solver
+    #   with Non-Conservative Terms.
+    #   arXiv (pre-print): https://arxiv.org/abs/2605.16684
+
+    half_nnodes = div(NNODES, 2)
+    even_nodes = iseven(NNODES)
+
+    KernelAbstractions.Extras.@unroll for offset in 1:half_nnodes
+        # weight the antipodal pair by 1/2 only when the number of nodes is even
+        weight = (even_nodes && offset == half_nnodes) ? 0.5f0 : 1.0f0
+
+        # first coordinate direction: rotate the partner index along `i`
+        ii = mod(i - 1 + offset, NNODES) + 1
+        u_node_ii = get_node_vars(u, equations, dg, ii, j, k, element)
+        # pull the contravariant vectors and compute the average
+        Ja1_node_ii = get_contravariant_vector(1, contravariant_vectors,
+                                               ii, j, k, element)
+        Ja1_avg = 0.5f0 * (Ja1_node + Ja1_node_ii)
+        # compute the contravariant volume flux in the direction of the
+        # averaged contravariant vector
+        fluxtilde1_left, fluxtilde1_right = volume_flux(u_node, u_node_ii, Ja1_avg,
+                                                        equations)
+        multiply_add_to_first_axis_atomic!(du,
+                                           weight * alpha * derivative_split[i, ii],
+                                           fluxtilde1_left,
+                                           i, j, k, element)
+        multiply_add_to_first_axis_atomic!(du,
+                                           weight * alpha * derivative_split[ii, i],
+                                           fluxtilde1_right,
+                                           ii, j, k, element)
+
+        # second coordinate direction: rotate the partner index along `j`
+        jj = mod(j - 1 + offset, NNODES) + 1
+        u_node_jj = get_node_vars(u, equations, dg, i, jj, k, element)
+        # pull the contravariant vectors and compute the average
+        Ja2_node_jj = get_contravariant_vector(2, contravariant_vectors,
+                                               i, jj, k, element)
+        Ja2_avg = 0.5f0 * (Ja2_node + Ja2_node_jj)
+        # compute the contravariant volume flux in the direction of the
+        # averaged contravariant vector
+        fluxtilde2_left, fluxtilde2_right = volume_flux(u_node, u_node_jj, Ja2_avg,
+                                                        equations)
+        multiply_add_to_first_axis_atomic!(du,
+                                           weight * alpha * derivative_split[j, jj],
+                                           fluxtilde2_left,
+                                           i, j, k, element)
+        multiply_add_to_first_axis_atomic!(du,
+                                           weight * alpha * derivative_split[jj, j],
+                                           fluxtilde2_right,
+                                           i, jj, k, element)
+
+        # third coordinate direction: rotate the partner index along `k`
+        kk = mod(k - 1 + offset, NNODES) + 1
+        u_node_kk = get_node_vars(u, equations, dg, i, j, kk, element)
+        # pull the contravariant vectors and compute the average
+        Ja3_node_kk = get_contravariant_vector(3, contravariant_vectors,
+                                               i, j, kk, element)
+        Ja3_avg = 0.5f0 * (Ja3_node + Ja3_node_kk)
+        # compute the contravariant volume flux in the direction of the
+        # averaged contravariant vector
+        fluxtilde3_left, fluxtilde3_right = volume_flux(u_node, u_node_kk, Ja3_avg,
+                                                        equations)
+        multiply_add_to_first_axis_atomic!(du,
+                                           weight * alpha * derivative_split[k, kk],
+                                           fluxtilde3_left,
+                                           i, j, k, element)
+        multiply_add_to_first_axis_atomic!(du,
+                                           weight * alpha * derivative_split[kk, k],
+                                           fluxtilde3_right,
+                                           i, j, kk, element)
+    end
+end
+
 function prolong2interfaces!(backend::Backend, cache, u,
                              mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                              equations, dg::DG)
@@ -372,30 +481,94 @@ end
                                      surface_integral, dg, cache,
                                      i_index, j_index, k_index, i_node_index,
                                      j_node_index,
-                                     direction_index, element,
-                                     boundary, node_coordinates,
+                                     direction_index, element_index,
+                                     boundary_index, node_coordinates,
                                      contravariant_vectors)
     @unpack surface_flux = surface_integral
 
     # Extract solution data from boundary container
-    u_inner = get_node_vars(u, equations, dg, i_node_index, j_node_index, boundary)
+    u_inner = get_node_vars(u, equations, dg, i_node_index, j_node_index,
+                            boundary_index)
 
     # Outward-pointing normal direction (not normalized)
     normal_direction = get_normal_direction(direction_index, contravariant_vectors,
-                                            i_index, j_index, k_index, element)
+                                            i_index, j_index, k_index, element_index)
 
     # Coordinates at boundary node
     x = get_node_coords(node_coordinates, equations, dg,
-                        i_index, j_index, k_index, element)
+                        i_index, j_index, k_index, element_index)
 
     flux_ = boundary_condition(u_inner, normal_direction, x, t, surface_flux, equations)
 
     # Copy flux to element storage in the correct orientation
     for v in eachvariable(equations)
-        surface_flux_values[v, i_node_index, j_node_index, direction_index, element] = flux_[v]
+        surface_flux_values[v, i_node_index, j_node_index, direction_index, element_index] = flux_[v]
     end
 end
 
+@inline function calc_boundary_flux!(u, surface_flux_values, t, boundary_condition,
+                                     MeshT::Type{<:Union{P4estMesh{3},
+                                                         T8codeMesh{3}}},
+                                     have_nonconservative_terms::True, equations,
+                                     surface_integral, dg, cache, i_index, j_index,
+                                     k_index, i_node_index, j_node_index,
+                                     direction_index,
+                                     element_index, boundary_index, node_coordinates,
+                                     contravariant_vectors)
+    calc_boundary_flux!(u, surface_flux_values, t, boundary_condition, MeshT,
+                        have_nonconservative_terms,
+                        combine_conservative_and_nonconservative_fluxes(surface_integral.surface_flux,
+                                                                        equations),
+                        equations,
+                        surface_integral, dg, cache,
+                        i_index, j_index, k_index, i_node_index, j_node_index,
+                        direction_index, element_index, boundary_index,
+                        node_coordinates, contravariant_vectors)
+    return nothing
+end
+
+@inline function calc_boundary_flux!(u, surface_flux_values, t, boundary_condition,
+                                     MeshT::Type{<:Union{P4estMesh{3},
+                                                         T8codeMesh{3}}},
+                                     have_nonconservative_terms::True,
+                                     combine_conservative_and_nonconservative_fluxes::True,
+                                     equations,
+                                     surface_integral, dg::DG, cache, i_index, j_index,
+                                     k_index, i_node_index, j_node_index,
+                                     direction_index,
+                                     element_index, boundary_index, node_coordinates,
+                                     contravariant_vectors)
+    @unpack surface_flux = surface_integral
+
+    # Extract solution data from boundary container
+    u_inner = get_node_vars(u, equations, dg, i_node_index, j_node_index,
+                            boundary_index)
+
+    # Outward-pointing normal direction (not normalized)
+    normal_direction = get_normal_direction(direction_index, contravariant_vectors,
+                                            i_index, j_index, k_index, element_index)
+
+    # Coordinates at boundary node
+    x = get_node_coords(node_coordinates, equations, dg,
+                        i_index, j_index, k_index, element_index)
+
+    # Call pointwise numerical flux functions for the conservative and nonconservative part
+    # in the normal direction on the boundary
+    flux = boundary_condition(u_inner, normal_direction, x, t,
+                              surface_flux, equations)
+
+    # Copy flux to element storage in the correct orientation
+    for v in eachvariable(equations)
+        # Note the factor 0.5 necessary for the nonconservative fluxes based on
+        # the interpretation of global SBP operators coupled discontinuously via
+        # central fluxes/SATs
+        surface_flux_values[v, i_node_index, j_node_index,
+        direction_index, element_index] = flux[v]
+    end
+
+    return nothing
+end
+
 function calc_surface_integral!(backend::Backend, du, u,
                                 mesh::Union{P4estMesh{3}, T8codeMesh{3}},
                                 equations,