diff --git a/benchmarks/benchmarks.jl b/benchmarks/benchmarks.jl
index 6a16f4ec..6368bf07 100644
--- a/benchmarks/benchmarks.jl
+++ b/benchmarks/benchmarks.jl
@@ -3,4 +3,4 @@ include("n_body.jl")
 include("smoothed_particle_hydrodynamics.jl")
 include("update.jl")
 
-include("plot.jl")
+include("run_benchmarks.jl")
diff --git a/benchmarks/count_neighbors.jl b/benchmarks/count_neighbors.jl
index 6ed4c146..81faa764 100644
--- a/benchmarks/count_neighbors.jl
+++ b/benchmarks/count_neighbors.jl
@@ -2,7 +2,8 @@ using PointNeighbors
 using BenchmarkTools
 
 """
-    benchmark_count_neighbors(neighborhood_search, coordinates; parallel = true)
+    benchmark_count_neighbors(neighborhood_search, coordinates;
+                              parallelization_backend = default_backend(coordinates))
 
 A very cheap and simple neighborhood search benchmark, only counting the neighbors of each
 point. For each point-neighbor pair, only an array entry is incremented.
diff --git a/benchmarks/n_body.jl b/benchmarks/n_body.jl
index 66650cce..3c14531a 100644
--- a/benchmarks/n_body.jl
+++ b/benchmarks/n_body.jl
@@ -2,7 +2,8 @@ using PointNeighbors
 using BenchmarkTools
 
 """
-    benchmark_n_body(neighborhood_search, coordinates; parallel = true)
+    benchmark_n_body(neighborhood_search, coordinates;
+                     parallelization_backend = default_backend(coordinates))
 
 A simple neighborhood search benchmark, computing the right-hand side of an n-body
 simulation with a cutoff (corresponding to the search radius of `neighborhood_search`).
@@ -16,7 +17,6 @@ function benchmark_n_body(neighborhood_search, coordinates_;
                           parallelization_backend = default_backend(coordinates_))
     # Passing a different backend like `CUDA.CUDABackend`
     # allows us to change the type of the array to run the benchmark on the GPU.
-    # Passing `parallel = true` or `parallel = false` will not change anything here.
     coordinates = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates_)
     nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search)
 
diff --git a/benchmarks/plot.jl b/benchmarks/plot.jl
deleted file mode 100644
index bd7588c3..00000000
--- a/benchmarks/plot.jl
+++ /dev/null
@@ -1,86 +0,0 @@
-using Plots
-using BenchmarkTools
-
-# Generate a rectangular point cloud
-include("../test/point_cloud.jl")
-
-"""
-    plot_benchmarks(benchmark, n_points_per_dimension, iterations;
-                    seed = 1, perturbation_factor_position = 1.0,
-                    parallel = true, title = "")
-
-Run a benchmark with several neighborhood searches multiple times for increasing numbers
-of points and plot the results.
-
-# Arguments
-- `benchmark`:              The benchmark function. See [`benchmark_count_neighbors`](@ref)
-                            and [`benchmark_n_body`](@ref).
-- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number
-                            of points. For example, use `(100, 100)` for a 2D benchmark or
-                            `(10, 10, 10)` for a 3D benchmark.
-- `iterations`:             Number of refinement iterations
-
-# Keywords
-- `parallel = true`:        Loop over all points in parallel
-- `title = ""`:             Title of the plot
-- `seed = 1`:               Seed to perturb the point positions. Different seeds yield
-                            slightly different point positions.
-- `perturbation_factor_position = 1.0`: Perturb point positions by this factor. A factor of
-                                        `1.0` corresponds to points being moved by
-                                        a maximum distance of `0.5` along each axis.
-
-# Examples
-```julia
-include("benchmarks/benchmarks.jl")
-
-plot_benchmarks(benchmark_count_neighbors, (10, 10), 3)
-"""
-function plot_benchmarks(benchmark, n_points_per_dimension, iterations;
-                         parallelization_backend = PolyesterBackend(), title = "",
-                         seed = 1, perturbation_factor_position = 1.0)
-    neighborhood_searches_names = ["TrivialNeighborhoodSearch";;
-                                   "GridNeighborhoodSearch";;
-                                   "PrecomputedNeighborhoodSearch"]
-
-    # Multiply number of points in each iteration (roughly) by this factor
-    scaling_factor = 4
-    per_dimension_factor = scaling_factor^(1 / length(n_points_per_dimension))
-    sizes = [round.(Int, n_points_per_dimension .* per_dimension_factor^(iter - 1))
-             for iter in 1:iterations]
-
-    n_particles_vec = prod.(sizes)
-    times = zeros(iterations, length(neighborhood_searches_names))
-
-    for iter in 1:iterations
-        coordinates = point_cloud(sizes[iter], seed = seed,
-                                  perturbation_factor_position = perturbation_factor_position)
-
-        search_radius = 3.0
-        NDIMS = size(coordinates, 1)
-        n_particles = size(coordinates, 2)
-
-        neighborhood_searches = [
-            TrivialNeighborhoodSearch{NDIMS}(; search_radius, eachpoint = 1:n_particles),
-            GridNeighborhoodSearch{NDIMS}(; search_radius, n_points = n_particles),
-            PrecomputedNeighborhoodSearch{NDIMS}(; search_radius, n_points = n_particles)
-        ]
-
-        for i in eachindex(neighborhood_searches)
-            neighborhood_search = neighborhood_searches[i]
-            initialize!(neighborhood_search, coordinates, coordinates)
-
-            time = benchmark(neighborhood_search, coordinates; parallelization_backend)
-            times[iter, i] = time
-            time_string = BenchmarkTools.prettytime(time * 1e9)
-            println("$(neighborhood_searches_names[i])")
-            println("with $(join(sizes[iter], "x")) = $(prod(sizes[iter])) particles finished in $time_string\n")
-        end
-    end
-
-    plot(n_particles_vec, times,
-         xaxis = :log, yaxis = :log,
-         xticks = (n_particles_vec, n_particles_vec),
-         xlabel = "#particles", ylabel = "Runtime [s]",
-         legend = :outerright, size = (750, 400), dpi = 200,
-         label = neighborhood_searches_names, title = title)
-end
diff --git a/benchmarks/run_benchmarks.jl b/benchmarks/run_benchmarks.jl
new file mode 100644
index 00000000..54c94ad7
--- /dev/null
+++ b/benchmarks/run_benchmarks.jl
@@ -0,0 +1,228 @@
+using Plots
+using BenchmarkTools
+
+# Generate a rectangular point cloud
+include("../test/point_cloud.jl")
+
+"""
+    run_benchmarks(benchmark, n_points_per_dimension, iterations, neighborhood_searches;
+                   parallelization_backend = PolyesterBackend(),
+                   names = ["NeighborhoodSearch 1" "NeighborhoodSearch 2" ...],
+                   seed = 1, perturbation_factor_position = 1.0)
+
+Run a benchmark with several neighborhood searches multiple times for increasing numbers
+of points and return the results as `(n_particles_vec, times)`, where `n_particles_vec`
+is a vector containing the number of particles for each iteration and `times` is a matrix
+containing the runtimes for each neighborhood search and iteration.
+
+See also
+- [`plot_benchmark`](@ref) to plot the results,
+- [`run_benchmark_default`](@ref) to run the benchmark with the most commonly used
+  neighborhood search implementations,
+- [`run_benchmark_gpu`](@ref) to run the benchmark with all GPU-compatible neighborhood
+  search implementations.
+
+# Arguments
+- `benchmark`:              The benchmark function. See [`benchmark_count_neighbors`](@ref),
+                            [`benchmark_n_body`](@ref), [`benchmark_wcsph`](@ref),
+                            [`benchmark_wcsph_fp32`](@ref) and [`benchmark_tlsph`](@ref).
+- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number
+                            of points. For example, use `(100, 100)` for a 2D benchmark or
+                            `(10, 10, 10)` for a 3D benchmark.
+- `iterations`:             Number of refinement iterations
+
+# Keywords
+- `parallelization_backend = PolyesterBackend()`: Parallelization strategy to use. See
+                                                  [`@threaded`](@ref) for a list of available
+                                                  backends.
+- `seed = 1`:               Seed to perturb the point positions. Different seeds yield
+                            slightly different point positions.
+- `perturbation_factor_position = 1.0`: Scale the point position perturbation by this factor.
+                                        A factor of `1.0` corresponds to a standard deviation
+                                        similar to that of a realistic simulation.
+
+# Examples
+```julia
+include("benchmarks/benchmarks.jl")
+
+run_benchmark(benchmark_count_neighbors, (10, 10), 3,
+              [TrivialNeighborhoodSearch{2}(), GridNeighborhoodSearch{2}()])
+```
+"""
+function run_benchmark(benchmark, n_points_per_dimension, iterations, neighborhood_searches;
+                       parallelization_backend = PolyesterBackend(),
+                       names = ["Neighborhood search $i"
+                                for i in 1:length(neighborhood_searches)]',
+                       seed = 1, perturbation_factor_position = 1.0)
+    # Multiply number of points in each iteration (roughly) by this factor
+    scaling_factor = 4
+    per_dimension_factor = scaling_factor^(1 / length(n_points_per_dimension))
+    sizes = [round.(Int, n_points_per_dimension .* per_dimension_factor^(iter - 1))
+             for iter in 1:iterations]
+
+    n_particles_vec = prod.(sizes)
+    times = zeros(iterations, length(neighborhood_searches))
+
+    for iter in 1:iterations
+        coordinates = point_cloud(sizes[iter]; seed, perturbation_factor_position)
+        domain_size = maximum(sizes[iter]) + 1
+
+        # Normalize domain size to 1
+        coordinates ./= domain_size
+
+        # Make this Float32 to make sure that Float32 benchmarks use Float32 exclusively
+        search_radius = 4.0f0 / domain_size
+        n_particles = size(coordinates, 2)
+
+        neighborhood_searches_copy = copy_neighborhood_search.(neighborhood_searches,
+                                                               search_radius, n_particles)
+
+        for i in eachindex(neighborhood_searches_copy)
+            neighborhood_search = neighborhood_searches_copy[i]
+            PointNeighbors.initialize!(neighborhood_search, coordinates, coordinates)
+
+            time = benchmark(neighborhood_search, coordinates; parallelization_backend)
+            times[iter, i] = time
+            time_string = BenchmarkTools.prettytime(time * 1e9)
+            time_string_per_particle = BenchmarkTools.prettytime(time * 1e9 / n_particles)
+            println("$(names[i])")
+            println("with $(join(sizes[iter], "x")) = $(prod(sizes[iter])) particles " *
+                    "finished in $time_string ($time_string_per_particle per particle)\n")
+        end
+    end
+
+    return n_particles_vec, times
+end
+
+"""
+    run_benchmark_default(benchmark, n_points_per_dimension, iterations; kwargs...)
+
+Shortcut to call [`run_benchmark`](@ref) with the most commonly used neighborhood search
+implementations:
+- `GridNeighborhoodSearch`
+- `GridNeighborhoodSearch` with `FullGridCellList`
+- `PrecomputedNeighborhoodSearch`
+
+# Arguments
+- `benchmark`:              The benchmark function. See [`benchmark_count_neighbors`](@ref),
+                            [`benchmark_n_body`](@ref), [`benchmark_wcsph`](@ref),
+                            [`benchmark_wcsph_fp32`](@ref) and [`benchmark_tlsph`](@ref).
+- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number
+                            of points. For example, use `(100, 100)` for a 2D benchmark or
+                            `(10, 10, 10)` for a 3D benchmark.
+- `iterations`:             Number of refinement iterations
+
+# Keywords
+See [`run_benchmark`](@ref) for a list of available keywords.
+
+# Examples
+```julia
+include("benchmarks/benchmarks.jl")
+
+run_benchmark_default(benchmark_n_body, (10, 10), 3)
+```
+"""
+function run_benchmark_default(benchmark, n_points_per_dimension, iterations; kwargs...)
+    NDIMS = length(n_points_per_dimension)
+    min_corner = 0.0f0 .* n_points_per_dimension
+    max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension))
+
+    neighborhood_searches = [
+        GridNeighborhoodSearch{NDIMS}(),
+        GridNeighborhoodSearch{NDIMS}(search_radius = 0.0f0,
+                                      cell_list = FullGridCellList(; search_radius = 0.0f0,
+                                                                   min_corner, max_corner)),
+        PrecomputedNeighborhoodSearch{NDIMS}()
+    ]
+
+    names = ["GridNeighborhoodSearch";;
+             "GridNeighborhoodSearch with FullGridCellList";;
+             "PrecomputedNeighborhoodSearch"]
+
+    run_benchmark(benchmark, n_points_per_dimension, iterations,
+                  neighborhood_searches; names, kwargs...)
+end
+
+"""
+    run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; kwargs...)
+
+Shortcut to call [`run_benchmark`](@ref) with all GPU-compatible neighborhood search
+implementations:
+- `GridNeighborhoodSearch` with `FullGridCellList`
+
+# Arguments
+- `benchmark`:              The benchmark function. See [`benchmark_count_neighbors`](@ref),
+                            [`benchmark_n_body`](@ref), [`benchmark_wcsph`](@ref),
+                            [`benchmark_wcsph_fp32`](@ref) and [`benchmark_tlsph`](@ref).
+- `n_points_per_dimension`: Initial resolution as tuple. The product is the initial number
+                            of points. For example, use `(100, 100)` for a 2D benchmark or
+                            `(10, 10, 10)` for a 3D benchmark.
+- `iterations`:             Number of refinement iterations
+
+# Keywords
+See [`run_benchmark`](@ref) for a list of available keywords.
+
+# Examples
+```julia
+include("benchmarks/benchmarks.jl")
+
+run_benchmark_gpu(benchmark_n_body, (10, 10), 3)
+```
+"""
+function run_benchmark_gpu(benchmark, n_points_per_dimension, iterations; kwargs...)
+    NDIMS = length(n_points_per_dimension)
+
+    min_corner = 0.0f0 .* n_points_per_dimension
+    max_corner = Float32.(n_points_per_dimension ./ maximum(n_points_per_dimension))
+    neighborhood_searches = [
+        GridNeighborhoodSearch{NDIMS}(search_radius = 0.0f0,
+                                      cell_list = FullGridCellList(; search_radius = 0.0f0,
+                                                                   min_corner, max_corner))
+    ]
+
+    names = ["GridNeighborhoodSearch with FullGridCellList";;]
+
+    run_benchmark(benchmark, n_points_per_dimension, iterations,
+                  neighborhood_searches; names, kwargs...)
+end
+
+"""
+    plot_benchmark(n_particles_vec, times; kwargs...)
+
+Plot the results of a benchmark run with [`run_benchmark`](@ref).
+Note that the arguments are the outputs of that function.
+
+# Arguments
+- `n_particles_vec`: Vector containing the number of particles for each iteration.
+- `times`:           Matrix containing the runtimes for each neighborhood search and iteration.
+
+# Keywords
+Keyword arguments are passed to `Plots.plot`. For example, use `title = "My title"`.
+
+# Examples
+```julia
+include("benchmarks/benchmarks.jl")
+
+n_particles_vec, times = run_benchmark_default(benchmark_count_neighbors, (10, 10), 3)
+plot_benchmark(n_particles_vec, times; title = "Count neighbors benchmark")
+```
+"""
+function plot_benchmark(n_particles_vec, times; kwargs...)
+    function format_n_particles(n)
+        if n >= 1_000_000
+            return "$(round(Int, n / 1_000_000))M"
+        elseif n >= 1_000
+            return "$(round(Int, n / 1_000))k"
+        else
+            return string(n)
+        end
+    end
+    xticks = format_n_particles.(n_particles_vec)
+
+    plot(n_particles_vec, times ./ n_particles_vec .* 1e9;
+         xaxis = :log,
+         xticks = (n_particles_vec, xticks), linewidth = 2,
+         xlabel = "#particles", ylabel = "runtime per particle [ns]",
+         legend = :outerright, size = (700, 350), dpi = 200, margin = 4 * Plots.mm,
+         palette = palette(:tab10), kwargs...)
+end
diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl
index 25dc3bce..0b97d702 100644
--- a/benchmarks/smoothed_particle_hydrodynamics.jl
+++ b/benchmarks/smoothed_particle_hydrodynamics.jl
@@ -21,7 +21,8 @@ end
 end
 
 """
-    benchmark_wcsph(neighborhood_search, coordinates; parallel = true)
+    benchmark_wcsph(neighborhood_search, coordinates;
+                    parallelization_backend = default_backend(coordinates))
 
 A benchmark of the right-hand side of a full real-life Weakly Compressible
 Smoothed Particle Hydrodynamics (WCSPH) simulation with TrixiParticles.jl.
@@ -30,47 +31,23 @@ This method is used to simulate an incompressible fluid.
 function benchmark_wcsph(neighborhood_search, coordinates;
                          parallelization_backend = default_backend(coordinates))
     density = 1000.0
-    fluid = InitialCondition(; coordinates, density, mass = 0.1)
-
-    # Compact support == 2 * smoothing length for these kernels
-    smoothing_length = PointNeighbors.search_radius(neighborhood_search) / 2
-    if ndims(neighborhood_search) == 1
-        smoothing_kernel = SchoenbergCubicSplineKernel{1}()
-    else
-        smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}()
-    end
+    particle_spacing = PointNeighbors.search_radius(neighborhood_search) / 3
+    fluid = InitialCondition(; coordinates, density, mass = 0.1, particle_spacing)
 
     sound_speed = 10.0
     state_equation = StateEquationCole(; sound_speed, reference_density = density,
                                        exponent = 1)
 
-    fluid_density_calculator = ContinuityDensity()
     viscosity = ArtificialViscosityMonaghan(alpha = 0.02, beta = 0.0)
     density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1)
 
-    fluid_system = WeaklyCompressibleSPHSystem(fluid, fluid_density_calculator,
-                                               state_equation, smoothing_kernel,
-                                               smoothing_length, viscosity = viscosity,
-                                               density_diffusion = density_diffusion)
-
-    system = PointNeighbors.Adapt.adapt(parallelization_backend, fluid_system)
-    nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search)
-    semi = DummySemidiscretization(nhs, parallelization_backend)
-
-    v = PointNeighbors.Adapt.adapt(parallelization_backend,
-                                   vcat(fluid.velocity, fluid.density'))
-    u = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates)
-    dv = zero(v)
-
-    # Initialize the system
-    TrixiParticles.initialize!(system, semi)
-    TrixiParticles.compute_pressure!(system, v, semi)
-
-    return @belapsed TrixiParticles.interact!($dv, $v, $u, $v, $u, $system, $system, $semi)
+    __benchmark_wcsph_inner(neighborhood_search, fluid, state_equation,
+                            viscosity, density_diffusion, parallelization_backend)
 end
 
 """
-    benchmark_wcsph_fp32(neighborhood_search, coordinates; parallel = true)
+    benchmark_wcsph_fp32(neighborhood_search, coordinates;
+                         parallelization_backend = default_backend(coordinates))
 
 Like [`benchmark_wcsph`](@ref), but using single precision floating point numbers.
 """
@@ -78,30 +55,33 @@ function benchmark_wcsph_fp32(neighborhood_search, coordinates_;
                               parallelization_backend = default_backend(coordinates_))
     coordinates = convert(Matrix{Float32}, coordinates_)
     density = 1000.0f0
-    fluid = InitialCondition(; coordinates, density, mass = 0.1f0)
-
-    # Compact support == 2 * smoothing length for these kernels
-    smoothing_length = convert(Float32,
-                               PointNeighbors.search_radius(neighborhood_search) / 2)
-    if ndims(neighborhood_search) == 1
-        smoothing_kernel = SchoenbergCubicSplineKernel{1}()
-    else
-        smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}()
-    end
+    particle_spacing = PointNeighbors.search_radius(neighborhood_search) / 3
+    fluid = InitialCondition(; coordinates, density, mass = 0.1f0, particle_spacing)
 
     sound_speed = 10.0f0
     state_equation = StateEquationCole(; sound_speed, reference_density = density,
                                        exponent = 1)
 
-    fluid_density_calculator = ContinuityDensity()
     viscosity = ArtificialViscosityMonaghan(alpha = 0.02f0, beta = 0.0f0)
     density_diffusion = DensityDiffusionMolteniColagrossi(delta = 0.1f0)
 
-    fluid_system = WeaklyCompressibleSPHSystem(fluid, fluid_density_calculator,
+    __benchmark_wcsph_inner(neighborhood_search, fluid, state_equation,
+                            viscosity, density_diffusion, parallelization_backend)
+end
+
+function __benchmark_wcsph_inner(neighborhood_search, initial_condition, state_equation,
+                                 viscosity, density_diffusion, parallelization_backend)
+    # Compact support == 2 * smoothing length for these kernels
+    smoothing_length = PointNeighbors.search_radius(neighborhood_search) / 2
+    if ndims(neighborhood_search) == 1
+        smoothing_kernel = SchoenbergCubicSplineKernel{1}()
+    else
+        smoothing_kernel = WendlandC2Kernel{ndims(neighborhood_search)}()
+    end
+
+    fluid_system = WeaklyCompressibleSPHSystem(initial_condition, ContinuityDensity(),
                                                state_equation, smoothing_kernel,
                                                smoothing_length, viscosity = viscosity,
-                                               acceleration = ntuple(_ -> 0.0f0,
-                                                                     Val(ndims(neighborhood_search))),
                                                density_diffusion = density_diffusion)
 
     system = PointNeighbors.Adapt.adapt(parallelization_backend, fluid_system)
@@ -109,8 +89,9 @@ function benchmark_wcsph_fp32(neighborhood_search, coordinates_;
     semi = DummySemidiscretization(nhs, parallelization_backend)
 
     v = PointNeighbors.Adapt.adapt(parallelization_backend,
-                                   vcat(fluid.velocity, fluid.density'))
-    u = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates)
+                                   vcat(initial_condition.velocity,
+                                        initial_condition.density'))
+    u = PointNeighbors.Adapt.adapt(parallelization_backend, initial_condition.coordinates)
     dv = zero(v)
 
     # Initialize the system
@@ -121,7 +102,8 @@ function benchmark_wcsph_fp32(neighborhood_search, coordinates_;
 end
 
 """
-    benchmark_tlsph(neighborhood_search, coordinates; parallel = true)
+    benchmark_tlsph(neighborhood_search, coordinates;
+                    parallelization_backend = default_backend(coordinates))
 
 A benchmark of the right-hand side of a full real-life Total Lagrangian
 Smoothed Particle Hydrodynamics (TLSPH) simulation with TrixiParticles.jl.
@@ -133,7 +115,8 @@ function benchmark_tlsph(neighborhood_search, coordinates;
     solid = InitialCondition(; coordinates, density = material.density, mass = 0.1)
 
     # Compact support == 2 * smoothing length for these kernels
-    smoothing_length = PointNeighbors.search_radius(neighborhood_search) / 2
+    smoothing_length_ = PointNeighbors.search_radius(neighborhood_search) / 2
+    smoothing_length = convert(typeof(material.E), smoothing_length_)
     if ndims(neighborhood_search) == 1
         smoothing_kernel = SchoenbergCubicSplineKernel{1}()
     else
diff --git a/benchmarks/update.jl b/benchmarks/update.jl
index 79904700..79fb8bb2 100644
--- a/benchmarks/update.jl
+++ b/benchmarks/update.jl
@@ -5,17 +5,20 @@ using BenchmarkTools
 include("../test/point_cloud.jl")
 
 """
-    benchmark_initialize(neighborhood_search, coordinates; parallel = true)
+    benchmark_initialize(neighborhood_search, coordinates;
+                         parallelization_backend = default_backend(coordinates))
 
 Benchmark neighborhood search initialization with the given `coordinates`.
 """
 function benchmark_initialize(neighborhood_search, coordinates;
                               parallelization_backend = default_backend(coordinates))
-    return @belapsed $initialize!($neighborhood_search, $coordinates, $coordinates)
+    return @belapsed $initialize!($neighborhood_search, $coordinates, $coordinates;
+                                  parallelization_backend = $parallelization_backend)
 end
 
 """
-    benchmark_update_alternating(neighborhood_search, coordinates; parallel = true)
+    benchmark_update_alternating(neighborhood_search, coordinates;
+                                 parallelization_backend = default_backend(coordinates))
 
 A very simple benchmark for neighborhood search update, alternating between two differently
 perturbed point clouds.
@@ -30,15 +33,16 @@ function benchmark_update_alternating(neighborhood_search, coordinates;
     # update in 2D and ~0.7% in 3D.
     # These values are the same as the experimentally computed averages in 2D and 3D SPH
     # dam break simulations. So this benchmark replicates a real-life SPH update.
-    perturb!(coordinates2, 0.015)
+    perturb!(coordinates2, 4e-4 * PointNeighbors.search_radius(neighborhood_search))
 
-    function update_alternating!(neighborhood_search, coordinates, coordinates2)
-        update!(neighborhood_search, coordinates, coordinates)
-        update!(neighborhood_search, coordinates, coordinates2)
+    function update_alternating!(neighborhood_search, coordinates, coordinates2,
+                                 parallelization_backend)
+        update!(neighborhood_search, coordinates, coordinates; parallelization_backend)
+        update!(neighborhood_search, coordinates, coordinates2; parallelization_backend)
     end
 
     result = @belapsed $update_alternating!($neighborhood_search, $coordinates,
-                                            $coordinates2)
+                                            $coordinates2, $parallelization_backend)
 
     # Return average update time
     return 0.5 * result
diff --git a/test/benchmarks.jl b/test/benchmarks.jl
index 573b99f5..6a3e7b3d 100644
--- a/test/benchmarks.jl
+++ b/test/benchmarks.jl
@@ -7,31 +7,38 @@
 
     @testset verbose=true "$(length(size))D" for size in [(50,), (10, 10), (5, 5, 5)]
         @testset verbose=true "`benchmark_count_neighbors`" begin
-            @trixi_test_nowarn plot_benchmarks(benchmark_count_neighbors, size, 2)
+            @trixi_test_nowarn run_benchmark_default(benchmark_count_neighbors, size, 2)
+            @trixi_test_nowarn run_benchmark_gpu(benchmark_count_neighbors, size, 2)
         end
 
         @testset verbose=true "`benchmark_n_body`" begin
-            @trixi_test_nowarn plot_benchmarks(benchmark_n_body, size, 2)
+            @trixi_test_nowarn run_benchmark_default(benchmark_n_body, size, 2)
+            @trixi_test_nowarn run_benchmark_gpu(benchmark_n_body, size, 2)
         end
 
         @testset verbose=true "`benchmark_wcsph`" begin
-            @trixi_test_nowarn plot_benchmarks(benchmark_wcsph, size, 2)
+            @trixi_test_nowarn run_benchmark_default(benchmark_wcsph, size, 2)
+            @trixi_test_nowarn run_benchmark_gpu(benchmark_wcsph, size, 2)
         end
 
         @testset verbose=true "`benchmark_wcsph_fp32`" begin
-            @trixi_test_nowarn plot_benchmarks(benchmark_wcsph_fp32, size, 2)
+            @trixi_test_nowarn run_benchmark_default(benchmark_wcsph_fp32, size, 2)
+            @trixi_test_nowarn run_benchmark_gpu(benchmark_wcsph_fp32, size, 2)
         end
 
         @testset verbose=true "`benchmark_tlsph`" begin
-            @trixi_test_nowarn plot_benchmarks(benchmark_tlsph, size, 2)
+            @trixi_test_nowarn run_benchmark_default(benchmark_tlsph, size, 2)
+            @trixi_test_nowarn run_benchmark_gpu(benchmark_tlsph, size, 2)
         end
 
         @testset verbose=true "`benchmark_initialize`" begin
-            @trixi_test_nowarn plot_benchmarks(benchmark_initialize, size, 2)
+            @trixi_test_nowarn run_benchmark_default(benchmark_initialize, size, 2)
+            @trixi_test_nowarn run_benchmark_gpu(benchmark_initialize, size, 2)
         end
 
         @testset verbose=true "`benchmark_update_alternating`" begin
-            @trixi_test_nowarn plot_benchmarks(benchmark_update_alternating, size, 2)
+            @trixi_test_nowarn run_benchmark_default(benchmark_update_alternating, size, 2)
+            @trixi_test_nowarn run_benchmark_gpu(benchmark_update_alternating, size, 2)
         end
     end
 end;
diff --git a/test/point_cloud.jl b/test/point_cloud.jl
index bbba9d0a..ba99b2e1 100644
--- a/test/point_cloud.jl
+++ b/test/point_cloud.jl
@@ -14,15 +14,19 @@ function point_cloud(n_points_per_dimension;
         coordinates[:, i] .= Tuple(cartesian_indices[i])
     end
 
-    perturb!(coordinates, perturbation_factor_position * 0.5)
+    # A standard deviation of 0.05 in the particle coordinates
+    # corresponds to a standard deviation of 2 in the number of neighbors for a 300 x 100
+    # grid, 1.6 for a 600 x 200 grid and 1.26 for a 1200 x 400 grid.
+    # This is consistent with the standard deviation in a vortex street simulation.
+    # The benchmark results are also consistent with the timer output of the simulation.
+    perturb!(coordinates, perturbation_factor_position * 0.05)
 
     return coordinates
 end
 
-function perturb!(data, amplitude)
+function perturb!(data, std_deviation)
     for i in eachindex(data)
-        # Perturbation in the interval (-amplitude, amplitude)
-        data[i] += 2 * amplitude * rand() - amplitude
+        data[i] += std_deviation * randn()
     end
 
     return data