diff --git a/Project.toml b/Project.toml
index 92fb96c6..70e91f98 100644
--- a/Project.toml
+++ b/Project.toml
@@ -1,7 +1,7 @@
 name = "PointNeighbors"
 uuid = "1c4d5385-0a27-49de-8e2c-43b175c8985c"
 authors = ["Erik Faulhaber <erik.faulhaber@uni-koeln.de>"]
-version = "0.5"
+version = "0.4.9-dev"
 
 [deps]
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
diff --git a/benchmarks/count_neighbors.jl b/benchmarks/count_neighbors.jl
index 9f73add7..6ed4c146 100644
--- a/benchmarks/count_neighbors.jl
+++ b/benchmarks/count_neighbors.jl
@@ -12,18 +12,20 @@ implementations are highlighted. On the other hand, this is the least realistic
 
 For a computationally heavier benchmark, see [`benchmark_n_body`](@ref).
 """
-function benchmark_count_neighbors(neighborhood_search, coordinates; parallel = true)
+function benchmark_count_neighbors(neighborhood_search, coordinates;
+                                   parallelization_backend = default_backend(coordinates))
     n_neighbors = zeros(Int, size(coordinates, 2))
 
-    function count_neighbors!(n_neighbors, coordinates, neighborhood_search, parallel)
+    function count_neighbors!(n_neighbors, coordinates, neighborhood_search,
+                              parallelization_backend)
         n_neighbors .= 0
 
-        foreach_point_neighbor(coordinates, coordinates, neighborhood_search,
-                               parallel = parallel) do i, _, _, _
+        foreach_point_neighbor(coordinates, coordinates, neighborhood_search;
+                               parallelization_backend) do i, _, _, _
             n_neighbors[i] += 1
         end
     end
 
     return @belapsed $count_neighbors!($n_neighbors, $coordinates,
-                                       $neighborhood_search, $parallel)
+                                       $neighborhood_search, $parallelization_backend)
 end
diff --git a/benchmarks/n_body.jl b/benchmarks/n_body.jl
index 6cd02721..66650cce 100644
--- a/benchmarks/n_body.jl
+++ b/benchmarks/n_body.jl
@@ -12,12 +12,13 @@ This is a more realistic benchmark for particle-based simulations than
 However, due to the higher computational cost, differences between neighborhood search
 implementations are less pronounced.
 """
-function benchmark_n_body(neighborhood_search, coordinates_; parallel = true)
+function benchmark_n_body(neighborhood_search, coordinates_;
+                          parallelization_backend = default_backend(coordinates_))
     # Passing a different backend like `CUDA.CUDABackend`
     # allows us to change the type of the array to run the benchmark on the GPU.
     # Passing `parallel = true` or `parallel = false` will not change anything here.
-    coordinates = PointNeighbors.Adapt.adapt(parallel, coordinates_)
-    nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search)
+    coordinates = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates_)
+    nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search)
 
     # This preserves the data type of `coordinates`, which makes it work for GPU types
     mass = 1e10 * (rand!(similar(coordinates, size(coordinates, 2))) .+ 1)
@@ -25,11 +26,12 @@ function benchmark_n_body(neighborhood_search, coordinates_; parallel = true)
 
     dv = similar(coordinates)
 
-    function compute_acceleration!(dv, coordinates, mass, G, neighborhood_search, parallel)
+    function compute_acceleration!(dv, coordinates, mass, G, neighborhood_search,
+                                   parallelization_backend)
         dv .= 0.0
 
-        foreach_point_neighbor(coordinates, coordinates, neighborhood_search,
-                               parallel = parallel) do i, j, pos_diff, distance
+        foreach_point_neighbor(coordinates, coordinates, neighborhood_search;
+                               parallelization_backend) do i, j, pos_diff, distance
             # Only consider particles with a distance > 0
             distance < sqrt(eps()) && return
 
@@ -43,5 +45,6 @@ function benchmark_n_body(neighborhood_search, coordinates_; parallel = true)
         return dv
     end
 
-    return @belapsed $compute_acceleration!($dv, $coordinates, $mass, $G, $nhs, $parallel)
+    return @belapsed $compute_acceleration!($dv, $coordinates, $mass, $G, $nhs,
+                                            $parallelization_backend)
 end
diff --git a/benchmarks/plot.jl b/benchmarks/plot.jl
index 976cf045..bd7588c3 100644
--- a/benchmarks/plot.jl
+++ b/benchmarks/plot.jl
@@ -36,7 +36,7 @@ include("benchmarks/benchmarks.jl")
 plot_benchmarks(benchmark_count_neighbors, (10, 10), 3)
 """
 function plot_benchmarks(benchmark, n_points_per_dimension, iterations;
-                         parallel = true, title = "",
+                         parallelization_backend = PolyesterBackend(), title = "",
                          seed = 1, perturbation_factor_position = 1.0)
     neighborhood_searches_names = ["TrivialNeighborhoodSearch";;
                                    "GridNeighborhoodSearch";;
@@ -69,7 +69,7 @@ function plot_benchmarks(benchmark, n_points_per_dimension, iterations;
             neighborhood_search = neighborhood_searches[i]
             initialize!(neighborhood_search, coordinates, coordinates)
 
-            time = benchmark(neighborhood_search, coordinates, parallel = parallel)
+            time = benchmark(neighborhood_search, coordinates; parallelization_backend)
             times[iter, i] = time
             time_string = BenchmarkTools.prettytime(time * 1e9)
             println("$(neighborhood_searches_names[i])")
diff --git a/benchmarks/smoothed_particle_hydrodynamics.jl b/benchmarks/smoothed_particle_hydrodynamics.jl
index 2ccfb77d..4dfae330 100644
--- a/benchmarks/smoothed_particle_hydrodynamics.jl
+++ b/benchmarks/smoothed_particle_hydrodynamics.jl
@@ -9,7 +9,8 @@ A benchmark of the right-hand side of a full real-life Weakly Compressible
 Smoothed Particle Hydrodynamics (WCSPH) simulation with TrixiParticles.jl.
 This method is used to simulate an incompressible fluid.
 """
-function benchmark_wcsph(neighborhood_search, coordinates; parallel = true)
+function benchmark_wcsph(neighborhood_search, coordinates;
+                         parallelization_backend = default_backend(coordinates))
     density = 1000.0
     fluid = InitialCondition(; coordinates, density, mass = 0.1)
 
@@ -34,19 +35,12 @@ function benchmark_wcsph(neighborhood_search, coordinates; parallel = true)
                                                smoothing_length, viscosity = viscosity,
                                                density_diffusion = density_diffusion)
 
-    # Note that we cannot just disable parallelism in TrixiParticles.
-    # But passing a different backend like `CUDA.CUDABackend`
-    # allows us to change the type of the array to run the benchmark on the GPU.
-    if parallel isa Bool
-        system = fluid_system
-        nhs = neighborhood_search
-    else
-        system = PointNeighbors.Adapt.adapt(parallel, fluid_system)
-        nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search)
-    end
+    system = PointNeighbors.Adapt.adapt(parallelization_backend, fluid_system)
+    nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search)
 
-    v = PointNeighbors.Adapt.adapt(parallel, vcat(fluid.velocity, fluid.density'))
-    u = PointNeighbors.Adapt.adapt(parallel, coordinates)
+    v = PointNeighbors.Adapt.adapt(parallelization_backend,
+                                   vcat(fluid.velocity, fluid.density'))
+    u = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates)
     dv = zero(v)
 
     # Initialize the system
@@ -61,7 +55,8 @@ end
 
 Like [`benchmark_wcsph`](@ref), but using single precision floating point numbers.
 """
-function benchmark_wcsph_fp32(neighborhood_search, coordinates_; parallel = true)
+function benchmark_wcsph_fp32(neighborhood_search, coordinates_;
+                              parallelization_backend = default_backend(coordinates_))
     coordinates = convert(Matrix{Float32}, coordinates_)
     density = 1000.0f0
     fluid = InitialCondition(; coordinates, density, mass = 0.1f0)
@@ -88,19 +83,12 @@ function benchmark_wcsph_fp32(neighborhood_search, coordinates_; parallel = true
                                                acceleration = (0.0f0, 0.0f0, 0.0f0),
                                                density_diffusion = density_diffusion)
 
-    # Note that we cannot just disable parallelism in TrixiParticles.
-    # But passing a different backend like `CUDA.CUDABackend`
-    # allows us to change the type of the array to run the benchmark on the GPU.
-    if parallel isa Bool
-        system = fluid_system
-        nhs = neighborhood_search
-    else
-        system = PointNeighbors.Adapt.adapt(parallel, fluid_system)
-        nhs = PointNeighbors.Adapt.adapt(parallel, neighborhood_search)
-    end
+    system = PointNeighbors.Adapt.adapt(parallelization_backend, fluid_system)
+    nhs = PointNeighbors.Adapt.adapt(parallelization_backend, neighborhood_search)
 
-    v = PointNeighbors.Adapt.adapt(parallel, vcat(fluid.velocity, fluid.density'))
-    u = PointNeighbors.Adapt.adapt(parallel, coordinates)
+    v = PointNeighbors.Adapt.adapt(parallelization_backend,
+                                   vcat(fluid.velocity, fluid.density'))
+    u = PointNeighbors.Adapt.adapt(parallelization_backend, coordinates)
     dv = zero(v)
 
     # Initialize the system
@@ -117,7 +105,8 @@ A benchmark of the right-hand side of a full real-life Total Lagrangian
 Smoothed Particle Hydrodynamics (TLSPH) simulation with TrixiParticles.jl.
 This method is used to simulate an elastic structure.
 """
-function benchmark_tlsph(neighborhood_search, coordinates; parallel = true)
+function benchmark_tlsph(neighborhood_search, coordinates;
+                         parallelization_backend = default_backend(coordinates))
     material = (density = 1000.0, E = 1.4e6, nu = 0.4)
     solid = InitialCondition(; coordinates, density = material.density, mass = 0.1)
 
diff --git a/benchmarks/update.jl b/benchmarks/update.jl
index 2fce971f..79904700 100644
--- a/benchmarks/update.jl
+++ b/benchmarks/update.jl
@@ -9,7 +9,8 @@ include("../test/point_cloud.jl")
 
 Benchmark neighborhood search initialization with the given `coordinates`.
 """
-function benchmark_initialize(neighborhood_search, coordinates; parallel = true)
+function benchmark_initialize(neighborhood_search, coordinates;
+                              parallelization_backend = default_backend(coordinates))
     return @belapsed $initialize!($neighborhood_search, $coordinates, $coordinates)
 end
 
@@ -21,7 +22,8 @@ perturbed point clouds.
 
 This is a good benchmark for incremental updates, since most particles stay in their cells.
 """
-function benchmark_update_alternating(neighborhood_search, coordinates; parallel = true)
+function benchmark_update_alternating(neighborhood_search, coordinates;
+                                      parallelization_backend = default_backend(coordinates))
     coordinates2 = copy(coordinates)
     # Perturb all coordinates with a perturbation factor of `0.015`.
     # This factor was tuned so that ~0.5% of the particles change their cell during an
diff --git a/src/PointNeighbors.jl b/src/PointNeighbors.jl
index 979230ac..f6458409 100644
--- a/src/PointNeighbors.jl
+++ b/src/PointNeighbors.jl
@@ -11,8 +11,8 @@ using LinearAlgebra: dot
 using Polyester: Polyester
 @reexport using StaticArrays: SVector
 
-include("util.jl")
 include("vector_of_vectors.jl")
+include("util.jl")
 include("neighborhood_search.jl")
 include("nhs_trivial.jl")
 include("cell_lists/cell_lists.jl")
@@ -25,9 +25,10 @@ export TrivialNeighborhoodSearch, GridNeighborhoodSearch, PrecomputedNeighborhoo
 export DictionaryCellList, FullGridCellList
 export ParallelUpdate, SemiParallelUpdate, SerialIncrementalUpdate, SerialUpdate,
        ParallelIncrementalUpdate
-export requires_update, requires_resizing
+export requires_update
 export initialize!, update!, initialize_grid!, update_grid!
-export PolyesterBackend, ThreadsDynamicBackend, ThreadsStaticBackend
+export SerialBackend, PolyesterBackend, ThreadsDynamicBackend, ThreadsStaticBackend,
+       default_backend
 export PeriodicBox, copy_neighborhood_search
 
 end # module PointNeighbors
diff --git a/src/cell_lists/full_grid.jl b/src/cell_lists/full_grid.jl
index 552b2634..be68440e 100644
--- a/src/cell_lists/full_grid.jl
+++ b/src/cell_lists/full_grid.jl
@@ -126,7 +126,7 @@ function Base.empty!(cell_list::FullGridCellList)
     (; cells) = cell_list
 
     # `Base.empty!.(cells)`, but for all backends
-    for i in eachindex(cells)
+    @threaded default_backend(cells) for i in eachindex(cells)
         emptyat!(cells, i)
     end
 
@@ -225,6 +225,22 @@ end
 @inline function check_cell_bounds(cell_list::FullGridCellList, cell::Tuple)
     (; linear_indices) = cell_list
 
+    # Make sure that points are not added to the outer padding layer, which is needed
+    # to ensure that neighboring cells in all directions of all non-empty cells exist.
+    if !all(cell[i] in 2:(size(linear_indices, i) - 1) for i in eachindex(cell))
+        size_ = [2:(size(linear_indices, i) - 1) for i in eachindex(cell)]
+        print_size_ = "[$(join(size_, ", "))]"
+        error("particle coordinates are NaN or outside the domain bounds of the cell list\n" *
+              "cell $cell is out of bounds for cell grid of size $print_size_")
+    end
+end
+
+# On GPUs, we can't throw a proper error message because string interpolation is not allowed
+@inline function check_cell_bounds(cell_list::FullGridCellList{<:DynamicVectorOfVectors{<:Any,
+                                                                                        <:AbstractGPUArray}},
+                                   cell::Tuple)
+    (; linear_indices) = cell_list
+
     # Make sure that points are not added to the outer padding layer, which is needed
     # to ensure that neighboring cells in all directions of all non-empty cells exist.
     if !all(cell[i] in 2:(size(linear_indices, i) - 1) for i in eachindex(cell))
diff --git a/src/gpu.jl b/src/gpu.jl
index 9293e19f..cd418c2e 100644
--- a/src/gpu.jl
+++ b/src/gpu.jl
@@ -31,6 +31,3 @@ function Adapt.adapt_structure(to, nhs::GridNeighborhoodSearch)
     return GridNeighborhoodSearch(cell_list, search_radius, periodic_box, n_cells,
                                   cell_size, update_buffer, nhs.update_strategy)
 end
-
-# This is useful to pass the backend directly to `@threaded`
-KernelAbstractions.get_backend(backend::KernelAbstractions.Backend) = backend
diff --git a/src/neighborhood_search.jl b/src/neighborhood_search.jl
index 5a805ab9..e035720b 100644
--- a/src/neighborhood_search.jl
+++ b/src/neighborhood_search.jl
@@ -13,17 +13,8 @@ function requires_update(::AbstractNeighborhoodSearch)
 end
 
 """
-    requires_resizing(search::AbstractNeighborhoodSearch)
-
-Returns `false` if the neighborhood search can be updated with a different number
-of neighbor points (`y`) without re-initializing it.
-"""
-function requires_resizing(::AbstractNeighborhoodSearch)
-    error("`requires_resizing` not implemented for this neighborhood search.")
-end
-
-"""
-    initialize!(search::AbstractNeighborhoodSearch, x, y)
+    initialize!(search::AbstractNeighborhoodSearch, x, y;
+                parallelization_backend = default_backend(x))
 
 Initialize a neighborhood search with the two coordinate arrays `x` and `y`.
 
@@ -32,19 +23,27 @@ all points in `y` whose distances to that point are smaller than the search radi
 `x` and `y` are expected to be matrices, where the `i`-th column contains the coordinates
 of point `i`. Note that `x` and `y` can be identical.
 
+If the neighborhood search type supports parallelization, the keyword argument
+`parallelization_backend` can be used to specify a parallelization backend.
+See [`@threaded`](@ref) for a list of available backends.
+
 See also [`update!`](@ref).
 """
-@inline initialize!(search::AbstractNeighborhoodSearch, x, y) = search
+@inline function initialize!(search::AbstractNeighborhoodSearch, x, y;
+                             parallelization_backend = default_backend(x))
+    return search
+end
 
 """
-    update!(search::AbstractNeighborhoodSearch, x, y; points_moving = (true, true))
+    update!(search::AbstractNeighborhoodSearch, x, y; points_moving = (true, true),
+            parallelization_backend = default_backend(x))
 
 Update an already initialized neighborhood search with the two coordinate arrays `x` and `y`.
 
-Like [`initialize!`](@ref), but reusing the existing data structures of the already
-initialized neighborhood search.
+Like [`initialize!`](@ref), but potentially reusing the existing data structures
+of the already initialized neighborhood search.
 When the points only moved a small distance since the last `update!` or `initialize!`,
-this is significantly faster than `initialize!`.
+this can be significantly faster than `initialize!`.
 
 Not all implementations support incremental updates.
 If incremental updates are not possible for an implementation, `update!` will fall back
@@ -56,20 +55,15 @@ in this case to avoid unnecessary updates.
 The first flag in `points_moving` indicates if points in `x` are moving.
 The second flag indicates if points in `y` are moving.
 
-!!! warning "Experimental Feature: Backend Specification"
-    The keyword argument `parallelization_backend` allows users to specify the
-    multithreading backend. This feature is currently considered experimental!
-
-    Possible parallelization backends are:
-    - [`ThreadsDynamicBackend`](@ref) to use `Threads.@threads :dynamic`
-    - [`ThreadsStaticBackend`](@ref) to use `Threads.@threads :static`
-    - [`PolyesterBackend`](@ref) to use `Polyester.@batch`
-    - `KernelAbstractions.Backend` to launch a GPU kernel
+If the neighborhood search type supports parallelization, the keyword argument
+`parallelization_backend` can be used to specify a parallelization backend.
+See [`@threaded`](@ref) for a list of available backends.
 
 See also [`initialize!`](@ref).
 """
 @inline function update!(search::AbstractNeighborhoodSearch, x, y;
-                         points_moving = (true, true), parallelization_backend = x)
+                         points_moving = (true, true),
+                         parallelization_backend = default_backend(x))
     return search
 end
 
@@ -129,7 +123,8 @@ end
 
 """
     foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search;
-                           points = axes(system_coords, 2), parallel = true)
+                           parallelization_backend = default_backend(system_coords),
+                           points = axes(system_coords, 2))
 
 Loop for each point in `system_coords` over all points in `neighbor_coords` whose distances
 to that point are smaller than the search radius and execute the function `f(i, j, pos_diff, d)`,
@@ -140,11 +135,13 @@ where
   (`system_coords[:, i]`) and ``y`` the coordinates of the neighbor (`neighbor_coords[:, j]`),
 - `d` the distance between `x` and `y`.
 
-The `neighborhood_search` must have been initialized or updated with `system_coords`
-as first coordinate array and `neighbor_coords` as second coordinate array.
-
 Note that `system_coords` and `neighbor_coords` can be identical.
 
+!!! warning
+    The `neighborhood_search` must have been initialized or updated with `system_coords`
+    as first coordinate array and `neighbor_coords` as second coordinate array.
+    This can be skipped for certain implementations. See [`requires_update`](@ref).
+
 # Arguments
 - `f`: The function explained above.
 - `system_coords`: A matrix where the `i`-th column contains the coordinates of point `i`.
@@ -155,70 +152,26 @@ Note that `system_coords` and `neighbor_coords` can be identical.
 
 # Keywords
 - `points`: Loop over these point indices. By default all columns of `system_coords`.
-- `parallel=true`: Run the outer loop over `points` thread-parallel.
+- `parallelization_backend`: Run the outer loop over `points` in parallel with the
+                             specified backend. By default, the backend is selected
+                             automatically based on the type of `system_coords`.
+                             See [`@threaded`](@ref) for a list of available backends.
 
 See also [`initialize!`](@ref), [`update!`](@ref).
 """
 function foreach_point_neighbor(f::T, system_coords, neighbor_coords, neighborhood_search;
-                                parallel::Union{Bool, ParallelizationBackend} = true,
+                                parallelization_backend::ParallelizationBackend = default_backend(system_coords),
                                 points = axes(system_coords, 2)) where {T}
     # The type annotation above is to make Julia specialize on the type of the function.
     # Otherwise, unspecialized code will cause a lot of allocations
     # and heavily impact performance.
     # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing
-    if parallel isa Bool
-        # When `false` is passed, run serially. When `true` is passed, run either a
-        # threaded loop with `Polyester.@batch`, or, when `system_coords` is a GPU array,
-        # launch the loop as a kernel on the GPU.
-        parallel_ = Val(parallel)
-    elseif parallel isa ParallelizationBackend
-        # When a `KernelAbstractions.Backend` is passed, launch the loop as a GPU kernel
-        # on this backend. This is useful to test the GPU code on the CPU by passing
-        # `parallel = KernelAbstractions.CPU()`, even though `system_coords isa Array`.
-        parallel_ = parallel
-    end
-
-    foreach_point_neighbor(f, system_coords, neighbor_coords, neighborhood_search, points,
-                           parallel_)
-end
 
-@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
-                                        neighborhood_search, points, parallel::Val{true})
     # Explicit bounds check before the hot loop (or GPU kernel)
     @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points)
 
-    @threaded system_coords for point in points
-        # Now we can assume that `point` is inbounds
-        @inbounds foreach_neighbor(f, system_coords, neighbor_coords,
-                                   neighborhood_search, point)
-    end
-
-    return nothing
-end
-
-# When a `KernelAbstractions.Backend` is passed, launch a GPU kernel on this backend
-@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
-                                        neighborhood_search, points,
-                                        backend::ParallelizationBackend)
-    # Explicit bounds check before the hot loop (or GPU kernel)
-    @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points)
-
-    @threaded backend for point in points
-        # Now we can assume that `point` is inbounds
-        @inbounds foreach_neighbor(f, system_coords, neighbor_coords,
-                                   neighborhood_search, point)
-    end
-
-    return nothing
-end
-
-@inline function foreach_point_neighbor(f, system_coords, neighbor_coords,
-                                        neighborhood_search, points, parallel::Val{false})
-    # Explicit bounds check before the hot loop
-    @boundscheck checkbounds(system_coords, ndims(neighborhood_search), points)
-
-    for point in points
-        # Now we can assume that `point` is inbounds
+    @threaded parallelization_backend for point in points
+        # Now we can safely assume that `point` is inbounds
         @inbounds foreach_neighbor(f, system_coords, neighbor_coords,
                                    neighborhood_search, point)
     end
diff --git a/src/nhs_grid.jl b/src/nhs_grid.jl
index f46316d5..e29ac8c9 100644
--- a/src/nhs_grid.jl
+++ b/src/nhs_grid.jl
@@ -166,18 +166,6 @@ See [`GridNeighborhoodSearch`](@ref) for usage information.
 """
 struct SerialUpdate end
 
-@inline function requires_resizing(::GridNeighborhoodSearch{<:Any, ParallelUpdate})
-    # Update is just a re-initialization, so no re-initialization is needed
-    # when the number of points changes.
-    return false
-end
-
-@inline function requires_resizing(::GridNeighborhoodSearch)
-    # Incremental update strategies require re-initialization
-    # when the number of points changes.
-    return true
-end
-
 # No update buffer needed for non-incremental update/initialize
 @inline function create_update_buffer(::Union{SerialUpdate, ParallelUpdate}, _, _)
     return nothing
@@ -205,12 +193,13 @@ end
 end
 
 function initialize!(neighborhood_search::GridNeighborhoodSearch,
-                     x::AbstractMatrix, y::AbstractMatrix)
-    initialize_grid!(neighborhood_search, y)
+                     x::AbstractMatrix, y::AbstractMatrix;
+                     parallelization_backend = default_backend(x))
+    initialize_grid!(neighborhood_search, y; parallelization_backend)
 end
 
 function initialize_grid!(neighborhood_search::GridNeighborhoodSearch, y::AbstractMatrix;
-                          parallelization_backend = y)
+                          parallelization_backend = default_backend(y))
     (; cell_list) = neighborhood_search
 
     empty!(cell_list)
@@ -221,9 +210,10 @@ function initialize_grid!(neighborhood_search::GridNeighborhoodSearch, y::Abstra
         return neighborhood_search
     end
 
+    # Ignore the parallelization backend here. This cannot be parallelized.
     for point in axes(y, 2)
         # Get cell index of the point's cell
-        point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
+        point_coords = @inbounds extract_svector(y, Val(ndims(neighborhood_search)), point)
         cell = cell_coords(point_coords, neighborhood_search)
 
         # Add point to corresponding cell
@@ -233,14 +223,9 @@ function initialize_grid!(neighborhood_search::GridNeighborhoodSearch, y::Abstra
     return neighborhood_search
 end
 
-# WARNING! Experimental feature:
-# By default, determine the parallelization backend from the type of `y`.
-# Optionally, pass a `KernelAbstractions.Backend` to run the KernelAbstractions.jl code
-# on this backend. This can be useful to run GPU kernels on the CPU by passing
-# `parallelization_backend = KernelAbstractions.CPU()`, even though `y isa Array`.
 function initialize_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any,
                                                                       ParallelUpdate},
-                          y::AbstractMatrix; parallelization_backend = y)
+                          y::AbstractMatrix; parallelization_backend = default_backend(y))
     (; cell_list) = neighborhood_search
 
     empty!(cell_list)
@@ -263,14 +248,9 @@ function initialize_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any,
     return neighborhood_search
 end
 
-# WARNING! Experimental feature:
-# By default, determine the parallelization backend from the type of `x`.
-# Optionally, pass a `KernelAbstractions.Backend` to run the KernelAbstractions.jl code
-# on this backend. This can be useful to run GPU kernels on the CPU by passing
-# `parallelization_backend = KernelAbstractions.CPU()`, even though `x isa Array`.
 function update!(neighborhood_search::GridNeighborhoodSearch,
                  x::AbstractMatrix, y::AbstractMatrix;
-                 points_moving = (true, true), parallelization_backend = x)
+                 points_moving = (true, true), parallelization_backend = default_backend(x))
     # The coordinates of the first set of points are irrelevant for this NHS.
     # Only update when the second set is moving.
     points_moving[2] || return neighborhood_search
@@ -279,19 +259,8 @@ function update!(neighborhood_search::GridNeighborhoodSearch,
 end
 
 # Update only with neighbor coordinates
-function update_grid!(neighborhood_search::GridNeighborhoodSearch{NDIMS},
-                      y::AbstractMatrix; parallelization_backend = y) where {NDIMS}
-    update_grid!(neighborhood_search, i -> extract_svector(y, Val(NDIMS), i);
-                 parallelization_backend)
-end
-
-# Serial and semi-parallel update.
-# See the warning above. `parallelization_backend = nothing` will use `Polyester.@batch`.
-function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any,
-                                                                        SerialIncrementalUpdate},
-                                                 GridNeighborhoodSearch{<:Any,
-                                                                        SemiParallelUpdate}},
-                      coords_fun::Function; parallelization_backend = nothing)
+function update_grid!(neighborhood_search::GridNeighborhoodSearch{NDIMS}, y::AbstractMatrix;
+                      parallelization_backend = default_backend(y)) where {NDIMS}
     (; cell_list, update_buffer) = neighborhood_search
 
     # Empty each thread's list
@@ -301,7 +270,7 @@ function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any,
 
     # Find all cells containing points that now belong to another cell.
     # This loop is threaded for `update_strategy == SemiParallelUpdate()`.
-    mark_changed_cells!(neighborhood_search, coords_fun, parallelization_backend)
+    mark_changed_cells!(neighborhood_search, y, parallelization_backend)
 
     # Iterate over all marked cells and move the points into their new cells.
     # This is always a serial loop (hence "semi-parallel").
@@ -316,11 +285,10 @@ function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any,
             # `deleteat_cell!(..., i)` will change the order of points that come after `i`.
             for i in reverse(eachindex(points))
                 point = points[i]
-                cell_coords_ = cell_coords(coords_fun(point), neighborhood_search)
-
-                if !is_correct_cell(cell_list, cell_coords_, cell_index)
-                    new_cell_coords = cell_coords(coords_fun(point), neighborhood_search)
+                point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
+                new_cell_coords = cell_coords(point_coords, neighborhood_search)
 
+                if !is_correct_cell(cell_list, new_cell_coords, cell_index)
                     # Add point to new cell or create cell if it does not exist
                     push_cell!(cell_list, new_cell_coords, point)
 
@@ -339,31 +307,33 @@ end
 # See https://docs.julialang.org/en/v1/manual/performance-tips/#Be-aware-of-when-Julia-avoids-specializing
 @inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any,
                                                                                  SemiParallelUpdate},
-                                     coords_fun::T, parallelization_backend) where {T}
+                                     y, parallelization_backend) where {T}
     (; cell_list) = neighborhood_search
 
     # `each_cell_index(cell_list)` might return a `KeySet`, which has to be `collect`ed
     # first to be able to be used in a threaded loop. This function takes care of that.
     @threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list)
-        mark_changed_cell!(neighborhood_search, cell_index, coords_fun)
+        mark_changed_cell!(neighborhood_search, cell_index, y)
     end
 end
 
 @inline function mark_changed_cells!(neighborhood_search::GridNeighborhoodSearch{<:Any,
                                                                                  SerialIncrementalUpdate},
-                                     coords_fun::T, _) where {T}
+                                     y, _) where {T}
     (; cell_list) = neighborhood_search
 
+    # Ignore the parallelization backend here for `SerialIncrementalUpdate`.
     for cell_index in each_cell_index(cell_list)
-        mark_changed_cell!(neighborhood_search, cell_index, coords_fun)
+        mark_changed_cell!(neighborhood_search, cell_index, y)
     end
 end
 
-@inline function mark_changed_cell!(neighborhood_search, cell_index, coords_fun)
+@inline function mark_changed_cell!(neighborhood_search, cell_index, y)
     (; cell_list, update_buffer) = neighborhood_search
 
     for point in cell_list[cell_index]
-        cell = cell_coords(coords_fun(point), neighborhood_search)
+        point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
+        cell = cell_coords(point_coords, neighborhood_search)
 
         # `cell` is a tuple, `cell_index` is the linear index used internally by the
         # cell list to store cells inside `cell`.
@@ -376,11 +346,10 @@ end
     end
 end
 
-# Fully parallel update with atomic push.
-# See the warning above. `parallelization_backend = nothing` will use `Polyester.@batch`.
+# Fully parallel incremental update with atomic push.
 function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any,
                                                                   ParallelIncrementalUpdate},
-                      coords_fun::Function; parallelization_backend = nothing)
+                      y::AbstractMatrix; parallelization_backend = default_backend(y))
     (; cell_list, update_buffer) = neighborhood_search
 
     # Note that we need two separate loops for adding and removing points.
@@ -399,11 +368,10 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any,
     @threaded parallelization_backend for cell_index in each_cell_index_threadable(cell_list)
         for i in 1:update_buffer[cell_index]
             point = cell_list.cells.backend[i, cell_index]
-            cell_coords_ = cell_coords(coords_fun(point), neighborhood_search)
-
-            if !is_correct_cell(cell_list, cell_coords_, cell_index)
-                new_cell_coords = cell_coords(coords_fun(point), neighborhood_search)
+            point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
+            new_cell_coords = cell_coords(point_coords, neighborhood_search)
 
+            if !is_correct_cell(cell_list, new_cell_coords, cell_index)
                 # Add point to new cell or create cell if it does not exist
                 push_cell_atomic!(cell_list, new_cell_coords, point)
             end
@@ -419,9 +387,10 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any,
         # `deleteat_cell!(..., i)` will change the order of points that come after `i`.
         for i in reverse(eachindex(points))
             point = points[i]
-            cell_coords_ = cell_coords(coords_fun(point), neighborhood_search)
+            point_coords = extract_svector(y, Val(ndims(neighborhood_search)), point)
+            new_cell_coords = cell_coords(point_coords, neighborhood_search)
 
-            if !is_correct_cell(cell_list, cell_coords_, cell_index)
+            if !is_correct_cell(cell_list, new_cell_coords, cell_index)
                 # Remove moved point from this cell
                 deleteat_cell!(cell_list, cell_index, i)
             end
@@ -431,13 +400,12 @@ function update_grid!(neighborhood_search::GridNeighborhoodSearch{<:Any,
     return neighborhood_search
 end
 
-# Note that this is only defined when a matrix `y` is passed. When updating with a function,
-# it will fall back to the semi-parallel update.
+# Non-incremental update strategies just forward to `initialize_grid!`
 function update_grid!(neighborhood_search::Union{GridNeighborhoodSearch{<:Any,
                                                                         ParallelUpdate},
                                                  GridNeighborhoodSearch{<:Any,
                                                                         SerialUpdate}},
-                      y::AbstractMatrix; parallelization_backend = y)
+                      y::AbstractMatrix; parallelization_backend = default_backend(y))
     initialize_grid!(neighborhood_search, y; parallelization_backend)
 end
 
diff --git a/src/nhs_precomputed.jl b/src/nhs_precomputed.jl
index cb58cfb3..e46926d0 100644
--- a/src/nhs_precomputed.jl
+++ b/src/nhs_precomputed.jl
@@ -47,22 +47,20 @@ end
 
 @inline requires_update(::PrecomputedNeighborhoodSearch) = (true, true)
 
-@inline function requires_resizing(search::PrecomputedNeighborhoodSearch)
-    return requires_resizing(search.neighborhood_search)
-end
-
 @inline function search_radius(search::PrecomputedNeighborhoodSearch)
     return search_radius(search.neighborhood_search)
 end
 
 function initialize!(search::PrecomputedNeighborhoodSearch,
-                     x::AbstractMatrix, y::AbstractMatrix)
+                     x::AbstractMatrix, y::AbstractMatrix;
+                     parallelization_backend = default_backend(x))
     (; neighborhood_search, neighbor_lists) = search
 
     # Initialize grid NHS
-    initialize!(neighborhood_search, x, y)
+    initialize!(neighborhood_search, x, y; parallelization_backend)
 
-    initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y)
+    initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y,
+                               parallelization_backend)
 end
 
 # WARNING! Experimental feature:
@@ -72,7 +70,7 @@ end
 # `parallelization_backend = KernelAbstractions.CPU()`, even though `x isa Array`.
 function update!(search::PrecomputedNeighborhoodSearch,
                  x::AbstractMatrix, y::AbstractMatrix;
-                 points_moving = (true, true), parallelization_backend = x)
+                 points_moving = (true, true), parallelization_backend = default_backend(x))
     (; neighborhood_search, neighbor_lists) = search
 
     # Update grid NHS
@@ -80,11 +78,13 @@ function update!(search::PrecomputedNeighborhoodSearch,
 
     # Skip update if both point sets are static
     if any(points_moving)
-        initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y)
+        initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y,
+                                   parallelization_backend)
     end
 end
 
-function initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y)
+function initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y,
+                                    parallelization_backend)
     # Initialize neighbor lists
     empty!(neighbor_lists)
     resize!(neighbor_lists, size(x, 2))
@@ -93,7 +93,8 @@ function initialize_neighbor_lists!(neighbor_lists, neighborhood_search, x, y)
     end
 
     # Fill neighbor lists
-    foreach_point_neighbor(x, y, neighborhood_search) do point, neighbor, _, _
+    foreach_point_neighbor(x, y, neighborhood_search;
+                           parallelization_backend) do point, neighbor, _, _
         push!(neighbor_lists[point], neighbor)
     end
 end
diff --git a/src/nhs_trivial.jl b/src/nhs_trivial.jl
index fbc57631..9e7c564b 100644
--- a/src/nhs_trivial.jl
+++ b/src/nhs_trivial.jl
@@ -32,12 +32,14 @@ end
 
 @inline requires_update(::TrivialNeighborhoodSearch) = (false, false)
 
-@inline requires_resizing(::TrivialNeighborhoodSearch) = false
-
-@inline initialize!(search::TrivialNeighborhoodSearch, x, y) = search
+@inline function initialize!(search::TrivialNeighborhoodSearch, x, y;
+                             parallelization_backend = default_backend(x))
+    return search
+end
 
 @inline function update!(search::TrivialNeighborhoodSearch, x, y;
-                         points_moving = (true, true), parallelization_backend = x)
+                         points_moving = (true, true),
+                         parallelization_backend = default_backend(x))
     return search
 end
 
diff --git a/src/util.jl b/src/util.jl
index 7cfbfedc..a188c80b 100644
--- a/src/util.jl
+++ b/src/util.jl
@@ -35,6 +35,13 @@ end
 
 abstract type AbstractThreadingBackend end
 
+"""
+    SerialBackend()
+
+Pass as first argument to the [`@threaded`](@ref) macro to run the loop serially.
+"""
+struct SerialBackend <: AbstractThreadingBackend end
+
 """
     PolyesterBackend()
 
@@ -63,32 +70,41 @@ struct ThreadsStaticBackend <: AbstractThreadingBackend end
 const ParallelizationBackend = Union{AbstractThreadingBackend, KernelAbstractions.Backend}
 
 """
-    @threaded x for ... end
+    default_backend(x)
 
-Run either a threaded CPU loop or launch a kernel on the GPU, depending on the type of `x`.
+Select the recommended backend for an array `x`.
+This allows to write generic code that works for both CPU and GPU arrays.
+
+The default backend for CPU arrays is currently `PolyesterBackend()`.
+For GPU arrays, the respective `KernelAbstractions.Backend` is returned.
+"""
+@inline default_backend(::AbstractArray) = PolyesterBackend()
+@inline default_backend(x::AbstractGPUArray) = KernelAbstractions.get_backend(x)
+@inline default_backend(x::DynamicVectorOfVectors) = default_backend(x.backend)
+
+"""
+    @threaded backend for ... end
+
+Run either a threaded CPU loop or launch a kernel on the GPU, depending on the `backend`.
 Semantically the same as `Threads.@threads` when iterating over a `AbstractUnitRange`
 but without guarantee that the underlying implementation uses `Threads.@threads`
 or works for more general `for` loops.
 
-The first argument must either be a parallelization backend (see below) or an array from
-which the backend can be derived to determine if the loop must be run threaded on the CPU
-or launched as a kernel on the GPU. Passing `KernelAbstractions.CPU()` will run the GPU
-kernel on the CPU.
-
 Possible parallelization backends are:
+- [`SerialBackend`](@ref) to disable multithreading
 - [`PolyesterBackend`](@ref) to use `Polyester.@batch`
 - [`ThreadsDynamicBackend`](@ref) to use `Threads.@threads :dynamic`
 - [`ThreadsStaticBackend`](@ref) to use `Threads.@threads :static`
-- `KernelAbstractions.Backend` to execute the loop as a GPU kernel
+- Any `KernelAbstractions.Backend` to execute the loop as a GPU kernel
 
-In particular, the underlying threading capabilities might be provided by other packages
-such as [Polyester.jl](https://github.com/JuliaSIMD/Polyester.jl).
+Use `default_backend(x)` to select the recommended backend for an array `x`.
+This allows to write generic code that works for both CPU and GPU arrays.
 
 !!! warning "Warning"
     This macro does not necessarily work for general `for` loops. For example,
     it does not necessarily support general iterables such as `eachline(filename)`.
 """
-macro threaded(system, expr)
+macro threaded(backend, expr)
     # Reverse-engineer the for loop.
     # `expr.args[1]` is the head of the for loop, like `i = eachindex(x)`.
     # So, `expr.args[1].args[2]` is the iterator `eachindex(x)`
@@ -100,37 +116,42 @@ macro threaded(system, expr)
     # Assemble the `for` loop again as a call to `parallel_foreach`, using `$i` to use the
     # same loop variable as used in the for loop.
     return esc(quote
-                   PointNeighbors.parallel_foreach($iterator, $system) do $i
+                   PointNeighbors.parallel_foreach($iterator, $backend) do $i
                        $inner_loop
                    end
                end)
 end
 
-# Use `Polyester.@batch` for low-overhead threading
-# This is currently the default when x::Array
-@inline function parallel_foreach(f, iterator, x)
+# Serial loop
+@inline function parallel_foreach(f, iterator, ::SerialBackend)
+    for i in iterator
+        @inline f(i)
+    end
+end
+
+# Use `Polyester.@batch`
+@inline function parallel_foreach(f, iterator, ::PolyesterBackend)
     Polyester.@batch for i in iterator
         @inline f(i)
     end
 end
 
 # Use `Threads.@threads :dynamic`
-@inline function parallel_foreach(f, iterator, x::ThreadsDynamicBackend)
+@inline function parallel_foreach(f, iterator, ::ThreadsDynamicBackend)
     Threads.@threads :dynamic for i in iterator
         @inline f(i)
     end
 end
 
 # Use `Threads.@threads :static`
-@inline function parallel_foreach(f, iterator, x::ThreadsStaticBackend)
+@inline function parallel_foreach(f, iterator, ::ThreadsStaticBackend)
     Threads.@threads :static for i in iterator
         @inline f(i)
     end
 end
 
 # On GPUs, execute `f` inside a GPU kernel with KernelAbstractions.jl
-@inline function parallel_foreach(f, iterator,
-                                  x::Union{AbstractGPUArray, KernelAbstractions.Backend})
+@inline function parallel_foreach(f, iterator, backend::KernelAbstractions.Backend)
     # On the GPU, we can only loop over `1:N`. Therefore, we loop over `1:length(iterator)`
     # and index with `iterator[eachindex(iterator)[i]]`.
     # Note that this only works with vector-like iterators that support arbitrary indexing.
@@ -140,8 +161,6 @@ end
     # Skip empty loops
     ndrange == 0 && return
 
-    backend = KernelAbstractions.get_backend(x)
-
     # Call the generic kernel that is defined below, which only calls a function with
     # the global GPU index.
     generic_kernel(backend)(ndrange = ndrange) do i
diff --git a/test/cell_lists/full_grid.jl b/test/cell_lists/full_grid.jl
index d4a81b55..9cc360e9 100644
--- a/test/cell_lists/full_grid.jl
+++ b/test/cell_lists/full_grid.jl
@@ -15,19 +15,18 @@
             nhs = GridNeighborhoodSearch{N}(; cell_list, search_radius)
             y = rand(N, 10)
             error_string = "particle coordinates are NaN or outside the domain bounds of the cell list"
-            error = ErrorException(error_string)
 
             y[1, 7] = NaN
-            @test_throws error initialize!(nhs, y, y)
+            @test_throws error_string initialize!(nhs, y, y)
 
             y[1, 7] = min_corner[1] - 0.01
-            @test_throws error initialize!(nhs, y, y)
+            @test_throws error_string initialize!(nhs, y, y)
 
             # A bit more than max corner might still be inside the grid,
             # but one search radius more is always outside.
             # Also accounting for 0.001 extra padding (see above).
             y[1, 7] = max_corner[1] + 1.01
-            @test_throws error initialize!(nhs, y, y)
+            @test_throws error_string initialize!(nhs, y, y)
 
             y[1, 7] = 0.0
             @test_nowarn_mod initialize!(nhs, y, y)
@@ -37,16 +36,16 @@
             @test_nowarn_mod update!(nhs, y, y)
 
             y[1, 7] = NaN
-            @test_throws error update!(nhs, y, y)
+            @test_throws error_string update!(nhs, y, y)
 
             # A bit more than max corner might still be inside the grid,
             # but one search radius more is always outside.
             # Also accounting for 0.001 extra padding (see above).
             y[1, 7] = max_corner[1] + 1.01
-            @test_throws error update!(nhs, y, y)
+            @test_throws error_string update!(nhs, y, y)
 
             y[1, 7] = min_corner[1] - 0.01
-            @test_throws error update!(nhs, y, y)
+            @test_throws error_string update!(nhs, y, y)
         end
     end
 end
diff --git a/test/neighborhood_search.jl b/test/neighborhood_search.jl
index e01efa9b..33bd72e4 100644
--- a/test/neighborhood_search.jl
+++ b/test/neighborhood_search.jl
@@ -145,8 +145,10 @@
             neighbors_expected = [Int[] for _ in axes(coords, 2)]
 
             foreach_point_neighbor(coords, coords, trivial_nhs,
-                                   parallel = false) do point, neighbor,
-                                                        pos_diff, distance
+                                   parallelization_backend = SerialBackend()) do point,
+                                                                                 neighbor,
+                                                                                 pos_diff,
+                                                                                 distance
                 append!(neighbors_expected[point], neighbor)
             end
 
@@ -236,8 +238,10 @@
                 neighbors = [Int[] for _ in axes(coords, 2)]
 
                 foreach_point_neighbor(coords, coords, nhs,
-                                       parallel = false) do point, neighbor,
-                                                            pos_diff, distance
+                                       parallelization_backend = SerialBackend()) do point,
+                                                                                     neighbor,
+                                                                                     pos_diff,
+                                                                                     distance
                     append!(neighbors[point], neighbor)
                 end
 
diff --git a/test/nhs_grid.jl b/test/nhs_grid.jl
index e7e3807a..5bd2aaf3 100644
--- a/test/nhs_grid.jl
+++ b/test/nhs_grid.jl
@@ -167,8 +167,7 @@
         coordinates2 = coordinates1 .+ [1.4, -3.5, 0.8]
 
         # Update neighborhood search
-        coords_fun2(i) = coordinates2[:, i]
-        update_grid!(nhs1, coords_fun2)
+        update_grid!(nhs1, coordinates2)
 
         # Get each neighbor for updated NHS
         neighbors2 = sort(collect(PointNeighbors.eachneighbor(point_position1, nhs1)))