trixi-framework
diff --git a/‎Project.toml‎
Lines changed: 4 additions & 3 deletions b/‎Project.toml‎
Lines changed: 4 additions & 3 deletions
diff --git a/‎docs/literate/src/tut_custom_kernel.jl‎
Lines changed: 17 additions & 16 deletions b/‎docs/literate/src/tut_custom_kernel.jl‎
Lines changed: 17 additions & 16 deletions
diff --git a/‎docs/make.jl‎
Lines changed: 2 additions & 1 deletion b/‎docs/make.jl‎
Lines changed: 2 additions & 1 deletion
diff --git a/‎docs/src/development.md‎
Lines changed: 42 additions & 0 deletions b/‎docs/src/development.md‎
Lines changed: 42 additions & 0 deletions
@@ -1,7 +1,7 @@
 name = "TrixiParticles"
 uuid = "66699cd8-9c01-4e9d-a059-b96c86d16b3a"
-authors = ["erik.faulhaber <44124897+efaulhaber@users.noreply.github.com>"]
 version = "0.4.5-dev"
+authors = ["erik.faulhaber <44124897+efaulhaber@users.noreply.github.com>"]
 
 [deps]
 Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
@@ -18,7 +18,6 @@ GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
 JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
 KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
-MuladdMacro = "46d2c3a1-f734-5fdb-9937-b9b9aeba4221"
 PointNeighbors = "1c4d5385-0a27-49de-8e2c-43b175c8985c"
 Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
 Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -37,14 +36,17 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
 [weakdeps]
 OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
 OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 
 [extensions]
 TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
+TrixiParticlesCUDAExt = "CUDA"
 
 [compat]
 Accessors = "0.1.43"
 Adapt = "4"
 CSV = "0.10"
+CUDA = "5.9.1"
 DataFrames = "1.6"
 DelimitedFiles = "1"
 DiffEqCallbacks = "4"
@@ -54,7 +56,6 @@ ForwardDiff = "1"
 GPUArraysCore = "0.2"
 JSON = "1"
 KernelAbstractions = "0.9"
-MuladdMacro = "0.2"
 OrdinaryDiffEq = "6.91"
 OrdinaryDiffEqCore = "2, 3"
 PointNeighbors = "0.6.5"
 
@@ -60,29 +60,30 @@ struct MyGaussianKernel <: TrixiParticles.AbstractSmoothingKernel{2} end
 
 # By looking at the implementation of existing kernels in TrixiParticles.jl,
 # we can see that a kernel implementation requires three functions.
-# `TrixiParticles.kernel`, which is the kernel function itself,
-# `TrixiParticles.kernel_deriv`, which is the derivative of the kernel function,
-# and `TrixiParticles.compact_support`, which defines the compact support of the
-# kernel in relation to the smoothing length.
+# `TrixiParticles.kernel_unsafe`, which is the kernel function itself,
+# `TrixiParticles.kernel_deriv_div_r_unsafe`, which is the derivative of the
+# kernel divided by ``r``, and `TrixiParticles.compact_support`, which defines
+# the compact support of the kernel in relation to the smoothing length.
 # The latter is relevant for determining the search radius of the neighborhood search.
-function TrixiParticles.kernel(kernel::MyGaussianKernel, r, h)
+#
+# We implement `kernel_deriv_div_r_unsafe` instead of `kernel_deriv` directly since
+# this avoids an extra division in the hot loop and is robust near ``r=0``.
+# The public function `TrixiParticles.kernel_deriv` is defined automatically by
+# TrixiParticles from this method (and multiplies by ``r`` again when needed).
+# In the unsafe functions, we do not check the compact support; this is handled in the
+# safe wrappers `TrixiParticles.kernel` and `TrixiParticles.kernel_deriv` based on the
+# compact support defined in `TrixiParticles.compact_support`.
+function TrixiParticles.kernel_unsafe(kernel::MyGaussianKernel, r::Real, h)
     q = r / h
 
-    if q < 2
-        return 1 / (pi * h^2) * exp(-q^2)
-    end
-
-    return 0.0
+    return 1 / (pi * h^2) * exp(-q^2)
 end
 
-function TrixiParticles.kernel_deriv(kernel::MyGaussianKernel, r, h)
+function TrixiParticles.kernel_deriv_div_r_unsafe(kernel::MyGaussianKernel, r::Real, h)
     q = r / h
 
-    if q < 2
-        return 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
-    end
-
-    return 0.0
+    kernel_deriv = 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
+    return kernel_deriv / r
 end
 
 TrixiParticles.compact_support(::MyGaussianKernel, h) = 2 * h
 
@@ -72,7 +72,8 @@ makedocs(sitename="TrixiParticles.jl",
          plugins=[bib],
          # Run doctests and check docs for the following modules
          modules=[TrixiParticles, TrixiBase],
-         format=Documenter.HTML(; assets=Asciicast.assets()),
+         # Set edit_link explicitly to avoid `git remote show origin` lookups.
+         format=Documenter.HTML(; assets=Asciicast.assets(), edit_link="main"),
          # Explicitly specify documentation structure
          pages=[
              "Home" => "index.md",
 
@@ -66,3 +66,45 @@ To create a new release for TrixiParticles.jl, perform the following steps:
    version should be `v0.3.1-dev`. If you just released `v0.2.4`, the new development
    version should be `v0.2.5-dev`.
 
+## [Writing GPU-compatible code](@id writing_gpu_code)
+
+When implementing new functionality that should run on both CPUs and GPUs,
+follow these rules:
+
+1. Data structures must be generic and parametric.
+   Do not hardcode concrete CPU array types like `Vector` or `Matrix` in fields.
+   Use type parameters, so the same structure can store CPU arrays and GPU arrays.
+2. Add an Adapt.jl rule in `src/general/gpu.jl`.
+   Register the new type with `Adapt.@adapt_structure ...`, so `adapt` can recursively
+   convert all arrays inside the structure to GPU arrays.
+   This conversion is then applied automatically inside `semidiscretize`.
+3. Use `@threaded` for all loops.
+   Accessing GPU arrays inside regular loops is not allowed.
+   With a GPU backend, `@threaded` loops are compiled to GPU kernels.
+4. Write type-stable code and do not allocate inside `@threaded` loops.
+   This is required for GPU kernels and is also essential for fast multithreaded CPU code.
+
+## [Writing fast GPU code](@id writing_fast_gpu_code)
+
+The following rules improve kernel performance and avoid common GPU pitfalls:
+
+1. Avoid exceptions and bounds errors inside kernels.
+   Perform all required checks before entering `@threaded` loops (that is, before GPU kernels).
+   Then use `@inbounds` directly at the loop where bounds are guaranteed.
+   In TrixiParticles.jl, we do not place `@inbounds` inside inner helper functions.
+   Instead, mark helper functions with `@propagate_inbounds` so the loop-level
+   `@inbounds` is propagated.
+2. Avoid implicit `Float64` literals in arithmetic.
+   For example, prefer `x / 2` over `0.5 * x` so `Float32` simulations stay in `Float32`.
+   Verify this with `@device_code`, or by confirming the kernel runs on an Apple GPU
+   (most Apple GPUs do not support `Float64`).
+3. Use `div_fast` in performance-critical divisions, but only after benchmarking (!).
+   It can significantly speed up kernels, but should not be applied indiscriminately.
+   When introducing `div_fast` in code, add a reference to [this section](@ref writing_fast_gpu_code)
+   to document the rationale and benchmarking context, e.g., like so:
+   ```julia
+   # Since this is one of the most performance critical functions, using fast divisions
+   # here gives a significant speedup on GPUs.
+   # See the docs page "Development" for more details on `div_fast`.
+   result = div_fast(dividend, divisor)
+   ```