Skip to content

Commit 63ac124

Browse files
authored
Merge branch 'main' into semi_improv
2 parents 7b449c3 + ca35699 commit 63ac124

File tree

40 files changed

+1236
-659
lines changed

40 files changed

+1236
-659
lines changed

Project.toml

Lines changed: 4 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,7 +1,7 @@
11
name = "TrixiParticles"
22
uuid = "66699cd8-9c01-4e9d-a059-b96c86d16b3a"
3-
authors = ["erik.faulhaber <44124897+efaulhaber@users.noreply.github.com>"]
43
version = "0.4.5-dev"
4+
authors = ["erik.faulhaber <44124897+efaulhaber@users.noreply.github.com>"]
55

66
[deps]
77
Accessors = "7d9f7c33-5ae7-4f3b-8dc6-eff91059b697"
@@ -18,7 +18,6 @@ GPUArraysCore = "46192b85-c4d5-4398-a991-12ede77f4527"
1818
JSON = "682c06a0-de6a-54ab-a142-c8b1cf79cde6"
1919
KernelAbstractions = "63c18a36-062a-441e-b654-da1e3ab1ce7c"
2020
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
21-
MuladdMacro = "46d2c3a1-f734-5fdb-9937-b9b9aeba4221"
2221
PointNeighbors = "1c4d5385-0a27-49de-8e2c-43b175c8985c"
2322
Polyester = "f517fe37-dbe3-4b94-8317-1923a5111588"
2423
Printf = "de0858da-6303-5e67-8744-51eddeeeb8d7"
@@ -37,14 +36,17 @@ WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
3736
[weakdeps]
3837
OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
3938
OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
39+
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
4040

4141
[extensions]
4242
TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
43+
TrixiParticlesCUDAExt = "CUDA"
4344

4445
[compat]
4546
Accessors = "0.1.43"
4647
Adapt = "4"
4748
CSV = "0.10"
49+
CUDA = "5.9.1"
4850
DataFrames = "1.6"
4951
DelimitedFiles = "1"
5052
DiffEqCallbacks = "4"
@@ -54,7 +56,6 @@ ForwardDiff = "1"
5456
GPUArraysCore = "0.2"
5557
JSON = "1"
5658
KernelAbstractions = "0.9"
57-
MuladdMacro = "0.2"
5859
OrdinaryDiffEq = "6.91"
5960
OrdinaryDiffEqCore = "2, 3"
6061
PointNeighbors = "0.6.5"

docs/literate/src/tut_custom_kernel.jl

Lines changed: 17 additions & 16 deletions
Original file line numberDiff line numberDiff line change
@@ -60,29 +60,30 @@ struct MyGaussianKernel <: TrixiParticles.AbstractSmoothingKernel{2} end
6060

6161
# By looking at the implementation of existing kernels in TrixiParticles.jl,
6262
# we can see that a kernel implementation requires three functions.
63-
# `TrixiParticles.kernel`, which is the kernel function itself,
64-
# `TrixiParticles.kernel_deriv`, which is the derivative of the kernel function,
65-
# and `TrixiParticles.compact_support`, which defines the compact support of the
66-
# kernel in relation to the smoothing length.
63+
# `TrixiParticles.kernel_unsafe`, which is the kernel function itself,
64+
# `TrixiParticles.kernel_deriv_div_r_unsafe`, which is the derivative of the
65+
# kernel divided by ``r``, and `TrixiParticles.compact_support`, which defines
66+
# the compact support of the kernel in relation to the smoothing length.
6767
# The latter is relevant for determining the search radius of the neighborhood search.
68-
function TrixiParticles.kernel(kernel::MyGaussianKernel, r, h)
68+
#
69+
# We implement `kernel_deriv_div_r_unsafe` instead of `kernel_deriv` directly since
70+
# this avoids an extra division in the hot loop and is robust near ``r=0``.
71+
# The public function `TrixiParticles.kernel_deriv` is defined automatically by
72+
# TrixiParticles from this method (and multiplies by ``r`` again when needed).
73+
# In the unsafe functions, we do not check the compact support; this is handled in the
74+
# safe wrappers `TrixiParticles.kernel` and `TrixiParticles.kernel_deriv` based on the
75+
# compact support defined in `TrixiParticles.compact_support`.
76+
function TrixiParticles.kernel_unsafe(kernel::MyGaussianKernel, r::Real, h)
6977
q = r / h
7078

71-
if q < 2
72-
return 1 / (pi * h^2) * exp(-q^2)
73-
end
74-
75-
return 0.0
79+
return 1 / (pi * h^2) * exp(-q^2)
7680
end
7781

78-
function TrixiParticles.kernel_deriv(kernel::MyGaussianKernel, r, h)
82+
function TrixiParticles.kernel_deriv_div_r_unsafe(kernel::MyGaussianKernel, r::Real, h)
7983
q = r / h
8084

81-
if q < 2
82-
return 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
83-
end
84-
85-
return 0.0
85+
kernel_deriv = 1 / (pi * h^2) * (-2 * q) * exp(-q^2) / h
86+
return kernel_deriv / r
8687
end
8788

8889
TrixiParticles.compact_support(::MyGaussianKernel, h) = 2 * h

docs/make.jl

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -72,7 +72,8 @@ makedocs(sitename="TrixiParticles.jl",
7272
plugins=[bib],
7373
# Run doctests and check docs for the following modules
7474
modules=[TrixiParticles, TrixiBase],
75-
format=Documenter.HTML(; assets=Asciicast.assets()),
75+
# Set edit_link explicitly to avoid `git remote show origin` lookups.
76+
format=Documenter.HTML(; assets=Asciicast.assets(), edit_link="main"),
7677
# Explicitly specify documentation structure
7778
pages=[
7879
"Home" => "index.md",

docs/src/development.md

Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -66,3 +66,45 @@ To create a new release for TrixiParticles.jl, perform the following steps:
6666
version should be `v0.3.1-dev`. If you just released `v0.2.4`, the new development
6767
version should be `v0.2.5-dev`.
6868

69+
## [Writing GPU-compatible code](@id writing_gpu_code)
70+
71+
When implementing new functionality that should run on both CPUs and GPUs,
72+
follow these rules:
73+
74+
1. Data structures must be generic and parametric.
75+
Do not hardcode concrete CPU array types like `Vector` or `Matrix` in fields.
76+
Use type parameters, so the same structure can store CPU arrays and GPU arrays.
77+
2. Add an Adapt.jl rule in `src/general/gpu.jl`.
78+
Register the new type with `Adapt.@adapt_structure ...`, so `adapt` can recursively
79+
convert all arrays inside the structure to GPU arrays.
80+
This conversion is then applied automatically inside `semidiscretize`.
81+
3. Use `@threaded` for all loops.
82+
Accessing GPU arrays inside regular loops is not allowed.
83+
With a GPU backend, `@threaded` loops are compiled to GPU kernels.
84+
4. Write type-stable code and do not allocate inside `@threaded` loops.
85+
This is required for GPU kernels and is also essential for fast multithreaded CPU code.
86+
87+
## [Writing fast GPU code](@id writing_fast_gpu_code)
88+
89+
The following rules improve kernel performance and avoid common GPU pitfalls:
90+
91+
1. Avoid exceptions and bounds errors inside kernels.
92+
Perform all required checks before entering `@threaded` loops (that is, before GPU kernels).
93+
Then use `@inbounds` directly at the loop where bounds are guaranteed.
94+
In TrixiParticles.jl, we do not place `@inbounds` inside inner helper functions.
95+
Instead, mark helper functions with `@propagate_inbounds` so the loop-level
96+
`@inbounds` is propagated.
97+
2. Avoid implicit `Float64` literals in arithmetic.
98+
For example, prefer `x / 2` over `0.5 * x` so `Float32` simulations stay in `Float32`.
99+
Verify this with `@device_code`, or by confirming the kernel runs on an Apple GPU
100+
(most Apple GPUs do not support `Float64`).
101+
3. Use `div_fast` in performance-critical divisions, but only after benchmarking (!).
102+
It can significantly speed up kernels, but should not be applied indiscriminately.
103+
When introducing `div_fast` in code, add a reference to [this section](@ref writing_fast_gpu_code)
104+
to document the rationale and benchmarking context, e.g., like so:
105+
```julia
106+
# Since this is one of the most performance critical functions, using fast divisions
107+
# here gives a significant speedup on GPUs.
108+
# See the docs page "Development" for more details on `div_fast`.
109+
result = div_fast(dividend, divisor)
110+
```

0 commit comments

Comments
 (0)