Skip to content

Commit d628a6d

Browse files
authored
Merge branch 'main' into dev
2 parents b4f5ed6 + 6ee0bf4 commit d628a6d

File tree

12 files changed

+239
-161
lines changed

12 files changed

+239
-161
lines changed

Project.toml

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -34,13 +34,13 @@ TrixiBase = "9a0f1c46-06d5-4909-a5a3-ce25d3fa3284"
3434
WriteVTK = "64499a7a-5c06-52f2-abe2-ccb03c286192"
3535

3636
[weakdeps]
37+
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
3738
OrdinaryDiffEq = "1dea7af3-3e70-54e6-95c3-0bf5283fa5ed"
3839
OrdinaryDiffEqCore = "bbf590c4-e513-4bbe-9b18-05decba2e5d8"
39-
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
4040

4141
[extensions]
42-
TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
4342
TrixiParticlesCUDAExt = "CUDA"
43+
TrixiParticlesOrdinaryDiffEqExt = ["OrdinaryDiffEq", "OrdinaryDiffEqCore"]
4444

4545
[compat]
4646
Accessors = "0.1.43"

src/general/abstract_system.jl

Lines changed: 10 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -68,17 +68,18 @@ end
6868
end
6969

7070
# Return `A[:, :, i]` as an `SMatrix`.
71-
@inline function extract_smatrix(A, system, particle)
72-
@boundscheck checkbounds(A, ndims(system), ndims(system), particle)
71+
@propagate_inbounds function extract_smatrix(A, system, particle)
72+
return extract_smatrix(A, Val(ndims(system)), particle)
73+
end
74+
75+
@inline function extract_smatrix(A, ::Val{NDIMS}, particle) where {NDIMS}
76+
@boundscheck checkbounds(A, NDIMS, NDIMS, particle)
7377

7478
# Extract the matrix elements for this particle as a tuple to pass to SMatrix
75-
return SMatrix{ndims(system),
76-
ndims(system)}(ntuple(@inline(i->@inbounds A[mod(i - 1,
77-
ndims(system)) + 1,
78-
div(i - 1,
79-
ndims(system)) + 1,
80-
particle]),
81-
Val(ndims(system)^2)))
79+
return SMatrix{NDIMS, NDIMS}(ntuple(@inline(i->@inbounds A[mod(i - 1, NDIMS) + 1,
80+
div(i - 1, NDIMS) + 1,
81+
particle]),
82+
Val(NDIMS^2)))
8283
end
8384

8485
# Specifically get the current coordinates of a particle for all system types.

src/general/neighborhood_search.jl

Lines changed: 6 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -11,6 +11,12 @@ function PointNeighbors.foreach_point_neighbor(f, system, neighbor_system,
1111
points, parallelization_backend)
1212
end
1313

14+
@propagate_inbounds function foreach_neighbor(f, system_coords, neighbor_coords,
15+
neighborhood_search, backend, particle)
16+
PointNeighbors.foreach_neighbor(f, system_coords, neighbor_coords,
17+
neighborhood_search, particle)
18+
end
19+
1420
# === Compact support selection ===
1521
# -- Generic
1622
@inline function compact_support(system, neighbor)

src/schemes/boundary/wall_boundary/system.jl

Lines changed: 5 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -179,6 +179,11 @@ end
179179
return kernel(smoothing_kernel, distance, smoothing_length)
180180
end
181181

182+
@inline function smoothing_kernel_unsafe(system::WallBoundarySystem, distance, particle)
183+
(; smoothing_kernel, smoothing_length) = system.boundary_model
184+
return kernel_unsafe(smoothing_kernel, distance, smoothing_length)
185+
end
186+
182187
@inline function smoothing_length(system::WallBoundarySystem, particle)
183188
return smoothing_length(system.boundary_model, particle)
184189
end

src/schemes/structure/total_lagrangian_sph/penalty_force.jl

Lines changed: 40 additions & 25 deletions
Original file line numberDiff line numberDiff line change
@@ -14,38 +14,53 @@ struct PenaltyForceGanzenmueller{ELTYPE}
1414
end
1515
end
1616

17-
@inline function dv_penalty_force(penalty_force::Nothing,
18-
particle, neighbor, initial_pos_diff, initial_distance,
19-
current_pos_diff, current_distance,
20-
system, m_a, m_b, rho_a, rho_b)
21-
return zero(initial_pos_diff)
17+
@inline function dv_penalty_force!(dv_particle, penalty_force::Nothing,
18+
particle, neighbor, initial_pos_diff, initial_distance,
19+
current_pos_diff, current_distance,
20+
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
21+
return dv_particle
2222
end
2323

24-
@propagate_inbounds function dv_penalty_force(penalty_force::PenaltyForceGanzenmueller,
25-
particle, neighbor, initial_pos_diff,
26-
initial_distance,
27-
current_pos_diff, current_distance,
28-
system, m_a, m_b, rho_a, rho_b)
29-
volume_a = m_a / rho_a
30-
volume_b = m_b / rho_b
24+
@propagate_inbounds function dv_penalty_force!(dv_particle,
25+
penalty_force::PenaltyForceGanzenmueller,
26+
particle, neighbor, initial_pos_diff,
27+
initial_distance,
28+
current_pos_diff, current_distance,
29+
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
30+
(; alpha) = penalty_force
3131

32-
kernel_weight = smoothing_kernel(system, initial_distance, particle)
32+
# Since this is one of the most performance critical functions, using fast divisions
33+
# here gives a significant speedup on GPUs.
34+
# See the docs page "Development" for more details on `div_fast`.
35+
volume_a = div_fast(m_a, rho_a)
36+
volume_b = div_fast(m_b, rho_b)
3337

34-
F_a = deformation_gradient(system, particle)
35-
F_b = deformation_gradient(system, neighbor)
38+
# This function is called after a compact support check, so we can use the unsafe
39+
# kernel function, which does not check the distance again.
40+
kernel_weight = smoothing_kernel_unsafe(system, initial_distance, particle)
3641

37-
inv_current_distance = 1 / current_distance
42+
E_a = young_modulus(system, particle)
43+
E_b = young_modulus(system, neighbor)
3844

39-
# Use the symmetry of epsilon to simplify computations
40-
eps_sum = (F_a + F_b) * initial_pos_diff - 2 * current_pos_diff
41-
delta_sum = dot(eps_sum, current_pos_diff) * inv_current_distance
45+
# Note that this is actually ϵ^b_ab = -ϵ^b_ba in the paper, so we later compute
46+
# δ^b_ab instead of δ^b_ba, but δ^b_ab = δ^b_ba because of antisymmetry of x_ab and ϵ_ab.
47+
eps_a = F_a * initial_pos_diff - current_pos_diff
48+
eps_b = F_b * initial_pos_diff - current_pos_diff
4249

43-
E = young_modulus(system, particle)
50+
# This is (E_a * delta_a + E_b * delta_b) * current_distance.
51+
# Pulling the division by `current_distance` out allows us to do one division by
52+
# `current_distance^2` instead.
53+
delta_sum = E_a * dot(eps_a, current_pos_diff) + E_b * dot(eps_b, current_pos_diff)
4454

45-
f = (penalty_force.alpha / 2) * volume_a * volume_b *
46-
kernel_weight / initial_distance^2 * E * delta_sum * current_pos_diff *
47-
inv_current_distance
55+
# The division contains all scalar factors, which are then multiplied by
56+
# the vector `current_pos_diff` at the end.
57+
# We already divide by `m_a` to obtain an acceleration.
58+
# Since this is one of the most performance critical functions, using fast divisions
59+
# here gives a significant speedup on GPUs.
60+
# See the docs page "Development" for more details on `div_fast`.
61+
dv_particle[] += div_fast((alpha / 2) * volume_a * volume_b * kernel_weight * delta_sum,
62+
initial_distance^2 * current_distance^2 * m_a) *
63+
current_pos_diff
4864

49-
# Divide force by mass to obtain acceleration
50-
return f / m_a
65+
return dv_particle
5166
end

src/schemes/structure/total_lagrangian_sph/rhs.jl

Lines changed: 54 additions & 39 deletions
Original file line numberDiff line numberDiff line change
@@ -19,6 +19,8 @@ end
1919

2020
# Everything here is done in the initial coordinates
2121
system_coords = initial_coordinates(system)
22+
neighborhood_search = get_neighborhood_search(system, semi)
23+
backend = semi.parallelization_backend
2224

2325
# For `distance == 0`, the analytical gradient is zero, but the unsafe gradient
2426
# and the density diffusion divide by zero.
@@ -29,48 +31,61 @@ end
2931
h = initial_smoothing_length(system)
3032
almostzero = sqrt(eps(h^2))
3133

32-
# Loop over all pairs of particles and neighbors within the kernel cutoff.
33-
# For structure-structure interaction, this has to happen in the initial coordinates.
34-
foreach_point_neighbor(system, system, system_coords, system_coords, semi;
35-
points=each_integrated_particle(system)) do particle, neighbor,
36-
initial_pos_diff,
37-
initial_distance
38-
# Skip neighbors with the same position because the kernel gradient is zero.
39-
# Note that `return` only exits the closure, i.e., skips the current neighbor.
40-
skip_zero_distance(system) && initial_distance < almostzero && return
41-
42-
# Now that we know that `distance` is not zero, we can safely call the unsafe
43-
# version of the kernel gradient to avoid redundant zero checks.
44-
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
45-
initial_distance, particle)
46-
47-
rho_a = @inbounds system.material_density[particle]
48-
rho_b = @inbounds system.material_density[neighbor]
49-
34+
@threaded semi for particle in each_integrated_particle(system)
35+
# We are looping over the particles of `system`, so it is guaranteed
36+
# that `particle` is in bounds of `system`.
5037
m_a = @inbounds system.mass[particle]
51-
m_b = @inbounds system.mass[neighbor]
52-
38+
rho_a = @inbounds system.material_density[particle]
5339
# PK1 / rho^2
5440
pk1_rho2_a = @inbounds pk1_rho2(system, particle)
55-
pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
56-
57-
current_pos_diff_ = @inbounds current_coords(system, particle) -
58-
current_coords(system, neighbor)
59-
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
60-
current_pos_diff = convert.(eltype(system), current_pos_diff_)
61-
current_distance = norm(current_pos_diff)
62-
63-
dv_stress = m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel
64-
65-
dv_penalty_force_ = @inbounds dv_penalty_force(penalty_force, particle, neighbor,
66-
initial_pos_diff, initial_distance,
67-
current_pos_diff, current_distance,
68-
system, m_a, m_b, rho_a, rho_b)
69-
70-
dv_particle = Ref(dv_stress + dv_penalty_force_)
71-
@inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
72-
current_pos_diff, current_distance,
73-
m_a, m_b, rho_a, rho_b, grad_kernel)
41+
current_coords_a = @inbounds current_coords(system, particle)
42+
F_a = @inbounds deformation_gradient(system, particle)
43+
44+
# Accumulate the RHS contributions over all neighbors before writing to `dv`
45+
# to reduce the number of memory writes.
46+
# Note that we need a `Ref` in order to be able to update these variables
47+
# inside the closure in the `foreach_neighbor` loop.
48+
dv_particle = Ref(zero(current_coords_a))
49+
50+
# Loop over all neighbors within the kernel cutoff
51+
@inbounds foreach_neighbor(system_coords, system_coords,
52+
neighborhood_search, backend,
53+
particle) do particle, neighbor,
54+
initial_pos_diff, initial_distance
55+
# Skip neighbors with the same position because the kernel gradient is zero.
56+
# Note that `return` only exits the closure, i.e., skips the current neighbor.
57+
skip_zero_distance(system) && initial_distance < almostzero && return
58+
59+
# Now that we know that `distance` is not zero, we can safely call the unsafe
60+
# version of the kernel gradient to avoid redundant zero checks.
61+
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
62+
initial_distance, particle)
63+
64+
rho_b = @inbounds system.material_density[neighbor]
65+
m_b = @inbounds system.mass[neighbor]
66+
# PK1 / rho^2
67+
pk1_rho2_b = @inbounds pk1_rho2(system, neighbor)
68+
current_coords_b = @inbounds current_coords(system, neighbor)
69+
70+
# The compiler is smart enough to optimize this away if no penalty force is used
71+
F_b = @inbounds deformation_gradient(system, neighbor)
72+
73+
current_pos_diff_ = current_coords_a - current_coords_b
74+
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
75+
current_pos_diff = convert.(eltype(system), current_pos_diff_)
76+
current_distance = norm(current_pos_diff)
77+
78+
dv_particle[] += m_b * (pk1_rho2_a + pk1_rho2_b) * grad_kernel
79+
80+
@inbounds dv_penalty_force!(dv_particle, penalty_force, particle, neighbor,
81+
initial_pos_diff, initial_distance,
82+
current_pos_diff, current_distance,
83+
system, m_a, m_b, rho_a, rho_b, F_a, F_b)
84+
85+
@inbounds dv_viscosity_tlsph!(dv_particle, system, v_system, particle, neighbor,
86+
current_pos_diff, current_distance,
87+
m_a, m_b, rho_a, rho_b, F_a, grad_kernel)
88+
end
7489

7590
for i in 1:ndims(system)
7691
@inbounds dv[i, particle] += dv_particle[][i]

src/schemes/structure/total_lagrangian_sph/system.jl

Lines changed: 40 additions & 21 deletions
Original file line numberDiff line numberDiff line change
@@ -489,30 +489,49 @@ end
489489

490490
# Loop over all pairs of particles and neighbors within the kernel cutoff
491491
initial_coords = initial_coordinates(system)
492-
foreach_point_neighbor(system, system, initial_coords, initial_coords,
493-
semi) do particle, neighbor, initial_pos_diff, initial_distance
494-
# Skip neighbors with the same position because the kernel gradient is zero.
495-
# Note that `return` only exits the closure, i.e., skips the current neighbor.
496-
skip_zero_distance(system) && initial_distance < almostzero && return
492+
neighborhood_search = get_neighborhood_search(system, semi)
493+
backend = semi.parallelization_backend
497494

498-
# Now that we know that `distance` is not zero, we can safely call the unsafe
499-
# version of the kernel gradient to avoid redundant zero checks.
500-
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
501-
initial_distance, particle)
502-
503-
volume = @inbounds mass[neighbor] / material_density[neighbor]
504-
pos_diff_ = @inbounds current_coords(system, particle) -
505-
current_coords(system, neighbor)
506-
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
507-
pos_diff = convert.(eltype(system), pos_diff_)
508-
509-
# Multiply by L_{0a}
510-
L = @inbounds correction_matrix(system, particle)
511-
512-
result = volume * pos_diff * grad_kernel' * L'
495+
# The deformation gradient is computed for all particles, including the clamped ones
496+
@threaded semi for particle in eachparticle(system)
497+
# We are looping over the particles of `system`, so it is guaranteed
498+
# that `particle` is in bounds of `system`.
499+
current_coords_a = @inbounds current_coords(system, particle)
500+
L_a = @inbounds correction_matrix(system, particle)
501+
502+
# Accumulate the contributions over all neighbors before writing
503+
# to `deformation_grad` to reduce the number of memory writes.
504+
# Note that we need a `Ref` in order to be able to update these variables
505+
# inside the closure in the `foreach_neighbor` loop.
506+
result = Ref(zero(L_a))
507+
508+
# Loop over all neighbors within the kernel cutoff
509+
@inbounds foreach_neighbor(initial_coords, initial_coords,
510+
neighborhood_search, backend,
511+
particle) do particle, neighbor,
512+
initial_pos_diff, initial_distance
513+
# Skip neighbors with the same position because the kernel gradient is zero.
514+
# Note that `return` only exits the closure, i.e., skips the current neighbor.
515+
skip_zero_distance(system) && initial_distance < almostzero && return
516+
517+
# Now that we know that `distance` is not zero, we can safely call the unsafe
518+
# version of the kernel gradient to avoid redundant zero checks.
519+
grad_kernel = smoothing_kernel_grad_unsafe(system, initial_pos_diff,
520+
initial_distance, particle)
521+
522+
volume = @inbounds mass[neighbor] / material_density[neighbor]
523+
current_coords_b = @inbounds current_coords(system, neighbor)
524+
pos_diff_ = current_coords_a - current_coords_b
525+
# On GPUs, convert `Float64` coordinates to `Float32` after computing the difference
526+
pos_diff = convert.(eltype(system), pos_diff_)
527+
528+
# The tensor product pos_diff ⊗ (L_{0a} * ∇W) is equivalent to multiplication
529+
# by the transpose: pos_diff * (L_{0a} * ∇W)ᵀ = pos_diff * ∇Wᵀ * L_{0a}ᵀ.
530+
result[] -= volume * pos_diff * grad_kernel' * L_a'
531+
end
513532

514533
for j in 1:ndims(system), i in 1:ndims(system)
515-
@inbounds deformation_grad[i, j, particle] -= result[i, j]
534+
@inbounds deformation_grad[i, j, particle] += result[][i, j]
516535
end
517536
end
518537

0 commit comments

Comments
 (0)