Skip to content

Commit 2b3dcda

Browse files
abussyniklasschmitz
authored andcommitted
GPU optimized symmetry operations (#1097)
1 parent f2f1e75 commit 2b3dcda

4 files changed

Lines changed: 64 additions & 80 deletions

File tree

src/gpu/gpu_arrays.jl

Lines changed: 0 additions & 8 deletions
Original file line numberDiff line numberDiff line change
@@ -5,14 +5,6 @@ using Preferences
55
# https://github.com/JuliaGPU/CUDA.jl/issues/1565
66
LinearAlgebra.dot(x::AbstractGPUArray, D::Diagonal, y::AbstractGPUArray) = x' * (D * y)
77

8-
function lowpass_for_symmetry!::AbstractGPUArray, basis; symmetries=basis.symmetries)
9-
all(isone, symmetries) && return ρ
10-
# lowpass_for_symmetry! currently uses scalar indexing, so we have to do this very ugly
11-
# thing for cases where ρ sits on a device (e.g. GPU)
12-
ρ_CPU = lowpass_for_symmetry!(to_cpu(ρ), basis; symmetries)
13-
ρ .= to_device(basis.architecture, ρ_CPU)
14-
end
15-
168
for fun in (:potential_terms, :kernel_terms)
179
@eval function DftFunctionals.$fun(fun::DispatchFunctional, ρ::AT,
1810
args...) where {AT <: AbstractGPUArray{Float64}}

src/symmetry.jl

Lines changed: 46 additions & 36 deletions
Original file line numberDiff line numberDiff line change
@@ -269,48 +269,58 @@ function apply_symop(symop::SymOp, basis, ρin; kwargs...)
269269
end
270270

271271
# Accumulates the symmetrized versions of the density ρin into ρout (in Fourier space).
272-
# No normalization is performed
272+
# No normalization is performed. This function is optimized for CPU and GPU.
273273
function accumulate_over_symmetries!(ρaccu, ρin, basis::PlaneWaveBasis{T}, symmetries) where {T}
274-
for symop in symmetries
275-
# Common special case, where ρin does not need to be processed
276-
if isone(symop)
277-
ρaccu .+= ρin
278-
continue
279-
end
280-
281-
# Transform ρin -> to the partial density at S * k.
282-
#
283-
# Since the eigenfunctions of the Hamiltonian at k and Sk satisfy
284-
# u_{Sk}(x) = u_{k}(S^{-1} (x - τ))
285-
# with Fourier transform
286-
# ̂u_{Sk}(G) = e^{-i G \cdot τ} ̂u_k(S^{-1} G)
287-
# equivalently
288-
# ρ ̂_{Sk}(G) = e^{-i G \cdot τ} ̂ρ_k(S^{-1} G)
289-
invS = Mat3{Int}(inv(symop.S))
290-
for (ig, G) in enumerate(G_vectors_generator(basis.fft_size))
291-
igired = index_G_vectors(basis, invS * G)
292-
isnothing(igired) && continue
293-
294-
if iszero(symop.τ)
295-
@inbounds ρaccu[ig] += ρin[igired]
296-
else
297-
factor = cis2pi(-T(dot(G, symop.τ)))
298-
@inbounds ρaccu[ig] += factor * ρin[igired]
299-
end
274+
# For each G vector and symmetry S:
275+
# Transform ρin -> to the partial density at S * k.
276+
#
277+
# Since the eigenfunctions of the Hamiltonian at k and Sk satisfy
278+
# u_{Sk}(x) = u_{k}(S^{-1} (x - τ))
279+
# with Fourier transform
280+
# ̂u_{Sk}(G) = e^{-i G \cdot τ} ̂u_k(S^{-1} G)
281+
# equivalently
282+
# ρ ̂_{Sk}(G) = e^{-i G \cdot τ} ̂ρ_k(S^{-1} G)
283+
Gs = reshape(G_vectors(basis), size(ρaccu))
284+
fft_size = basis.fft_size
285+
286+
# Need to transfer symmetry data to the GPU as isbit data
287+
symm_invS = to_device(basis.architecture, [Mat3{Int}(inv(symop.S)) for symop in symmetries])
288+
symm_τ = to_device(basis.architecture, [symop.τ for symop in symmetries])
289+
n_symm = length(symmetries)
290+
291+
# Looping over symmetries inside of map! on G vectors allow for a single GPU kernel launch
292+
map!(ρaccu, Gs) do G
293+
acc = zero(complex(T))
294+
# Explicit loop over indicies because AMDGPU does not support zip() in map!
295+
for i_symm in 1:n_symm
296+
invS = symm_invS[i_symm]
297+
τ = symm_τ[i_symm]
298+
idx = index_G_vectors(fft_size, invS * G)
299+
acc += isnothing(idx) ? zero(complex(T)) : cis2pi(-T(dot(G, τ))) * ρin[idx]
300300
end
301-
end # symop
301+
acc
302+
end
302303
ρaccu
303304
end
304305

305306
# Low-pass filters ρ (in Fourier) so that symmetry operations acting on it stay in the grid
307+
# This function is optimized for CPU and GPU.
306308
function lowpass_for_symmetry!::AbstractArray, basis; symmetries=basis.symmetries)
307-
for symop in symmetries
308-
isone(symop) && continue
309-
for (ig, G) in enumerate(G_vectors_generator(basis.fft_size))
310-
if index_G_vectors(basis, symop.S * G) === nothing
311-
ρ[ig] = 0
312-
end
309+
all(isone, symmetries) && return ρ
310+
311+
Gs = reshape(G_vectors(basis), size(ρ))
312+
fft_size = basis.fft_size
313+
314+
symm_S = to_device(basis.architecture, [symop.S for symop in symmetries])
315+
316+
# Loop structure optimized for both CPU and GPU
317+
map!(ρ, ρ, Gs) do ρ_i, G
318+
acc = ρ_i
319+
for S in symm_S
320+
idx = index_G_vectors(fft_size, S * G)
321+
acc *= isnothing(idx) ? 0 : 1
313322
end
323+
acc
314324
end
315325
ρ
316326
end
@@ -320,15 +330,15 @@ Symmetrize a density by applying all the basis (by default) symmetries and formi
320330
"""
321331
@views @timing function symmetrize_ρ(basis, ρ::AbstractArray{T};
322332
symmetries=basis.symmetries, do_lowpass=true) where {T}
323-
ρin_fourier = to_cpu(fft(basis, ρ))
333+
ρin_fourier = fft(basis, ρ)
324334
ρout_fourier = zero(ρin_fourier)
325335
for σ = 1:size(ρ, 4)
326336
accumulate_over_symmetries!(ρout_fourier[:, :, :, σ],
327337
ρin_fourier[:, :, :, σ], basis, symmetries)
328338
do_lowpass && lowpass_for_symmetry!(ρout_fourier[:, :, :, σ], basis; symmetries)
329339
end
330340
inv_fft = T <: Real ? irfft : ifft
331-
inv_fft(basis, to_device(basis.architecture, ρout_fourier) ./ length(symmetries))
341+
inv_fft(basis, ρout_fourier ./ length(symmetries))
332342
end
333343

334344
"""

src/terms/hartree.jl

Lines changed: 3 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -28,19 +28,18 @@ struct TermHartree <: TermNonlinear
2828
end
2929
function compute_poisson_green_coeffs(basis::PlaneWaveBasis{T}, scaling_factor;
3030
q=zero(Vec3{T})) where {T}
31-
model = basis.model
31+
recip_lattice = basis.model.recip_lattice
3232

3333
# Solving the Poisson equation ΔV = -4π ρ in Fourier space
3434
# is multiplying elementwise by 4π / |G|^2.
35-
poisson_green_coeffs = 4T(π) ./ [sum(abs2, model.recip_lattice * (G + q))
36-
for G in to_cpu(G_vectors(basis))]
35+
poisson_green_coeffs = map(G -> 4T(π) / sum(abs2, recip_lattice * (G + q)),
36+
G_vectors(basis))
3737
if iszero(q)
3838
# Compensating charge background => Zero DC.
3939
GPUArraysCore.@allowscalar poisson_green_coeffs[1] = 0
4040
# Symmetrize Fourier coeffs to have real iFFT.
4141
enforce_real!(poisson_green_coeffs, basis)
4242
end
43-
poisson_green_coeffs = to_device(basis.architecture, poisson_green_coeffs)
4443
scaling_factor .* poisson_green_coeffs
4544
end
4645
function TermHartree(basis::PlaneWaveBasis{T}, scaling_factor) where {T}

test/symmetry_issues.jl

Lines changed: 15 additions & 32 deletions
Original file line numberDiff line numberDiff line change
@@ -28,49 +28,32 @@
2828
@testset "Inlining" begin
2929
using PseudoPotentialData
3030

31-
# Test that the index_G_vectors function is properly inlined by comparing timing
32-
# with a locally defined function known not to be inlined. Issue initially tackled
33-
# in PR https://github.com/JuliaMolSim/DFTK.jl/pull/1025
34-
function index_G_vectors_slow(basis, G::AbstractVector{<:Integer})
35-
start = .- cld.(basis.fft_size .- 1, 2)
36-
stop = fld.(basis.fft_size .- 1, 2)
37-
lengths = stop .- start .+ 1
38-
39-
# FFTs store wavevectors as [0 1 2 3 -2 -1] (example for N=5)
40-
function G_to_index(length, G)
41-
G >= 0 && return 1 + G
42-
return 1 + length + G
43-
end
44-
if all(start .<= G .<= stop)
45-
CartesianIndex(Tuple(G_to_index.(lengths, G)))
46-
else
47-
nothing # Outside range of valid indices
48-
end
49-
end
31+
# Test that the index_G_vectors function is properly inlined.
32+
# Issue initially tackled in PR https://github.com/JuliaMolSim/DFTK.jl/pull/1025
5033

5134
# This is a bare-bone version of the accumulate_over_symmetries() function, only
5235
# keeping calls to the index_G_vectors() function for which we test inlining
53-
function G_vectors_calls(basis, test_func)
36+
function G_vectors_calls(basis)
5437
for symop in basis.symmetries
5538
invS = Mat3{Int}(inv(symop.S))
5639
for (ig, G) in enumerate(DFTK.G_vectors_generator(basis.fft_size))
57-
igired = test_func(basis, invS * G)
40+
igired = DFTK.index_G_vectors(basis, invS * G)
5841
end
5942
end
6043
end
6144

62-
silicon = TestCases.silicon
63-
Si = ElementPsp(silicon.atnum, PseudoFamily("cp2k.nc.sr.lda.v0_1.semicore.gth"))
64-
atoms = [Si, Si]
65-
model = model_DFT(silicon.lattice, atoms, silicon.positions;
66-
functionals=[:lda_x, :lda_c_vwn])
67-
Ecut = 32
68-
kgrid = [1, 1, 1]
69-
basis = PlaneWaveBasis(model; Ecut, kgrid)
45+
# If a function is inlined, its name disappers from the output of code_typed(;optimize=true)
46+
# When optimize=false, the function is not inlined and we can see its name in the output.
47+
function is_inlined(optimize)
48+
info = code_typed(G_vectors_calls, (DFTK.PlaneWaveBasis,), optimize=optimize)
49+
codeinfo = first(info).first
50+
output = sprint(show, MIME"text/plain"(), codeinfo.code)
51+
return !occursin("index_G_vectors", output)
52+
end
7053

71-
actual_alloc = @allocated G_vectors_calls(basis, DFTK.index_G_vectors)
72-
slow_alloc = @allocated G_vectors_calls(basis, index_G_vectors_slow)
73-
@test slow_alloc > actual_alloc
54+
# Test that the index_G_vectors function is inlined
55+
@test is_inlined(true)
56+
@test !is_inlined(false)
7457
end
7558
end
7659

0 commit comments

Comments
 (0)