GPU optimization of LOBPCG (#1068)

abussy · web-flow · commit 359996b95fdc · 2025-05-15T10:42:04.000Z
diff --git a/src/DFTK.jl b/src/DFTK.jl
@@ -138,6 +138,7 @@ export PreconditionerNone
 export lobpcg_hyper
 export diag_full
 export diagonalize_all_kblocks
+include("eigen/linalg.jl")
 include("eigen/preconditioners.jl")
 include("eigen/diag.jl")
 
@@ -231,7 +232,10 @@ include("postprocess/refine.jl")
 # Workarounds
 include("workarounds/dummy_inplace_fft.jl")
 include("workarounds/forwarddiff_rules.jl")
-include("workarounds/gpu_arrays.jl")
+
+# Optimized generic GPU functions and GPU workarounds
+include("gpu/linalg.jl")
+include("gpu/gpu_arrays.jl")
 
 # Precompilation block with a basic workflow
 
diff --git a/src/eigen/linalg.jl b/src/eigen/linalg.jl
@@ -0,0 +1,15 @@
+# Calculate the norms of the columns of an array
+function columnwise_norms(X::AbstractArray{T}) where{T}
+    vec(sqrt.(sum(abs2, X; dims=1)))
+end
+
+# Returns a vector of dot(A[:, i], B[:, i]), for all columns of A, B
+@views function columnwise_dots(A::AbstractArray{T}, B::AbstractArray{T}) where {T}
+    [real(dot(A[:, i], B[:, i])) for i = 1:size(A, 2)]
+end
+
+# Returns a vector of real(dot(A[:, i], M, B[:, i])), for all columns of
+# A, B, and matrix M
+@views function columnwise_dots(A::AbstractArray{T}, M, B::AbstractArray{T}) where {T}
+    [real(dot(A[:, i], M, B[:, i])) for i = 1:size(A, 2)]
+end
diff --git a/src/eigen/lobpcg_hyper_impl.jl b/src/eigen/lobpcg_hyper_impl.jl
@@ -33,6 +33,9 @@
 # other eigenvectors (which is not the case in many - all ? - other
 # implementations)
 
+# - Some functions are reimplemented in a GPU optimized way as part of
+# the DFTK CUDA Extension (ext/DFTKCUDAExt/lobpcg.jl).
+
 
 ## TODO micro-optimization of buffer reuse
 ## TODO write a version that doesn't assume that B is well-conditioned, and doesn't reuse B applications at all
@@ -170,7 +173,7 @@ function B_ortho!(X, BX)
     rdiv!(BX, U)
 end
 
-normest(M) = maximum(abs.(diag(M))) + norm(M - Diagonal(diag(M)))
+normest(M) = maximum(abs, diag(M)) + norm(M - Diagonal(diag(M)))
 # Orthogonalizes X to tol
 # Returns the new X, the number of Cholesky factorizations algorithm, and the
 # growth factor by which small perturbations of X can have been magnified
@@ -252,24 +255,15 @@ end
 
 # Randomize the columns of X if the norm is below tol
 function drop_small!(X::AbstractArray{T}; tol=2eps(real(T))) where {T}
-    dropped = Int[]
-    for i=1:size(X,2)
-        n = norm(@views X[:,i])
-        if n <= tol
-            X[:,i] = randn(T, size(X,1))
-            push!(dropped, i)
-        end
-    end
+    dropped = findall(n -> n <= tol, columnwise_norms(X))
+    @views randn!(TaskLocalRNG(), X[:, dropped])
     dropped
 end
 
 # Find X that is orthogonal, and B-orthogonal to Y, up to a tolerance tol.
 @timing "ortho! X vs Y" function ortho!(X::AbstractArray{T}, Y, BY; tol=2eps(real(T))) where {T}
     # normalize to try to cheaply improve conditioning
-    parallel_loop_over_range(1:size(X, 2)) do i
-        n = norm(@views X[:,i])
-        @views X[:,i] ./= n
-    end
+    X ./= columnwise_norms(X)'
 
     niter = 1
     ninners = zeros(Int, 0)
@@ -322,7 +316,7 @@ end
 end
 
 function final_retval(X, AX, BX, λ, resid_history, niter, n_matvec)
-    λ_host = oftype(ones(eltype(λ), 1), λ)  # Copy to CPU for element-wise access
+    λ_host = to_cpu(λ)  # Copy to CPU for element-wise access
     if !issorted(λ_host)
         p = sortperm(λ_host)
         λ_host = λ_host[p]
@@ -336,6 +330,12 @@ function final_retval(X, AX, BX, λ, resid_history, niter, n_matvec)
      residual_history=resid_history[:, 1:niter+1], n_matvec)
 end
 
+# Computes λ = real((X' * AX) / (X' *BX)), for each column of X
+function compute_λ(X, AX, BX)
+    λs = @views [real((X[:, n]'*AX[:, n]) / (X[:, n]'BX[:, n])) for n=1:size(X, 2)]
+    oftype(real(X[:, 1]), λs)  # Offload to GPU if needed
+end
+
 ### The algorithm is Xn+1 = rayleigh_ritz(hcat(Xn, A*Xn, Xn-Xn-1))
 ### We follow the strategy of Hetmaniuk and Lehoucq, and maintain a B-orthonormal basis Y = (X,R,P)
 ### After each rayleigh_ritz step, the B-orthonormal X and P are deduced by an orthogonal rotation from Y
@@ -389,8 +389,7 @@ end
     end
     nlocked = 0
     niter = 0  # the first iteration is fake
-    λs = @views [real((X[:, n]'*AX[:, n]) / (X[:, n]'BX[:, n])) for n=1:M]
-    λs = oftype(real(X[:, 1]), λs)  # Offload to GPU if needed
+    λs = compute_λ(X, AX, BX)
     new_X  = X
     new_AX = AX
     new_BX = BX
@@ -431,9 +430,8 @@ end
         ### Compute new residuals
         @timing "Update residuals" begin
             new_R = new_AX .- new_BX .* λs'
-            @views for i = 1:size(X, 2)
-                resid_history[i + nlocked, niter+1] = norm(new_R[:, i])
-            end
+            norms = to_cpu(columnwise_norms(new_R))
+            @views resid_history[1 + nlocked: size(new_R, 2) + nlocked, niter+1] .= norms[:]
         end
         @debug niter resid_history[:, niter+1]
 
@@ -512,10 +510,9 @@ end
         end
 
         # Quick sanity check
-        for i = 1:size(X, 2)
-            @views if abs(BX[:, i]'X[:, i] - 1) >= sqrt(eps(real(eltype(X))))
-                error("LOBPCG is badly failing to keep the vectors normalized; this should never happen")
-            end
+        diffs = abs.(columnwise_dots(BX, X) .-1)
+        if any(diffs .>= sqrt(eps(real(eltype(X)))))
+           error("LOBPCG is badly failing to keep the vectors normalized; this should never happen")
         end
 
         # Restrict all views to active
diff --git a/src/eigen/preconditioners.jl b/src/eigen/preconditioners.jl
@@ -24,11 +24,11 @@ PreconditionerNone(::HamiltonianBlock) = I
 (Simplified version of)
 [Tetter-Payne-Allan preconditioning](https://doi.org/10.1103/physrevb.40.12255).
 """
-mutable struct PreconditionerTPA{T <: Real}
+mutable struct PreconditionerTPA{T <: Real, Tkin <: AbstractVector{T}}
     basis::PlaneWaveBasis
     kpt::Kpoint
-    kin::AbstractVector{T}  # kinetic energy of every G
-    mean_kin::Union{Nothing, Vector{T}}  # mean kinetic energy of every band
+    kin::Tkin  # kinetic energy of every G
+    mean_kin::Union{Nothing, Tkin}  # mean kinetic energy of every band
     default_shift::T  # if mean_kin is not set by `precondprep!`, this will be used for the shift
 end
 
@@ -40,7 +40,7 @@ function PreconditionerTPA(basis::PlaneWaveBasis{T}, kpt::Kpoint; default_shift=
     #      it's better to pass a HamiltonianBlock directly and read the computed values.
     kinetic_term = only(kinetic_term)
     kin = kinetic_energy(kinetic_term, basis.Ecut, Gplusk_vectors_cart(basis, kpt))
-    PreconditionerTPA{T}(basis, kpt, kin, nothing, default_shift)
+    PreconditionerTPA{T, typeof(kin)}(basis, kpt, kin, nothing, default_shift)
 end
 function PreconditionerTPA(ham::HamiltonianBlock; kwargs...)
     PreconditionerTPA(ham.basis, ham.kpoint; kwargs...)
@@ -50,7 +50,7 @@ end
     if P.mean_kin === nothing
         ldiv!(Y, Diagonal(P.kin .+ P.default_shift), R)
     else
-    parallel_loop_over_range(1:size(Y, 2)) do n
+        parallel_loop_over_range(1:size(Y, 2)) do n
             Y[:, n] .= P.mean_kin[n] ./ (P.mean_kin[n] .+ P.kin) .* R[:, n]
         end
     end
@@ -73,7 +73,7 @@ end
 (Base.:*)(P::PreconditionerTPA, R) = mul!(copy(R), P, R)
 
 function precondprep!(P::PreconditionerTPA, X::AbstractArray)
-    P.mean_kin = [real(dot(x, Diagonal(P.kin), x)) for x in eachcol(X)]
+    P.mean_kin = vec(real(columnwise_dots(X, Diagonal(P.kin), X)))
 end
 precondprep!(P::PreconditionerTPA, ::Nothing) = 1  # fallback for edge cases
 
diff --git a/src/gpu/gpu_arrays.jl b/src/gpu/gpu_arrays.jl
diff --git a/src/gpu/linalg.jl b/src/gpu/linalg.jl
@@ -0,0 +1,45 @@
+### GPU-specific implementations of functions called during LOBPCG
+# The massive parallelism of the GPU can only be fully exploited when
+# operating on whole arrays. For performance reasons, one should avoid
+# explicitly looping over columns or elements. This approach is not
+# necessarily the most performant on CPU, as the allocation of large
+# temporary arrays hurts cache locality. It is also harder to read.
+
+using LinearAlgebra
+using GPUArraysCore
+
+function compute_λ(X::AbstractGPUArray{T}, AX::AbstractGPUArray{T}, BX::AbstractGPUArray{T}) where {T}
+    num = sum(conj(X) .* AX, dims=1)
+    den = sum(conj(X) .* BX, dims=1)
+    vec(real.(num ./ den))
+end
+
+function columnwise_dots(A::AbstractGPUArray{T}, B::AbstractGPUArray{T}) where {T}
+    sum(conj(A) .* B; dims=1)
+end
+
+function columnwise_dots(A::AbstractGPUArray{T}, M, B::AbstractGPUArray{T}) where {T}
+    sum(conj(A) .* (M * B); dims=1)
+end
+
+function columnwise_dots(A::AbstractGPUArray{T}, D::Diagonal, B::AbstractGPUArray{T}) where {T}
+    sum(conj(A) .* (D.diag .* B); dims=1)
+end
+
+function ldiv!(Y::AbstractGPUArray{T}, P::PreconditionerTPA, R::AbstractGPUArray{T}) where {T}
+    if P.mean_kin === nothing
+        ldiv!(Y, Diagonal(P.kin .+ P.default_shift), R)
+    else
+        Y .= (P.mean_kin' ./ (P.mean_kin' .+ P.kin)) .* R
+    end
+    Y
+end
+
+function mul!(Y::AbstractGPUArray{T}, P::PreconditionerTPA, R::AbstractGPUArray{T}) where {T}
+    if P.mean_kin === nothing
+        mul!(Y, Diagonal(P.kin .+ P.default_shift), R)
+    else
+        Y .= ((P.mean_kin' .+ P.kin) ./ P.mean_kin') .* R
+    end
+    Y
+end
diff --git a/src/interpolation.jl b/src/interpolation.jl
@@ -105,12 +105,11 @@ function interpolate_kpoint(data_in::AbstractVecOrMat,
     n_bands  = size(data_in, 2)
     n_Gk_out = length(G_vectors(basis_out, kpoint_out))
     data_out = similar(data_in, n_Gk_out, n_bands) .= 0
-    # TODO: use a map, or this will not be GPU compatible (scalar indexing)
-    for iin = 1:size(data_in, 1)
-        idx_fft = kpoint_in.mapping[iin]
-        idx_fft in keys(kpoint_out.mapping_inv) || continue
-        iout = kpoint_out.mapping_inv[idx_fft]
-        data_out[iout, :] = data_in[iin, :]
-    end
+
+    max_nG = max(length(G_vectors(basis_in)), length(G_vectors(basis_out)))
+    tmp = similar(data_in, max_nG, n_bands) .= 0
+
+    tmp[kpoint_in.mapping, :] .= data_in
+    data_out .= @view tmp[kpoint_out.mapping, :]
     ortho_qr(data_out)  # Re-orthogonalize and renormalize
 end