Avoid implementing LAPACK interfaces directly

Some of the existing CUSOLVER wrappers, and the ones added in https://github.com/JuliaGPU/CUDA.jl/pull/1806, directly implement `LinearAlgebra.LAPACK` methods for greater reuse of Base functionality. We used to do that in the past, and have actually moved away from that, instead opting to implement/override higher-level APIs from LinearAlgebra building on a private CUSOLVER LAPACK-style interface instead. The main reasons for that were that NVIDIA's libraries generally do not exactly match the CPU LAPACK interface (that's not their intent, and is what libraries like NVBLAS should be used for), and that higher-level APIs often need to be overwritten anyway because they introduce Array allocations or perform GPU-incompatible operations (like for loops).

Simply removing [the overloads](https://github.com/JuliaGPU/CUDA.jl/blob/a011e73c375728fbf7a997a648d7c40a6607958a/lib/cusolver/dense.jl#L894-L940) obviously doesn't work because we've already started relying again on higher-level LinearAlgebra functionality:

```
  ArgumentError: cannot take the CPU address of a CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
  Stacktrace:
    [1] unsafe_convert(::Type{Ptr{Float32}}, x::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
      @ CUDA ~/Julia/pkg/CUDA/src/array.jl:484
    [2] potrf!(uplo::Char, A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
      @ LinearAlgebra.LAPACK ~/Julia/depot/juliaup/julia-1.10.2+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/lapack.jl:3226
    [3] _chol!
      @ ~/Julia/depot/juliaup/julia-1.10.2+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/cholesky.jl:187 [inlined]
    [4] cholesky!(A::Hermitian{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}}, ::NoPivot; check::Bool)
      @ LinearAlgebra ~/Julia/depot/juliaup/julia-1.10.2+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/cholesky.jl:268
    [5] cholesky!
      @ ~/Julia/depot/juliaup/julia-1.10.2+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/cholesky.jl:267 [inlined]
    [6] cholesky!(A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ::NoPivot; check::Bool)
      @ LinearAlgebra ~/Julia/depot/juliaup/julia-1.10.2+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/cholesky.jl:301
    [7] cholesky! (repeats 2 times)
      @ ~/Julia/depot/juliaup/julia-1.10.2+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/cholesky.jl:295 [inlined]
    [8] cholesky(A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, ::NoPivot; check::Bool)
      @ LinearAlgebra ~/Julia/depot/juliaup/julia-1.10.2+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/cholesky.jl:401
```

```
  ArgumentError: cannot take the CPU address of a CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}
  Stacktrace:
    [1] unsafe_convert(::Type{Ptr{Float32}}, x::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer})
      @ CUDA ~/Julia/pkg/CUDA/src/array.jl:484
    [2] ormqr!(side::Char, trans::Char, A::CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, tau::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, C::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
      @ LinearAlgebra.LAPACK ~/Julia/depot/juliaup/julia-1.10.2+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/lapack.jl:2926
    [3] lmul!(adjQ::LinearAlgebra.AdjointQ{Float32, LinearAlgebra.QRPackedQ{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, B::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
      @ LinearAlgebra ~/Julia/depot/juliaup/julia-1.10.2+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/abstractq.jl:347
    [4] mul!(C::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}, Q::LinearAlgebra.AdjointQ{Float32, LinearAlgebra.QRPackedQ{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, B::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
      @ LinearAlgebra ~/Julia/depot/juliaup/julia-1.10.2+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/abstractq.jl:212
    [5] *(Q::LinearAlgebra.AdjointQ{Float32, LinearAlgebra.QRPackedQ{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}}, B::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
      @ LinearAlgebra ~/Julia/depot/juliaup/julia-1.10.2+0.x64.linux.gnu/share/julia/stdlib/v1.10/LinearAlgebra/src/abstractq.jl:171
    [6] ldiv!(_qr::QR{Float32, CuArray{Float32, 2, CUDA.Mem.DeviceBuffer}, CuArray{Float32, 1, CUDA.Mem.DeviceBuffer}}, b::CuArray{Float32, 1, CUDA.Mem.DeviceBuffer})
      @ CUDA.CUSOLVER ~/Julia/pkg/CUDA/lib/cusolver/linalg.jl:153
```

... and some more. That last one actually is an example of why we probably don't want this, because we already have several factorizations implemented at the higher API level, and it's confusing to have both these and some inherited from LinearAlgebra: https://github.com/JuliaGPU/CUDA.jl/blob/a011e73c375728fbf7a997a648d7c40a6607958a/lib/cusolver/linalg.jl#L139-L237

I don't feel particularly strongly about this because I'm not too familiar with the LinearAlgebra codebase, but it doesn't seem ideal from a maintainability perspective to mix both approaches.

cc @amontoison 

	using LinearAlgebra: Factorization, AbstractQ, QRCompactWY, QRCompactWYQ, QRPackedQ

	## QR

	LinearAlgebra.qr!(A::CuMatrix{T}) where T = QR(geqrf!(A::CuMatrix{T})...)

	# conversions
	CuMatrix(F::Union{QR,QRCompactWY}) = CuArray(AbstractArray(F))
	CuArray(F::Union{QR,QRCompactWY}) = CuMatrix(F)
	CuMatrix(F::QRPivoted) = CuArray(AbstractArray(F))
	CuArray(F::QRPivoted) = CuMatrix(F)

	function LinearAlgebra.ldiv!(_qr::QR, b::CuVector)
	m,n = size(_qr)
	_x = UpperTriangular(_qr.R[1:min(m,n), 1:n]) \ ((_qr.Q' * b)[1:n])
	b[1:n] .= _x
	unsafe_free!(_x)
	return b[1:n]
	end

	function LinearAlgebra.ldiv!(_qr::QR, B::CuMatrix)
	m,n = size(_qr)
	_x = UpperTriangular(_qr.R[1:min(m,n), 1:n]) \ ((_qr.Q' * B)[1:n, 1:size(B, 2)])
	B[1:n, 1:size(B, 2)] .= _x
	unsafe_free!(_x)
	return B[1:n, 1:size(B, 2)]
	end

	function LinearAlgebra.ldiv!(x::CuArray, _qr::QR, b::CuArray)
	_x = ldiv!(_qr, b)
	x .= vec(_x)
	unsafe_free!(_x)
	return x
	end

	# AbstractQ's `size` is the size of the full matrix,
	# while `Matrix(Q)` only gives the compact Q.
	# See JuliaLang/julia#26591 and JuliaGPU/CUDA.jl#969.
	CuArray(Q::AbstractQ) = CuMatrix(Q)
	CuArray{T}(Q::AbstractQ) where {T} = CuMatrix{T}(Q)
	CuMatrix(Q::AbstractQ{T}) where {T} = CuMatrix{T}(Q)
	CuMatrix{T}(Q::QRPackedQ{S}) where {T,S} =
	CuMatrix{T}(lmul!(Q, CuMatrix{S}(I, size(Q, 1), min(size(Q.factors)...))))
	CuMatrix{T, B}(Q::QRPackedQ{S}) where {T, B, S} = CuMatrix{T}(Q)
	CuMatrix{T}(Q::QRCompactWYQ) where {T} = error("QRCompactWY format is not supported")
	# avoid the CPU array in the above mul!
	Matrix{T}(Q::QRPackedQ{S,<:CuArray,<:CuArray}) where {T,S} = Array(CuMatrix{T}(Q))
	Matrix{T}(Q::QRCompactWYQ{S,<:CuArray,<:CuArray}) where {T,S} = Array(CuMatrix{T}(Q))

	if VERSION < v"1.10-"
	# extracting the full matrix can be done with `collect` (which defaults to `Array`)
	function Base.collect(src::Union{QRPackedQ{<:Any,<:CuArray,<:CuArray},
	QRCompactWYQ{<:Any,<:CuArray,<:CuArray}})
	dest = similar(src)
	copyto!(dest, I)
	lmul!(src, dest)
	collect(dest)
	end

	# avoid the generic similar fallback that returns a CPU array
	Base.similar(Q::Union{QRPackedQ{<:Any,<:CuArray,<:CuArray},
	QRCompactWYQ{<:Any,<:CuArray,<:CuArray}},
	::Type{T}, dims::Dims{N}) where {T,N} =
	CuArray{T,N}(undef, dims)

	end

	function Base.getindex(Q::QRPackedQ{<:Any, <:CuArray}, ::Colon, j::Int)
	y = CUDA.zeros(eltype(Q), size(Q, 2))
	y[j] = 1
	lmul!(Q, y)
	end

	# multiplication by Q
	LinearAlgebra.lmul!(A::QRPackedQ{T,<:CuArray,<:CuArray},
	B::CuVecOrMat{T}) where {T<:BlasFloat} =
	ormqr!('L', 'N', A.factors, A.τ, B)
	LinearAlgebra.lmul!(adjA::Adjoint{T,<:QRPackedQ{T,<:CuArray,<:CuArray}},
	B::CuVecOrMat{T}) where {T<:BlasReal} =
	ormqr!('L', 'T', parent(adjA).factors, parent(adjA).τ, B)
	LinearAlgebra.lmul!(adjA::Adjoint{T,<:QRPackedQ{T,<:CuArray,<:CuArray}},
	B::CuVecOrMat{T}) where {T<:BlasComplex} =
	ormqr!('L', 'C', parent(adjA).factors, parent(adjA).τ, B)
	LinearAlgebra.lmul!(trA::Transpose{T,<:QRPackedQ{T,<:CuArray,<:CuArray}},
	B::CuVecOrMat{T}) where {T<:BlasFloat} =
	ormqr!('L', 'T', parent(trA).factors, parent(trA).τ, B)

	LinearAlgebra.rmul!(A::CuVecOrMat{T},
	B::QRPackedQ{T,<:CuArray,<:CuArray}) where {T<:BlasFloat} =
	ormqr!('R', 'N', B.factors, B.τ, A)
	LinearAlgebra.rmul!(A::CuVecOrMat{T},
	adjB::Adjoint{<:Any,<:QRPackedQ{T,<:CuArray,<:CuArray}}) where {T<:BlasReal} =
	ormqr!('R', 'T', parent(adjB).factors, parent(adjB).τ, A)
	LinearAlgebra.rmul!(A::CuVecOrMat{T},
	adjB::Adjoint{<:Any,<:QRPackedQ{T,<:CuArray,<:CuArray}}) where {T<:BlasComplex} =
	ormqr!('R', 'C', parent(adjB).factors, parent(adjB).τ, A)
	LinearAlgebra.rmul!(A::CuVecOrMat{T},
	trA::Transpose{<:Any,<:QRPackedQ{T,<:CuArray,<:CuArray}}) where {T<:BlasFloat} =
	ormqr!('R', 'T', parent(trA).factors, parent(adjB).τ, A)

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

Uh oh!

Avoid implementing LAPACK interfaces directly #2332

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Avoid implementing LAPACK interfaces directly #2332

Description

Metadata

Metadata

Assignees

Labels

Type

Projects

Milestone

Relationships

Development

Issue actions