Use sharedarrays on Mtl (#75)

matthieugomez · web-flow · commit b39ad90bc49a · 2026-02-02T13:24:45.000-06:00
* Change nthreads default to nothing in solver

* Update AbstractFixedEffectSolver to use optional nthreads

* Allow optional nthreads parameter in AbstractFixedEffectSolver

* Change nthreads parameter to 'nothing' in solver functions

* Bump version from 2.5.2 to 2.6.0

* Update method and double_precision arguments in functions

* Fix capitalization of 'Metal' in method argument

* Update benchmark_Metal.jl

* Reformat function signatures for consistency

* Decrease maxiter by 1 in lsmr! call

* safer to use Int for big arrays and not more costly

* Update MetalExt.jl

* better to do chunkis of 100_000 even if it means more threads than Threads.nthreads

* Update SolverCPU.jl

* used shared arrays for Metal

* Update Project.toml

* Update AbstractFixedEffectSolver.jl

* Update MetalExt.jl

* rmv nthreads
diff --git a/Project.toml b/Project.toml
@@ -1,6 +1,7 @@
 name = "FixedEffects"
 uuid = "c8885935-8500-56a7-9867-7708b20db0eb"
-version = "2.6.0"
+version = "2.7.0"
+
 
 [deps]
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
diff --git a/ext/CUDAExt.jl b/ext/CUDAExt.jl
@@ -1,6 +1,6 @@
 module CUDAExt
 using FixedEffects, CUDA
-using FixedEffects: FixedEffectCoefficients, AbstractWeights, UnitWeights, LinearAlgebra, Adjoint, mul!, rmul!,  lsmr!, AbstractFixedEffectLinearMap
+using FixedEffects: FixedEffectCoefficients, AbstractWeights, UnitWeights, LinearAlgebra, Adjoint, mul!, rmul!,  lsmr!, AbstractFixedEffectLinearMap, copy_internal!
 CUDA.allowscalar(false)
 
 ##############################################################################
@@ -36,17 +36,17 @@ mutable struct FixedEffectLinearMapCUDA{T} <: AbstractFixedEffectLinearMap{T}
 	fes::Vector{<:FixedEffect}
 	scales::Vector{<:AbstractVector}
 	caches::Vector{<:AbstractVector}
-	nthreads::Int
 end
 
-function FixedEffectLinearMapCUDA{T}(fes::Vector{<:FixedEffect}, nthreads) where {T}
+function FixedEffectLinearMapCUDA{T}(fes::Vector{<:FixedEffect}) where {T}
 	fes = [_cu(T, fe) for fe in fes]
 	scales = [CUDA.zeros(T, fe.n) for fe in fes]
 	caches = [CUDA.zeros(T, length(fes[1].interaction)) for fe in fes]
-	return FixedEffectLinearMapCUDA{T}(fes, scales, caches, nthreads)
+	return FixedEffectLinearMapCUDA{T}(fes, scales, caches)
 end
 
-function FixedEffects.gather!(fecoef::CuVector, refs::CuVector, α::Number, y::CuVector, cache::CuVector, nthreads::Integer)
+function FixedEffects.gather!(fecoef::CuVector, refs::CuVector, α::Number, y::CuVector, cache::CuVector)
+	nthreads = 256
 	nblocks = cld(length(y), nthreads) 
 	@cuda threads=nthreads blocks=nblocks gather_kernel!(fecoef, refs, α, y, cache)    
 end
@@ -61,7 +61,8 @@ function gather_kernel!(fecoef, refs, α, y, cache)
 	end
 end
 
-function FixedEffects.scatter!(y::CuVector, α::Number, fecoef::CuVector, refs::CuVector, cache::CuVector, nthreads::Integer)
+function FixedEffects.scatter!(y::CuVector, α::Number, fecoef::CuVector, refs::CuVector, cache::CuVector)
+	nthreads = 256
 	nblocks = cld(length(y), nthreads)
 	@cuda threads=nthreads blocks=nblocks scatter_kernel!(y, α, fecoef, refs, cache)
 end
@@ -101,11 +102,7 @@ function FixedEffects.AbstractFixedEffectSolver{T}(fes::Vector{<:FixedEffect}, w
 end
 
 function FixedEffects.AbstractFixedEffectSolver{T}(fes::Vector{<:FixedEffect}, weights::AbstractWeights, ::Type{Val{:CUDA}}, nthreads = nothing) where {T}
-	if nthreads === nothing
-		nthreads = 256
-	end
-	nthreads = prevpow(2, nthreads)
-	m = FixedEffectLinearMapCUDA{T}(fes, nthreads)
+	m = FixedEffectLinearMapCUDA{T}(fes)
 	b = CUDA.zeros(T, length(weights))
 	r = CUDA.zeros(T, length(weights))
 	x = FixedEffectCoefficients([CUDA.zeros(T, fe.n) for fe in fes])
@@ -120,15 +117,16 @@ end
 function FixedEffects.update_weights!(feM::FixedEffectSolverCUDA{T}, weights::AbstractWeights) where {T}
 	copyto!(feM.weights, _cu(T, weights))
 	for (scale, fe) in zip(feM.m.scales, feM.m.fes)
-		scale!(scale, fe.refs, fe.interaction, feM.weights, feM.m.nthreads)
+		scale!(scale, fe.refs, fe.interaction, feM.weights)
 	end
 	for (cache, scale, fe) in zip(feM.m.caches, feM.m.scales, feM.m.fes)
-		cache!(cache, fe.refs, fe.interaction, feM.weights, scale, feM.m.nthreads)
+		cache!(cache, fe.refs, fe.interaction, feM.weights, scale)
 	end	
 	return feM
 end
 
-function scale!(scale::CuVector, refs::CuVector, interaction::CuVector, weights::CuVector, nthreads::Integer)
+function scale!(scale::CuVector, refs::CuVector, interaction::CuVector, weights::CuVector)
+	nthreads = 256
 	nblocks = cld(length(refs), nthreads) 
     fill!(scale, 0)
 	@cuda threads=nthreads blocks=nblocks scale_kernel!(scale, refs, interaction, weights)
@@ -145,7 +143,8 @@ function scale_kernel!(scale, refs, interaction, weights)
 	end
 end
 
-function cache!(cache::CuVector, refs::CuVector, interaction::CuVector, weights::CuVector, scale::CuVector, nthreads::Integer)
+function cache!(cache::CuVector, refs::CuVector, interaction::CuVector, weights::CuVector, scale::CuVector)
+	nthreads = 256
 	nblocks = cld(length(cache), nthreads) 
 	@cuda threads=nthreads blocks=nblocks cache!_kernel!(cache, refs, interaction, weights, scale)
 end
@@ -160,6 +159,15 @@ function cache!_kernel!(cache, refs, interaction, weights, scale)
 	end
 end
 
+function FixedEffects.copy_internal!(feM::FixedEffectSolverCUDA, field::Symbol, r::AbstractVector)
+	copyto!(feM.tmp, r)
+	copyto!(getfield(feM, field), feM.tmp)
+end
+
+function FixedEffects.copy_internal!(r::AbstractVector, feM::FixedEffectSolverCUDA, field::Symbol)
+	copyto!(feM.tmp, getfield(feM, field))
+	copyto!(r, feM.tmp)
+end
 
 
 end
diff --git a/ext/MetalExt.jl b/ext/MetalExt.jl
@@ -1,6 +1,6 @@
 module MetalExt
 using FixedEffects, Metal
-using FixedEffects: FixedEffectCoefficients, AbstractWeights, UnitWeights, LinearAlgebra, Adjoint, mul!, rmul!, lsmr!, AbstractFixedEffectLinearMap
+using FixedEffects: FixedEffectCoefficients, AbstractWeights, UnitWeights, LinearAlgebra, Adjoint, mul!, rmul!, lsmr!, AbstractFixedEffectLinearMap, copy_internal!
 Metal.allowscalar(false)
 
 ##############################################################################
@@ -35,50 +35,53 @@ mutable struct FixedEffectLinearMapMetal{T} <: AbstractFixedEffectLinearMap{T}
 	fes::Vector{<:FixedEffect}
 	scales::Vector{<:AbstractVector}
 	caches::Vector
-	nthreads::Int
 end
 
 function bucketize_refs(refs::Vector, n::Int)
 	# count the number of obs per group
-    counts = zeros(Int, n)
-    @inbounds for r in refs
-        counts[r] += 1
-    end
+  counts = zeros(Int, n)
+  @inbounds for r in refs
+    counts[r] += 1
+  end
 	# offsets is vcat(1, cumsum(counts))
-    offsets = Vector{Int}(undef, n + 1)
+    offsets_mtl = Metal.@sync Metal.zeros(Int, n + 1; storage = Metal.SharedStorage)
+    offsets = unsafe_wrap(Array{Int}, offsets_mtl, size(offsets_mtl))
     offsets[1] = 1
     @inbounds for k in 1:n
         offsets[k+1] = offsets[k] + counts[k]
     end
+
+    perm_mtl = Metal.@sync Metal.zeros(Int, length(refs); storage = Metal.SharedStorage)
+    perm = unsafe_wrap(Array{Int}, perm_mtl, size(perm_mtl))
     next = offsets[1:n]
-    perm = Vector{Int}(undef, length(refs))
     @inbounds for i in eachindex(refs)
         r = refs[i]
         p = next[r]
         perm[p] = i
         next[r] = p + 1
     end
-    return perm, offsets
+    return perm_mtl, offsets_mtl
 end
 
-function FixedEffectLinearMapMetal{T}(fes::Vector{<:FixedEffect}, nthreads) where {T}
+function FixedEffectLinearMapMetal{T}(fes::Vector{<:FixedEffect}) where {T}
 	fes2 = [_mtl(T, fe) for fe in fes]
 	scales = [Metal.zeros(T, fe.n) for fe in fes]
-	caches = [[Metal.zeros(T, length(fe.refs)), Metal.zeros(Int, 1), Metal.zeros(Int, 1)] for fe in fes]
+	caches = [Any[Metal.zeros(T, length(fe.refs)), Metal.zeros(Int, 1), Metal.zeros(Int, 1)] for fe in fes]
 	Threads.@threads for i in 1:length(fes)
 		refs = fes[i].refs
 		n = fes[i].n
 		if n < min(100_000,  div(length(refs), 16))	
 			out = bucketize_refs(refs, n)
-			caches[i][2] = MtlArray(out[1])
-			caches[i][3] = MtlArray(out[2])
+			caches[i][2] = out[1]
+			caches[i][3] = out[2]
 		end
 	end
-	return FixedEffectLinearMapMetal{T}(fes2, scales, caches, nthreads)
+	return FixedEffectLinearMapMetal{T}(fes2, scales, caches)
 end
 
-function FixedEffects.gather!(fecoef::MtlVector, refs::MtlVector, α::Number, y::MtlVector, cache::Vector, nthreads::Integer)
+function FixedEffects.gather!(fecoef::MtlVector, refs::MtlVector, α::Number, y::MtlVector, cache::Vector)
 	n = length(fecoef)
+	nthreads = Int(device().maxThreadsPerThreadgroup.width)
 	if n < min(100_000,  div(length(refs), 16))
 		Metal.@sync @metal threads=nthreads groups=n gather_kernel_bin!(fecoef, refs, α, y, cache[1], cache[2], cache[3], Val(nthreads))
 	else
@@ -138,7 +141,8 @@ function gather_kernel!(fecoef, refs, α, y, cache)
 	return nothing
 end
 
-function FixedEffects.scatter!(y::MtlVector, α::Number, fecoef::MtlVector, refs::MtlVector, cache::Vector, nthreads::Integer)
+function FixedEffects.scatter!(y::MtlVector, α::Number, fecoef::MtlVector, refs::MtlVector, cache::Vector)
+	nthreads = Int(device().maxThreadsPerThreadgroup.width)
 	nblocks = cld(length(y), nthreads)
 	Metal.@sync @metal threads=nthreads groups=nblocks scatter_kernel!(y, α, fecoef, refs, cache[1])
 end
@@ -168,40 +172,36 @@ mutable struct FixedEffectSolverMetal{T} <: FixedEffects.AbstractFixedEffectSolv
 	v::FixedEffectCoefficients{<: AbstractVector{T}}
 	h::FixedEffectCoefficients{<: AbstractVector{T}}
 	hbar::FixedEffectCoefficients{<: AbstractVector{T}}
-	tmp::Vector{T} # used to convert AbstractVector to Vector{T}
 	fes::Vector{<:FixedEffect}
 end
+
 	
 function FixedEffects.AbstractFixedEffectSolver{T}(fes::Vector{<:FixedEffect}, weights::AbstractWeights, ::Type{Val{:Metal}}, nthreads = nothing) where {T}
-	if nthreads === nothing
-		nthreads = Int(device().maxThreadsPerThreadgroup.width)
-	end
-	nthreads = prevpow(2, nthreads)
-	m = FixedEffectLinearMapMetal{T}(fes, nthreads)
-	b = Metal.zeros(T, length(weights))
-	r = Metal.zeros(T, length(weights))
+	m = FixedEffectLinearMapMetal{T}(fes)
+	b = Metal.zeros(T, length(weights); storage = Metal.SharedStorage)
+	r = Metal.zeros(T, length(weights); storage = Metal.SharedStorage)
 	x = FixedEffectCoefficients([Metal.zeros(T, fe.n) for fe in fes])
 	v = FixedEffectCoefficients([Metal.zeros(T, fe.n) for fe in fes])
 	h = FixedEffectCoefficients([Metal.zeros(T, fe.n) for fe in fes])
 	hbar = FixedEffectCoefficients([Metal.zeros(T, fe.n) for fe in fes])
-	tmp = zeros(T, length(weights))
-	feM = FixedEffectSolverMetal{T}(m, Metal.zeros(T, length(weights)), b, r, x, v, h, hbar, tmp, fes)
+	feM = FixedEffectSolverMetal{T}(m, Metal.zeros(T, length(weights)), b, r, x, v, h, hbar, fes)
 	FixedEffects.update_weights!(feM, weights)
 end
 
 
 function FixedEffects.update_weights!(feM::FixedEffectSolverMetal{T}, weights::AbstractWeights) where {T}
 	copyto!(feM.weights, _mtl(T, weights))
 	for (scale, fe) in zip(feM.m.scales, feM.m.fes)
-		scale!(scale, fe.refs, fe.interaction, feM.weights, feM.m.nthreads)
+		scale!(scale, fe.refs, fe.interaction, feM.weights)
 	end
 	for (cache, scale, fe) in zip(feM.m.caches, feM.m.scales, feM.m.fes)
-		cache!(cache, fe.refs, fe.interaction, feM.weights, scale, feM.m.nthreads)
+		cache!(cache, fe.refs, fe.interaction, feM.weights, scale)
 	end	
 	return feM
 end
 
-function scale!(scale::MtlVector, refs::MtlVector, interaction::MtlVector, weights::MtlVector, nthreads::Integer)
+function scale!(scale::MtlVector, refs::MtlVector, interaction::MtlVector, weights::MtlVector)
+	nthreads = Int(device().maxThreadsPerThreadgroup.width)
 	nblocks = cld(length(refs), nthreads) 
     fill!(scale, 0)
 	Metal.@sync @metal threads=nthreads groups=nblocks scale_kernel!(scale, refs, interaction, weights)
@@ -224,7 +224,8 @@ function inv_kernel!(scale, T)
 	return nothing
 end
 
-function cache!(cache, refs::MtlVector, interaction::MtlVector, weights::MtlVector, scale::MtlVector, nthreads::Integer)
+function cache!(cache, refs::MtlVector, interaction::MtlVector, weights::MtlVector, scale::MtlVector)
+	nthreads = Int(device().maxThreadsPerThreadgroup.width)
 	nblocks = cld(length(cache[1]), nthreads) 
 	Metal.@sync @metal threads=nthreads groups=nblocks cache!_kernel!(cache[1], refs, interaction, weights, scale)
 end
@@ -237,5 +238,17 @@ function cache!_kernel!(cache, refs, interaction, weights, scale)
 	return nothing
 end
 
+function FixedEffects.copy_internal!(feM::FixedEffectSolverMetal{T}, field::Symbol, r::AbstractVector) where {T}
+	synchronize()
+	feM_r = unsafe_wrap(Array{T}, getfield(feM, field), size(getfield(feM, field)))
+	copyto!(feM_r, r)
+end
+
+function FixedEffects.copy_internal!(r::AbstractVector, feM::FixedEffectSolverMetal{T}, field::Symbol) where {T}
+	synchronize()
+	feM_r = unsafe_wrap(Array{T}, getfield(feM, field), size(getfield(feM, field)))
+	copyto!(r, feM_r)
+end
+
 
 end
diff --git a/src/AbstractFixedEffectLinearMap.jl b/src/AbstractFixedEffectLinearMap.jl
@@ -29,7 +29,7 @@ function LinearAlgebra.mul!(fecoefs::FixedEffectCoefficients,
 	fem = adjoint(Cfem)
 	rmul!(fecoefs, β)
 	for (fecoef, fe, cache) in zip(fecoefs.x, fem.fes, fem.caches)
-		gather!(fecoef, fe.refs, α, y, cache, fem.nthreads)
+		gather!(fecoef, fe.refs, α, y, cache)
 	end
 	return fecoefs
 end
@@ -38,7 +38,7 @@ function LinearAlgebra.mul!(y::AbstractVector, fem::AbstractFixedEffectLinearMap
 			  fecoefs::FixedEffectCoefficients, α::Number, β::Number)
 	rmul!(y, β)
 	for (fecoef, fe, cache) in zip(fecoefs.x, fem.fes, fem.caches)
-		scatter!(y, α, fecoef, fe.refs, cache, fem.nthreads)
+		scatter!(y, α, fecoef, fe.refs, cache)
 	end
 	return y
 end
diff --git a/src/AbstractFixedEffectSolver.jl b/src/AbstractFixedEffectSolver.jl
@@ -6,7 +6,6 @@
 ##
 ##############################################################################
 abstract type AbstractFixedEffectSolver{T} end
-works_with_view(::AbstractFixedEffectSolver) = false
 
 """
 `solve_residuals!(y, fes, w; method = :cpu, double_precision = method == :cpu, tol = 1e-8, maxiter = 10000)`
@@ -43,22 +42,17 @@ function solve_residuals!(y::Union{AbstractVector{<: Real}, AbstractMatrix{<: Re
 	nthreads = nothing)
 	any((length(fe) != size(y, 1) for fe in fes)) && throw("FixedEffects must have the same length as y")
 	any(ismissing.(fes)) && throw("FixedEffects must not have missing values")
-	feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, w, Val{method}, nthreads)
+	feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, w, Val{method})
 	solve_residuals!(y, feM; maxiter = maxiter, tol = tol)
 end
 
 
 
 function solve_residuals!(r::AbstractVector{<:Real}, feM::AbstractFixedEffectSolver{T}; tol::Real = sqrt(eps(T)), maxiter::Integer = 100_000) where {T}
 	# One cannot copy view of Vector (r) on GPU, so first collect the vector
-	if works_with_view(feM)
-		copyto!(feM.r, r)
-	else
-		copyto!(feM.tmp, r)
-		copyto!(feM.r, feM.tmp)
-	end
+	copy_internal!(feM, :r, r)
 	if !(feM.weights isa UnitWeights)
-		 feM.r .*= sqrt.(feM.weights)
+		feM.r .*= sqrt.(feM.weights)
 	end
 	copyto!(feM.b, feM.r)
 	mul!(feM.x, feM.m', feM.b, 1, 0)
@@ -71,12 +65,7 @@ function solve_residuals!(r::AbstractVector{<:Real}, feM::AbstractFixedEffectSol
 	if !(feM.weights isa UnitWeights)
 		feM.r ./=  sqrt.(feM.weights)
 	end
-	if works_with_view(feM)
-		copyto!(r, feM.r)
-	else
-		copyto!(feM.tmp, feM.r)
-		copyto!(r, feM.tmp)
-	end
+	copy_internal!(r, feM, :r)
 	return r, iter, converged
 end
 
@@ -160,18 +149,13 @@ function solve_coefficients!(y::AbstractVector{<: Number}, fes::AbstractVector{<
 		nthreads = nothing)
 	any(ismissing.(fes)) && throw("Some FixedEffect has a missing value for reference or interaction")
 	any((length(fe) != length(y) for fe in fes))  && throw("FixedEffects must have the same length as y")
-	feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, w, Val{method}, nthreads)
+	feM = AbstractFixedEffectSolver{double_precision ? Float64 : Float32}(fes, w, Val{method})
 	solve_coefficients!(y, feM; maxiter = maxiter, tol = tol)
 end
 
 function FixedEffects.solve_coefficients!(r::AbstractVector, feM::AbstractFixedEffectSolver{T}; tol::Real = sqrt(eps(T)), maxiter::Integer = 100_000) where {T}
 	# One cannot copy view of Vector (r) on GPU, so first collect the vector
-	if works_with_view(feM)
-		copyto!(feM.b, r)
-	else
-		copyto!(feM.tmp, r)
-		copyto!(feM.b, feM.tmp)
-	end
+	copy_internal!(feM, :b, r)
 	if !(feM.weights isa UnitWeights)
 		feM.b .*= sqrt.(feM.weights)
 	end
diff --git a/src/SolverCPU.jl b/src/SolverCPU.jl