Skip to content
Open

gpu #47

Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
48 commits
Select commit Hold shift + click to select a range
d9baba2
Update typings to be more type-stable
v-morlock Nov 12, 2023
63af6ed
Reduce memory allocation and implement other performance improvements
v-morlock Nov 13, 2023
38372ae
add array dimensions
v-morlock Nov 19, 2023
fcfc558
re-enable multi threading
v-morlock Nov 19, 2023
0278c77
make Calculate_Q type stable
v-morlock Nov 19, 2023
ac01fe8
use AppleAccelerate in certain places
v-morlock Nov 23, 2023
de16e11
add intelvectormath
v-morlock Nov 23, 2023
ecbda77
minor
v-morlock Nov 23, 2023
6f5827a
more optimized methods
v-morlock Nov 24, 2023
4cb0ae9
minor
v-morlock Dec 13, 2023
8958e6b
log
v-morlock Dec 13, 2023
4455096
add benchmark results
v-morlock Dec 15, 2023
f41b969
fixes, make fp local
v-morlock Dec 15, 2023
51dee61
update project.toml
v-morlock Dec 15, 2023
3a34caa
precalculate y^rho
v-morlock Dec 16, 2023
9efe43a
things are faster now
v-morlock Dec 16, 2023
c84e8b2
optimize more sums
v-morlock Dec 16, 2023
9e607e6
fix sphering LL
behinger Dec 18, 2023
9df37ae
speed improvement from 272µs to 640ns
behinger Dec 19, 2023
3629500
insane additional 100x speed improvement in calculate_y
behinger Dec 19, 2023
16a39f6
this should work, but havent tested -oops
behinger Dec 19, 2023
df72b27
added missing broadcast to first minus, 340 to 230 µs
behinger Dec 19, 2023
fe60308
actual 10% slower, but less allocs. try later again to use?
behinger Dec 19, 2023
3bb8a13
15% or so improvement, much better on allocations
behinger Dec 19, 2023
fc2c85d
tried to pull out generation of Q
behinger Dec 19, 2023
6e41c09
merged the u changes, maybe not optimal, bt ok
behinger Dec 19, 2023
c8ba5fb
improved y_rho, removed experimental exp/pow
behinger Dec 19, 2023
abab784
1.46s/it vs. 0.6s/it
behinger Dec 21, 2023
0747103
finally works again....
behinger Dec 21, 2023
07eb2a4
rearrange indizes from [n, N, m] to [m, n, N] and implement intel powx
v-morlock Dec 30, 2023
7ce5a07
powx works
v-morlock Dec 30, 2023
c86b416
handling for pcs without intel vector math (apple silicon)
v-morlock Dec 31, 2023
884b96a
unsure
behinger Jan 18, 2024
2353207
Merge branch 'loopiloop_outsourceQ' into perf3
behinger Jan 18, 2024
63a6b5c
Move some things to the Amica object, format all files
v-morlock Jan 21, 2024
968dd9e
0,81 s / iter
v-morlock Jan 21, 2024
7711464
minor
v-morlock Jan 21, 2024
5a4494d
0,57s/iter
v-morlock Feb 2, 2024
9f49c41
minor
v-morlock Feb 2, 2024
6b8676c
ivm.abs
v-morlock Feb 2, 2024
02c1c23
replace pinv(x)*x with ldiv!()
behinger Feb 2, 2024
77a99f2
sum -> loop
v-morlock Feb 20, 2024
c99d809
one more sum to loop
v-morlock Feb 22, 2024
cb52bac
fix float32
behinger Feb 22, 2024
9bab7a3
fix float32
behinger Feb 22, 2024
480b5ba
supposedly now whitening does something?
behinger Feb 22, 2024
9ab9b5f
better testcases maybe
behinger Feb 22, 2024
d2e837b
added GPU support
behinger Feb 22, 2024
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 2 additions & 0 deletions .gitignore
Original file line number Diff line number Diff line change
Expand Up @@ -4,3 +4,5 @@
/Manifest.toml
/docs/build/
/.CondaPkg/
.DS_Store
*.mem
14 changes: 14 additions & 0 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -4,17 +4,31 @@ authors = ["Alexander Lulkin", "Benedikt V. Ehinger"]
version = "0.1.0"

[deps]
AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924"
BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
Diagonalizations = "9cd687f3-b62d-43f3-8fd3-ffcd9e581047"
Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
GaussianMixtures = "cc18c42c-b769-54ff-9e2a-b28141a64aae"
IntelVectorMath = "c8ce9da6-5d36-5c03-b118-5a70151be7bc"
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
MATLAB = "10e44e05-a98a-55b3-a45b-ba969058deb6"
MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
SignalAnalysis = "df1fea92-c066-49dd-8b36-eace3378ea47"
SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"

[compat]
ComponentArrays = "0.14"
Expand Down
114 changes: 66 additions & 48 deletions src/Amica.jl
Original file line number Diff line number Diff line change
@@ -1,55 +1,73 @@
#Amica.jl is based on a MATLAB implementation of AMICA by Jason Palmer.
module Amica
using LinearAlgebra
using GaussianMixtures
using Distributions
using SpecialFunctions
using ProgressMeter
#using ComponentArrays
using Diagonalizations
using LogExpFunctions
#using MultivariateStats
#using StatsAPI
include("types.jl")
include("helper.jl")
include("likelihood.jl")
include("parameters.jl")
include("newton.jl")
include("main.jl")

export amica!
export fit,fit!
export AbstractAmica,MultiModelAmica,SingleModelAmica


import Base.show

function Base.show(io::Core.IO,m::SingleModelAmica)
try
global like = m.LL[findlast(m.LL .!= 0)]
catch
global like = "not run"
end
println(io,"""
Amica with:
- signal-size: $(size(m.source_signals))
- likelihood: $(like)
""")
using LinearAlgebra
using GaussianMixtures
using Distributions
using MKL_jll
using IntelVectorMath
using SpecialFunctions
using ProgressMeter
using LoopVectorization
using AppleAccelerate
using StaticArrays
#using ComponentArrays
using Diagonalizations
using LogExpFunctions
using CUDA
import CUDA: CuArray
using Tullio

#using MultivariateStats
#using StatsAPI
include("types.jl")
include("types_gpu.jl")
include("helper.jl")
include("likelihood.jl")
include("parameters.jl")
include("newton.jl")
include("main.jl")

export amica!
export fit, fit!
export AbstractAmica, MultiModelAmica, SingleModelAmica, CuSingleModelAmica


import Base.show

function Base.show(io::Core.IO, m::Union{CuSingleModelAmica,SingleModelAmica})
try
ix = findlast(.!isnan.(m.LL))
@show ix
global like = string(m.LL[ix]) * " (after $(string(ix)) iterations)"
catch
global like = "not run"
end
println(
io,
"""
$(typeof(m)) with:
- signal-size: $(size(m.source_signals))
- likelihood: $(like)
"""
)
end

function Base.show(io::Core.IO,m::MultiModelAmica)
try
global like = m.LL[findlast(m.LL .!= 0)]
catch
global like = "not run"
end
println(io,"""
Amica with:
- models: $(length(m.models))
- signal-size: $(size(m.models[1].source_signals))
- likelihood: $(like)
""")
function Base.show(io::Core.IO, m::MultiModelAmica)
try
global like = m.LL[findlast(m.LL .!= 0)]
catch
global like = "not run"
end
println(
io,
"""
Amica with:
- models: $(length(m.models))
- signal-size: $(size(m.models[1].source_signals))
- likelihood: $(like)
"""
)
end



end
188 changes: 164 additions & 24 deletions src/helper.jl
Original file line number Diff line number Diff line change
@@ -1,43 +1,183 @@
#removes mean from nxN float matrix
function removeMean!(input)
mn = mean(input,dims=2)
(n,N) = size(input)
for i in 1:n
input[i,:] = input[i,:] .- mn[i]
end
return mn
mn = mean(input, dims=2)
(n, N) = size(input)
for i in 1:n
input[i, :] .= input[i, :] .- mn[i]
end
return mn
end

#Returns sphered data x. todo:replace with function from lib
function sphering(x)
(n,N) = size(x)
Us,Ss,Vs = svd(x*x'/N)
S = Us * diagm(vec(1 ./sqrt.(Ss))) * Us'
return x = S*x
function sphering_manual!(x::T) where {T}
(_, N) = size(x)
Us, Ss = svd(x * x' / N)
#@debug typeof(Us), typeof(diagm(1 ./ sqrt.(Ss)))
S = Us * T(diagm((1 ./ sqrt.(Ss)))) * Us'
x .= S * x
# @show x
return S
end

function bene_sphering(data)
d_memory_whiten = whitening(data) # Todo: make the dimensionality reduction optional
return d_memory_whiten.iF * data
function sphering!(data)
d_memory_whiten = whitening(data; simple=true)
data .= d_memory_whiten.iF * data
return d_memory_whiten
end

#Adds means back to model centers
add_means_back!(myAmica::SingleModelAmica,removed_mean) = nothing
add_means_back!(myAmica::AbstractAmica, removed_mean) = nothing

function add_means_back!(myAmica::MultiModelAmica, removed_mean)
M = size(myAmica.models,1)
for h in 1:M
myAmica.models[h].centers = myAmica.models[h].centers + removed_mean #add mean back to model centers
end
M = size(myAmica.models, 1)
for h in 1:M
myAmica.models[h].centers = myAmica.models[h].centers + removed_mean #add mean back to model centers
end
end

#taken from amica_a.m
#L = det(A) * mult p(s|θ)
function logpfun(x,rho)
return (.-abs.(x).^rho .- log(2) .- loggamma.(1+1/rho))

function logpfun!(out::CuArray{T,3}, y_rho::CuArray{T,3}, shape::CuArray{T,2}) where {T<:Real}
out .= 1 .+ 1 ./ shape

out .= .-y_rho .- log(2) .- loggamma.(out)
end
function logpfun!(out::AbstractArray{T,3}, y_rho::AbstractArray{T,3}, shape::AbstractArray{T,2}) where {T<:Real}
out .= 1 .+ 1 ./ shape
IVM.lgamma!(out)
out .= .-y_rho .- log(2) .- out
end

#taken from amica_a.m
function ffun(x,rho)
return rho .* sign.(x) .* abs.(x) .^(rho.-1)
function ffun!(fp::AbstractArray{T,3}, y::AbstractArray{T,3}, rho::AbstractArray{T,2}) where {T<:Real}
(m, n, N) = size(y)

fp .= abs.(y)

for i in 1:n
for j in 1:m
@views _fp = fp[j, i, :]
@views optimized_pow!(_fp, _fp, rho[j, i] - 1)
end
end

fp .*= sign.(y) .* rho
end

# intelvectormath Pow

function optimized_pow(lhs::AbstractArray{T,1}, rhs::T)::AbstractArray{T,1} where {T<:Real}
out = similar(lhs)
optimized_pow!(out, lhs, rhs)
return out

end



function optimized_pow!(out::AbstractArray{Float32}, lhs::AbstractArray{Float32}, rhs::Float32)

if !hasproperty(MKL_jll, :libmkl_rt) || out isa SubArray{Float32,1,<:CuArray}
out .= lhs .^ rhs
return
end

sta = IVM.stride1(lhs)
sto = IVM.stride1(out)
dense = (sta == 1 && sto == 1)

if dense
@ccall MKL_jll.libmkl_rt.vsPowx(length(lhs)::Cint, lhs::Ptr{Float32}, rhs::Float32, out::Ptr{Float32})::Cvoid
else
@ccall MKL_jll.libmkl_rt.vsPowxI(length(lhs)::Cint, lhs::Ptr{Float32}, sta::Cint, rhs::Float32, out::Ptr{Float32}, sto::Cint)::Cvoid
end
end

#function optimized_pow!(out::SubArray{T,1,<:CuArray{T}}, lhs::SubArray{T,1,<:CuArray{T}}, rhs::T) where {T<:Union{Float32,Float64}}
# out .= lhs .^ rhs
#end
function optimized_pow!(out::AbstractArray{Float64}, lhs::AbstractArray{Float64}, rhs::Float64)
if !hasproperty(MKL_jll, :libmkl_rt) || out isa SubArray{Float64,1,<:CuArray}
out .= lhs .^ rhs
return
end

sta = IVM.stride1(lhs)
sto = IVM.stride1(out)
dense = (sta == 1 && sto == 1)

if dense
@ccall MKL_jll.libmkl_rt.vdPowx(length(lhs)::Cint, lhs::Ptr{Float64}, rhs::Float64, out::Ptr{Float64})::Cvoid
else
@ccall MKL_jll.libmkl_rt.vdPowxI(length(lhs)::Cint, lhs::Ptr{Float64}, sta::Cint, rhs::Float64, out::Ptr{Float64}, sto::Cint)::Cvoid
end
end

# intelvectormath Log

function optimized_log(in::CuArray)
log.(in)
end
function optimized_log(in::AbstractArray{T})::AbstractArray{T} where {T<:Real}
if !hasproperty(MKL_jll, :libmkl_rt)
return log.(in)
end

return IVM.log(in)
end

function optimized_log!(inout::AbstractArray{T}) where {T<:Real}
if !hasproperty(MKL_jll, :libmkl_rt)
inout .= log.(inout)
return
end
IVM.log!(inout)
end

function optimized_log!(out::AbstractArray{T}, in::AbstractArray{T}) where {T<:Real}
if !hasproperty(MKL_jll, :libmkl_rt)
out .= log.(in)
return
end

IVM.log!(out, in)
end


# intelvectormath Exp

function optimized_exp(in::AbstractArray{T})::AbstractArray{T} where {T<:Real}
if !hasproperty(MKL_jll, :libmkl_rt)
return exp.(in)
end

return IVM.exp(in)
end

function optimized_exp!(inout::CuArray)
inout .= exp.(inout)
end
function optimized_exp!(inout::AbstractArray{T}) where {T<:Real}
if !hasproperty(MKL_jll, :libmkl_rt)
inout .= exp.(inout)
return
end

IVM.exp!(inout)
end

function optimized_exp!(out::AbstractArray{T}, in::AbstractArray{T}) where {T<:Real}
if !hasproperty(MKL_jll, :libmkl_rt)
out .= exp.(in)
return
end

IVM.exp!(out, in)
end

function optimized_abs!(myAmica::CuSingleModelAmica)
myAmica.y_rho .= abs.(myAmica.y)
end
function optimized_abs!(myAmica)
IVM.abs!(myAmica.y_rho, myAmica.y)
end
Loading