diff --git a/.gitignore b/.gitignore
index eb7e1f7..1c958d7 100755
--- a/.gitignore
+++ b/.gitignore
@@ -4,3 +4,5 @@
 /Manifest.toml
 /docs/build/
 /.CondaPkg/
+.DS_Store
+*.mem
\ No newline at end of file
diff --git a/Project.toml b/Project.toml
index 1f4c43b..ae24eb8 100755
--- a/Project.toml
+++ b/Project.toml
@@ -4,17 +4,31 @@ authors = ["Alexander Lulkin", "Benedikt V. Ehinger"]
 version = "0.1.0"
 
 [deps]
+AppleAccelerate = "13e28ba4-7ad8-5781-acae-3021b1ed3924"
+BenchmarkTools = "6e4b80f9-dd63-53aa-95a3-0cdb28fa8baf"
+CSV = "336ed68f-0bac-5ca0-87d4-7b16caf5d00b"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
+CairoMakie = "13f3f980-e62b-5c42-98c6-ff1f3baf88f0"
 ComponentArrays = "b0b7db55-cfe3-40fc-9ded-d10e2dbeff66"
+DataFrames = "a93c6f00-e57d-5684-b7b6-d8193f3e46c0"
 Diagonalizations = "9cd687f3-b62d-43f3-8fd3-ffcd9e581047"
 Distributions = "31c24e10-a181-5473-b8eb-7969acd0382f"
+FileIO = "5789e2e9-d7fb-5bc7-8068-2c6fae9b9549"
 GaussianMixtures = "cc18c42c-b769-54ff-9e2a-b28141a64aae"
+IntelVectorMath = "c8ce9da6-5d36-5c03-b118-5a70151be7bc"
 LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
 LogExpFunctions = "2ab3a3ac-af41-5b50-aa03-7779005ae688"
+LoopVectorization = "bdcacae8-1622-11e9-2a5c-532679323890"
 MAT = "23992714-dd62-5051-b70f-ba57cb901cac"
+MATLAB = "10e44e05-a98a-55b3-a45b-ba969058deb6"
+MKL_jll = "856f044c-d86e-5d09-b602-aeab76dc8ba7"
 Parameters = "d96e819e-fc66-5662-9728-84c9c7592b0a"
 ProgressMeter = "92933f4c-e287-5a05-a399-4b506db050ca"
+Revise = "295af30f-e4ad-537b-8983-00126c2a3abe"
 SignalAnalysis = "df1fea92-c066-49dd-8b36-eace3378ea47"
 SpecialFunctions = "276daf66-3868-5448-9aa4-cd146d93841b"
+StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
+Tullio = "bc48ee85-29a4-5162-ae0b-a64e1601d4bc"
 
 [compat]
 ComponentArrays = "0.14"
diff --git a/src/Amica.jl b/src/Amica.jl
index 7401a1a..4c297b5 100755
--- a/src/Amica.jl
+++ b/src/Amica.jl
@@ -1,55 +1,73 @@
 #Amica.jl is based on a MATLAB implementation of AMICA by Jason Palmer.
 module Amica
-    using LinearAlgebra
-	using GaussianMixtures
-	using Distributions
-	using SpecialFunctions
-    using ProgressMeter
-    #using ComponentArrays
-    using Diagonalizations
-    using LogExpFunctions
-    #using MultivariateStats
-    #using StatsAPI
-    include("types.jl")
-    include("helper.jl")
-    include("likelihood.jl")
-    include("parameters.jl")
-    include("newton.jl")
-    include("main.jl")
-    
-    export amica!
-    export fit,fit!
-    export AbstractAmica,MultiModelAmica,SingleModelAmica
-
-
-    import Base.show
-
-    function Base.show(io::Core.IO,m::SingleModelAmica)
-        try
-            global like = m.LL[findlast(m.LL .!= 0)]
-        catch
-            global like = "not run"
-        end
-        println(io,"""
-        Amica with:
-            - signal-size: $(size(m.source_signals))
-            - likelihood: $(like) 
-        """)
+using LinearAlgebra
+using GaussianMixtures
+using Distributions
+using MKL_jll
+using IntelVectorMath
+using SpecialFunctions
+using ProgressMeter
+using LoopVectorization
+using AppleAccelerate
+using StaticArrays
+#using ComponentArrays
+using Diagonalizations
+using LogExpFunctions
+using CUDA
+import CUDA: CuArray
+using Tullio
+
+#using MultivariateStats
+#using StatsAPI
+include("types.jl")
+include("types_gpu.jl")
+include("helper.jl")
+include("likelihood.jl")
+include("parameters.jl")
+include("newton.jl")
+include("main.jl")
+
+export amica!
+export fit, fit!
+export AbstractAmica, MultiModelAmica, SingleModelAmica, CuSingleModelAmica
+
+
+import Base.show
+
+function Base.show(io::Core.IO, m::Union{CuSingleModelAmica,SingleModelAmica})
+    try
+        ix = findlast(.!isnan.(m.LL))
+        @show ix
+        global like = string(m.LL[ix]) * " (after $(string(ix)) iterations)"
+    catch
+        global like = "not run"
     end
+    println(
+        io,
+        """
+$(typeof(m)) with:
+    - signal-size: $(size(m.source_signals))
+    - likelihood: $(like) 
+"""
+    )
+end
 
-    function Base.show(io::Core.IO,m::MultiModelAmica)
-        try
-            global like = m.LL[findlast(m.LL .!= 0)]
-        catch
-            global like = "not run"
-        end
-        println(io,"""
-        Amica with:
-            - models: $(length(m.models))
-            - signal-size: $(size(m.models[1].source_signals))
-            - likelihood: $(like) 
-        """)
+function Base.show(io::Core.IO, m::MultiModelAmica)
+    try
+        global like = m.LL[findlast(m.LL .!= 0)]
+    catch
+        global like = "not run"
     end
+    println(
+        io,
+        """
+Amica with:
+    - models: $(length(m.models))
+    - signal-size: $(size(m.models[1].source_signals))
+    - likelihood: $(like) 
+"""
+    )
+end
+
 
-    
 end
diff --git a/src/helper.jl b/src/helper.jl
index bc2ee9e..b6a4882 100755
--- a/src/helper.jl
+++ b/src/helper.jl
@@ -1,43 +1,183 @@
 #removes mean from nxN float matrix
 function removeMean!(input)
-	mn = mean(input,dims=2)
-	(n,N) = size(input)
-	for i in 1:n
-		input[i,:] = input[i,:] .- mn[i]
-	end
-	return mn
+    mn = mean(input, dims=2)
+    (n, N) = size(input)
+    for i in 1:n
+        input[i, :] .= input[i, :] .- mn[i]
+    end
+    return mn
 end
 
 #Returns sphered data x. todo:replace with function from lib
-function sphering(x)
-	(n,N) = size(x)
-	Us,Ss,Vs = svd(x*x'/N)
-	S = Us * diagm(vec(1 ./sqrt.(Ss))) * Us'
-    return x = S*x
+function sphering_manual!(x::T) where {T}
+    (_, N) = size(x)
+    Us, Ss = svd(x * x' / N)
+    #@debug typeof(Us), typeof(diagm(1 ./ sqrt.(Ss)))
+    S = Us * T(diagm((1 ./ sqrt.(Ss)))) * Us'
+    x .= S * x
+    # @show x
+    return S
 end
 
-function bene_sphering(data)
-	d_memory_whiten = whitening(data) # Todo: make the dimensionality reduction optional
-	return d_memory_whiten.iF * data
+function sphering!(data)
+    d_memory_whiten = whitening(data; simple=true)
+    data .= d_memory_whiten.iF * data
+    return d_memory_whiten
 end
 
 #Adds means back to model centers
-add_means_back!(myAmica::SingleModelAmica,removed_mean) = nothing
+add_means_back!(myAmica::AbstractAmica, removed_mean) = nothing
 
 function add_means_back!(myAmica::MultiModelAmica, removed_mean)
-	M = size(myAmica.models,1)
-	for h in 1:M
-		myAmica.models[h].centers = myAmica.models[h].centers + removed_mean #add mean back to model centers
-	end
+    M = size(myAmica.models, 1)
+    for h in 1:M
+        myAmica.models[h].centers = myAmica.models[h].centers + removed_mean #add mean back to model centers
+    end
 end
 
 #taken from amica_a.m
 #L = det(A) * mult p(s|θ)
-function logpfun(x,rho)
-	return  (.-abs.(x).^rho .- log(2) .- loggamma.(1+1/rho))
+
+function logpfun!(out::CuArray{T,3}, y_rho::CuArray{T,3}, shape::CuArray{T,2}) where {T<:Real}
+    out .= 1 .+ 1 ./ shape
+
+    out .= .-y_rho .- log(2) .- loggamma.(out)
+end
+function logpfun!(out::AbstractArray{T,3}, y_rho::AbstractArray{T,3}, shape::AbstractArray{T,2}) where {T<:Real}
+    out .= 1 .+ 1 ./ shape
+    IVM.lgamma!(out)
+    out .= .-y_rho .- log(2) .- out
 end
 
-#taken from amica_a.m
-function ffun(x,rho)
-	return rho .* sign.(x) .* abs.(x) .^(rho.-1)
+function ffun!(fp::AbstractArray{T,3}, y::AbstractArray{T,3}, rho::AbstractArray{T,2}) where {T<:Real}
+    (m, n, N) = size(y)
+
+    fp .= abs.(y)
+
+    for i in 1:n
+        for j in 1:m
+            @views _fp = fp[j, i, :]
+            @views optimized_pow!(_fp, _fp, rho[j, i] - 1)
+        end
+    end
+
+    fp .*= sign.(y) .* rho
+end
+
+# intelvectormath Pow
+
+function optimized_pow(lhs::AbstractArray{T,1}, rhs::T)::AbstractArray{T,1} where {T<:Real}
+    out = similar(lhs)
+    optimized_pow!(out, lhs, rhs)
+    return out
+
+end
+
+
+
+function optimized_pow!(out::AbstractArray{Float32}, lhs::AbstractArray{Float32}, rhs::Float32)
+
+    if !hasproperty(MKL_jll, :libmkl_rt) || out isa SubArray{Float32,1,<:CuArray}
+        out .= lhs .^ rhs
+        return
+    end
+
+    sta = IVM.stride1(lhs)
+    sto = IVM.stride1(out)
+    dense = (sta == 1 && sto == 1)
+
+    if dense
+        @ccall MKL_jll.libmkl_rt.vsPowx(length(lhs)::Cint, lhs::Ptr{Float32}, rhs::Float32, out::Ptr{Float32})::Cvoid
+    else
+        @ccall MKL_jll.libmkl_rt.vsPowxI(length(lhs)::Cint, lhs::Ptr{Float32}, sta::Cint, rhs::Float32, out::Ptr{Float32}, sto::Cint)::Cvoid
+    end
+end
+
+#function optimized_pow!(out::SubArray{T,1,<:CuArray{T}}, lhs::SubArray{T,1,<:CuArray{T}}, rhs::T) where {T<:Union{Float32,Float64}}
+#   out .= lhs .^ rhs
+#end
+function optimized_pow!(out::AbstractArray{Float64}, lhs::AbstractArray{Float64}, rhs::Float64)
+    if !hasproperty(MKL_jll, :libmkl_rt) || out isa SubArray{Float64,1,<:CuArray}
+        out .= lhs .^ rhs
+        return
+    end
+
+    sta = IVM.stride1(lhs)
+    sto = IVM.stride1(out)
+    dense = (sta == 1 && sto == 1)
+
+    if dense
+        @ccall MKL_jll.libmkl_rt.vdPowx(length(lhs)::Cint, lhs::Ptr{Float64}, rhs::Float64, out::Ptr{Float64})::Cvoid
+    else
+        @ccall MKL_jll.libmkl_rt.vdPowxI(length(lhs)::Cint, lhs::Ptr{Float64}, sta::Cint, rhs::Float64, out::Ptr{Float64}, sto::Cint)::Cvoid
+    end
+end
+
+# intelvectormath Log
+
+function optimized_log(in::CuArray)
+    log.(in)
+end
+function optimized_log(in::AbstractArray{T})::AbstractArray{T} where {T<:Real}
+    if !hasproperty(MKL_jll, :libmkl_rt)
+        return log.(in)
+    end
+
+    return IVM.log(in)
+end
+
+function optimized_log!(inout::AbstractArray{T}) where {T<:Real}
+    if !hasproperty(MKL_jll, :libmkl_rt)
+        inout .= log.(inout)
+        return
+    end
+    IVM.log!(inout)
+end
+
+function optimized_log!(out::AbstractArray{T}, in::AbstractArray{T}) where {T<:Real}
+    if !hasproperty(MKL_jll, :libmkl_rt)
+        out .= log.(in)
+        return
+    end
+
+    IVM.log!(out, in)
+end
+
+
+# intelvectormath Exp
+
+function optimized_exp(in::AbstractArray{T})::AbstractArray{T} where {T<:Real}
+    if !hasproperty(MKL_jll, :libmkl_rt)
+        return exp.(in)
+    end
+
+    return IVM.exp(in)
+end
+
+function optimized_exp!(inout::CuArray)
+    inout .= exp.(inout)
+end
+function optimized_exp!(inout::AbstractArray{T}) where {T<:Real}
+    if !hasproperty(MKL_jll, :libmkl_rt)
+        inout .= exp.(inout)
+        return
+    end
+
+    IVM.exp!(inout)
+end
+
+function optimized_exp!(out::AbstractArray{T}, in::AbstractArray{T}) where {T<:Real}
+    if !hasproperty(MKL_jll, :libmkl_rt)
+        out .= exp.(in)
+        return
+    end
+
+    IVM.exp!(out, in)
+end
+
+function optimized_abs!(myAmica::CuSingleModelAmica)
+    myAmica.y_rho .= abs.(myAmica.y)
+end
+function optimized_abs!(myAmica)
+    IVM.abs!(myAmica.y_rho, myAmica.y)
 end
\ No newline at end of file
diff --git a/src/likelihood.jl b/src/likelihood.jl
index ce35c92..1b0bf96 100755
--- a/src/likelihood.jl
+++ b/src/likelihood.jl
@@ -1,163 +1,123 @@
 #Calculates log-likelihood for whole model. todo: make the calculate LLs one function
-function calculate_LL!(myAmica::SingleModelAmica)
-	(n,N) = size(myAmica.source_signals)
-	push!(myAmica.LL,sum(myAmica.Lt) / (n*N))
+function calculate_LL!(myAmica::AbstractAmica)
+    (n, N) = size(myAmica.source_signals)
+    push!(myAmica.LL, sum(myAmica.Lt) / (n * N))
 end
 
 #Calculates LL for whole ICA mixture model. Uses LL for dominant model for each time sample.
 function calculate_LL!(myAmica::MultiModelAmica)
-	M = size(myAmica.models,1)
-	(n,N) = size(myAmica.models[1].source_signals)
-	Ltmax = ones(size(myAmica.models[1].Lt,1)) #Ltmax = (M x N)
-	Lt_i = zeros(M)
-	P = zeros(N)
-	for i in 1:N
-		for h in 1:M
-			Lt_i[h] = myAmica.models[h].Lt[i]
-		end
-		Ltmax[i] = maximum(Lt_i) #Look for the maximum ith entry among all models
-		for h in 1:M
-			P[i] = P[i] + exp(myAmica.models[h].Lt[i] - Ltmax[i])
-		end
-	end 
-	push!(myAmica.LL, sum(Ltmax .+ log.(P)) / (n*N))
+    M = size(myAmica.models, 1)
+    (n, N) = size(myAmica.models[1].source_signals)
+    Ltmax = ones(size(myAmica.models[1].Lt, 1)) #Ltmax = (M x N)
+    Lt_i = zeros(M)
+    P = zeros(N)
+    for i in 1:N
+        for h in 1:M
+            Lt_i[h] = myAmica.models[h].Lt[i]
+        end
+        Ltmax[i] = maximum(Lt_i) #Look for the maximum ith entry among all models
+        for h in 1:M
+            P[i] = P[i] + exp(myAmica.models[h].Lt[i] - Ltmax[i])
+        end
+    end
+    push!(myAmica.LL, sum(Ltmax .+ log.(P)) / (n * N))
 end
 
 #Update loop for Lt and u (which is saved in z). Todo: Rename
-function loopiloop(myAmica::SingleModelAmica)
-	(n,N) = size(myAmica.source_signals)
-	m = myAmica.m
-	Q = zeros(m,N)
-	@debug (:prop,myAmica.learnedParameters.proportions[1,1],:shape,myAmica.learnedParameters.shape[1],:y,myAmica.y[1])
-	for i in 1:n
-		Q = calculate_Q(myAmica,Q,i) # myAmica.Q
-		calculate_u!(myAmica,Q,i) # myAmica.z
-		@debug (:z,myAmica.z[1,1:5,1])
-		calculate_Lt!(myAmica,Q) # myAmica.Q
-	end
-	@debug (:Q,Q[1,1])
-end
+function loopiloop!(myAmica)
+    gg = myAmica.learnedParameters
+    calculate_Q!(myAmica.Q, gg.proportions, gg.scale, gg.shape, myAmica.y_rho)
+    calculate_u!(myAmica.z, myAmica.Q, myAmica.u_intermed)
+    calculate_Lt!(myAmica.Lt, myAmica.Q)
 
-function loopiloop(myAmica::MultiModelAmica)
-	M = size(myAmica.models,1)
-	(n,N) = size(myAmica.models[1].source_signals)
-	m = myAmica.m
-	Q = zeros(m,N)
-
-	for h in 1:M #run along models
-		Threads.@threads for i in 1:n #run along components
-			#uses SingleModel versions of functions
-			Q = calculate_Q(myAmica.models[h],Q,i)
-			calculate_u!(myAmica.models[h],Q,i)
-			calculate_Lt!(myAmica.models[h],Q)
-		end
-	end
 end
 
-#Applies location and scale parameter to source signals (per generalized Gaussian)
-function calculate_y!(myAmica::SingleModelAmica)
-	n = size(myAmica.A,1)
-	Threads.@threads for i in 1:n
-		for j in 1:myAmica.m
-			myAmica.y[i,:,j] = sqrt(myAmica.learnedParameters.scale[j,i]) * (myAmica.source_signals[i,:] .- myAmica.learnedParameters.location[j,i])
-		end
-	end
+function loopiloop!(myAmica::MultiModelAmica)
+    M = size(myAmica.models, 1)
+    (n, _) = size(myAmica.models[1].source_signals)
+
+    for h in 1:M #run along models
+        for i in 1:n #run along components
+            Q = calculate_Q(myAmica.models[h], i, y_rho)
+            calculate_u!(myAmica.models[h], @view(Q[:, n, :]), i)
+            calculate_Lt!(myAmica.models[h], @view(Q[:, n, :]))
+        end
+    end
 end
 
-function calculate_y!(myAmica::MultiModelAmica)
-	n = size(myAmica.models[1].A,1)
-	for h in 1:size(myAmica.models,1)
-		#=Threads.@threads=# for i in 1:n
-			for j in 1:myAmica.m
-				myAmica.models[h].y[i,:,j] = sqrt(myAmica.models[h].learnedParameters.scale[j,i]) * (myAmica.models[h].source_signals[i,:] .- myAmica.models[h].learnedParameters.location[j,i])
-			end
-		end
-	end
+function calculate_Q!(Q::AbstractArray{T,3}, proportions::AbstractArray{T,2}, scale::AbstractArray{T,2}, shape::AbstractArray{T,2}, y_rho::AbstractArray{T,3}) where {T<:Real}
+    Q .= y_rho
+    logpfun!(Q, y_rho, shape)
+    # idk why but this is faster when stored in a variable?!
+    add = (log.(proportions) .+ 0.5 .* log.(scale))
+    Q .+= add
 end
 
 #calculates u but saves it into z. MultiModel also uses the SingleModel version
-function calculate_u!(myAmica::SingleModelAmica, Q, i)
-	m = size(myAmica.learnedParameters.scale,1)
-	if m > 1
-		for j in 1:m
-			Qj = ones(m,1) .* Q[j,:]'
-			myAmica.z[i,:,j] = 1 ./ sum(exp.(Q-Qj),dims = 1)
-		end
-	end
-end
+@views function calculate_u!(z::AbstractArray{T,3}, Q::AbstractArray{T,3}, u_intermed::AbstractArray{T,4}) where {T<:Real}
+    (m, n, N) = size(z)
 
-#no longer in use, multimodel also uses the singlemodel version (see loopiloop)
-function calculate_u!(myAmica::MultiModelAmica,Q,i,h)
-	m = myAmica.m
-	
-	if m > 1 
-		for j in 1:m
-			Qj = ones(m,1) .* Q[j,:]'
-			myAmica.models[h].z[i,:,j] = 1 ./ sum(exp.(Q-Qj),dims = 1)
-		end
-	end
-end
+    if (m <= 1)
+        z .= 1 ./ z
+        return
+    end
 
-#Calculates densities for each generalized Gaussian j. Currently used my MultiModelAmica too
-function calculate_Q(myAmica::SingleModelAmica, Q, i)
-	m = size(myAmica.learnedParameters.scale, 1) #m = number of GGs, can't use myAmica.m in case this gets used my MultiModelAmica
+    #@tullio u_intermed[j, i, k, j1] := Q[j1, i, k] - Q[j, i, k]
+    for k = 1:N, i = 1:n, j = 1:m, j1 = 1:m
+        @inbounds u_intermed[j, i, k, j1] = Q[j1, i, k] - Q[j, i, k]
+    end
 
-	for j in 1:m
-		Q[j,:] = log(myAmica.learnedParameters.proportions[j,i]) + 0.5*log(myAmica.learnedParameters.scale[j,i]) .+ logpfun(myAmica.y[i,:,j],myAmica.learnedParameters.shape[j,i])
-	end
-	return Q
-end
+    optimized_exp!(u_intermed)
 
-#no longer in use, Q is no longer saved in Amica object but instead in loopiloop
-function calculate_Q!(myAmica::MultiModelAmica, i, h)
-	m = myAmica.m
-	for j in 1:m
-		myAmica.Q[j,:] = log(myAmica.models[h].learnedParameters.proportions[j,i]) + 0.5*log(myAmica.models[h].learnedParameters.scale[j,i]) .+ logpfun(myAmica.models[h].y[i,:,j],myAmica.models[h].learnedParameters.shape[j,i])
-	end
+    #@tullio z[j, i, k] := u_intermed[j, i, k, j1]
+    for k = 1:N, i = 1:n, j = 1:m, j1 = 1:m
+        @inbounds z[j, i, k] += u_intermed[j, i, k, j1]
+    end
+
+    z .= 1 ./ z
 end
 
-#Calculates Likelihood for each time sample and for each ICA model
-function calculate_Lt!(myAmica::SingleModelAmica,Q)
-	m = size(myAmica.learnedParameters.scale,1)
-	if m > 1
-		Qmax = ones(m,1).*maximum(Q,dims=1);
-		myAmica.Lt[:] = myAmica.Lt' .+ Qmax[1,:]' .+ logsumexp(Q - Qmax,dims = 1)
-	else
-		myAmica.Lt[:] = myAmica.Lt .+ Q[1,:]#todo: test
-	end
+#Applies location and scale parameter to source signals (per generalized Gaussian)
+@views function calculate_y!(myAmica::AbstractAmica)
+    for j in 1:myAmica.m
+        myAmica.y[j, :, :] .= sqrt.(myAmica.learnedParameters.scale[j, :]) .* (myAmica.source_signals .- myAmica.learnedParameters.location[j, :])
+    end
 end
 
-#no longer in use, multimodel amica also uses singlemodel version
-function calculate_Lt!(myAmica::MultiModelAmica,Q,h)
-	m = myAmica.m
+function calculate_y!(myAmica::MultiModelAmica)
+    calculate_y!.(myAmica.models[h])
+end
 
-	if m > 1
-		Qmax = ones(m,1).*maximum(Q,dims=1);
-		myAmica.models[h].Lt[:] = myAmica.models[h].Lt[:]' .+ Qmax[1,:]' .+ log.(sum(exp.(Q - Qmax),dims = 1))
-	else
-		myAmica.models[h].Lt[:] = myAmica.models[h].Lt[:] .+ Q[1,:] #todo: test
-	end
+#Calculates Likelihood for each time sample and for each ICA model
+function calculate_Lt!(Lt::AbstractArray{T,1}, Q::AbstractArray{T,3}) where {T<:Real}
+    (m, _, _) = size(Q)
+
+    if m > 1
+        Lt .+= sum(logsumexp(Q; dims=1), dims=2)[1, 1, :]
+    else
+        Lt[:] = Lt .+ Q[1, :] #todo: test
+    end
 end
 
 #Initializes the likelihoods for each time sample with the determinant of the mixing matrix
-function initialize_Lt!(myAmica::SingleModelAmica)
-	myAmica.Lt .= myAmica.ldet
+function initialize_Lt!(myAmica)
+    myAmica.Lt .= myAmica.ldet
 end
 
 #Initializes the likelihoods of each time sample with the determinant of the mixing matrix and the weights for each ICA model
 function initialize_Lt!(myAmica::MultiModelAmica)
-	M = size(myAmica.models,1)
-	for h in 1:M
-		myAmica.models[h].Lt .= log(myAmica.normalized_ica_weights[h]) + myAmica.models[h].ldet
-	end
+    M = size(myAmica.models, 1)
+    for h in 1:M
+        myAmica.models[h].Lt .= log(myAmica.normalized_ica_weights[h]) + myAmica.models[h].ldet
+    end
 end
 
 # calculate loglikelihood for each sample in vector x, given a parameterization of a mixture of PGeneralizedGaussians (not in use)
-function loglikelihoodMMGG(μ::AbstractVector,prop::AbstractVector,shape::AbstractVector,data::AbstractVector,π::AbstractVector)
-	# take the vectors of μ,prop,shape and generate a GG from each
-    GGvec = PGeneralizedGaussian.(μ,prop,shape)
-    MM = MixtureModel(GGvec,Vector(π)) # make it a mixture model with prior probabilities π
-    return loglikelihood.(MM,data) # apply the loglikelihood to each sample individually (note the "." infront of .(MM,x))
+function loglikelihoodMMGG(μ::AbstractVector, prop::AbstractVector, shape::AbstractVector, data::AbstractVector, π::AbstractVector)
+    # take the vectors of μ,prop,shape and generate a GG from each
+    GGvec = PGeneralizedGaussian.(μ, prop, shape)
+    MM = MixtureModel(GGvec, Vector(π)) # make it a mixture model with prior probabilities π
+    return loglikelihood.(MM, data) # apply the loglikelihood to each sample individually (note the "." infront of .(MM,x))
 end
 
 
diff --git a/src/main.jl b/src/main.jl
index 82951d5..47ccb21 100755
--- a/src/main.jl
+++ b/src/main.jl
@@ -3,102 +3,137 @@ Main AMICA algorithm
 
 """
 
-function fit(amicaType::Type{T}, data; m = 3, maxiter = 500, location = nothing, scale = nothing, A = nothing, kwargs...) where {T<:AbstractAmica}
-	amica = T(data; m = m, maxiter = maxiter, location = location, scale = scale, A = A)
-	fit!(amica, data; kwargs...)
-	return amica
+function fit(amicaType::Type{T}, data; m=3, maxiter=500, location=nothing, scale=nothing, A=nothing, kwargs...) where {T<:AbstractAmica}
+    amica = T(data; m=m, maxiter=maxiter, location=location, scale=scale, A=A)
+    fit!(amica, data; kwargs...)
+    return amica
 end
 function fit!(amica::AbstractAmica, data; kwargs...)
-	amica!(amica, data; kwargs...)
+    amica!(amica, data; kwargs...)
 end
 
 function amica!(myAmica::AbstractAmica,
-	data;
-	lrate = LearningRate(),
-	shapelrate = LearningRate(;lrate = 0.1,minimum=0.5,maximum=5,init=1.5),
-	remove_mean = true,
-	do_sphering = true,
-	show_progress = true,
-	maxiter = myAmica.maxiter,
-	do_newton = 1,
-	newt_start_iter = 25,
-	iterwin = 10,
-	update_shape = 1,
-	mindll = 1e-8,
-
-	kwargs...)
-	
-	initialize_shape_parameter(myAmica,shapelrate)
-
-	(n, N) = size(data)
-	m = myAmica.m
-
-	#Prepares data by removing means and/or sphering
-	if remove_mean
-		removed_mean = removeMean!(data)
-	end
-	if do_sphering
-		data = sphering(data)
-	end
-	
-	dLL = zeros(1, maxiter)
-	fp = zeros(m ,N)
-
-	#todo put them into object
-	lambda = zeros(n, 1)
-	kappa = zeros(n, 1)
-	sigma2 = zeros(n, 1)
+    data_in::AbstractMatrix{T};
+    lrate::LearningRate{T}=LearningRate{T}(),
+    shapelrate::LearningRate{T}=LearningRate{T}(; lrate=0.1, minimum=0.5, maximum=5, init=1.5),
+    remove_mean::Bool=true,
+    do_sphering::Bool=true,
+    show_progress::Bool=true,
+    maxiter::Int=myAmica.maxiter,
+    do_newton::Bool=true,
+    newt_start_iter::Int=25,
+    iterwin::Int=10,
+    update_shape::Bool=true,
+    mindll::T=T(1e-8), kwargs...) where {T<:Real}
+
+    initialize_shape_parameter!(myAmica, shapelrate)
+    data = deepcopy(data_in)
+    (n, N) = size(data)
+    m = myAmica.m
+
+    println("m[j]: $(m), n[i]: $(n), N: $(N)")
+
+    #Prepares data by removing means and/or sphering
+    if remove_mean
+        removed_mean = removeMean!(data)
+    end
+    if do_sphering
+        S = sphering_manual!(data)
+        myAmica.S = S
+
+        LLdetS = logabsdet(S)[1]
+    else
+        myAmica.S = collect(T.(I(size(data, 1))))
+        LLdetS = 0
+    end
+
+    dLL = zeros(1, maxiter)
 
     prog = ProgressUnknown("Minimizing"; showspeed=true)
 
-	for iter in 1:maxiter
-		#E-step
-		update_sources!(myAmica, data)
-		calculate_ldet!(myAmica)
-		initialize_Lt!(myAmica)
-		calculate_y!(myAmica)
-		loopiloop(myAmica) #Updates y and Lt. Todo: Rename
-		calculate_LL!(myAmica)
-		@debug (:LL,myAmica.LL)
-		#Calculate difference in loglikelihood between iterations
-		if iter > 1
-			dLL[iter] = myAmica.LL[iter] - myAmica.LL[iter-1]
-		end
-		if iter > iterwin +1
-			calculate_lrate!(dLL, lrate, iter,newt_start_iter, do_newton, iterwin)
-			#Calculates average likelihood change over multiple itertions
-			sdll = sum(dLL[iter-iterwin+1:iter])/iterwin
-			#Checks termination criterion
-			if (sdll > 0) && (sdll < mindll)
-				println("LL increase to low. Stop at iteration ", iter)
-				break
-			end
-		end
-		
-		#M-step
-		try
-			#Updates parameters and mixing matrix
-			update_loop!(myAmica, fp, lambda, shapelrate, update_shape, iter, do_newton, newt_start_iter, lrate)
-		catch e
-			#Terminates if NaNs are detected in parameters
-			if isa(e,AmicaNaNException)
-				println("\nNaN detected. Better stop. Current iteration: ", iter)
-				@goto escape_from_NaN
-			else 
-				rethrow()
-			end
-		end
-		
-		reparameterize!(myAmica, data)
-		#Shows current progress
-		show_progress && ProgressMeter.next!(prog; showvalues=[(:LL, myAmica.LL[iter])])
- 
-	end
-	#If parameters contain NaNs, the algorithm skips the A update and terminates by jumping here
+    for iter in 1:maxiter
+        gg = myAmica.learnedParameters
+        #@debug :scale, gg.scale[2], :location, gg.location[2], :proportions, gg.proportions[2], :shape, gg.shape[2]
+        #@debug println("")
+        #E-step
+        #@show typeof(myAmica.A),typeof(data)
+        #@debug :A myAmica.A[1:2, 1:2]
+
+        #@debug :source_signals myAmica.source_signals[1], myAmica.source_signals[end]
+        update_sources!(myAmica, data)
+        #@debug :source_signals myAmica.source_signals[1], myAmica.source_signals[end]
+        calculate_ldet!(myAmica)
+        initialize_Lt!(myAmica)
+        myAmica.Lt .+= LLdetS
+
+        calculate_y!(myAmica)
+
+        #@debug :y, myAmica.y[1, 1:3, 1]
+        # pre-calculate abs(y)^rho
+
+        optimized_abs!(myAmica)
+
+        for i in 1:n
+            for j in 1:m
+                @views _y_rho = myAmica.y_rho[j, i, :]
+                optimized_pow!(_y_rho, _y_rho, myAmica.learnedParameters.shape[j, i])
+            end
+        end
+
+
+
+        loopiloop!(myAmica) #Updates y and Lt. Todo: Rename
+
+
+        calculate_LL!(myAmica)
+
+
+        #@debug (:LL, myAmica.LL)
+        #Calculate difference in loglikelihood between iterations
+        if iter > 1
+            dLL[iter] = myAmica.LL[iter] - myAmica.LL[iter-1]
+        end
+        if iter > iterwin + 1
+            calculate_lrate!(dLL, lrate, iter, newt_start_iter, do_newton, iterwin)
+            #Calculates average likelihood change over multiple itertions
+            sdll = sum(dLL[iter-iterwin+1:iter]) / iterwin
+            #Checks termination criterion
+            if (sdll > 0) && (sdll < mindll)
+                println("LL increase to low. Stop at iteration ", iter)
+                break
+            end
+        end
+
+        #M-step
+        try
+            #Updates parameters and mixing matrix
+            update_loop!(myAmica, shapelrate, update_shape, iter, do_newton, newt_start_iter, lrate)
+        catch e
+            #Terminates if NaNs are detected in parameters
+            if isa(e, AmicaNaNException)
+                println("\nNaN detected. Better stop. Current iteration: ", iter)
+                @goto escape_from_NaN
+            else
+                rethrow()
+            end
+        end
+        #@debug iter, :A myAmica.A[1:2, 1:2]
+
+        reparameterize!(myAmica, data)
+
+        #@debug iter, :A myAmica.A[1:2, 1:2]
+        #Shows current progress
+        show_progress && ProgressMeter.next!(prog; showvalues=[(:LL, myAmica.LL[iter])])
+
+    end
+    #If parameters contain NaNs, the algorithm skips the A update and terminates by jumping here
     @label escape_from_NaN
-	#If means were removed, they are added back
-	if remove_mean
-		add_means_back!(myAmica, removed_mean)
-	end
-	return myAmica
+
+
+    #If means were removed, they are added back
+    if remove_mean
+        add_means_back!(myAmica, removed_mean)
+    end
+    @debug myAmica.LL
+    return myAmica
 end
\ No newline at end of file
diff --git a/src/newton.jl b/src/newton.jl
index f48053a..e8b3044 100644
--- a/src/newton.jl
+++ b/src/newton.jl
@@ -1,67 +1,67 @@
 #Updates the mixing matrix with the newton method
-function newton_method!(myAmica::SingleModelAmica, iter, g, kappa, do_newton, newt_start_iter, lrate::LearningRate, lambda)
-	
-	lnatrate = lrate.natural_rate
-	lrate = lrate.lrate
-	(n,N) = size(myAmica.source_signals)
+function newton_method!(myAmica::Union{<:CuSingleModelAmica{T},<:SingleModelAmica{T}}, iter::Int, g, kappa, do_newton::Bool, newt_start_iter::Int, lrate::LearningRate, lambda::AbstractArray{T,1}) where {T<:Real}
 
-	sigma2 = sum(myAmica.source_signals.^2,dims=2) / N #is probably called sigma2 cause always squared
+    lnatrate = lrate.natural_rate
+    lrate = lrate.lrate
+    (n, N) = size(myAmica.source_signals)
 
-	dA = Matrix{Float64}(I, n, n) - g * myAmica.source_signals[:,:]' 
-	bflag = 0
-	B = zeros(n,n)
-	
-	for i in 1:n
-		for k = 1:n
-			if i == k
-				B[i,i] = dA[i,i] / (lambda[i])
-			else
-				denom = kappa[i]*kappa[k]*sigma2[i]*sigma2[k] - 1
-				if denom > 0
-					B[i,k] = (-kappa[k] * sigma2[i] * dA[i,k] + dA[k,i]) / denom
-				else
-					bflag = 1
-				end
-			end
-		end		
-	end
-	if (bflag == 0) && (do_newton == 1) && (iter > newt_start_iter)
-		myAmica.A[:,:] = myAmica.A[:,:] + lrate * myAmica.A[:,:] * B
-	else
-		myAmica.A[:,:] = myAmica.A[:,:] - lnatrate * myAmica.A[:,:] * dA
-	end
+    sigma2 = sum(myAmica.source_signals .^ 2, dims=2) / N #is probably called sigma2 cause always squared
+
+    dA = I - g * myAmica.source_signals'
+    bflag = false
+    B = zeros(T, n, n)
+
+    for i in 1:n
+        for k = 1:n
+            if i == k
+                B[i, i] = dA[i, i] / (lambda[i])
+            else
+                denom = kappa[i] * kappa[k] * sigma2[i] * sigma2[k] - 1
+                if denom > 0
+                    B[i, k] = (-kappa[k] * sigma2[i] * dA[i, k] + dA[k, i]) / denom
+                else
+                    bflag = true
+                end
+            end
+        end
+    end
+    if (bflag == false) && (do_newton == 1) && (iter > newt_start_iter)
+        myAmica.A += lrate * myAmica.A * B
+    else
+        myAmica.A -= lnatrate * myAmica.A * dA
+    end
 end
 
-function newton_method!(myAmica::MultiModelAmica, h, iter, g, kappa, do_newton, newt_start_iter, lrate::LearningRate, lambda)
-	
-	lnatrate = lrate.natural_rate
-	lrate = lrate.lrate
-	M = size(myAmica.models)
-	(n,N) = size(myAmica.models[1].source_signals)
+@views function newton_method!(myAmica::MultiModelAmica, h, iter, g, kappa, do_newton, newt_start_iter, lrate::LearningRate, lambda)
+
+    lnatrate = lrate.natural_rate
+    lrate = lrate.lrate
+    M = size(myAmica.models)
+    (n, N) = size(myAmica.models[1].source_signals)
+
+    sigma2 = myAmica.models[h].source_signals .^ 2 * myAmica.ica_weights_per_sample[h, :] / myAmica.ica_weights[h]
 
-	sigma2 = myAmica.models[h].source_signals[:,:].^2 * myAmica.ica_weights_per_sample[h,:] /myAmica.ica_weights[h]
+    dA = Matrix{Float64}(I, n, n) - g * myAmica.models[h].source_signals'
+    bflag = 0
+    B = zeros(n, n)
 
-	dA = Matrix{Float64}(I, n, n) - g * myAmica.models[h].source_signals[:,:]' 
-	bflag = 0
-	B = zeros(n,n)
-	
-	for i in 1:n
-		for k = 1:n
-			if i == k
-				B[i,i] = dA[i,i] / (lambda[i])
-			else
-				denom = kappa[i]*kappa[k]*sigma2[i]*sigma2[k] - 1
-				if denom > 0
-					B[i,k] = (-kappa[k] * sigma2[i] * dA[i,k] + dA[k,i]) / denom
-				else
-					bflag = 1
-				end
-			end
-		end		
-	end
-	if (bflag == 0) && (do_newton == 1) && (iter > newt_start_iter)
-		myAmica.models[h].A[:,:] = myAmica.models[h].A[:,:] + lrate * myAmica.models[h].A[:,:] * B
-	else
-		myAmica.models[h].A[:,:] = myAmica.models[h].A[:,:] - lnatrate * myAmica.models[h].A[:,:] * dA
-	end
+    for i in 1:n
+        for k = 1:n
+            if i == k
+                B[i, i] = dA[i, i] / (lambda[i])
+            else
+                denom = kappa[i] * kappa[k] * sigma2[i] * sigma2[k] - 1
+                if denom > 0
+                    B[i, k] = (-kappa[k] * sigma2[i] * dA[i, k] + dA[k, i]) / denom
+                else
+                    bflag = 1
+                end
+            end
+        end
+    end
+    if (bflag == 0) && (do_newton == 1) && (iter > newt_start_iter)
+        myAmica.models[h].A = myAmica.models[h].A + lrate * myAmica.models[h].A * B
+    else
+        myAmica.models[h].A = myAmica.models[h].A - lnatrate * myAmica.models[h].A * dA
+    end
 end
\ No newline at end of file
diff --git a/src/parameters.jl b/src/parameters.jl
index b75a49a..036f392 100644
--- a/src/parameters.jl
+++ b/src/parameters.jl
@@ -1,388 +1,396 @@
 #Normalizes source density location parameter (mu), scale parameter (beta) and model centers
-function reparameterize!(myAmica::SingleModelAmica, data)
-	(n,N) = size(myAmica.source_signals)
-	mu = myAmica.learnedParameters.location
-	beta = myAmica.learnedParameters.scale
-
-	for i in 1:n
-		tau = norm(myAmica.A[:,i])
-		myAmica.A[:,i] = myAmica.A[:,i] / tau
-		mu[:,i] = mu[:,i] * tau
-		beta[:,i] = beta[:,i] / tau^2
-	end
-
-	myAmica.learnedParameters.location .= mu
-	myAmica.learnedParameters.scale .= beta
-
-	return myAmica
+function reparameterize!(myAmica, data)
+
+    tau = norm.(eachcol(myAmica.A))
+    if myAmica.A isa CuArray
+        tau = CuVector(tau)
+    end
+    @debug typeof(tau), size(tau), typeof(tau')
+    myAmica.A = myAmica.A ./ tau'
+    myAmica.learnedParameters.location = myAmica.learnedParameters.location .* tau'
+    myAmica.learnedParameters.scale = myAmica.learnedParameters.scale ./ tau' .^ 2
+
 end
 
 #Reparameterizes the parameters for the active models
 function reparameterize!(myAmica::MultiModelAmica, data)
-	(n,N) = size(myAmica.models[1].source_signals)
-	M = size(myAmica.models,1)
-	
-	for h = 1:M
-		mu = myAmica.models[h].learnedParameters.location
-		beta = myAmica.models[h].learnedParameters.scale
-
-		if myAmica.normalized_ica_weights[h] == 0
-			continue
-		end
-		for i in 1:n
-			tau = norm(myAmica.models[h].A[:,i])
-			myAmica.models[h].A[:,i] = myAmica.models[h].A[:,i] / tau
-			mu[:,i] = mu[:,i] * tau
-			beta[:,i] = beta[:,i] / tau^2
-		end
-	
-		if M > 1
-			cnew = data * myAmica.ica_weights_per_sample[h,:] /(sum(myAmica.ica_weights_per_sample[h,:])) #todo: check why v not inverted
-			for i in 1:n
-				Wh = pinv(myAmica.models[h].A[:,:])
-				mu[:,i] = mu[:,i] .- Wh[i,:]' * (cnew-myAmica.models[h].centers[:])
-			end
-			myAmica.models[h].centers = cnew
-		end
-		myAmica.models[h].learnedParameters.location .= mu
-		myAmica.models[h].learnedParameters.scale .= beta
-	end
+    (n, N) = size(myAmica.models[1].source_signals)
+    M = size(myAmica.models, 1)
+
+    for h = 1:M
+        mu = myAmica.models[h].learnedParameters.location
+        beta = myAmica.models[h].learnedParameters.scale
+
+        if myAmica.normalized_ica_weights[h] == 0
+            continue
+        end
+        for i in 1:n
+            tau = norm(myAmica.models[h].A[:, i])
+            myAmica.models[h].A[:, i] = myAmica.models[h].A[:, i] / tau
+            mu[:, i] = mu[:, i] * tau
+            beta[:, i] = beta[:, i] / tau^2
+        end
+
+        if M > 1
+            cnew = data * myAmica.ica_weights_per_sample[h, :] / (sum(myAmica.ica_weights_per_sample[h, :])) #todo: check why v not inverted
+            for i in 1:n
+                Wh = pinv(myAmica.models[h].A[:, :])
+                mu[:, i] = mu[:, i] .- Wh[i, :]' * (cnew - myAmica.models[h].centers[:])
+            end
+            myAmica.models[h].centers = cnew
+        end
+        myAmica.models[h].learnedParameters.location .= mu
+        myAmica.models[h].learnedParameters.scale .= beta
+    end
 end
 
 #Calculates sum of z. Returns N if there is just one generalized Gaussian
-function calculate_sumz(myAmica::SingleModelAmica,i,j)
-	if myAmica.m == 1
-		return size(myAmica.source_signals,2)
-	else
-		return sum(myAmica.z[i,:,j])
-	end
-end
+function calculate_sumz(z::AbstractArray{T,3})::AbstractArray{T,2} where {T<:Real}
+    (m, n, N) = size(z)
 
-function calculate_sumz(myAmica::MultiModelAmica,i,j,h)
-		return sum(myAmica.models[h].z[i,:,j])
-end
+    if m == 1
+        return ones(m, n) * N
+    else
+        sumz = sum(z, dims=3)[:, :, 1]
+        sumz[sumz.<0] .= 1
 
-#Calculates densities for each sample per ICA model and per Gaussian mixture
-calculate_z!(myAmica::SingleModelAmica, i,j) = nothing
-function calculate_z!(myAmica::MultiModelAmica, i,j,h)
-	if myAmica.m > 1
-		myAmica.models[h].z[i,:,j] .= myAmica.ica_weights_per_sample[h,:] .* myAmica.models[h].z[i,:,j]
-	elseif myAmica.m == 1
-		myAmica.models[h].z[i,:,j] = myAmica.ica_weights_per_sample[h,:]
-	end
+        return sumz
+    end
 end
 
-#Updates the Gaussian mixture proportion parameter
-function update_mixture_proportions!(sumz, myAmica::SingleModelAmica,j,i)
-	N = size(myAmica.source_signals,2)
-	if myAmica.m > 1
-		myAmica.learnedParameters.proportions[j,i] = sumz / N
-	end
+@views function calculate_sumz(myAmica::MultiModelAmica, h)
+    return sum(myAmica.models[h].z, dims=3)
 end
 
-function update_mixture_proportions!(sumz, myAmica::MultiModelAmica,j,i,h)
-	if myAmica.m > 1
-		myAmica.models[h].learnedParameters.proportions[j,i] = sumz / myAmica.ica_weights[h]
-	end
+#Calculates densities for each sample per ICA model and per Gaussian mixture
+calculate_z!(myAmica::SingleModelAmica, i, j) = nothing
+function calculate_z!(myAmica::MultiModelAmica, i, j, h)
+    if myAmica.m > 1
+        myAmica.models[h].z[j, i, :] .= myAmica.ica_weights_per_sample[h, :] .* myAmica.models[h].z[j, i, :]
+    elseif myAmica.m == 1
+        myAmica.models[h].z[j, i, :] .= myAmica.ica_weights_per_sample[h, :]
+    end
 end
 
 #Updates the Gaussian mixture location parameter. Todo: merge again with MultiModel version
-function update_location(myAmica::SingleModelAmica,shape,zfp,y,location,scale,kp)
-	m = myAmica.m
-	if shape <= 2
-		if (m > 1) 
-			dm = sum(zfp./y)
-			if dm > 0
-				return location + (1/sqrt(scale)) * sum(zfp) / dm
-			end
-		end
-	else
-		if (m > 1) && kp > 0
-			return location + sqrt(scale) * sum(zfp) / kp
-		end
-	end
-	return location
+function update_location!(location::AbstractArray{T,2}, shape::AbstractArray{T,2}, zfp::AbstractArray{T,3}, y::AbstractArray{T,3}, scale::AbstractArray{T,2}, kp::AbstractArray{T,2}) where {T<:Real}
+    (m, n, N) = size(y)
+
+    if m <= 1
+        return
+    end
+
+    dm = zeros(T, m, n) |> typeof(location)
+
+
+    for k = 1:N, i = 1:n, j = 1:m
+        @inbounds dm[j, i] += zfp[j, i, k] / y[j, i, k]
+    end
+
+    sum_zfp = sum(zfp, dims=3)[:, :, 1]
+
+    dm[dm.<=0] .= 1
+
+    a = shape .<= 2
+    b = .!a .&& kp .> 0
+    @debug typeof(scale) typeof(a) typeof(sum_zfp) typeof(dm)
+    location[a] .+= (1 ./ sqrt.(scale[a])) .* sum_zfp[a] ./ dm[a]
+    location[b] .+= sqrt.(scale[b]) .* sum_zfp[b] ./ kp[b]
 end
 
-function update_location(myAmica::MultiModelAmica,shape,zfp,y,location,scale,kp)
-	if shape <= 2
-			dm = sum(zfp./y)
-			if dm > 0
-				return location + (1/sqrt(scale)) * sum(zfp) / dm
-			end
-	else
-		if kp > 0
-			return location + sqrt(scale) * sum(zfp) / kp
-		end
-	end
-	return location
+
+function update_location(myAmica::MultiModelAmica, shape, zfp, y, location, scale, kp)
+    if shape <= 2
+        dm = sum(zfp ./ y)
+        if dm > 0
+            return location + (1 / sqrt(scale)) * sum(zfp) / dm
+        end
+    else
+        if kp > 0
+            return location + sqrt(scale) * sum(zfp) / kp
+        end
+    end
+    return location
 end
 
 #Updates the Gaussian mixture scale parameter
-function update_scale(zfp,y,scale,z,shape)
-	if shape <= 2
-		db = sum(zfp.*y)
-		if db > 0
-			scale = scale ./ db
-		end
-	else
-		db = (shape .* sum(z.*abs.(y).^shape)).^(.- 2 ./ shape)
-		scale = scale .* db
-	end
-	return scale
+@views function update_scale!(scale::AbstractArray{T,2}, zfp::AbstractArray{T,3}, y::AbstractArray{T,3}, z::AbstractArray{T,3}, shape::AbstractArray{T,2}, y_rho::AbstractArray{T,3}) where {T<:Real}
+    (m, n, N) = size(y)
+
+    db = zeros(T, m, n)
+
+    @inbounds for k = 1:N, i = 1:n, j = 1:m
+        if shape[j, i] <= 2
+            db[j, i] += zfp[j, i, k] * y[j, i, k]
+        else
+            db[j, i] += z[j, i, k] * y_rho[j, i, k]
+        end
+    end
+
+    @inbounds for i = 1:n, j = 1:m
+        if shape[j, i] <= 2
+            if db[j, i] >= 0
+                scale[j, i] /= db[j, i]
+            end
+        else
+            db[j, i] *= shape[j, i] * db[j, i]^(-2 / shape[j, i])
+        end
+    end
 end
 
 #Sets the initial value for the shape parameter of the GeneralizedGaussians for each Model
-function initialize_shape_parameter(myAmica::SingleModelAmica, shapelrate::LearningRate)
-	myAmica.learnedParameters.shape .= shapelrate.init .*myAmica.learnedParameters.shape
+function initialize_shape_parameter!(myAmica, shapelrate::LearningRate)
+    myAmica.learnedParameters.shape = shapelrate.init .* myAmica.learnedParameters.shape
 end
 
-function initialize_shape_parameter(myAmica::MultiModelAmica, shapelrate::LearningRate)
-	M = size(myAmica.models,1)
-	for h in 1:M
-		myAmica.models[h].learnedParameters.shape .= shapelrate.init .*myAmica.models[h].learnedParameters.shape
-	end
+function initialize_shape_parameter!(myAmica::MultiModelAmica, shapelrate::LearningRate)
+    initialize_shape_parameter!.(myAmica.models)
 end
 
 #Updates Gaussian mixture Parameters and mixing matrix. todo: rename since its not a loop for single model
-function update_loop!(myAmica::SingleModelAmica, fp, lambda, shapelrate, update_shape, iter, do_newton, newt_start_iter, lrate)
-		#Update parameters
-		g, kappa, lambda = update_parameters!(myAmica, fp, lambda, shapelrate, update_shape)
-		
-		#Checks for NaN in parameters before updating the mixing matrix
-		if any(isnan, kappa) || any(isnan, myAmica.source_signals) || any(isnan, lambda) || any(isnan, g) || any(isnan, myAmica.learnedParameters.proportions)
-			throw(AmicaNaNException())
-		end
-		#Update mixing matrix via Newton method
-		newton_method!(myAmica, iter, g, kappa, do_newton, newt_start_iter, lrate, lambda)
+function update_loop!(myAmica::AbstractAmica, shapelrate::LearningRate, update_shape::Bool, iter::Int, do_newton::Bool, newt_start_iter::Int, lrate::LearningRate)
+    #Update parameters
+    g, kappa = update_parameters!(myAmica, shapelrate, update_shape)
+    #Checks for NaN in parameters before updating the mixing matrix
+    if any(isnan, kappa) || any(isnan, myAmica.source_signals) || any(isnan, myAmica.lambda) || any(isnan, g) || any(isnan, myAmica.learnedParameters.proportions)
+        throw(AmicaNaNException())
+    end
+    #Update mixing matrix via Newton method
+    newton_method!(myAmica, iter, g, kappa, do_newton, newt_start_iter, lrate, myAmica.lambda)
 end
 
 #Updates Gaussian mixture Parameters and mixing matrix.
-function update_loop!(myAmica::MultiModelAmica, fp, lambda, shapelrate, update_shape, iter, do_newton, newt_start_iter, lrate)
-	(n,N) = size(myAmica.models[1].source_signals)
-	M = size(myAmica.models,1)
-
-	myAmica.ica_weights_per_sample = ones(M,N)
-	for h in 1:M
-		#Calcutes ICA model weights
-		myAmica.ica_weights_per_sample[h,:] = zeros(N)
-		for i in 1:M
-			myAmica.ica_weights_per_sample[h,:] = myAmica.ica_weights_per_sample[h,:] + exp.(myAmica.models[i].Lt-myAmica.models[h].Lt)
-		end
-		myAmica.ica_weights_per_sample[h,:] = 1 ./ myAmica.ica_weights_per_sample[h,:]
-		myAmica.ica_weights[h] = sum(myAmica.ica_weights_per_sample[h,:])
-		myAmica.normalized_ica_weights[h] = myAmica.ica_weights[h] / N
-		
-		#If model weight equals 0 skip update for this model
-		if myAmica.normalized_ica_weights[h] == 0
-			continue
-		end
-
-		g, kappa, lambda = update_parameters!(myAmica, h, fp, lambda, shapelrate, update_shape)#todo: remove return
-
-		#Checks for NaN in parameters before updating the mixing matrix
-		if any(isnan, kappa) || any(isnan, myAmica.models[h].source_signals) || any(isnan, lambda) || any(isnan, g) || any(isnan, myAmica.models[h].learnedParameters.proportions)
-			throw(AmicaNaNException())
-		end
-		#Update mixing matrix via Newton method
-		newton_method!(myAmica, h, iter, g, kappa, do_newton, newt_start_iter, lrate, lambda)
-	end
+function update_loop!(myAmica::MultiModelAmica, fp, lambda, y_rho, shapelrate, update_shape, iter, do_newton, newt_start_iter, lrate)
+    (n, N) = size(myAmica.models[1].source_signals)
+    M = size(myAmica.models, 1)
+
+    myAmica.ica_weights_per_sample = ones(M, N)
+    for h in 1:M
+        #Calcutes ICA model weights
+        myAmica.ica_weights_per_sample[h, :] = zeros(N)
+        for i in 1:M
+            myAmica.ica_weights_per_sample[h, :] = myAmica.ica_weights_per_sample[h, :] + exp.(myAmica.models[i].Lt - myAmica.models[h].Lt)
+        end
+        myAmica.ica_weights_per_sample[h, :] = 1 ./ myAmica.ica_weights_per_sample[h, :]
+        myAmica.ica_weights[h] = sum(myAmica.ica_weights_per_sample[h, :])
+        myAmica.normalized_ica_weights[h] = myAmica.ica_weights[h] / N
+
+        #If model weight equals 0 skip update for this model
+        if myAmica.normalized_ica_weights[h] == 0
+            continue
+        end
+
+        g, kappa, lambda = update_parameters!(myAmica, h, fp, lambda, y_rho, shapelrate, update_shape)#todo: remove return
+
+        #Checks for NaN in parameters before updating the mixing matrix
+        if any(isnan, kappa) || any(isnan, myAmica.models[h].source_signals) || any(isnan, lambda) || any(isnan, g) || any(isnan, myAmica.models[h].learnedParameters.proportions)
+            throw(AmicaNaNException())
+        end
+        #Update mixing matrix via Newton method
+        newton_method!(myAmica, h, iter, g, kappa, do_newton, newt_start_iter, lrate, lambda)
+    end
 end
 
 #Updates Gaussian mixture parameters. It also returns g, kappa and lamda which are needed to apply the newton method.
 #Todo: Save g, kappa, lambda in structure, remove return
-function update_parameters!(myAmica::SingleModelAmica, fp, lambda, lrate_rho::LearningRate, update_shape)
-	alpha = myAmica.learnedParameters.proportions
-	beta = myAmica.learnedParameters.scale
-	mu = myAmica.learnedParameters.location
-	rho = myAmica.learnedParameters.shape
-	(n,N) = size(myAmica.source_signals)
-	m = myAmica.m
-	g = zeros(n,N)
-	kappa = zeros(n,1)
-	zfp = zeros(m, N)
-
-	
-	#Threads.@threads 
-	for i in 1:n
-		for j in 1:m
-			sumz = 0
-			sumz = calculate_sumz(myAmica,i,j)
-			update_mixture_proportions!(sumz,myAmica,j,i)
-			if sumz > 0
-				if (m > 1)
-					myAmica.z[i,:,j] .= myAmica.z[i,:,j] / sumz
-				end
-			else
-				continue
-			end
-	
-			fp[j,:] = ffun(myAmica.y[i,:,j], rho[j,i])
-			zfp[j,:] = myAmica.z[i,:,j] .* fp[j,:]
-			g[i,:] = g[i,:] .+ alpha[j,i] .* sqrt(beta[j,i]) .*zfp[j,:]
-	
-			kp = beta[j,i] .* sum(zfp[j,:].*fp[j,:])
-	
-			kappa[i] = kappa[i]  + alpha[j,i] * kp
-	
-			lambda[i] = lambda[i] .+ alpha[j,i] .* (sum(myAmica.z[i,:,j].*(fp[j,:].*myAmica.y[i,:,j] .-1).^2) .+ mu[j,i]^2 .* kp)
-			mu[j,i] =  update_location(myAmica,rho[j,i],zfp[j,:],myAmica.y[i,:,j],mu[j,i],beta[j,i],kp)
-			
-			
-			beta[j,i] = update_scale(zfp[j,:],myAmica.y[i,:,j],beta[j,i],myAmica.z[i,:,j],rho[j,i])
-
-			if update_shape == 1
-				update_shape!(myAmica, rho, j, i, lrate_rho)
-			end
-		end
-	end
-	myAmica.learnedParameters.proportions = alpha
-	myAmica.learnedParameters.scale = beta
-	myAmica.learnedParameters.location = mu
-	myAmica.learnedParameters.shape = rho
-	return g, kappa, lambda
+@views function update_parameters!(myAmica::Union{SingleModelAmica{T,ncomps,nmix},CuSingleModelAmica{T,ncomps,nmix}}, shapelrate::LearningRate, upd_shape::Bool) where {T,ncomps,nmix}
+    (m, n, N) = size(myAmica.y)
+
+    gg = myAmica.learnedParameters
+
+
+    sumz = calculate_sumz(myAmica.z)
+
+    if m > 0
+        myAmica.z ./= sumz
+    end
+
+    if m > 1
+        myAmica.learnedParameters.proportions = sumz ./ N
+    end
+
+    ffun!(myAmica.fp, myAmica.y, gg.shape)
+    myAmica.zfp .= myAmica.z .* myAmica.fp
+
+    # calculate g
+    myAmica.g .= zero(T)
+    for k = 1:N, i = 1:n, j = 1:m
+        @inbounds myAmica.g[i, k] += gg.proportions[j, i] * sqrt(gg.scale[j, i]) * myAmica.zfp[j, i, k]
+    end
+
+    # calculate kp
+    #kp = zeros(T, size(gg.scale))
+    kp = similar(gg.scale)
+    for k = 1:N, i = 1:n, j = 1:m
+        @inbounds kp[j, i] += myAmica.zfp[j, i, k] * myAmica.fp[j, i, k]
+    end
+    kp .*= gg.scale
+
+    # calculate lambda
+    for k = 1:N, i = 1:n, j = 1:m
+        @inbounds myAmica.lambda[i] += gg.proportions[j, i] * ((myAmica.z[j, i, k] * (myAmica.fp[j, i, k] * myAmica.y[j, i, k])^2) + (gg.location[j, i] .^ 2 .* kp[j, i]) / N)
+    end
+
+    kappa = sum(gg.proportions .* kp, dims=1)[1, :]
+
+    update_location!(gg.location, gg.shape, myAmica.zfp, myAmica.y, gg.scale, kp)
+    update_scale!(gg.scale, myAmica.zfp, myAmica.y, myAmica.z, gg.shape, myAmica.y_rho)
+
+
+    if upd_shape
+        update_shape!(gg.shape, myAmica.z, myAmica.y_rho, shapelrate)
+    end
+
+    return myAmica.g, kappa
 end
 
 #Updates Gaussian mixture parameters. It also returns g, kappa and lamda which are needed to apply the newton method.
 #Todo: Save g, kappa, lambda in structure, remove return
-function update_parameters!(myAmica::MultiModelAmica, h, fp, lambda, lrate_rho::LearningRate, update_shape)
-	alpha = myAmica.models[h].learnedParameters.proportions #todo: move into loop and add h
-	beta = myAmica.models[h].learnedParameters.scale
-	mu = myAmica.models[h].learnedParameters.location
-	rho = myAmica.models[h].learnedParameters.shape
-
-	(n,N) = size(myAmica.models[1].source_signals)
-	m = myAmica.m
-	g = zeros(n,N)
-	kappa = zeros(n,1)
-	zfp = zeros(m, N)
-
-	
-	#=Threads.@threads=# for i in 1:n
-		for j in 1:m
-			sumz = 0
-			calculate_z!(myAmica, i, j, h)
-			sumz = calculate_sumz(myAmica,i,j,h)
-			update_mixture_proportions!(sumz,myAmica,j,i,h)
-			if sumz > 0
-				myAmica.models[h].z[i,:,j] .= myAmica.models[h].z[i,:,j] / sumz
-			else
-				continue
-			end
-	
-			fp[j,:] = ffun(myAmica.models[h].y[i,:,j], rho[j,i])
-			zfp[j,:] = myAmica.models[h].z[i,:,j] .* fp[j,:]
-			g[i,:] = g[i,:] .+ alpha[j,i] .* sqrt(beta[j,i]) .*zfp[j,:]
-	
-			kp = beta[j,i] .* sum(zfp[j,:].*fp[j,:])
-	
-			kappa[i] = kappa[i]  + alpha[j,i] * kp
-	
-			lambda[i] = lambda[i] .+ alpha[j,i] .* (sum(myAmica.models[h].z[i,:,j].*(fp[j,:].*myAmica.models[h].y[i,:,j] .-1).^2) .+ mu[j,i]^2 .* kp)
-			mu[j,i] =  update_location(myAmica,rho[j,i],zfp[j,:],myAmica.models[h].y[i,:,j],mu[j,i],beta[j,i],kp)
-			
-			
-			beta[j,i] = update_scale(zfp[j,:],myAmica.models[h].y[i,:,j],beta[j,i],myAmica.models[h].z[i,:,j],rho[j,i])
-
-			if update_shape == 1
-				update_shape!(myAmica.models[h], rho, j, i, lrate_rho) #uses SingleModel version on each model
-			end
-		end
-	end
-	myAmica.models[h].learnedParameters.proportions = alpha
-	myAmica.models[h].learnedParameters.scale = beta
-	myAmica.models[h].learnedParameters.location = mu
-	myAmica.models[h].learnedParameters.shape = rho
-	return g, kappa, lambda
+@views function update_parameters!(myAmica::MultiModelAmica, h, fp, y_rho, lambda, lrate_rho::LearningRate, upd_shape)
+    alpha = myAmica.models[h].learnedParameters.proportions #todo: move into loop and add h
+    beta = myAmica.models[h].learnedParameters.scale
+    mu = myAmica.models[h].learnedParameters.location
+    rho = myAmica.models[h].learnedParameters.shape
+
+    (n, N) = size(myAmica.models[1].source_signals)
+    m = myAmica.m
+    g = zeros(n, N)
+    kappa = zeros(n, 1)
+    zfp = zeros(m, N)
+
+
+    sumz = calculate_sumz(myAmica, h)
+    if m > 0
+        sumz[sumz.<0] .= 1
+        myAmica.models[h].z ./= sumz
+    end
+
+
+    for i in 1:n #=Threads.@threads=#
+        for j in 1:m
+            sumz = 0
+            calculate_z!(myAmica, i, j, h)
+            update_mixture_proportions!(sumz[j, i, 1], myAmica, j, i, h)
+            if sumz[j, i, 1] > 0
+                myAmica.models[h].z[j, i, :] ./= sumz[j, i, 1]
+            else
+                continue
+            end
+
+            fp[j, :] .= ffun(myAmica.models[h].y[j, i, :], rho[j, i])
+            zfp[j, :] .= myAmica.models[h].z[j, i, :] .* fp[j, :]
+            g[i, :] .+= alpha[j, i] .* sqrt(beta[j, i]) .* zfp[j, :]
+
+            kp = beta[j, i] .* sum(zfp[j, :] .* fp[j, :])
+
+            kappa[i] += alpha[j, i] * kp
+
+            lambda[i] += alpha[j, i] .* (sum(myAmica.models[h].z[j, i, :] .* (fp[j, :] .* myAmica.models[h].y[j, i, :] .- 1) .^ 2) .+ mu[j, i]^2 .* kp)
+            mu[j, i] = update_location(myAmica, rho[j, i], zfp[j, :], myAmica.models[h].y[j, i, :], mu[j, i], beta[j, i], kp)
+
+
+            beta[j, i] = update_scale(zfp[j, :], myAmica.models[h].y[j, i, :], beta[j, i], myAmica.models[h].z[j, i, :], rho[j, i])
+
+            if upd_shape == 1
+                log_y_rho = optimized_log(y_rho)
+                dr = sum(myAmica.models[h].z .* log_y_rho .* y_rho, dims=2)
+                rho[j, i] = update_shape(rho[j, i], dr[j, i, 1], lrate_rho)
+            end
+        end
+    end
+    myAmica.models[h].learnedParameters.proportions = alpha
+    myAmica.models[h].learnedParameters.scale = beta
+    myAmica.models[h].learnedParameters.location = mu
+    myAmica.models[h].learnedParameters.shape = rho
+    return g, kappa
 end
 
 #Updates the Gaussian mixture shape parameter
-function update_shape!(myAmica::SingleModelAmica, rho, j, i, lrate_rho::LearningRate)
-	rhomin, rhomax, shapelrate = lrate_rho.minimum, lrate_rho.maximum, lrate_rho.lrate
-	ytmp = abs.(myAmica.y[i,:,j]).^rho[j,i]
-	dr = sum(myAmica.z[i,:,j].*log.(ytmp).*ytmp)
-
-	if rho[j,i] > 2
-		dr2 = digamma(1+1/rho[j,i]) / rho[j,i] - dr
-		if ~isnan(dr2)
-			rho[j,i] = rho[j,i] + 0.5 * dr2
-		end
-	else
-		dr2 = 1 - rho[j,i] * dr / digamma(1+1/rho[j,i])
-		if ~isnan(dr2)
-			rho[j,i] = rho[j,i] + shapelrate *dr2
-		end
-	end
-	rho[j,i] = min(rhomax, rho[j,i])
-	rho[j,i] = max(rhomin, rho[j,i])
-end
+@views function update_shape!(shape::AbstractArray{T,2}, z::AbstractArray{T,3}, y_rho::AbstractArray{T,3}, shapelrate::LearningRate) where {T<:Real}
+    (m, n, N) = size(z)
+
+    dr = zeros(T, m, n)
+    log_y_rho = optimized_log(y_rho)
 
-#Updates the Gaussian mixture shape parameter. MultiModel version no longer in use
-function update_shape!(myAmica::MultiModelAmica, rho, j, i, h, lrate_rho::LearningRate)
-	rhomin, rhomax, shapelrate = lrate_rho.minimum, lrate_rho.maximum, lrate_rho.lrate
-	ytmp = abs.(myAmica.models[h].y[i,:,j]).^rho[j,i]
-	dr = sum(myAmica.models[h].z[i,:,j].*log.(ytmp).*ytmp)
-
-	if rho[j,i] > 2
-		dr2 = digamma(1+1/rho[j,i]) / rho[j,i] - dr
-		if ~isnan(dr2)
-			rho[j,i] = rho[j,i] + 0.5 * dr2
-		end
-	else
-		dr2 = 1 - rho[j,i] * dr / digamma(1+1/rho[j,i])
-		if ~isnan(dr2)
-			rho[j,i] = rho[j,i] + shapelrate *dr2
-		end
-	end
-	rho[j,i] = min(rhomax, rho[j,i])
-	rho[j,i] = max(rhomin, rho[j,i])
+    for k = 1:N, i = 1:n, j = 1:m
+        @inbounds dr[j, i] = z[j, i, k] * log_y_rho[j, i, k] * y_rho[j, i, k]
+    end
+
+
+    for i in 1:n
+        for j in 1:m
+            _shape = shape[j, i]
+            if _shape > 2
+                dr2 = digamma(1 + 1 / _shape) / _shape - dr[j, i]
+                if !isnan(dr2)
+                    shape[j, i] += 0.5 * dr2
+                end
+            else
+                dr2 = 1 - _shape * dr[j, i] / digamma(1 + 1 / _shape)
+                if !isnan(dr2)
+                    shape[j, i] += shapelrate.lrate * dr2
+                end
+            end
+        end
+    end
+
+    clamp!(shape, shapelrate.minimum, shapelrate.maximum)
 end
 
 #Calculates determinant of mixing Matrix A (with log). first log-likelihood part of L = |A| * p(sources)
-function calculate_ldet!(myAmica::SingleModelAmica)
-	#myAmica.ldet = -log(abs(det(myAmica.A)))
-	myAmica.ldet = -logabsdet(myAmica.A)[1]
-	@debug :ldet myAmica.ldet
+function calculate_ldet!(myAmica)
+    #myAmica.ldet = -log(abs(det(myAmica.A)))
+    myAmica.ldet = -logabsdet(myAmica.A)[1]
+    @debug :ldet myAmica.ldet
 end
 
 function calculate_ldet!(myAmica::MultiModelAmica)
-	for h in 1:length(myAmica.models)
-		myAmica.models[h].ldet = -log(abs(det(myAmica.models[h].A)))
-	end
+    for h in 1:length(myAmica.models)
+        myAmica.models[h].ldet = -log(abs(det(myAmica.models[h].A)))
+    end
 end
 
+
+function update_sources!(myAmica::CuSingleModelAmica, data)
+    myAmica.source_signals .= pinv(myAmica.A) * data
+end
 #Updates source singal estimations by unmixing the data
-function update_sources!(myAmica::SingleModelAmica, data)
-	b = pinv(myAmica.A) * data
-	myAmica.source_signals = b
+function update_sources!(myAmica, data)
+    #myAmica.source_signals .= myAmica.A \ data#
+    #myAmica.source_signals .= myAmica.A \ data#
+    #ldiv!(myAmica.source_signals,myAmica.A,data)
+    #lu!(myAmica.A)
+
+    LinearAlgebra.ldiv!(myAmica.source_signals, LinearAlgebra.qr(myAmica.A), data)
+    #myAmica.source_signals = pinv(myAmica.A) * data
 end
 
 function update_sources!(myAmica::MultiModelAmica, data)
-	n = size(myAmica.models[1].A, 1)
-	for h in 1:length(myAmica.models)
-		for i in 1:n 
-			Wh = pinv(myAmica.models[h].A)
-			myAmica.models[h].source_signals[i,:] = Wh[i,:]' * data .- Wh[i,:]' * myAmica.models[h].centers
-		end
-	end
+    n = size(myAmica.models[1].A, 1)
+    for h in 1:length(myAmica.models)
+        for i in 1:n
+            Wh = pinv(myAmica.models[h].A)
+            myAmica.models[h].source_signals[i, :] = Wh[i, :]' * data .- Wh[i, :]' * myAmica.models[h].centers
+        end
+    end
 end
 
 #Adjusts learning rate depending on log-likelihood growth during past iterations. How many depends on iterwin. Uses LearningRate type from types.jl
 function calculate_lrate!(dLL, lrateType::LearningRate, iter, newt_start_iter, do_newton, iterwin)
 
-	lratefact,lnatrate,lratemax, = lrateType.decreaseFactor, lrateType.natural_rate, lrateType.maximum
-	lrate = lrateType.lrate
-	sdll = sum(dLL[iter-iterwin+1:iter])/iterwin
+    lratefact, lnatrate, lratemax, = lrateType.decreaseFactor, lrateType.natural_rate, lrateType.maximum
+    lrate = lrateType.lrate
+    sdll = sum(dLL[iter-iterwin+1:iter]) / iterwin
 
     if sdll < 0
         println("Likelihood decreasing!")
         lrate = lrate * lratefact
     else
         if (iter > newt_start_iter) && do_newton == 1
-            lrate = min(lratemax,lrate + min(0.1,lrate))
+            lrate = min(lratemax, lrate + min(0.1, lrate))
         else
-            lrate = min(lnatrate,lrate + min(0.1,lrate))
+            lrate = min(lnatrate, lrate + min(0.1, lrate))
         end
     end
-	lrateType.lrate = lrate
+    lrateType.lrate = lrate
 end
\ No newline at end of file
diff --git a/src/types.jl b/src/types.jl
index 2459b4d..cc60d29 100755
--- a/src/types.jl
+++ b/src/types.jl
@@ -1,141 +1,185 @@
-mutable struct GGParameters
-	proportions::AbstractArray{Float64} #source density mixture proportions
-	scale::AbstractArray{Float64} #source density inverse scale parameter
-	location::AbstractArray{Float64} #source density location parameter
-	shape::AbstractArray{Float64} #source density shape paramters
+mutable struct GGParameters{T,ncomps,nmix}
+    proportions::Array{T,2} #source density mixture proportions
+    scale::Array{T,2} #source density inverse scale parameter
+    location::Array{T,2} #source density location parameter
+    shape::Array{T,2} #source density shape paramters
 end
 
+
 abstract type AbstractAmica end
 
-mutable struct SingleModelAmica <:AbstractAmica
-	source_signals					   #Unmixed signals
-	learnedParameters::GGParameters	   #Parameters of the Gaussian mixtures
-	m::Union{Integer, Nothing} 		   #Number of gaussians
-    A::AbstractArray 				   #Mixing matrix
-	z::AbstractArray				   #Densities for each sample per Gaussian (normalized)
-	y::AbstractArray				   #Source signals (scaled and shifted with scale and location parameter)
-	centers::AbstractArray 			   #Model centers
-	Lt::AbstractVector 				   #Log likelihood of time point for each model ( M x N )
-	LL::Union{AbstractVector, Nothing} #Log-Likelihood
-	ldet::Float64					   #log determinant of A
-	maxiter::Union{Int, Nothing} 	   #maximum number of iterations, can be nothing because it's not needed for multimodel
+mutable struct SingleModelAmica{T,ncomps,nmix} <: AbstractAmica
+    source_signals::Array{T,2}
+    learnedParameters::GGParameters{T,ncomps,nmix}
+    m::Int    #Number of gaussians
+    A::Array{T,2} # unmixing matrices for each model
+    S::Array{T,2} # sphering matrix
+    z::Array{T,3}
+    y::Array{T,3}
+    centers::Array{T,1} #model centers
+    Lt::Array{T,1} #log likelihood of time point for each model ( M x N )
+    LL::Array{T,1} #log likelihood over iterations todo: change to tuple 
+    ldet::T
+    maxiter::Int
+
+    # --- intermediary values
+    # precalculated abs(y)^rho
+    y_rho::Array{T,3}
+    lambda::Array{T,1}
+    fp::Array{T,3}
+    # z * fp
+    zfp::Array{T,3}
+    g::Array{T,2}
+    Q::Array{T,3}
+
+    u_intermed::Array{T,4}
 end
 
-mutable struct MultiModelAmica <:AbstractAmica
-	models::Array{SingleModelAmica} #Array of SingleModelAmicas
-	normalized_ica_weights 			#Model weights (normalized)
-	ica_weights_per_sample 			#Model weight for each sample
-	ica_weights						#Model weight for all samples
-	maxiter::Int					#Number of iterations
-	m::Int 							#Number of Gaussians
-	LL::AbstractVector				#Log-Likelihood
+
+
+mutable struct MultiModelAmica{T} <: AbstractAmica
+    models::Array{SingleModelAmica{T}} #Array of SingleModelAmicas
+    normalized_ica_weights #Model weights (normalized)
+    ica_weights_per_sample #Model weight for each sample
+    ica_weights#Model weight for all samples
+    maxiter::Int#Number of iterations
+    m::Int #Number of Gaussians
+    LL::Array{T,1}#Log-Likelihood
 end
 
 #Structure for Learning Rate type with initial value, minumum, maximum etc. Used for learning rate and shape lrate
 using Parameters
-@with_kw mutable struct LearningRate
-	lrate::Real = 0.1
-	init::Float64 = 0.1
-	minimum::Float64 = 0.
-	maximum::Float64 = 1.0
-	natural_rate::Float64 = 0.1
-	decreaseFactor::Float64 = 0.5
+@with_kw mutable struct LearningRate{T}
+    lrate::T = 0.1
+    init::T = 0.1
+    minimum::T = 0.0
+    maximum::T = 1.0
+    natural_rate::T = 0.1
+    decreaseFactor::T = 0.5
 end
 
 #Data type for AMICA with just one ICA model. todo: rename gg parameters
 function SingleModelAmica(data::AbstractArray{T}; m=3, maxiter=500, A=nothing, location=nothing, scale=nothing, kwargs...) where {T<:Real}
-	(n, N) = size(data)
-	#initialize parameters
-	
-	centers = zeros(n)
-	eye = Matrix(I, n, n)
-	if isnothing(A)
-		A = zeros(n,n)
-		A[:,:] = eye[n] .+ 0.1*rand(n,n)
-		for i in 1:n
-			A[:,i] = A[:,i] / norm(A[:,i])
-		end
-	end
-
-	proportions = (1/m) * ones(m,n)
-	if isnothing(location)
-		if m > 1
-			location = 0.1 * randn(m, n)
-		else
-			location = zeros(m, n)
-		end
-	end
-	if isnothing(scale)
-		scale = ones(m, n) + 0.1 * randn(m, n)
-	end
-	shape = ones(m, n)
-
-	y = zeros(n,N,m)
-	
-	Lt = zeros(N)
-	z = ones(n,N,m)/N
-
-	#Sets some parameters to nothing if used my MultiModel to only have them once
-	if isnothing(maxiter)
-		LL = nothing
-		m = nothing
-	else
-		LL = Float64[]
-	end
-	ldet = 0.0
-	source_signals = zeros(n,N)
-
-	return SingleModelAmica(source_signals,GGParameters(proportions,scale,location,shape),m,A,z,y,#=Q,=#centers,Lt,LL,ldet,maxiter)
+    (n, N) = size(data)
+    ncomps = n
+    nmix = m
+    #initialize parameters
+
+    centers = zeros(T, n)
+    eye = Matrix(I, n, n)
+    if isnothing(A)
+        A = zeros(T, n, n)
+        A[:, :] = eye[n] .+ 0.1 * rand(n, n)
+        for i in 1:n
+            A[:, i] = A[:, i] / norm(A[:, i])
+        end
+    end
+
+    proportions = (1 / m) * ones(T, m, n)
+    if isnothing(location)
+        if m > 1
+            location = 0.1 * randn(T, m, n)
+        else
+            location = zeros(T, m, n)
+        end
+    end
+    if isnothing(scale)
+        scale = ones(T, m, n) + 0.1 * randn(T, m, n)
+    end
+    shape = ones(T, m, n)
+
+    y = zeros(T, m, n, N)
+    y_rho = zeros(T, m, n, N)
+
+    Lt = zeros(T, N)
+    z = ones(T, m, n, N) / N
+
+    #Sets some parameters to nothing if used my MultiModel to only have them once
+    if isnothing(maxiter)
+        LL = nothing
+        m = nothing
+    else
+        LL = T[]
+    end
+    ldet = 0.0
+    source_signals = zeros(T, n, N)
+    lambda = zeros(T, n)
+    fp = zeros(T, m, n, N)
+    zfp = zeros(T, m, n, N)
+    Q = zeros(T, m, n, N)
+    g = zeros(T, n, N)
+    u_intermed = zeros(T, m, n, N, m)
+
+    return SingleModelAmica{T,ncomps,nmix}(
+        source_signals,
+        GGParameters{T,ncomps,nmix}(proportions, scale, location, shape),
+        m,
+        A,
+        I(size(A, 1)),
+        z,
+        y,
+        centers,
+        Lt,
+        LL,
+        ldet,
+        maxiter,
+        y_rho,
+        lambda,
+        fp,
+        zfp,
+        g,
+        Q,
+        u_intermed)
 end
 
 #Data type for AMICA with multiple ICA models
 function MultiModelAmica(data::Array; m=3, M=2, maxiter=500, A=nothing, location=nothing, scale=nothing, kwargs...)
-	models = Array{SingleModelAmica}(undef, M) #Array of SingleModelAmica opjects
-	normalized_ica_weights = (1/M) * ones(M,1)
-	(n, N) = size(data)
-	ica_weights_per_sample = ones(M,N)
-	ica_weights = zeros(M)
-	LL = Float64[]
-
-	#This part only exists to allow for initial values to be set by the user. They are still required to have the old format (something x something x M)
-	eye = Matrix(I, n, n)
-	if isnothing(A)
-		A = zeros(n,n,M)
-		for h in 1:M
-			A[:,:,h] = eye[n] .+ 0.1*rand(n,n)
-			for i in 1:n
-				A[:,i,h] = A[:,i,h] / norm(A[:,i,h])
-			end
-		end
-	end
-
-	if isnothing(location)
-		if m > 1
-			location = 0.1 * randn(m, n, M)
-		else
-			location = zeros(m, n, M)
-		end
-	end
-
-	if isnothing(scale)
-		scale = ones(m, n, M) + 0.1 * randn(m, n, M)
-	end
-
-	for h in 1:M
-		models[h] = SingleModelAmica(data; m, maxiter=nothing, A=A[:,:,h], location=location[:,:,h], scale=scale[:,:,h], kwargs...)
-	end
-	return MultiModelAmica(models,normalized_ica_weights,ica_weights_per_sample,ica_weights,maxiter,m,LL#=,Q=#)
+    models = Array{SingleModelAmica}(undef, M) #Array of SingleModelAmica opjects
+    normalized_ica_weights = (1 / M) * ones(M, 1)
+    (n, N) = size(data)
+    ica_weights_per_sample = ones(M, N)
+    ica_weights = zeros(M)
+    LL = Float64[]
+
+    #This part only exists to allow for initial values to be set by the user. They are still required to have the old format (something x something x M)
+    eye = Matrix(I, n, n)
+    if isnothing(A)
+        A = zeros(m, n, N)
+        for h in 1:M
+            A[:, :, h] = eye[n] .+ 0.1 * rand(n, n)
+            for i in 1:n
+                A[:, i, h] = A[:, i, h] / norm(A[:, i, h])
+            end
+        end
+    end
+
+    if isnothing(location)
+        if m > 1
+            location = 0.1 * randn(m, n, M)
+        else
+            location = zeros(m, n, M)
+        end
+    end
+
+    if isnothing(scale)
+        scale = ones(m, n, M) + 0.1 * randn(m, n, M)
+    end
+
+    for h in 1:M
+        models[h] = SingleModelAmica(data; m, maxiter=nothing, A=A[:, :, h], location=location[:, :, h], scale=scale[:, :, h], kwargs...)
+    end
+    return MultiModelAmica(models, normalized_ica_weights, ica_weights_per_sample, ica_weights, maxiter, m, LL) #=,Q=#
 end
 
 
 # import Base.getproperty
-#  Base.getproperty(x::AbstractAmica, s::Symbol) = Base.getproperty(x, Val(s))
-#  Base.getproperty(x::AbstractAmica, ::Val{s}) where s = getfield(x, s)
+#  Base.getproperty(x<:AbstractAmica, s::Symbol) = Base.getproperty(x, Val(s))
+#  Base.getproperty(x<:AbstractAmica, ::Val{s}) where s = getfield(x, s)
 
-#  Base.getproperty(m::AbstractAmica, ::Val{:N}) = size(m.Lt,1)
-#  Base.getproperty(m::AbstractAmica, ::Val{:n}) = size(m.A,1)
+#  Base.getproperty(m<:AbstractAmica, ::Val{:N}) = size(m.Lt,1)
+#  Base.getproperty(m<:AbstractAmica, ::Val{:n}) = size(m.A,1)
 #  Base.getproperty(m::MultiModelAmica, ::Val{:M}) = length(m.models)
-#  Base.getproperty(m::SingleModelAmica, ::Val{:M}) = 1
+#  Base.getproperty(m<:AbstractAmica, ::Val{:M}) = 1
 
 
 # function Base.getproperty(multiModel::MultiModelAmica, prop::Symbol)
diff --git a/src/types_gpu.jl b/src/types_gpu.jl
new file mode 100644
index 0000000..8cce3bf
--- /dev/null
+++ b/src/types_gpu.jl
@@ -0,0 +1,60 @@
+mutable struct CuGGParameters{T,ncomps,nmix}
+    proportions::CuArray{T,2} #source density mixture proportions
+    scale::CuArray{T,2} #source density inverse scale parameter
+    location::CuArray{T,2} #source density location parameter
+    shape::CuArray{T,2} #source density shape paramters
+end
+
+
+
+
+mutable struct CuSingleModelAmica{T,ncomps,nmix} <: AbstractAmica
+    source_signals::CuArray{T,2}
+    learnedParameters::CuGGParameters{T,ncomps,nmix}
+    m::Int    #Number of gaussians
+    A::CuArray{T,2} # unmixing matrices for each model
+    S::CuArray{T,2} # sphering matrix
+    z::CuArray{T,3}
+    y::CuArray{T,3}
+    centers::CuArray{T,1} #model centers
+    Lt::CuArray{T,1} #log likelihood of time point for each model ( M x N )
+    LL::CuArray{T,1} #log likelihood over iterations todo: change to tuple 
+    ldet::T
+    maxiter::Int
+
+    # --- intermediary values
+    # precalculated abs(y)^rho
+    y_rho::CuArray{T,3}
+    lambda::CuArray{T,1}
+    fp::CuArray{T,3}
+    # z * fp
+    zfp::CuArray{T,3}
+    g::CuArray{T,2}
+    Q::CuArray{T,3}
+
+    u_intermed::CuArray{T,4}
+end
+
+myconv(x::GGParameters) = CuArray(x)
+myconv(x::AbstractArray) = CuArray(x)
+myconv(x) = x
+function CuSingleModelAmica(data::AbstractArray{T}; kwargs...) where {T}
+
+    init = SingleModelAmica(data; kwargs...)
+    eval(Expr(:call, CuSingleModelAmica,
+        [
+            :(myconv($init.$field))
+            for field in fieldnames(SingleModelAmica)
+        ]...))
+
+end
+
+
+function CUDA.CuArray(gg::Amica.GGParameters{T,n,m}) where {T,n,m}
+    CuGGParameters{T,n,m}(
+        gg.proportions |> CuArray,
+        gg.scale |> CuArray,
+        gg.location |> CuArray,
+        gg.shape |> CuArray
+    )
+end
\ No newline at end of file
diff --git a/test/bene_testAmica.jl b/test/bene_testAmica.jl
index ae93f87..bd5aa2c 100644
--- a/test/bene_testAmica.jl
+++ b/test/bene_testAmica.jl
@@ -1,65 +1,69 @@
 #s = sin.(t * collect(0.5:0.8:pi)')'#rand(10,10000)
+using Revise
 using SignalAnalysis
 using Amica
-
-t = range(0,20*π,length=10000)
-s =rand(PinkGaussian(length(t)),20)'
-
-s[1,:] = sin.(t)
-s[2,:] = cos.(2 .* t)
-s[3,:] = sin.(10 .* t)
-s[4,:] = cos.(20 .* t)
+includet("/scratch/projects/fapra_amica/src/simulate_data.jl")
+includet("/scratch/projects/fapra_amica/src/fortran_tools.jl")
+n_chan = 4
+n_time = 1000
+x, _A, s = simulate_data(; T=50, n_chan, n_time, type=:gg6)
+i_scale, i_location, i_A = init_params(; n_chan, n_time)
+
+#s[3, :] = sin.(10 .* t)
+#s[4, :] = cos.(20 .* t)
 #s = s .* [1,2,3,4]
-A = rand(size(s,1),size(s,1))
+A = rand(size(s, 1), size(s, 1))
 A = [1 1 0 0; 0 1 1 0; 0 0 1 1; 1 0 1 0]
-
-x = A*s
+#A = [1 2; 0.5 -3]
+x = A * s
 
 #A = [1 1 0 1; 1 1 0 0; 1 0 1 1; 0 0 0 1]
 #x = hcat(x,A*s) 
-am = fit(SingleModelAmica,x;maxiter=500)
-amm = fit(MultiModelAmica,x;maxiter=500,M=1)
+am = Amica.fit(SingleModelAmica, x; maxiter=50, do_sphering=true)
+#amm = fit(MultiModelAmica,x;maxiter=500,M=1)
 size(am.A)
-W = inv(am.A[:,:,1]) #previously [:,:,2]
-
 
 #---
+W = pinv(am.A)
+W = pinv(am.A) * am.S
+#W = pinv(pinv(am.S) * am.A)*(am.S)
+#W = pinv(pinv(am.S) * am.A * am.S)
 using CairoMakie
 
 f = Figure()
-series(f[1,1],s[:,1:500])
-ax,h = heatmap(f[1,2],A)
-Colorbar(f[1,3],h)
+series(f[1, 1], s[:, 1:500])
+ax, h = heatmap(f[1, 2], A)
+Colorbar(f[1, 3], h)
 
 
-series(f[2,1],W*x)
-ax,h = heatmap(f[2,2],am.A[:,:,1])
-Colorbar(f[2,3],h)
+series(f[2, 1], (W*x)[:, 1:500])
+ax, h = heatmap(f[2, 2], am.A[:, :, 1])
+Colorbar(f[2, 3], h)
 
-series(f[4,1],x[:,1:500])
-series(f[4,2],(W*x)[:,1:500])
+series(f[4, 1], s[:, 1:50])
+series(f[4, 2], (W*x)[:, 1:50])
 f
 #series(f[4,1],inv(W)'*x)
 #series(f[6,1],W'*x)
 
 #----
 using PyMNE
-data_path = pyconvert(String,@py(str(PyMNE.datasets.ssvep.data_path())))
-bids_fname =  joinpath(data_path,"sub-02","ses-01","eeg","sub-02_ses-01_task-ssvep_eeg.vhdr")
+data_path = pyconvert(String, @py(str(PyMNE.datasets.ssvep.data_path())))
+bids_fname = joinpath(data_path, "sub-02", "ses-01", "eeg", "sub-02_ses-01_task-ssvep_eeg.vhdr")
 
 
 raw = PyMNE.io.read_raw_brainvision(bids_fname, preload=true, verbose=false)
 raw.resample(128)
 raw.filter(l_freq=1, h_freq=nothing, fir_design="firwin")
-d = pyconvert(Array,raw.get_data(;units="uV"))
+d = pyconvert(Array, raw.get_data(; units="uV"))
 
 
-am = fit(MultiModelAmica,d;maxiter=500,M=2)
+am = fit(MultiModelAmica, d; maxiter=500, M=2)
 
 #----
 raw_memory = PyMNE.io.read_epochs_eeglab("/data/export/users/ehinger/amica_recompile/amica/Memorize.set")
-d_memory = pyconvert(Array,raw_memory.get_data(;units="uV"))
+d_memory = pyconvert(Array, raw_memory.get_data(; units="uV"))
 
-d_memory = reshape(permutedims(d_memory,(2,3,1)),71,:)
+d_memory = reshape(permutedims(d_memory, (2, 3, 1)), 71, :)
 
-am2 = fit(SingleModelAmica,d_memory,M=1)
\ No newline at end of file
+am2 = fit(SingleModelAmica, d_memory, M=1)
\ No newline at end of file
diff --git a/test/compare_amica_implementations.jl b/test/compare_amica_implementations.jl
index 4b38a78..aac6318 100644
--- a/test/compare_amica_implementations.jl
+++ b/test/compare_amica_implementations.jl
@@ -1,75 +1,117 @@
+ENV["MATLAB_ROOT"] = "/opt/common/apps/matlab/r2021a/"
+using Revise
+
 using MATLAB
 using Amica
 using SignalAnalysis
 using LinearAlgebra
-using Revise 
+#---
 using CairoMakie
 #---
-includet("src/simulate_data.jl")
-includet("src/fortran_tools.jl")
-n_chan=3
-n_time=5000
-x,A,s = simulate_data(;T=50,n_chan,n_time,type=:gg6)
-i_scale,i_location,i_A = init_params(;n_chan,n_time)
-
-f  = Figure()
-series(f[1,1],s,axis=(;title="source"))
+includet("/scratch/projects/fapra_amica/src/simulate_data.jl")
+includet("/scratch/projects/fapra_amica/src/fortran_tools.jl")
+n_chan = 20
+n_time = 60_000
+x, _A, s = simulate_data(; T=50, n_chan, n_time, type=:gg6)
+i_scale, i_location, i_A = init_params(; n_chan, n_time)
+
+#---
+f = Figure()
+series(f[1, 1], s, axis=(; title="source"))
 #xlims!(1,20)
-series(f[2,1],x,axis=(;title="mixed"))
+series(f[2, 1], x, axis=(; title="mixed"))
 f
 
 #---
 
-maxiter = 500
+maxiter = 50
 
+#=
 # matlab run
 mat"""
+tic
 [mA,mc,mLL,mLt,mgm,malpha,mmu,mbeta,mrho] = amica_a($x,1,3,$maxiter,$i_location,$i_scale,$i_A,0);
+mt = toc
 """
 mat"""
+tic
 [mAopt,mWopt,mSopt,mkhindsopt,mcopt,mLLopt,mLtopt,mgmopt,malphaopt,mmuopt,mbetaopt,mrhoopt] = amica_optimized($x,1,3,$maxiter,1,1,$i_location,$i_scale,$i_A);
+mtopt = toc
+
 """
 mA = @mget(mA); # get matlab var
 mAopt = @mget(mAopt); # get matlab opt var
 
+t_mopt = @mget(mtopt); # get matlab var
+t_m = @mget(mt); # get matlab var
+
+=#
+#---
 # Fortran setup + ran
-fortran_setup(x;max_threads=1,max_iter=maxiter)
-run(`/scratch/projects/fapra_amica/fortran/amica15test julia.param`)
-fA = reshape(reinterpret(Float64,(read("amicaout/A"))),n_chan,n_chan)
+fortran_setup(Float32.(x); max_threads=1, max_iter=maxiter)
+t_f32 = @elapsed run(`/scratch/projects/fapra_amica/fortran/amica15test julia.param`)
+fA = reshape(reinterpret(Float64, (read("amicaout/A"))), n_chan, n_chan)
+
+fortran_setup(Float32.(x); max_threads=20, max_iter=maxiter)
+t_f32_parallel = @elapsed run(`/scratch/projects/fapra_amica/fortran/amica15test julia.param`)
+
+
+
+fortran_setup(Float64.(x); max_threads=1, max_iter=maxiter, dble_data=1)
+t_f64 = @elapsed run(`/scratch/projects/fapra_amica/fortran/amica15test julia.param`)
 
 # Julia run
-am = SingleModelAmica(x;maxiter=maxiter,A=i_A,location=i_location,scale=i_scale)
-fit!(am,x)
+am32 = SingleModelAmica(Float32.(x); maxiter=maxiter, A=deepcopy(i_A), location=deepcopy(i_location), scale=deepcopy(i_scale))
+t_am32 = @elapsed fit!(am32, Float32.(x))
+
+am64 = SingleModelAmica(Float64.(x); maxiter=maxiter, A=deepcopy(i_A), location=deepcopy(i_location), scale=deepcopy(i_scale))
+t_am64 = @elapsed fit!(am64, Float64.(x))
+#am = SingleModelAmica(Float16.(x);maxiter=maxiter,A=i_A,location=i_location,scale=i_scale)
+#@time fit!(am,x)
 #vcat(@mget(mLL),am.LL')
 
 #---
-f2 = Figure(size=(800,800))
-series(f2[1,1],inv(am.A)*x, axis=(;title="unmixed julia"))
-series(f2[2,1],inv(mA)*x, axis=(;title="unmixed matlab"))
-series(f2[3,1],inv(mAopt)*x, axis=(;title="unmixed matlab_optimizd"))
-series(f2[4,1],.-inv(fA')*x, axis=(;title="unmixed fortran"))
-series(f2[5,1],s,axis=(;title="original source"))
-series(f2[6,1],x, axis=(;title="original mixed"))
-hidedecorations!.(f2.content)
+
 
 linkxaxes!(f2.content...)
-xlims!(0,100)
+xlims!(0, 100)
 f2
 
 
 #--- compare LLs
-LL = am.LL
-mLL = @mget mLL
-mLLopt = @mget mLLopt
-fLL = reinterpret(Float64,(read("amicaout/LL")))
-f = Figure()
-ax = f[1,1] = Axis(f)
-labels = ["julia","matlab", "matlab opt","fortran"]
-for (ix,d) = enumerate([LL,mLL[1,:],mLLopt[1,:],fLL])
-    lines!(ax,d;label=labels[ix])
+
+visibnoise = 0.02 # add little bit of noise to distinguish identical lines 
+#mLL = @mget mLL
+#mLLopt = @mget mLLopt
+fLL = reinterpret(Float64, (read("amicaout/LL")))
+
+f = Figure(size=(1024, 512))
+ax = f[1, 1] = Axis(f)
+labels = ["julia64", "julia32", "fortran",] #"matlab", "matlab opt", ]
+for (ix, d) = enumerate([am64.LL, am32.LL, fLL]) #mLL[1, :], mLLopt[1, :],])
+    lines!(ax, d .+ rand(size(d)...) .* visibnoise; label=labels[ix])
 end
 axislegend(ax)
-ylims!(ax,-1.3,-1)
+#ylims!(ax, -5, 0.1)
+
+
+ax_T = f[1, 2] = Axis(f)
+scatter!(ax_T, [t_am32, t_am64, t_f32, t_f64])# t_m, t_mopt,])
+ax_T.xticks = 1:4
+ax_T.xtickformat = x -> ["am32", "am64", "f32", "f64"][Int.(x)] #"mat", "matopt",
+
 f
 
-#--- compare A
+
+#---
+amcu = SingleModelAmica(CuArray(Float32.(x)); maxiter=10)#,A=deepcopy(i_A),location=deepcopy(i_location),scale=deepcopy(i_scale))
+fit!(amcu, CuArray(Float32.(x)))
+
+
+#----
+am32 = SingleModelAmica(Float32.(x); maxiter=10)#,A=deepcopy(i_A),location=deepcopy(i_location),scale=deepcopy(i_scale))
+@profview fit!(am32, Float32.(x))
+
+
+fortran_setup(Float32.(x); max_threads=10, max_iter=20)
+t_f32 = @elapsed run(`/scratch/projects/fapra_amica/fortran/amica15test julia.param`)
\ No newline at end of file
diff --git a/test/minimal_testcase.jl b/test/minimal_testcase.jl
new file mode 100644
index 0000000..6661379
--- /dev/null
+++ b/test/minimal_testcase.jl
@@ -0,0 +1,84 @@
+using MATLAB: mxSINGLE_CLASS
+ENV["MATLAB_ROOT"] = "/opt/common/apps/matlab/r2021a/"
+using Revise
+
+using MATLAB
+using Amica
+using SignalAnalysis
+using LinearAlgebra
+#---
+using CairoMakie
+#---
+includet("/scratch/projects/fapra_amica/src/simulate_data.jl")
+includet("/scratch/projects/fapra_amica/src/fortran_tools.jl")
+n_chan = 4
+n_time = 10_000#60_000
+x, _A, s = simulate_data(; T=50, n_chan, n_time, type=:gg6)
+i_scale, i_location, i_A = init_params(; n_chan, n_time)
+
+#---
+f = Figure()
+series(f[1, 1], s[:, 1:100], axis=(; title="source"))
+#xlims!(1,20)
+series(f[2, 1], x[:, 1:100], axis=(; title="mixed"))
+f
+
+#----
+maxiter = 50
+fortran_setup(Float32.(x); max_threads=10, max_iter=maxiter, dble_data=0)
+run(`/scratch/projects/fapra_amica/fortran/amica15test julia.param`)
+
+mat"""
+tic
+[mA,mc,mLL,mLt,mgm,malpha,mmu,mbeta,mrho] = amica_a($x,1,3,$maxiter,$i_location,$i_scale,$i_A,0);
+mt = toc
+"""
+
+# Julia run
+am = SingleModelAmica(x; maxiter=maxiter)#, A=deepcopy(i_A), location=deepcopy(i_location), scale=deepcopy(i_scale))
+fit!(am, x; do_sphering=true)
+
+#---
+fLL = reinterpret(Float64, (read("amicaout/LL")))
+scatter(vec(@mget(mLL)), label="matlab")
+scatter!(am.LL, label="julia")
+scatter!(fLL, label="fortran")
+ylims!(-2, 0)
+axislegend()
+current_figure()
+
+#---
+fA = reshape(reinterpret(Float64, (read("amicaout/A"))), n_chan, n_chan)
+fS = reshape(reinterpret(Float64, (read("amicaout/S"))), n_chan, n_chan)
+
+
+f2 = Figure()#size=(800, 800))
+#series(f2[1, 1], (inv(a.A) * x), axis=(; title="unmixed julia64"))
+series(f2[end+1, 1], inv(am.A) * am.S * x[:, 1:100], axis=(; title="unmixed julia32"))
+#series(f2[end+1, 1], inv(mA) * x, axis=(; title="unmixed matlab"))
+#series(f2[end+1, 1], inv(mAopt) * x, axis=(; title="unmixed matlab_optimizd"))
+series(f2[end+1, 1], (inv(fA)*fS*x)[:, 1:100], axis=(; title="unmixed fortran"))
+series(f2[end+1, 1], s[:, 1:100], axis=(; title="original source"))
+series(f2[end+1, 1], x[:, 1:100], axis=(; title="original mixed"))
+hidedecorations!.(f2.content)
+f2
+
+#---
+xc = Amica.CuArray(Float32.(x))
+amc = Amica.fit(CuSingleModelAmica, xc; maxiter=3)
+
+
+#---
+import IntelVectorMath as IVM
+function A(myAmica)
+    IVM.abs!(myAmica.y_rho, myAmica.y)
+    for i in 1:size(myAmica.y_rho, 2)
+        for j in 1:size(myAmica.y_rho, 1)
+            @views _y_rho = myAmica.y_rho[j, i, :]
+            Amica.optimized_pow!(_y_rho, _y_rho, myAmica.learnedParameters.shape[j, i])
+        end
+    end
+
+end
+
+CUDA.@profile A(amc)
diff --git a/test/profile.jl b/test/profile.jl
new file mode 100644
index 0000000..6038918
--- /dev/null
+++ b/test/profile.jl
@@ -0,0 +1,33 @@
+using Amica
+using MAT
+using Profile
+using Test
+
+file = matopen("test/testdata/eeg_data.mat")
+#file = matopen("test/testdata/pink_sinus_data.mat")
+
+x = read(file, "x")
+
+beta_init = read(file, "beta_init")
+mu_init = read(file, "mu_init")
+A_init = read(file, "A_init")
+
+close(file)
+# before: ca 1.2s / iter
+# last best: 0,53s / iter
+
+# SingleModelAmica{Float64, 32, 3} with:
+#     - signal-size: (32, 59850)
+#     - likelihood: -0.7354624215419704 (after 30 iterations) 
+
+@profview myAmica = fit(SingleModelAmica, x; maxiter=30, do_sphering=true, remove_mean=true, m=3, scale=beta_init[:, :, 1], location=mu_init[:, :, 1], A=copy(A_init[:, :, 1]))
+
+# x = [1 4; 4 1]*Float64.([1.0 2 3; 4 5 6])
+# A_init = [1.0 0.003; -0.05 1.0]
+# beta_init = [1.1 0.9; 1.0 0.9; 0.9 0.8]
+# mu_init = [0.1 0.9; -0.01 0.0; 0.0 -0.02]
+# am = fit(SingleModelAmica,x;maxiter=6, m=3, scale=beta_init, location=mu_init, A=copy(A_init), do_sphering = false)
+# @test am.A ≈ [0.8761290481633254 0.7147631091971822; 0.48207664428431446 0.6993666404188701]
+# @test am.LL[6] ≈ -1.701977346216155
+# @test am.Lt[3] ≈ -3.4842563175935526
+
diff --git a/test/runtests.jl b/test/runtests.jl
index 343ac21..1c86945 100755
--- a/test/runtests.jl
+++ b/test/runtests.jl
@@ -10,9 +10,9 @@ using MAT
     beta_init = [1.1 0.9; 1.0 0.9; 0.9 0.8]
     mu_init = [0.1 0.9; -0.01 0.0; 0.0 -0.02]
     am = fit(SingleModelAmica,x;maxiter=6, m=3, scale=beta_init, location=mu_init, A=copy(A_init), do_sphering = false)
-    @test am.A == [0.8761290481633254 0.7147631091971822; 0.48207664428431446 0.6993666404188701]
-    @test am.LL[6] == -1.701977346216155
-    @test am.Lt[3] == -3.4842563175935526
+    @test am.A ≈ [0.8761290481633254 0.7147631091971822; 0.48207664428431446 0.6993666404188701]
+    @test am.LL[6] ≈ -1.701977346216155
+    @test am.Lt[3] ≈ -3.4842563175935526
 
     #Test multi model
     file = matopen("test/testdata/supersmall_data.mat")
@@ -23,8 +23,8 @@ using MAT
     close(file)
     file = matopen("test/testdata/supersmall_data_results.mat") #contains results for A and LL after 4 iterations (2 Models, 3 GGs)
     am = fit(MultiModelAmica,x;maxiter=4, m=3,M = 2,scale=beta_init, location=mu_init, A=copy(A_init), do_sphering = false)
-    @test am.LL == read(file,"LL_after_4iter")
-    @test am.models[1].A == read(file, "A1_after_4iter")
-    @test am.models[2].A == read(file, "A2_after_4iter")
+    @test am.LL ≈ read(file,"LL_after_4iter")
+    @test am.models[1].A ≈ read(file, "A1_after_4iter")
+    @test am.models[2].A ≈ read(file, "A2_after_4iter")
     close(file)
 end