Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Start towards caching and perf optimizations #45

Merged
merged 10 commits into from
Mar 22, 2024
11 changes: 2 additions & 9 deletions Project.toml
Original file line number Diff line number Diff line change
Expand Up @@ -15,19 +15,12 @@ NonlinearSolve = "8913a72c-1f9b-4ce2-8d82-65094dcecaec"
Optimization = "7f7a1694-90dd-40f0-9382-eb1efda571ba"
QuasiMonteCarlo = "8a4e6c94-4038-4cdc-81c3-7e6ffdb2a71b"
Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
SciMLBase = "0bca4576-84f4-4d90-8ffe-ffa030f20462"
Setfield = "efcf1570-3423-57d1-acb7-fd33fddbac46"
SimpleChains = "de6bee2f-e2f4-4ec7-b6ed-219cc6f6e9e5"
SimpleNonlinearSolve = "727e6d20-b764-4bd8-a329-72de5adea6c7"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"

[compat]
julia = "1.6"

[extras]
LinearAlgebra = "37e2e46d-f89d-539d-b4ee-838fcccc9c8e"
StaticArrays = "90137ffa-7385-5640-81b9-e52037218182"
Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"

[targets]
test = ["Test", "StaticArrays", "LinearAlgebra", "Optimization", "ForwardDiff"]
julia = "1.6"
7 changes: 6 additions & 1 deletion src/PSOGPU.jl
Original file line number Diff line number Diff line change
Expand Up @@ -3,11 +3,15 @@ module PSOGPU
using SciMLBase, StaticArrays, Setfield, KernelAbstractions
using QuasiMonteCarlo, Optimization, SimpleNonlinearSolve, ForwardDiff
import Adapt
import Adapt: adapt
import Enzyme: autodiff_deferred, Active, Reverse
import KernelAbstractions: @atomic, @atomicreplace, @atomicswap
using QuasiMonteCarlo
import DiffEqGPU: GPUTsit5, make_prob_compatible, vectorized_solve, vectorized_asolve

using Reexport
@reexport using SciMLBase

## Use lb and ub either as StaticArray or pass them separately as CuArrays
## Passing as CuArrays makes more sense, or maybe SArray? The based on no. of dimension
struct SPSOParticle{T1, T2 <: eltype(T1)}
Expand Down Expand Up @@ -63,10 +67,11 @@ include("./utils.jl")
include("./ode_pso.jl")
include("./kernels.jl")
include("./lowerlevel_solve.jl")
include("init.jl")
include("./solve.jl")
include("./bfgs.jl")
include("./hybrid.jl")

export ParallelPSOKernel,
ParallelSyncPSOKernel, ParallelPSOArray, SerialPSO, OptimizationProblem, solve
ParallelSyncPSOKernel, ParallelPSOArray, SerialPSO
end
8 changes: 8 additions & 0 deletions src/algorithms.jl
Original file line number Diff line number Diff line change
@@ -1,6 +1,14 @@

abstract type PSOAlgorithm end
abstract type HybridPSOAlgorithm{LocalOpt} end
abstract type GPUSamplingAlgorithm end

struct GPUUniformSampler <: GPUSamplingAlgorithm
end

struct GPUUnboundedSampler <: GPUSamplingAlgorithm
end

"""
```julia
ParallelPSOKernel(num_particles = 100)
Expand Down
25 changes: 12 additions & 13 deletions src/hybrid.jl
Original file line number Diff line number Diff line change
Expand Up @@ -5,27 +5,26 @@
result[i] = sol.u
end

function SciMLBase.__solve(prob::SciMLBase.OptimizationProblem,
opt::HybridPSO{Backend, LocalOpt},
args...;
function SciMLBase.solve!(
cache::HybridPSOCache, opt::HybridPSO{Backend, LocalOpt}, args...;
abstol = nothing,
reltol = nothing,
maxiters = 100,
local_maxiters = 10,
kwargs...) where {Backend, LocalOpt <: Union{LBFGS, BFGS}}
t0 = time()
psoalg = opt.pso
local_opt = opt.local_opt
backend = opt.backend
maxiters = 100, local_maxiters = 10, kwargs...) where {
Backend, LocalOpt <: Union{LBFGS, BFGS}}
pso_cache = cache.pso_cache

sol_pso = solve(prob, psoalg, args...; maxiters, kwargs...)
sol_pso = solve!(pso_cache)
x0s = sol_pso.original
prob = remake(prob, lb = nothing, ub = nothing)

backend = opt.backend

prob = remake(cache.prob, lb = nothing, ub = nothing)
f = Base.Fix2(prob.f.f, prob.p)
∇f = instantiate_gradient(f, prob.f.adtype)

kernel = simplebfgs_run!(backend)
result = KernelAbstractions.allocate(backend, typeof(prob.u0), length(x0s))
result = cache.start_points
copyto!(result, x0s)
nlprob = NonlinearProblem{false}(∇f, prob.u0)

nlalg = LocalOpt isa LBFGS ?
Expand Down
166 changes: 166 additions & 0 deletions src/init.jl
Original file line number Diff line number Diff line change
@@ -0,0 +1,166 @@
mutable struct PSOCache{TP, TAlg, TPart, TGbest, TSampler}
prob::TP
alg::TAlg
particles::TPart
gbest::TGbest
sampler::TSampler
end

struct HybridPSOCache{TPc, TSp, TAlg}
pso_cache::TPc
start_points::TSp
alg::TAlg
end

function __init!(particles, prob::OptimizationProblem,
opt::Union{ParallelPSOKernel, ParallelSyncPSOKernel}, sampler::T,
args...; kwargs...) where {T <: QuasiMonteCarlo.SamplingAlgorithm}
backend = opt.backend

qmc_samples = QuasiMonteCarlo.sample(opt.num_particles, prob.lb, prob.ub, sampler)

qmc_samples = adapt(backend, qmc_samples)

kernel! = gpu_init_particles!(backend)

kernel!(
particles, qmc_samples, prob, opt, typeof(prob.u0), T; ndrange = opt.num_particles)

best_particle = minimum(particles)
init_gbest = SPSOGBest(best_particle.best_position, best_particle.best_cost)

return particles, init_gbest
end

function __init!(particles, prob::OptimizationProblem,
opt::Union{ParallelPSOKernel, ParallelSyncPSOKernel}, sampler::T,
args...; kwargs...) where {T <: GPUSamplingAlgorithm}
backend = opt.backend

kernel! = gpu_init_particles!(backend)

kernel!(particles, prob, opt, typeof(prob.u0), T; ndrange = opt.num_particles)

best_particle = minimum(particles)

init_gbest = SPSOGBest(best_particle.best_position, best_particle.best_cost)

particles, init_gbest
end

function SciMLBase.init(
prob::OptimizationProblem, opt::ParallelPSOKernel, args...; sampler = GPUUniformSampler(), kwargs...)
@assert prob.u0 isa SArray

## Bounds check
lb, ub = check_init_bounds(prob)
lb, ub = check_init_bounds(prob)
prob = remake(prob; lb = lb, ub = ub)

particles = KernelAbstractions.allocate(
opt.backend, SPSOParticle{typeof(prob.u0), eltype(typeof(prob.u0))}, opt.num_particles)

_sampler = if lb === nothing || ub === nothing || (all(isinf, lb) && all(isinf, ub))
GPUUnboundedSampler()
else
sampler
end

particles, _init_gbest = __init!(particles, prob, opt, _sampler, args...; kwargs...)

init_gbest = KernelAbstractions.allocate(opt.backend, typeof(_init_gbest), (1,))
copyto!(init_gbest, [_init_gbest])

return PSOCache{
typeof(prob), typeof(opt), typeof(particles), typeof(init_gbest), typeof(_sampler)}(
prob, opt, particles, init_gbest, _sampler)
end

function SciMLBase.init(
prob::OptimizationProblem, opt::ParallelSyncPSOKernel, args...; sampler = GPUUniformSampler(), kwargs...)
@assert prob.u0 isa SArray

## Bounds check
lb, ub = check_init_bounds(prob)
lb, ub = check_init_bounds(prob)
prob = remake(prob; lb = lb, ub = ub)

particles = KernelAbstractions.allocate(
opt.backend, SPSOParticle{typeof(prob.u0), eltype(typeof(prob.u0))}, opt.num_particles)

_sampler = if lb === nothing || ub === nothing || (all(isinf, lb) && all(isinf, ub))
GPUUnboundedSampler()
else
sampler
end

particles, init_gbest = __init!(particles, prob, opt, _sampler, args...; kwargs...)

return PSOCache{
typeof(prob), typeof(opt), typeof(particles), typeof(init_gbest), typeof(_sampler)}(
prob, opt, particles, init_gbest, _sampler)
end

function SciMLBase.reinit!(cache::Union{PSOCache, HybridPSOCache})
reinit_cache!(cache, cache.alg)
end

function reinit_cache!(cache::PSOCache, opt::ParallelPSOKernel)
prob = cache.prob
particles = cache.particles

particles, _init_gbest = __init!(particles, prob, opt, cache.sampler)

copyto!(cache.gbest, [_init_gbest])

return nothing
end

function reinit_cache!(cache::PSOCache, opt::ParallelSyncPSOKernel)
prob = cache.prob
particles = cache.particles

particles, init_gbest = __init!(particles, prob, opt, cache.sampler)

cache.gbest = init_gbest

return nothing
end

function SciMLBase.init(
prob::OptimizationProblem, opt::HybridPSO{Backend, LocalOpt}, args...;
kwargs...) where {Backend, LocalOpt <: Union{LBFGS, BFGS}}
psoalg = opt.pso
backend = opt.backend

pso_cache = init(prob, psoalg, args...; kwargs...)

start_points = KernelAbstractions.allocate(
backend, typeof(prob.u0), opt.pso.num_particles)

return HybridPSOCache{
typeof(pso_cache), typeof(start_points), typeof(opt)}(pso_cache, start_points, opt)
end

function reinit_cache!(cache::HybridPSOCache,
opt::HybridPSO{Backend, LocalOpt}) where {Backend, LocalOpt <: Union{LBFGS, BFGS}}
reinit!(cache.pso_cache)
fill!(cache.start_points, zero(eltype(cache.start_points)))
return nothing
end

function Base.getproperty(cache::HybridPSOCache, name::Symbol)
if name ∈ (:start_points, :pso_cache, :alg)
return getfield(cache, name)
else
return getproperty(cache.pso_cache, name)
end
end

function Base.setproperty!(cache::HybridPSOCache, name::Symbol, val)
if name ∈ (:start_points, :pso_cache, :alg)
return setfield!(cache, name, val)
else
return setproperty!(cache.pso_cache, name, val)
end
end
99 changes: 49 additions & 50 deletions src/solve.jl
Original file line number Diff line number Diff line change
Expand Up @@ -2,6 +2,55 @@ function get_pos(particle)
return particle.position
end

function SciMLBase.solve!(
cache::Union{PSOCache, HybridPSOCache}, args...; maxiters = 100, kwargs...)
solve!(cache, cache.alg, args...; maxiters, kwargs...)
end

function SciMLBase.solve!(
cache::PSOCache, opt::ParallelPSOKernel, args...; maxiters = 100, kwargs...)
prob = cache.prob
t0 = time()
gbest, particles = vectorized_solve!(cache.prob,
cache.gbest,
cache.particles,
opt,
Val(opt.global_update),
args...;
maxiters, kwargs...)
t1 = time()

particles_positions = get_pos.(particles)
SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
gbest.position, prob.f(gbest.position, prob.p), original = particles_positions,
stats = Optimization.OptimizationStats(; time = t1 - t0))
end

function SciMLBase.solve!(
cache::PSOCache, opt::ParallelSyncPSOKernel, args...; maxiters = 100, kwargs...)
prob = cache.prob
t0 = time()
gbest, particles = vectorized_solve!(prob,
cache.gbest,
cache.particles,
opt,
args...;
maxiters,
kwargs...)
t1 = time()

particles_positions = get_pos.(particles)
SciMLBase.build_solution(SciMLBase.DefaultOptimizationCache(prob.f, prob.p), opt,
gbest.position, prob.f(gbest.position, prob.p), original = particles_positions,
stats = Optimization.OptimizationStats(; time = t1 - t0))
end

function SciMLBase.solve(prob::OptimizationProblem,
opt::Union{ParallelPSOKernel, ParallelSyncPSOKernel, HybridPSO},
args...; maxiters = 100, kwargs...)
solve!(init(prob, opt, args...; kwargs...), opt, args...; maxiters, kwargs...)
end

function SciMLBase.__solve(prob::OptimizationProblem,
opt::PSOAlgorithm,
args...;
Expand All @@ -18,34 +67,6 @@ function SciMLBase.__solve(prob::OptimizationProblem,
stats = Optimization.OptimizationStats(; time = solve_time))
end

function pso_solve(prob::OptimizationProblem,
opt::ParallelPSOKernel,
args...;
kwargs...)
backend = opt.backend
@assert prob.u0 isa SArray
init_gbest, particles = init_particles(prob, opt, typeof(prob.u0))
# TODO: Do the equivalent of cu()/roc()
particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
gpu_particles = KernelAbstractions.allocate(backend,
particles_eltype,
size(particles))
copyto!(gpu_particles, particles)
gpu_init_gbest = KernelAbstractions.allocate(backend, typeof(init_gbest), (1,))
copyto!(gpu_init_gbest, [init_gbest])

t0 = time()
gbest, particles = vectorized_solve!(prob,
gpu_init_gbest,
gpu_particles,
opt,
Val(opt.global_update),
args...;
kwargs...)
t1 = time()
gbest, particles, t1 - t0
end

function pso_solve(prob::OptimizationProblem,
opt::ParallelPSOArray,
args...;
Expand All @@ -69,25 +90,3 @@ function pso_solve(prob::OptimizationProblem, opt::SerialPSO, args...; kwargs...
t1 = time()
gbest, particles, t1 - t0
end

function pso_solve(prob::OptimizationProblem,
opt::ParallelSyncPSOKernel,
args...;
kwargs...)
backend = opt.backend
init_gbest, particles = init_particles(prob, opt, typeof(prob.u0))
particles_eltype = eltype(particles) === Float64 ? Float32 : eltype(particles)
gpu_particles = KernelAbstractions.allocate(backend, particles_eltype, size(particles))
copyto!(gpu_particles, particles)
init_gbest = init_gbest

t0 = time()
gbest, particles = vectorized_solve!(prob,
init_gbest,
gpu_particles,
opt,
args...;
kwargs...)
t1 = time()
gbest, particles, t1 - t0
end
Loading
Loading