Skip to content

Commit d8c159d

Browse files
Jpsl/update flux (#1086)
* Update Flux and GPUArrays compatibility in Project.toml; refactor FluxApproximator and TargetNetwork implementations * Refactor target network optimization and update test assertions for consistency * Simplify FluxApproximator's optimise! method by using a single-line function definition * Bump version to 0.15.4 in Project.toml * Update NEWS.md for v0.15.4: Upgrade Flux.jl to v0.16 and resolve deprecation warnings * Add Conda dependency and update test environment setup * Update test environment setup to use pip for gym installation * Fix RLEnv tests * Fix optimizer reference in stock trading environment example * Fix optimizer reference in stock trading environment example * Refactor optimizer implementation in DDPGPolicy to use OptimiserChain
1 parent 35c2092 commit d8c159d

File tree

10 files changed

+88
-78
lines changed

10 files changed

+88
-78
lines changed

docs/homepage/blog/ospp_report_210370190/index.md

+2-2
Original file line numberDiff line numberDiff line change
@@ -491,11 +491,11 @@ create_critic(critic_dim) = Chain(
491491
create_policy(player) = DDPGPolicy(
492492
behavior_actor = NeuralNetworkApproximator(
493493
model = create_actor(player),
494-
optimizer = Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-2)),
494+
optimizer = OptimiserChain(ClipNorm(0.5), Adam(1e-2)),
495495
),
496496
behavior_critic = NeuralNetworkApproximator(
497497
model = create_critic(critic_dim),
498-
optimizer = Flux.Optimise.Optimiser(ClipNorm(0.5), Adam(1e-2)),
498+
optimizer = OptimiserChain(ClipNorm(0.5), Adam(1e-2)),
499499
),
500500
target_actor = NeuralNetworkApproximator(
501501
model = create_actor(player),

src/ReinforcementLearningCore/NEWS.md

+4
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,9 @@
11
# ReinforcementLearningCore.jl Release Notes
22

3+
#### v0.15.4
4+
5+
- Update `Flux.jl` to `v0.16` and fix deprecation warnings and method errors
6+
37
#### v0.15.3
48

59
- Make `FluxApproximator` work with `QBasedPolicy`

src/ReinforcementLearningCore/Project.toml

+3-3
Original file line numberDiff line numberDiff line change
@@ -1,6 +1,6 @@
11
name = "ReinforcementLearningCore"
22
uuid = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
3-
version = "0.15.3"
3+
version = "0.15.4"
44

55
[deps]
66
AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
@@ -31,8 +31,8 @@ CircularArrayBuffers = "0.1.12"
3131
Crayons = "4"
3232
Distributions = "0.25"
3333
FillArrays = "0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1"
34-
Flux = "0.14"
35-
GPUArrays = "8, 9, 10"
34+
Flux = "0.14, 0.15, 0.16"
35+
GPUArrays = "8, 9, 10, 11"
3636
Metal = "1.0"
3737
ProgressMeter = "1"
3838
Reexport = "1"

src/ReinforcementLearningCore/src/policies/learners/flux_approximator.jl

+2-2
Original file line numberDiff line numberDiff line change
@@ -43,5 +43,5 @@ Flux.@layer FluxApproximator trainable=(model,)
4343
forward(A::FluxApproximator, args...; kwargs...) = A.model(args...; kwargs...)
4444
forward(A::FluxApproximator, env::E, player::AbstractPlayer=current_player(env)) where {E <: AbstractEnv} = env |> (x -> state(x, player)) |> (x -> forward(A, x))
4545

46-
RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) =
47-
Flux.Optimise.update!(A.optimiser_state, A.model, grad.model)
46+
RLBase.optimise!(A::FluxApproximator, grad::NamedTuple) = Flux.Optimise.update!(A.optimiser_state, A.model, grad.model)
47+

src/ReinforcementLearningCore/src/policies/learners/target_network.jl

+6-3
Original file line numberDiff line numberDiff line change
@@ -74,9 +74,12 @@ function RLBase.optimise!(tn::TargetNetwork, grad::NamedTuple)
7474
tn.n_optimise += 1
7575

7676
if tn.n_optimise % tn.sync_freq == 0
77-
# polyak averaging
78-
for (dest, src) in zip(Flux.params(target(tn)), Flux.params(tn.network))
79-
dest .= tn.ρ .* dest .+ (1 - tn.ρ) .* src
77+
# Polyak averaging
78+
src_layers = RLCore.model(tn)
79+
dest_layers = RLCore.target(tn)
80+
for i in 1:length(src_layers)
81+
dest_layers[i].weight .= tn.ρ .* dest_layers[i].weight .+ (1 - tn.ρ) .* src_layers[i].weight
82+
dest_layers[i].bias .= tn.ρ .* dest_layers[i].bias .+ (1 - tn.ρ) .* src_layers[i].bias
8083
end
8184
tn.n_optimise = 0
8285
end

src/ReinforcementLearningCore/test/policies/learners/target_network.jl

+14-14
Original file line numberDiff line numberDiff line change
@@ -10,9 +10,9 @@ using ReinforcementLearningCore
1010
@test_throws "AssertionError: `FluxApproximator` model is not on GPU." TargetNetwork(FluxApproximator(model, optimiser), use_gpu=true)
1111
end
1212
@test TargetNetwork(FluxApproximator(model=model, optimiser=optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork
13-
@test TargetNetwork(FluxApproximator(model, optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork
13+
@test TargetNetwork(FluxApproximator(model=model, optimiser=optimiser, use_gpu=true), use_gpu=true) isa TargetNetwork
1414

15-
approx = FluxApproximator(model, optimiser, use_gpu=false)
15+
approx = FluxApproximator(model=model, optimiser=optimiser, use_gpu=false)
1616
target_network = TargetNetwork(approx, use_gpu=false)
1717

1818

@@ -38,7 +38,7 @@ using ReinforcementLearningCore
3838
@testset "Optimise" begin
3939
optimiser = Adam()
4040
model = Chain(Dense(10, 5, relu), Dense(5, 2))
41-
approximator = FluxApproximator(model, optimiser)
41+
approximator = FluxApproximator(model=model, optimiser=optimiser)
4242
target_network = TargetNetwork(approximator)
4343
input = rand(Float32, 10)
4444
grad = Flux.Zygote.gradient(target_network) do model
@@ -54,7 +54,7 @@ using ReinforcementLearningCore
5454

5555
@testset "Sync" begin
5656
optimiser = Adam()
57-
model = FluxApproximator(Chain(Dense(10, 5, relu), Dense(5, 2)), optimiser)
57+
model = FluxApproximator(model=Chain(Dense(10, 5, relu), Dense(5, 2)), optimiser=optimiser)
5858
target_network = TargetNetwork(model, sync_freq=2, ρ=0.5)
5959

6060
input = rand(Float32, 10)
@@ -75,9 +75,9 @@ end
7575
m = Chain(Dense(4,1))
7676
app = FluxApproximator(model = m, optimiser = Flux.Adam(), use_gpu=true)
7777
tn = TargetNetwork(app, sync_freq = 3, use_gpu=true)
78-
@test typeof(model(tn)) == typeof(target(tn))
79-
p1 = Flux.destructure(model(tn))[1]
80-
pt1 = Flux.destructure(target(tn))[1]
78+
@test typeof(RLCore.model(tn)) == typeof(RLCore.target(tn))
79+
p1 = Flux.destructure(RLCore.model(tn))[1]
80+
pt1 = Flux.destructure(RLCore.target(tn))[1]
8181
@test p1 == pt1
8282
input = gpu(ones(Float32, 4))
8383
grad = Flux.Zygote.gradient(tn) do model
@@ -87,16 +87,16 @@ end
8787
grad_model = grad[1]
8888

8989
RLCore.optimise!(tn, grad_model)
90-
@test p1 != Flux.destructure(model(tn))[1]
91-
@test p1 == Flux.destructure(target(tn))[1]
90+
@test p1 != Flux.destructure(RLCore.model(tn))[1]
91+
@test p1 == Flux.destructure(RLCore.target(tn))[1]
9292
RLCore.optimise!(tn, grad_model)
93-
@test p1 != Flux.destructure(model(tn))[1]
93+
@test p1 != Flux.destructure(RLCore.model(tn))[1]
9494
@test p1 == Flux.destructure(target(tn))[1]
9595
RLCore.optimise!(tn, grad_model)
96-
@test Flux.destructure(target(tn))[1] == Flux.destructure(model(tn))[1]
96+
@test Flux.destructure(RLCore.target(tn))[1] == Flux.destructure(RLCore.model(tn))[1]
9797
@test p1 != Flux.destructure(target(tn))[1]
98-
p2 = Flux.destructure(model(tn))[1]
98+
p2 = Flux.destructure(RLCore.model(tn))[1]
9999
RLCore.optimise!(tn, grad_model)
100-
@test p2 != Flux.destructure(model(tn))[1]
101-
@test p2 == Flux.destructure(target(tn))[1]
100+
@test p2 != Flux.destructure(RLCore.model(tn))[1]
101+
@test p2 == Flux.destructure(RLCore.target(tn))[1]
102102
end

src/ReinforcementLearningCore/test/utils/networks.jl

+48-50
Original file line numberDiff line numberDiff line change
@@ -22,13 +22,13 @@ import ReinforcementLearningBase: RLBase
2222
q_values = NN(rand(Float32, 2))
2323
@test size(q_values) == (3,)
2424
25-
gs = gradient(params(NN)) do
25+
gs = gradient(NN) do
2626
sum(NN(rand(Float32, 2, 5)))
2727
end
2828
29-
old_params = deepcopy(collect(params(NN).params))
29+
old_params = deepcopy(collect(Flux.trainable(NN).params))
3030
push!(NN, gs)
31-
new_params = collect(params(NN).params)
31+
new_params = collect(Flux.trainable(NN).params)
3232
3333
@test old_params != new_params
3434
end
@@ -72,42 +72,40 @@ import ReinforcementLearningBase: RLBase
7272
end
7373
@testset "Correctness of gradients" begin
7474
@testset "One action per state" begin
75-
@test Flux.params(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias])
75+
@test Flux.trainable(gn).pre == gn.pre
76+
@test Flux.trainable(gn).μ == gn.μ
77+
@test Flux.trainable(gn).σ == gn.σ
7678
action_saver = Matrix[]
77-
g = Flux.gradient(Flux.params(gn)) do
78-
a, logp = gn(state, is_sampling = true, is_return_log_prob = true)
79+
g = Flux.gradient(gn) do model
80+
a, logp = model(state, is_sampling = true, is_return_log_prob = true)
7981
ChainRulesCore.ignore_derivatives() do
8082
push!(action_saver, a)
8183
end
8284
sum(logp)
8385
end
84-
g2 = Flux.gradient(Flux.params(gn)) do
85-
logp = gn(state, only(action_saver))
86+
g2 = Flux.gradient(gn) do model
87+
logp = model(state, only(action_saver))
8688
sum(logp)
8789
end
8890
#Check that gradients are identical
89-
for (grad1, grad2) in zip(g,g2)
90-
@test grad1 grad2
91-
end
91+
@test g == g2
9292
end
9393
@testset "Multiple actions per state" begin
9494
#Same with multiple actions sampled
9595
action_saver = []
9696
state = unsqueeze(state, dims = 2)
97-
g = Flux.gradient(Flux.params(gn)) do
98-
a, logp = gn(state, 3)
97+
g1 = Flux.gradient(gn) do model
98+
a, logp = model(state, 3)
9999
ChainRulesCore.ignore_derivatives() do
100100
push!(action_saver, a)
101101
end
102102
sum(logp)
103103
end
104-
g2 = Flux.gradient(Flux.params(gn)) do
105-
logp = gn(state, only(action_saver))
104+
g2 = Flux.gradient(gn) do model
105+
logp = model(state, only(action_saver))
106106
sum(logp)
107107
end
108-
for (grad1, grad2) in zip(g,g2)
109-
@test grad1 grad2
110-
end
108+
@test g1 == g2
111109
end
112110
end
113111
end
@@ -117,7 +115,6 @@ import ReinforcementLearningBase: RLBase
117115
gn = GaussianNetwork(Dense(20,15), Dense(15,10), Dense(15,10, softplus)) |> gpu
118116
state = rand(Float32, 20,3) |> gpu #batch of 3 states
119117
@testset "Forward pass compatibility" begin
120-
@test Flux.params(gn) == Flux.Params([gn.pre.weight, gn.pre.bias, gn.μ.weight, gn.μ.bias, gn.σ.weight, gn.σ.bias])
121118
m, L = gn(state)
122119
@test size(m) == size(L) == (10,3)
123120
a, logp = gn(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true)
@@ -134,15 +131,15 @@ import ReinforcementLearningBase: RLBase
134131
@testset "Backward pass compatibility" begin
135132
@testset "One action sampling" begin
136133
action_saver = CuMatrix[]
137-
g = Flux.gradient(Flux.params(gn)) do
138-
a, logp = gn(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true)
134+
g = Flux.gradient(gn) do model
135+
a, logp = model(CUDA.CURAND.RNG(), state, is_sampling = true, is_return_log_prob = true)
139136
ChainRulesCore.ignore_derivatives() do
140137
push!(action_saver, a)
141138
end
142139
sum(logp)
143140
end
144-
g2 = Flux.gradient(Flux.params(gn)) do
145-
logp = gn(state, only(action_saver))
141+
g2 = Flux.gradient(gn) do model
142+
logp = model(state, only(action_saver))
146143
sum(logp)
147144
end
148145
#Check that gradients are identical
@@ -153,15 +150,15 @@ import ReinforcementLearningBase: RLBase
153150
@testset "Multiple actions sampling" begin
154151
action_saver = []
155152
state = unsqueeze(state, dims = 2)
156-
g = Flux.gradient(Flux.params(gn)) do
153+
g = Flux.gradient(gn) do
157154
a, logp = gn(CUDA.CURAND.RNG(), state, 3)
158155
ChainRulesCore.ignore_derivatives() do
159156
push!(action_saver, a)
160157
end
161158
sum(logp)
162159
end
163-
g2 = Flux.gradient(Flux.params(gn)) do
164-
logp = gn(state, only(action_saver))
160+
g2 = Flux.gradient(gn) do model
161+
logp = model(state, only(action_saver))
165162
sum(logp)
166163
end
167164
for (grad1, grad2) in zip(g,g2)
@@ -202,7 +199,10 @@ import ReinforcementLearningBase: RLBase
202199
μ = Dense(15,10)
203200
Σ = Dense(15,10*11÷2)
204201
gn = CovGaussianNetwork(pre, μ, Σ)
205-
@test Flux.params(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias])
202+
@test Flux.trainable(gn).pre == pre
203+
@test Flux.trainable(gn).μ == μ
204+
@test Flux.trainable(gn).Σ == Σ
205+
206206
state = rand(Float32, 20,3) #batch of 3 states
207207
#Check that it works in 2D
208208
m, L = gn(state)
@@ -233,35 +233,34 @@ import ReinforcementLearningBase: RLBase
233233
logp_truth = [logpdf(mvn, a) for (mvn, a) in zip(mvnormals, eachslice(as, dims = 3))]
234234
@test stack(logp_truth; dims=2) dropdims(logps,dims = 1) #test against ground truth
235235
action_saver = []
236-
g = Flux.gradient(Flux.params(gn)) do
237-
a, logp = gn(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
236+
g1 = Flux.gradient(gn) do model
237+
a, logp = model(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
238238
ChainRulesCore.ignore_derivatives() do
239239
push!(action_saver, a)
240240
end
241241
mean(logp)
242242
end
243-
g2 = Flux.gradient(Flux.params(gn)) do
244-
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
243+
g2 = Flux.gradient(gn) do model
244+
logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver))
245245
mean(logp)
246246
end
247-
for (grad1, grad2) in zip(g,g2)
248-
@test grad1 grad2
249-
end
247+
@test g1 == g2
248+
250249
empty!(action_saver)
251-
g3 = Flux.gradient(Flux.params(gn)) do
252-
a, logp = gn(Flux.unsqueeze(state,dims = 2), 3)
250+
251+
g3 = Flux.gradient(gn) do model
252+
a, logp = model(Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
253253
ChainRulesCore.ignore_derivatives() do
254254
push!(action_saver, a)
255255
end
256256
mean(logp)
257257
end
258-
g4 = Flux.gradient(Flux.params(gn)) do
259-
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
258+
g4 = Flux.gradient(gn) do model
259+
logp = model(Flux.unsqueeze(state, dims = 2), only(action_saver))
260260
mean(logp)
261261
end
262-
for (grad1, grad2) in zip(g4,g3)
263-
@test grad1 grad2
264-
end
262+
263+
@test g4 == g3
265264
end
266265
@testset "CUDA" begin
267266
if (@isdefined CUDA) && CUDA.functional()
@@ -271,7 +270,6 @@ import ReinforcementLearningBase: RLBase
271270
μ = Dense(15,10) |> gpu
272271
Σ = Dense(15,10*11÷2) |> gpu
273272
gn = CovGaussianNetwork(pre, μ, Σ)
274-
@test Flux.params(gn) == Flux.Params([pre.weight, pre.bias, μ.weight, μ.bias, Σ.weight, Σ.bias])
275273
state = rand(Float32, 20,3)|> gpu #batch of 3 states
276274
m, L = gn(Flux.unsqueeze(state,dims = 2))
277275
@test size(m) == (10,1,3)
@@ -292,31 +290,31 @@ import ReinforcementLearningBase: RLBase
292290
logp_truth = [logpdf(mvn, cpu(a)) for (mvn, a) in zip(mvnormals, eachslice(as, dims = 3))]
293291
@test reduce(hcat, collect(logp_truth)) dropdims(cpu(logps); dims=1) #test against ground truth
294292
action_saver = []
295-
g = Flux.gradient(Flux.params(gn)) do
296-
a, logp = gn(rng, Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
293+
g = Flux.gradient(gn) do model
294+
a, logp = model(rng, Flux.unsqueeze(state,dims = 2), is_sampling = true, is_return_log_prob = true)
297295
ChainRulesCore.ignore_derivatives() do
298296
push!(action_saver, a)
299297
end
300298
mean(logp)
301299
end
302300

303-
g2 = Flux.gradient(Flux.params(gn)) do
304-
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
301+
g2 = Flux.gradient(gn) do model
302+
logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver))
305303
mean(logp)
306304
end
307305
for (grad1, grad2) in zip(g,g2)
308306
@test grad1 grad2
309307
end
310308
empty!(action_saver)
311-
g3 = Flux.gradient(Flux.params(gn)) do
312-
a, logp = gn(rng, Flux.unsqueeze(state,dims = 2), 3)
309+
g3 = Flux.gradient(gn) do model
310+
a, logp = model(rng, Flux.unsqueeze(state,dims = 2), 3)
313311
ChainRulesCore.ignore_derivatives() do
314312
push!(action_saver, a)
315313
end
316314
mean(logp)
317315
end
318-
g4 = Flux.gradient(Flux.params(gn)) do
319-
logp = gn(Flux.unsqueeze(state,dims = 2), only(action_saver))
316+
g4 = Flux.gradient(gn) do model
317+
logp = model(Flux.unsqueeze(state,dims = 2), only(action_saver))
320318
mean(logp)
321319
end
322320
for (grad1, grad2) in zip(g4,g3)

src/ReinforcementLearningEnvironments/test/environments/examples/stock_trading_env.jl

+1-1
Original file line numberDiff line numberDiff line change
@@ -26,7 +26,7 @@ end
2626
Dense(ns, 64, relu),
2727
Dense(64, na, relu),
2828
),
29-
Flux.Optimise.Optimiser(ClipNorm(0.5), ADAM(1e-5)),
29+
OptimiserChain(ClipNorm(0.5), Adam(1e-5)),
3030
),
3131
explorer = EpsilonGreedyExplorer(ϵ_stable=0.01),
3232
),

src/ReinforcementLearningEnvironments/test/runtests.jl

+7-2
Original file line numberDiff line numberDiff line change
@@ -14,8 +14,13 @@ using TimerOutputs
1414
using Conda
1515
using JLD2
1616

17-
Conda.add("gym")
18-
Conda.add("numpy")
17+
ENV["CONDA_JL_USE_MINIFORGE"] = "1"
18+
19+
Conda.add("python", Conda.ROOTENV)
20+
Conda.add("numpy", Conda.ROOTENV)
21+
Conda.pip_interop(true, Conda.ROOTENV)
22+
Conda.pip("install", "gym", Conda.ROOTENV)
23+
1924

2025
@testset "ReinforcementLearningEnvironments" begin
2126
include("environments/environments.jl")

src/ReinforcementLearningFarm/Project.toml

+1-1
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,7 @@ ReinforcementLearning = "158674fc-8238-5cab-b5ba-03dfc80d1318"
1313

1414
[compat]
1515
FillArrays = "1"
16-
Flux = "0.14"
16+
Flux = "0.14, 0.15, 0.16"
1717
CircularArrayBuffers = "0.1.12"
1818
Distributions = "0.25"
1919
ReinforcementLearning = "0.11"

0 commit comments

Comments
 (0)