JuliaReinforcementLearning
diff --git a/‎.github/workflows/ci.yml
+1-1 b/‎.github/workflows/ci.yml
+1-1
diff --git a/‎Project.toml
+3-5 b/‎Project.toml
+3-5
diff --git a/‎src/DistributedReinforcementLearning/Project.toml
+2-2 b/‎src/DistributedReinforcementLearning/Project.toml
+2-2
diff --git a/‎src/ReinforcementLearningCore/Project.toml
+5-7 b/‎src/ReinforcementLearningCore/Project.toml
+5-7
diff --git a/‎src/ReinforcementLearningCore/src/policies/explorers/epsilon_greedy_explorer.jl
+4-4 b/‎src/ReinforcementLearningCore/src/policies/explorers/epsilon_greedy_explorer.jl
+4-4
diff --git a/‎src/ReinforcementLearningCore/src/policies/learners.jl
-26 b/‎src/ReinforcementLearningCore/src/policies/learners.jl
-26
diff --git a/‎src/ReinforcementLearningCore/src/policies/learners/abstract_learner.jl
+51 b/‎src/ReinforcementLearningCore/src/policies/learners/abstract_learner.jl
+51
diff --git a/‎src/ReinforcementLearningCore/src/policies/learners/learners.jl
+3 b/‎src/ReinforcementLearningCore/src/policies/learners/learners.jl
+3
diff --git a/‎src/ReinforcementLearningCore/src/policies/learners/tabular_approximator.jl
+49 b/‎src/ReinforcementLearningCore/src/policies/learners/tabular_approximator.jl
+49
diff --git a/‎src/ReinforcementLearningCore/src/policies/approximator.jl ‎src/ReinforcementLearningCore/src/policies/learners/target_network.jl
+7-23 b/‎src/ReinforcementLearningCore/src/policies/approximator.jl ‎src/ReinforcementLearningCore/src/policies/learners/target_network.jl
+7-23
diff --git a/‎src/ReinforcementLearningCore/src/policies/policies.jl
+1-2 b/‎src/ReinforcementLearningCore/src/policies/policies.jl
+1-2
diff --git a/‎src/ReinforcementLearningCore/src/utils/basic.jl
+2-7 b/‎src/ReinforcementLearningCore/src/utils/basic.jl
+2-7
@@ -163,7 +163,7 @@ jobs:
         with:
           version: ${{ matrix.version }}
           arch: ${{ matrix.arch }}
-      - uses: julia-actions/cache@v1
+      # - uses: julia-actions/cache@v1
       - name: Get changed files
         id: RLEnvironments-changed
         uses: tj-actions/changed-files@v42
 
@@ -8,17 +8,15 @@ Pkg = "44cfe95a-1eb2-52ea-b672-e2afdf69b78f"
 Reexport = "189a3867-3050-52da-a836-e630ba90ab69"
 ReinforcementLearningBase = "e575027e-6cd6-5018-9292-cdc6200d2b44"
 ReinforcementLearningCore = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
-ReinforcementLearningDatasets = "dd1544ca-2576-438c-a599-ae96278fd687"
 ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921"
 ReinforcementLearningZoo = "d607f57d-ee1e-4ba7-bcf2-7734c1e31854"
 
 [compat]
 Reexport = "0.2, 1"
-julia = "1.6"
-ReinforcementLearningBase = "0.10"
-ReinforcementLearningCore = "0.13"
+ReinforcementLearningBase = "0.12"
+ReinforcementLearningCore = "0.14"
 ReinforcementLearningEnvironments = "0.8"
-ReinforcementLearningZoo = "0.6"
+julia = "1.6"
 
 [extras]
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 
@@ -13,8 +13,8 @@ StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 
 [compat]
 julia = "1"
-Flux = "0.11"
+Flux = "0.14"
 ReinforcementLearningBase = "0.8.5"
-ReinforcementLearningCore = "0.5.1"
+ReinforcementLearningCore = "0.14"
 ReinforcementLearningEnvironments = "0.3.3"
 StatsBase = "0.33, 0.34"
@@ -1,11 +1,10 @@
 name = "ReinforcementLearningCore"
 uuid = "de1b191a-4ae0-4afa-a27b-92d07f46b2d6"
-version = "0.13.1"
+version = "0.14.0"
 
 [deps]
 AbstractTrees = "1520ce14-60c1-5f80-bbc7-55ef81b5835c"
 Adapt = "79e6a3ab-5dfb-504d-930d-738a2a938a0e"
-CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 ChainRulesCore = "d360d2e6-b24c-11e9-a2a3-2a2ae2dbcce4"
 CircularArrayBuffers = "9de3a189-e0c0-4e15-ba3b-b14b9fb0aec1"
 Crayons = "a8cc5b0e-0ffa-5ad4-8c14-923d3ee1735f"
@@ -25,18 +24,16 @@ Statistics = "10745b16-79ce-11e8-11f9-7d13ad32a3b2"
 StatsBase = "2913bbd2-ae8a-5f71-8c99-4fb6c76f3a91"
 TimerOutputs = "a759f4b9-e2f1-59dc-863e-4aeb61b1ea8f"
 UnicodePlots = "b8865327-cd53-5732-bb35-84acbb429228"
-cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [compat]
 AbstractTrees = "0.3, 0.4"
 Adapt = "3, 4"
-CUDA = "4, 5"
 ChainRulesCore = "1"
 CircularArrayBuffers = "0.1.12"
 Crayons = "4"
 Distributions = "0.25"
 FillArrays = "0.8, 0.9, 0.10, 0.11, 0.12, 0.13, 1"
-Flux = "0.13, 0.14"
+Flux = "0.14"
 Functors = "0.1, 0.2, 0.3, 0.4"
 GPUArrays = "8, 9, 10"
 Metal = "1.0"
@@ -49,17 +46,18 @@ Statistics = "1"
 StatsBase = "0.32, 0.33, 0.34"
 TimerOutputs = "0.5"
 UnicodePlots = "1.3, 2, 3"
-cuDNN = "1"
 julia = "1.9"
 
 [extras]
 CommonRLInterface = "d842c3ba-07a1-494f-bbec-f5741b0a3e98"
+CUDA = "052768ef-5323-5732-b1bb-66c8b64840ba"
 DomainSets = "5b8099bc-c8ec-5219-889f-1d9e522a28bf"
 Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
+cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [targets]
-test = ["CommonRLInterface", "DomainSets", "Metal", "Preferences", "Test", "UUIDs"]
+test = ["CommonRLInterface", "CUDA", "cuDNN", "DomainSets", "Metal", "Preferences", "Test", "UUIDs"]
@@ -126,7 +126,7 @@ RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,false}, x::Vector{I}, mask::Trues) w
 function RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,false}, values::Vector{I}, mask::M) where {I<:Real, M<:Union{BitVector, Vector{Bool}}}
     ϵ = get_ϵ(s)
     s.step += 1
-    rand(s.rng) >= ϵ ? findmax(values, mask)[2] : rand(s.rng, findall(mask))
+    rand(s.rng) >= ϵ ? findmax_masked(values, mask)[2] : rand(s.rng, findall(mask))
 end
 
 #####
@@ -188,7 +188,7 @@ function RLBase.prob(s::EpsilonGreedyExplorer{<:Any,false}, values, mask)
     ϵ, n = get_ϵ(s), length(values)
     probs = zeros(n)
     probs[mask] .= ϵ / sum(mask)
-    probs[findmax(values, mask)[2]] += 1 - ϵ
+    probs[findmax_masked(values, mask)[2]] += 1 - ϵ
     Categorical(probs; check_args=false)
 end
 
@@ -201,7 +201,7 @@ struct GreedyExplorer <: AbstractExplorer end
 RLBase.plan!(s::GreedyExplorer, x, mask::Trues) = s(x)
 
 RLBase.plan!(s::GreedyExplorer, values) = findmax(values)[2]
-RLBase.plan!(s::GreedyExplorer, values, mask) = findmax(values, mask)[2]
+RLBase.plan!(s::GreedyExplorer, values, mask) = findmax_masked(values, mask)[2]
 
 RLBase.prob(s::GreedyExplorer, values) =
     Categorical(onehot(findmax(values)[2], 1:length(values)); check_args=false)
@@ -210,4 +210,4 @@ RLBase.prob(s::GreedyExplorer, values, action::Integer) =
     findmax(values)[2] == action ? 1.0 : 0.0
 
 RLBase.prob(s::GreedyExplorer, values, mask) =
-    Categorical(onehot(findmax(values, mask)[2], length(values)); check_args=false)
+    Categorical(onehot(findmax_masked(values, mask)[2], length(values)); check_args=false)
@@ -0,0 +1,51 @@
+export AbstractLearner, Approximator
+
+using Flux
+using Functors: @functor
+
+abstract type AbstractLearner end
+
+Base.show(io::IO, m::MIME"text/plain", L::AbstractLearner) = show(io, m, convert(AnnotatedStructTree, L))
+
+# Take Learner and Environment, get state, send to RLCore.forward(Learner, State)
+function forward(L::Le, env::E) where {Le <: AbstractLearner, E <: AbstractEnv}
+    env |> state |> Flux.gpu |> (x -> forward(L, x)) |> Flux.cpu
+end
+
+function RLBase.optimise!(::AbstractLearner, ::AbstractStage, ::Trajectory) end
+
+
+"""
+    Approximator(model, optimiser)
+
+Wraps a Flux trainable model and implements the `RLBase.optimise!(::Approximator, ::Gradient)` 
+interface. See the RLCore documentation for more information on proper usage.
+"""
+struct Approximator{M,O} <: AbstractLearner
+    model::M
+    optimiser_state::O
+end
+
+function Approximator(; model, optimiser)
+    optimiser_state = Flux.setup(optimiser, model)
+    Approximator(gpu(model), gpu(optimiser_state)) # Pass model to GPU (if available) upon creation
+end
+
+Base.show(io::IO, m::MIME"text/plain", A::Approximator) = show(io, m, convert(AnnotatedStructTree, A))
+
+@functor Approximator (model,)
+
+function RLBase.plan!(explorer::AbstractExplorer, learner::AbstractLearner, env::AbstractEnv)
+    legal_action_space_ = RLBase.legal_action_space_mask(env)
+    RLBase.plan!(explorer, forward(learner, env), legal_action_space_)
+end
+
+function RLBase.plan!(explorer::AbstractExplorer, learner::AbstractLearner, env::AbstractEnv, player::Symbol)
+    legal_action_space_ = RLBase.legal_action_space_mask(env, player)
+    return RLBase.plan!(explorer, forward(learner, env), legal_action_space_)
+end
+
+forward(A::Approximator, args...; kwargs...) = A.model(args...; kwargs...)
+
+RLBase.optimise!(A::Approximator, grad) =
+    Flux.Optimise.update!(A.optimiser_state, A.model, grad)
@@ -0,0 +1,3 @@
+include("abstract_learner.jl")
+include("tabular_approximator.jl")
+include("target_network.jl")
@@ -0,0 +1,49 @@
+export TabularApproximator, TabularVApproximator, TabularQApproximator
+
+using Flux: gpu
+
+"""
+    TabularApproximator(table<:AbstractArray, opt)
+
+For `table` of 1-d, it will serve as a state value approximator.
+For `table` of 2-d, it will serve as a state-action value approximator.
+
+!!! warning
+    For `table` of 2-d, the first dimension is action and the second dimension is state.
+"""
+# TODO: add back missing AbstractApproximator
+struct TabularApproximator{N,A,O} <: AbstractLearner
+    table::A
+    optimizer::O
+    function TabularApproximator(table::A, opt::O) where {A<:AbstractArray,O}
+        n = ndims(table)
+        n <= 2 || throw(ArgumentError("the dimension of table must be <= 2"))
+        new{n,A,O}(table, opt)
+    end
+end
+
+TabularVApproximator(; n_state, init = 0.0, opt = InvDecay(1.0)) =
+    TabularApproximator(fill(init, n_state), opt)
+
+TabularQApproximator(; n_state, n_action, init = 0.0, opt = InvDecay(1.0)) =
+    TabularApproximator(fill(init, n_action, n_state), opt)
+
+# Take Learner and Environment, get state, send to RLCore.forward(Learner, State)
+function forward(L::TabularApproximator, env::E) where {E <: AbstractEnv}
+    env |> state |> (x -> forward(L, x))
+end
+
+RLCore.forward(
+    app::TabularApproximator{1,R,O},
+    s::I,
+) where {R<:AbstractArray,O,I<:Integer} = @views app.table[s]
+
+RLCore.forward(
+    app::TabularApproximator{2,R,O},
+    s::I,
+) where {R<:AbstractArray,O,I<:Integer} = @views app.table[:, s]
+RLCore.forward(
+    app::TabularApproximator{2,R,O},
+    s::I1,
+    a::I2,
+) where {R<:AbstractArray,O,I1<:Integer,I2<:Integer} = @views app.table[a, s]
@@ -3,25 +3,6 @@ export Approximator, TargetNetwork, target, model
 using Flux
 
 
-"""
-    Approximator(model, optimiser)
-
-Wraps a Flux trainable model and implements the `RLBase.optimise!(::Approximator, ::Gradient)` 
-interface. See the RLCore documentation for more information on proper usage.
-"""
-Base.@kwdef mutable struct Approximator{M,O}
-    model::M
-    optimiser::O
-end
-
-Base.show(io::IO, m::MIME"text/plain", A::Approximator) = show(io, m, convert(AnnotatedStructTree, A))
-
-@functor Approximator (model,)
-
-forward(A::Approximator, args...; kwargs...) = A.model(args...; kwargs...)
-
-RLBase.optimise!(A::Approximator, gs) = Flux.Optimise.update!(A.optimiser, Flux.params(A), gs)
-
 target(ap::Approximator) = ap.model #see TargetNetwork
 model(ap::Approximator) = ap.model #see TargetNetwork
 
@@ -52,9 +33,11 @@ mutable struct TargetNetwork{M}
     n_optimise::Int
 end
 
-function TargetNetwork(x; sync_freq=1, ρ=0.0f0)
+function TargetNetwork(network; sync_freq = 1, ρ = 0f0)
     @assert 0 <= ρ <= 1 "ρ must in [0,1]"
-    TargetNetwork(x, deepcopy(x.model), sync_freq, ρ, 0)
+    # NOTE: model is pushed to gpu in Approximator, need to transfer to cpu before deepcopy, then push target model to gpu
+    target = gpu(deepcopy(cpu(network.model)))
+    TargetNetwork(network, target, sync_freq, ρ, 0)
 end
 
 @functor TargetNetwork (network, target)
@@ -66,9 +49,10 @@ forward(tn::TargetNetwork, args...) = forward(tn.network, args...)
 model(tn::TargetNetwork) = model(tn.network)
 target(tn::TargetNetwork) = tn.target
 
-function RLBase.optimise!(tn::TargetNetwork, gs)
+function RLBase.optimise!(tn::TargetNetwork, grad)
     A = tn.network
-    Flux.Optimise.update!(A.optimiser, Flux.params(A), gs)
+    optimise!(A, grad)
+
     tn.n_optimise += 1
 
     if tn.n_optimise % tn.sync_freq == 0
 
@@ -1,6 +1,5 @@
 include("agent/agent.jl")
 include("random_policy.jl")
 include("explorers/explorers.jl")
-include("learners.jl")
+include("learners/learners.jl")
 include("q_based_policy.jl")
-include("approximator.jl")
@@ -113,16 +113,11 @@ function find_all_max(x, mask::AbstractVector{Bool})
     v, [k for (m, k) in zip(mask, keys(x)) if m && x[k] == v]
 end
 
-# !!! watch https://github.com/JuliaLang/julia/pull/35316#issuecomment-622629895
-# Base.findmax(f, domain) = mapfoldl(x -> (f(x), x), _rf_findmax, domain)
-# _rf_findmax((fm, m), (fx, x)) = isless(fm, fx) ? (fx, x) : (fm, m)
 
-# !!! type piracy
-Base.findmax(A::AbstractVector{T}, mask::AbstractVector{Bool}) where {T} =
+findmax_masked(A::AbstractVector{T}, mask::AbstractVector{Bool}) where {T} =
     findmax(ifelse.(mask, A, typemin(T)))
 
-Base.findmax(A::AbstractVector, mask::Trues) = findmax(A)
-
+findmax_masked(A::AbstractVector, mask::Trues) = findmax(A)
 
 const VectorOrMatrix = Union{AbstractMatrix,AbstractVector}
Original file line number	Diff line number	Diff line change
`@@ -0,0 +1,3 @@`
	`1`	`+include("abstract_learner.jl")`
	`2`	`+include("tabular_approximator.jl")`
	`3`	`+include("target_network.jl")`