JuliaReinforcementLearning
diff --git a/‎.buildkite/pipeline.yml
Lines changed: 1 addition & 0 deletions b/‎.buildkite/pipeline.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎.github/workflows/ci.yml
Lines changed: 1 addition & 0 deletions b/‎.github/workflows/ci.yml
Lines changed: 1 addition & 0 deletions
diff --git a/‎src/ReinforcementLearningCore/Project.toml
Lines changed: 3 additions & 1 deletion b/‎src/ReinforcementLearningCore/Project.toml
Lines changed: 3 additions & 1 deletion
diff --git a/‎src/ReinforcementLearningCore/src/policies/agent/agent_base.jl
Lines changed: 1 addition & 1 deletion b/‎src/ReinforcementLearningCore/src/policies/agent/agent_base.jl
Lines changed: 1 addition & 1 deletion
diff --git a/‎src/ReinforcementLearningCore/src/policies/explorers/epsilon_greedy_explorer.jl
Lines changed: 8 additions & 7 deletions b/‎src/ReinforcementLearningCore/src/policies/explorers/epsilon_greedy_explorer.jl
Lines changed: 8 additions & 7 deletions
diff --git a/‎src/ReinforcementLearningCore/src/policies/learners/approximator.jl
Lines changed: 0 additions & 45 deletions b/‎src/ReinforcementLearningCore/src/policies/learners/approximator.jl
Lines changed: 0 additions & 45 deletions
diff --git a/‎src/ReinforcementLearningCore/src/policies/learners/flux_model_approximator.jl
Lines changed: 47 additions & 0 deletions b/‎src/ReinforcementLearningCore/src/policies/learners/flux_model_approximator.jl
Lines changed: 47 additions & 0 deletions
diff --git a/‎src/ReinforcementLearningCore/src/policies/learners/learners.jl
Lines changed: 2 additions & 1 deletion b/‎src/ReinforcementLearningCore/src/policies/learners/learners.jl
Lines changed: 2 additions & 1 deletion
diff --git a/‎src/ReinforcementLearningCore/src/policies/learners/tabular_approximator.jl
Lines changed: 19 additions & 16 deletions b/‎src/ReinforcementLearningCore/src/policies/learners/tabular_approximator.jl
Lines changed: 19 additions & 16 deletions
diff --git a/‎src/ReinforcementLearningCore/src/policies/learners/target_network.jl
Lines changed: 10 additions & 10 deletions b/‎src/ReinforcementLearningCore/src/policies/learners/target_network.jl
Lines changed: 10 additions & 10 deletions
@@ -17,6 +17,7 @@ steps:
         Pkg.develop(path="src/ReinforcementLearningBase")
         Pkg.develop(path="src/ReinforcementLearningEnvironments")
         Pkg.develop(path="src/ReinforcementLearningCore")
+        Pkg.develop(path="src/ReinforcementLearningFarm")
 
         println("+++ :julia: Running tests")
         Pkg.test("ReinforcementLearningCore", coverage=true)
 
@@ -95,6 +95,7 @@ jobs:
             Pkg.develop(path="src/ReinforcementLearningBase")
             Pkg.develop(path="src/ReinforcementLearningCore")
             Pkg.develop(path="src/ReinforcementLearningEnvironments")
+            Pkg.develop(path="src/ReinforcementLearningFarm")
             Pkg.test("ReinforcementLearningCore", coverage=true)'
       - uses: julia-actions/julia-processcoverage@v1
         with:
 
@@ -37,6 +37,7 @@ Metal = "1.0"
 ProgressMeter = "1"
 Reexport = "1"
 ReinforcementLearningBase = "0.12"
+ReinforcementLearningFarm = "0.0.1"
 ReinforcementLearningTrajectories = "0.3.7"
 Statistics = "1"
 StatsBase = "0.32, 0.33, 0.34"
@@ -52,9 +53,10 @@ Metal = "dde4c033-4e86-420c-a63e-0dd931031962"
 Preferences = "21216c6a-2e73-6563-6e65-726566657250"
 Random = "9a3f8284-a2c9-5f02-9a11-845980a1fd5c"
 ReinforcementLearningEnvironments = "25e41dd2-4622-11e9-1641-f1adca772921"
+ReinforcementLearningFarm = "14eff660-7080-4cec-bba2-cfb12cd77ac3"
 Test = "8dfed614-e22c-5e08-85e1-65c5234f0b40"
 UUIDs = "cf7118a7-6976-5b1a-9a39-7adc72f591a4"
 cuDNN = "02a925ec-e4fe-4b08-9a7e-0d78e3d38ccd"
 
 [targets]
-test = ["CommonRLInterface", "CUDA", "cuDNN", "DomainSets", "Metal", "Preferences", "ReinforcementLearningEnvironments", "Test", "UUIDs"]
+test = ["CommonRLInterface", "CUDA", "cuDNN", "DomainSets", "Metal", "Preferences", "ReinforcementLearningEnvironments", "ReinforcementLearningFarm", "Test", "UUIDs"]
@@ -37,7 +37,7 @@ RLBase.optimise!(::SyncTrajectoryStyle, agent::AbstractAgent, stage::S) where {S
 # already spawn a task to optimise inner policy when initializing the agent
 RLBase.optimise!(::AsyncTrajectoryStyle, agent::AbstractAgent, stage::S) where {S<:AbstractStage} = nothing
 
-#by default, optimise does nothing at all stage
+#by default, optimise does nothing at all stages
 function RLBase.optimise!(policy::AbstractPolicy, stage::AbstractStage, trajectory::Trajectory) end
 
 Flux.@layer Agent trainable=(policy,)
 
@@ -99,31 +99,32 @@ get_ϵ(s::EpsilonGreedyExplorer) = get_ϵ(s, s.step)
     `NaN` will be filtered unless all the values are `NaN`.
     In that case, a random one will be returned.
 """
-function RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,true}, values::Vector{I}) where {I<:Real}
+function RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,true}, values::A) where {I<:Real, A<:AbstractArray{I}}
     ϵ = get_ϵ(s)
     s.step += 1
     rand(s.rng) >= ϵ ? rand(s.rng, find_all_max(values)[2]) : rand(s.rng, 1:length(values))
 end
 
-function RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,false}, values::Vector{I}) where {I<:Real}
+function RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,false}, values::A) where {I<:Real, A<:AbstractArray{I}}
     ϵ = get_ϵ(s)
     s.step += 1
     rand(s.rng) >= ϵ ? findmax(values)[2] : rand(s.rng, 1:length(values))
 end
 
 #####
 
-RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,true}, x, mask::Trues) = RLBase.plan!(s, x)
+RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,true}, x::A, mask::Trues) where {I<:Real, A<:AbstractArray{I}} = RLBase.plan!(s, x)
 
-function RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,true}, values::Vector{I}, mask::M) where {I<:Real, M<:Union{BitVector, Vector{Bool}}}
+function RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,true}, values::A, mask::M) where {I<:Real, A<:AbstractArray{I}, M<:Union{BitVector, Vector{Bool}}}
     ϵ = get_ϵ(s)
     s.step += 1
     rand(s.rng) >= ϵ ? rand(s.rng, find_all_max(values, mask)[2]) :
     rand(s.rng, findall(mask))
 end
 
-RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,false}, x::Vector{I}, mask::Trues) where{I<:Real} = RLBase.plan!(s, x)
-function RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,false}, values::Vector{I}, mask::M) where {I<:Real, M<:Union{BitVector, Vector{Bool}}}
+RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,false}, x::A, mask::Trues) where{I<:Real, A<:AbstractArray{I}} = RLBase.plan!(s, x)
+
+function RLBase.plan!(s::EpsilonGreedyExplorer{<:Any,false}, values::A, mask::M) where {I<:Real, A<:AbstractArray{I}, M<:Union{BitVector, Vector{Bool}}}
     ϵ = get_ϵ(s)
     s.step += 1
     rand(s.rng) >= ϵ ? findmax_masked(values, mask)[2] : rand(s.rng, findall(mask))
@@ -137,7 +138,7 @@ end
 
 Return the probability of selecting each action given the estimated `values` of each action.
 """
-function RLBase.prob(s::EpsilonGreedyExplorer{<:Any,true}, values)
+function RLBase.prob(s::EpsilonGreedyExplorer{<:Any,true}, values::A) where {I<:Real, A<:AbstractArray{I}}
     ϵ, n = get_ϵ(s), length(values)
     probs = fill(ϵ / n, n)
     max_val_inds = find_all_max(values)[2]
 
@@ -0,0 +1,47 @@
+export FluxModelApproximator
+
+using Flux
+
+"""
+    FluxModelApproximator(model, optimiser)
+
+Wraps a Flux trainable model and implements the `RLBase.optimise!(::FluxModelApproximator, ::Gradient)` 
+interface. See the RLCore documentation for more information on proper usage.
+"""
+struct FluxModelApproximator{M,O} <: AbstractLearner
+    model::M
+    optimiser_state::O
+end
+
+
+"""
+    FluxModelApproximator(; model, optimiser, usegpu=false)
+
+Constructs an `FluxModelApproximator` object for reinforcement learning.
+
+# Arguments
+- `model`: The model used for approximation.
+- `optimiser`: The optimizer used for updating the model.
+- `usegpu`: A boolean indicating whether to use GPU for computation. Default is `false`.
+
+# Returns
+An `FluxModelApproximator` object.
+"""
+function FluxModelApproximator(; model, optimiser, use_gpu=false)
+    optimiser_state = Flux.setup(optimiser, model)
+    if use_gpu  # Pass model to GPU (if available) upon creation
+        return FluxModelApproximator(gpu(model), gpu(optimiser_state))
+    else
+        return FluxModelApproximator(model, optimiser_state)
+    end
+end
+
+FluxModelApproximator(model, optimiser::Flux.Optimise.AbstractOptimiser; use_gpu=false) = FluxModelApproximator(model=model, optimiser=optimiser, use_gpu=use_gpu)
+
+Flux.@layer FluxModelApproximator trainable=(model,)
+
+forward(A::FluxModelApproximator, args...; kwargs...) = A.model(args...; kwargs...)
+forward(A::FluxModelApproximator, env::E) where {E <: AbstractEnv} = env |> state |> (x -> forward(A, x))
+
+RLBase.optimise!(A::FluxModelApproximator, grad::NamedTuple) =
+    Flux.Optimise.update!(A.optimiser_state, A.model, grad.model)
@@ -1,4 +1,5 @@
 include("abstract_learner.jl")
-include("approximator.jl")
+include("flux_model_approximator.jl")
 include("tabular_approximator.jl")
+include("td_learner.jl")
 include("target_network.jl")
@@ -1,46 +1,49 @@
 export TabularApproximator, TabularVApproximator, TabularQApproximator
 
-const TabularApproximator = Approximator{A,O} where {A<:AbstractArray,O}
-const TabularQApproximator = Approximator{A,O} where {A<:AbstractArray,O}
-const TabularVApproximator = Approximator{A,O} where {A<:AbstractVector,O}
+struct TabularApproximator{A} <: AbstractLearner where {A<:AbstractArray}
+    model::A
+end
+
+const TabularQApproximator = TabularApproximator{A} where {A<:AbstractMatrix}
+const TabularVApproximator = TabularApproximator{A} where {A<:AbstractVector}
 
 """
-    TabularApproximator(table<:AbstractArray, opt)
+    TabularApproximator(table<:AbstractArray)
 
 For `table` of 1-d, it will serve as a state value approximator.
 For `table` of 2-d, it will serve as a state-action value approximator.
 
 !!! warning
     For `table` of 2-d, the first dimension is action and the second dimension is state.
 """
-function TabularApproximator(table::A, opt::O) where {A<:AbstractArray,O}
+function TabularApproximator(table::A) where {A<:AbstractArray}
     n = ndims(table)
     n <= 2 || throw(ArgumentError("the dimension of table must be <= 2"))
-    TabularApproximator{A,O}(table, opt)
+    TabularApproximator{A}(table)
 end
 
-TabularVApproximator(; n_state, init = 0.0, opt = InvDecay(1.0)) =
-    TabularApproximator(fill(init, n_state), opt)
+TabularVApproximator(; n_state, init = 0.0) =
+    TabularApproximator(fill(init, n_state))
 
-TabularQApproximator(; n_state, n_action, init = 0.0, opt = InvDecay(1.0)) =
-    TabularApproximator(fill(init, n_action, n_state), opt)
+TabularQApproximator(; n_state, n_action, init = 0.0) =
+    TabularApproximator(fill(init, n_action, n_state))
 
 # Take Learner and Environment, get state, send to RLCore.forward(Learner, State)
 forward(L::TabularVApproximator, env::E) where {E <: AbstractEnv} = env |> state |> (x -> forward(L, x))
 forward(L::TabularQApproximator, env::E) where {E <: AbstractEnv} = env |> state |> (x -> forward(L, x))
 
 RLCore.forward(
-    app::TabularVApproximator{R,O},
+    app::TabularVApproximator{R},
     s::I,
-) where {R<:AbstractVector,O,I} = @views app.model[s]
+) where {R<:AbstractVector,I} = @views app.model[s]
 
 RLCore.forward(
-    app::TabularQApproximator{R,O},
+    app::TabularQApproximator{R},
     s::I,
-) where {R<:AbstractArray,O,I} = @views app.model[:, s]
+) where {R<:AbstractArray,I} = @views app.model[:, s]
 
 RLCore.forward(
-    app::TabularQApproximator{R,O},
+    app::TabularQApproximator{R},
     s::I1,
     a::I2,
-) where {R<:AbstractArray,O,I1,I2} = @views app.model[a, s]
+) where {R<:AbstractArray,I1,I2} = @views app.model[a, s]
@@ -1,14 +1,14 @@
-export Approximator, TargetNetwork, target, model
+export TargetNetwork, target, model
 
 using Flux
 
-target(ap::Approximator) = ap.model #see TargetNetwork
-model(ap::Approximator) = ap.model #see TargetNetwork
+target(ap::FluxModelApproximator) = ap.model #see TargetNetwork
+model(ap::FluxModelApproximator) = ap.model #see TargetNetwork
 
 """
-    TargetNetwork(network::Approximator; sync_freq::Int = 1, ρ::Float32 = 0f0)
+    TargetNetwork(network::FluxModelApproximator; sync_freq::Int = 1, ρ::Float32 = 0f0)
 
-Wraps an Approximator to hold a target network that is updated towards the model of the 
+Wraps an FluxModelApproximator to hold a target network that is updated towards the model of the 
 approximator. 
 - `sync_freq` is the number of updates of `network` between each update of the `target`. 
 - ρ (\rho) is "how much of the target is kept when updating it". 
@@ -21,11 +21,11 @@ Implements the `RLBase.optimise!(::TargetNetwork, ::Gradient)` interface to upda
 and the target with weights replacement or Polyak averaging.
 
 Note to developers: `model(::TargetNetwork)` will return the trainable Flux model 
-and `target(::TargetNetwork)` returns the target model and `target(::Approximator)`
+and `target(::TargetNetwork)` returns the target model and `target(::FluxModelApproximator)`
 returns the non-trainable Flux model. See the RLCore documentation.
 """
 mutable struct TargetNetwork{M}
-    network::Approximator{M}
+    network::FluxModelApproximator{M}
     target::M
     sync_freq::Int
     ρ::Float32
@@ -46,13 +46,13 @@ Constructs a target network for reinforcement learning.
 # Returns
 A `TargetNetwork` object.
 """
-function TargetNetwork(network::Approximator; sync_freq = 1, ρ = 0f0, use_gpu = false)
+function TargetNetwork(network::FluxModelApproximator; sync_freq = 1, ρ = 0f0, use_gpu = false)
     @assert 0 <= ρ <= 1 "ρ must in [0,1]"
     ρ = Float32(ρ)
 
     if use_gpu
-        @assert typeof(gpu(network.model)) == typeof(network.model) "`Approximator` model is not on GPU. Please set `use_gpu=false`` or ensure model is on GPU, by setting `use_gpu=true` when constructing `Approximator`."
-        # NOTE: model is pushed to gpu in Approximator, need to transfer to cpu before deepcopy, then push target model to gpu
+        @assert typeof(gpu(network.model)) == typeof(network.model) "`FluxModelApproximator` model is not on GPU. Please set `use_gpu=false`` or ensure model is on GPU, by setting `use_gpu=true` when constructing `FluxModelApproximator`."
+        # NOTE: model is pushed to gpu in FluxModelApproximator, need to transfer to cpu before deepcopy, then push target model to gpu
         target = gpu(deepcopy(cpu(network.model)))
     else
         target = deepcopy(network.model)