naming

Jeremiah Lewis · Jeremiah Lewis · commit b1c5c125601b · 2024-03-19T15:08:25.000+01:00
diff --git a/src/ReinforcementLearningCore/src/policies/learners/td_learner.jl b/src/ReinforcementLearningCore/src/policies/learners/td_learner.jl
@@ -45,32 +45,32 @@ Update the Q-value of the given state-action pair.
 """
 function bellman_update!(
     approx::TabularApproximator,
-    s::I1,
-    s_plus_one::I2,
-    a::I3,
-    r::F1, # reward
+    state::I1,
+    next_state::I2,
+    action::I3,
+    reward::F1,
     γ::Float64, # discount factor
 ) where {I1<:Integer,I2<:Integer,I3<:Integer,F1<:AbstractFloat}
     # Q-learning formula following https://github.com/JuliaPOMDP/TabularTDLearning.jl/blob/25c4d3888e178c51ed1ff448f36b0fcaf7c1d8e8/src/q_learn.jl#LL63C26-L63C95
     # Terminology following https://en.wikipedia.org/wiki/Q-learning
-    estimate_optimal_future_value = maximum(Q(approx, s_plus_one))
-    current_value = Q(approx, s, a)
-    raw_q_value = (r + γ * estimate_optimal_future_value - current_value) # Discount factor γ is applied here
+    estimate_optimal_future_value = maximum(Q(approx, next_state))
+    current_value = Q(approx, state, action)
+    raw_q_value = (reward + γ * estimate_optimal_future_value - current_value) # Discount factor γ is applied here
     q_value_updated = Flux.Optimise.update!(approx.optimiser_state, :learning, [raw_q_value])[] # adust according to optimiser learning rate
-    approx.model[a, s] += q_value_updated
-    return Q(approx, s, a)
+    approx.model[action, state] += q_value_updated
+    return Q(approx, state, action)
 end
 
 function _optimise!(
     n::I1,
     γ::F,
     approx::Approximator{Ar},
-    s::I2,
-    s_next::I2,
-    a::I3,
-    r::F,
+    state::I2,
+    next_state::I2,
+    action::I3,
+    reward::F,
 ) where {I1<:Number,I2<:Number,I3<:Number,Ar<:AbstractArray,F<:AbstractFloat}
-    bellman_update!(approx, s, s_next, a, r, γ)
+    bellman_update!(approx, state, next_state, action, reward, γ)
 end
 
 function RLBase.optimise!(
@@ -80,12 +80,12 @@ function RLBase.optimise!(
     _optimise!(L.n, L.γ, L.approximator, t.state, t.next_state, t.action, t.reward)
 end
 
-function RLBase.optimise!(learner::TDLearner, stage, trajectory::Trajectory)
+function RLBase.optimise!(learner::TDLearner, stage::AbstractStage, trajectory::Trajectory)
     for batch in trajectory.container
         optimise!(learner, stage, batch)
     end
 end
 
 # TDLearner{:SARS} optimises at the PostActStage
-RLBase.optimise!(L::TDLearner{:SARS}, stage::PostActStage, trace::NamedTuple) = RLBase.optimise!(L, trace)
+RLBase.optimise!(learner::TDLearner{:SARS}, stage::PostActStage, trace::NamedTuple) = RLBase.optimise!(learner, trace)