Merge pull request #13 from JuliaReinforcementLearning/improve_doc

jbrea · web-flow · commit 24578f3c9048 · 2018-08-27T11:20:19.000+02:00
improve docs
diff --git a/docs/make.jl b/docs/make.jl
@@ -7,17 +7,19 @@ makedocs(modules = [ReinforcementLearning],
  		 linkcheck = !("skiplinks" in ARGS),
 		 pages = [ "Introduction" => "index.md", 
 				   "Usage" => "usage.md",
+                   "Tutorial" => "tutorial.md",
 				   "Reference" => ["Comparison" => "comparison.md",
 								   "Learning" => "learning.md",
 								   "Learners" =>  "learners.md",
+                                   "Buffers" => "buffers.md",
 								   "Environments" => "environments.md",
 								   "Stopping Criteria" =>  "stop.md",
                                    "Preprocessors" => "preprocessors.md",
 								   "Policies" =>  "policies.md",
 								   "Callbacks" =>  "callbacks.md",
 								   "Evaluation Metrics" =>  "metrics.md",
-                                  ],
-				   "API" => "api.md"],
+                                  ]
+                  ],
 		 html_prettyurls = true
 		)
 
diff --git a/docs/src/buffers.md b/docs/src/buffers.md
@@ -0,0 +1,7 @@
+# [Buffers](@id buffers)
+
+```@autodocs
+Modules = [ReinforcementLearning]
+Pages   = ["buffers.jl"]
+```
+
diff --git a/docs/src/tutorial.md b/docs/src/tutorial.md
@@ -0,0 +1,98 @@
+# Tutorial
+You would like to test existing reinforcement learning methods on your
+environment or try your method on existing environments? Extending this package
+is a piece of cake. Please consider registering the binding to your own
+environment as a new package (see e.g.
+[RLEnvAtari](https://github.com/JuliaReinforcementLearning/RLEnvAtari.jl)) and
+open a [pull
+request](https://github.com/JuliaReinforcementLearning/ReinforcementLearning.jl/pulls)
+for any other extension.
+
+## Write your own learner
+
+For a new learner you need to implement the functions
+```
+update!(learner, buffer)                          # returns nothing
+selectaction(learner, policy, state)              # returns an action
+defaultbuffer(learner, environment, preprocessor) # returns a buffer
+```
+
+Let's assume you want to implement plain, simple Q-learning (you don't need to
+do this; it is already implemented. Your file `qlearning.jl` could contain
+```julia
+import ReinforcementLearning: update!, selectaction, defaultbuffer, Buffer
+
+struct MyQLearning
+    Q::Array{Float64, 2} # number of actions x number of states
+    alpha::Float64       # learning rate
+end
+
+function update!(learner::MyQLearning, buffer)
+    s = buffer.states[1]
+    snext = buffer.states[2]
+    r = buffer.rewards[1]
+    a = buffer.actions[1]
+    Q = learner.Q
+    Q[a, s] += learner.alpha * (r + maximum(Q[:, snext]) - Q[a, s])
+end
+
+function selectaction(learner::MyQLearning, policy, state)
+    selectaction(policy, learner.Q[:, state])
+end
+
+function defaultbuffer(learner::MyQLearning, environment, preprocessor)
+    state, done = getstate(environment)
+    processedstate = preprocessstate(preprocessor, state)
+    Buffer(statetype = typeof(processedstate), capacity = 2)
+end
+```
+The function `defaultbuffer` gets called during the construction of an
+`RLSetup`. It returns a buffer that is filled with states, actions and rewards
+during interaction with the environment. Currently there are three types of
+Buffers implemented
+```julia
+import ReinforcementLearning: Buffer, EpisodeBuffer, ArrayStateBuffer
+?Buffer
+```
+
+## [Bind your own environment](@id api_environments)
+For new environments you need to implement the functions
+```
+interact!(action, environment)          # returns state, reward done
+getstate(environment)                   # returns state, done
+reset!(environment)                     # returns state
+```
+
+Optionally you may also implement the function
+```
+plotenv(environment, state, action, reward, done)
+```
+
+Please have a look at the
+[cartpole](https://github.com/JuliaReinforcementLearning/RLEnvClassicControl.jl/blob/master/src/cartpole.jl)
+for an example.
+
+## Preprocessors
+```
+preprocessstate(preprocessor, state)    # returns the preprocessed state
+```
+Optional:
+```
+preprocess(preprocessor, reward, state, done) # returns a preprocessed (state, reward done) tuple.
+```
+
+## Policies
+```
+selectaction(policy, values)            # returns an action
+getactionprobabilities(policy, state)   # Returns a normalized (1-norm) vector with non-negative entries.
+```
+
+## Callbacks
+```
+callback!(callback, rlsetup, state, action, reward, done) # returns nothing
+```
+
+## Stopping Criteria
+```
+isbreak!(stoppingcriterion, state, action, reward, done) # returns true of false
+```
diff --git a/src/buffers.jl b/src/buffers.jl
@@ -1,9 +1,21 @@
+"""
+    struct Buffer{Ts, Ta}
+        states::CircularBuffer{Ts}
+        actions::CircularBuffer{Ta}
+        rewards::CircularBuffer{Float64}
+        done::CircularBuffer{Bool}
+"""
 struct Buffer{Ts, Ta}
     states::CircularBuffer{Ts}
     actions::CircularBuffer{Ta}
     rewards::CircularBuffer{Float64}
     done::CircularBuffer{Bool}
 end
+"""
+    Buffer(; statetype = Int64, actiontype = Int64, 
+             capacity = 2, capacitystates = capacity,
+             capacityrewards = capacity - 1)
+"""
 function Buffer(; statetype = Int64, actiontype = Int64, 
                   capacity = 2, capacitystates = capacity,
                   capacityrewards = capacity - 1)
@@ -23,12 +35,23 @@ function pushreturn!(b, r, done)
     push!(b.done, done)
 end
 
+"""
+    struct EpisodeBuffer{Ts, Ta}
+        states::Array{Ts, 1}
+        actions::Array{Ta, 1}
+        rewards::Array{Float64, 1}
+        done::Array{Bool, 1}
+"""
 struct EpisodeBuffer{Ts, Ta}
     states::Array{Ts, 1}
     actions::Array{Ta, 1}
     rewards::Array{Float64, 1}
     done::Array{Bool, 1}
 end
+"""
+    EpisodeBuffer(; statetype = Int64, actiontype = Int64) = 
+        EpisodeBuffer(statetype[], actiontype[], Float64[], Bool[])
+"""
 EpisodeBuffer(; statetype = Int64, actiontype = Int64) = 
     EpisodeBuffer(statetype[], actiontype[], Float64[], Bool[])
 function pushreturn!(b::EpisodeBuffer, r, done)
@@ -42,13 +65,24 @@ function pushreturn!(b::EpisodeBuffer, r, done)
     push!(b.done, done)
 end
 
+"""
+    mutable struct ArrayCircularBuffer{T}
+        data::T
+        capacity::Int64
+        start::Int64
+        counter::Int64
+        full::Bool
+"""
 mutable struct ArrayCircularBuffer{T}
     data::T
     capacity::Int64
     start::Int64
     counter::Int64
     full::Bool
 end
+"""
+    ArrayCircularBuffer(arraytype, datatype, elemshape, capacity)
+"""
 function ArrayCircularBuffer(arraytype, datatype, elemshape, capacity)
     ArrayCircularBuffer(arraytype(zeros(datatype, 
                                         convert(Dims, (elemshape..., capacity)))),
@@ -96,12 +130,31 @@ for N in 2:5
 end
 lastindex(a::ArrayCircularBuffer) = a.full ? a.capacity : a.counter
 
+"""
+    struct ArrayStateBuffer{Ts, Ta}
+        states::ArrayCircularBuffer{Ts}
+        actions::CircularBuffer{Ta}
+        rewards::CircularBuffer{Float64}
+        done::CircularBuffer{Bool}
+"""
 struct ArrayStateBuffer{Ts, Ta}
     states::ArrayCircularBuffer{Ts}
     actions::CircularBuffer{Ta}
     rewards::CircularBuffer{Float64}
     done::CircularBuffer{Bool}
 end
+"""
+    ArrayStateBuffer(; arraytype = Array, datatype = Float64, 
+                       elemshape = (1), actiontype = Int64, 
+                       capacity = 2, capacitystates = capacity,
+                       capacityrewards = capacity - 1)
+
+An `ArrayStateBuffer` is similar to a [`Buffer`](@ref) but the states are stored
+in a prealocated array of size `(elemshape..., capacity)`. `K` consecutive
+states at position `i` in the state buffer can can efficiently be retrieved with
+`nmarkovview(buffer.states, i, K)` or `nmarkovgetindex(buffer.states, i, K)`.
+See the implementation of DQN for an example. 
+"""
 function ArrayStateBuffer(; arraytype = Array, datatype = Float64, 
                             elemshape = (1), actiontype = Int64, 
                             capacity = 2, capacitystates = capacity,
diff --git a/src/learner/tdlearning.jl b/src/learner/tdlearning.jl
@@ -1,3 +1,18 @@
+"""
+    mutable struct TDLearner{T,Tp}
+        ns::Int64 = 10
+        na::Int64 = 4
+        γ::Float64 = .9
+        λ::Float64 = .8
+        α::Float64 = .1
+        nsteps::Int64 = 1
+        initvalue::Float64 = 0.
+        unseenvalue::Float64 = initvalue == Inf64 ? 0. : initvalue
+        params::Array{Float64, 2} = zeros(na, ns) .+ initvalue
+        tracekind = DataType = λ == 0 ? NoTraces : ReplacingTraces
+        traces::T = tracekind == NoTraces ? NoTraces() : tracekind(ns, na, λ, γ)
+        endvaluepolicy::Tp = SarsaEndPolicy()
+"""
 @with_kw mutable struct TDLearner{T,Tp}
     ns::Int64 = 10
     na::Int64 = 4
@@ -17,18 +32,29 @@ struct QLearningEndPolicy end
 struct ExpectedSarsaEndPolicy{Tp} 
     policy::Tp
 end
-Sarsa(; kargs...) = TDLearner(; kargs...)
-QLearning(; kargs...) = TDLearner(; endvaluepolicy = QLearningEndPolicy(), kargs...)
-ExpectedSarsa(; kargs...) = TDLearner(; endvaluepolicy = ExpectedSarsaEndPolicy(VeryOptimisticEpsilonGreedyPolicy(.1)), kargs...)
+"""
+    Sarsa(; kargs...) = TDLearner(; kargs...)
+"""
+function Sarsa(; kargs...) TDLearner(; kargs...) end
+"""
+    QLearning(; kargs...) = TDLearner(; endvaluepolicy = QLearningEndPolicy(), kargs...)
+"""
+function QLearning(; kargs...) 
+    TDLearner(; endvaluepolicy = QLearningEndPolicy(), kargs...) 
+end
+"""
+    ExpectedSarsa(; kargs...) = TDLearner(; endvaluepolicy = ExpectedSarsaEndPolicy(VeryOptimisticEpsilonGreedyPolicy(.1)), kargs...)
+"""
+function ExpectedSarsa(; kargs...) 
+    TDLearner(; endvaluepolicy = ExpectedSarsaEndPolicy(VeryOptimisticEpsilonGreedyPolicy(.1)), kargs...)
+end
 export Sarsa, QLearning, ExpectedSarsa
 
 @inline function selectaction(learner::Union{TDLearner, AbstractPolicyGradient}, 
                               policy,
                               state)
     selectaction(policy, getvalue(learner.params, state))
 end
-params(learner::TDLearner) = learner.params
-reconstructwithparams(learner::TDLearner, w) = reconstruct(learner, params = w)
 
 # td error