Merge pull request #113 from cpnota/release/0.3.3

cpnota · web-flow · commit 4575ecae4039 · 2019-09-17T18:17:51.000-04:00
Release/0.3.3
diff --git a/.pylintrc b/.pylintrc
@@ -437,7 +437,8 @@ good-names=i,
            n,
            t,
            e,
-           kl
+           kl,
+           ax
 
 # Include a hint for the correct naming format with invalid-name.
 include-naming-hint=no
diff --git a/README.md b/README.md
@@ -2,54 +2,62 @@
 
 The Autonomous Learning Library (`all`) is an object-oriented deep reinforcement learning library in `pytorch`. The goal of the library is to provide implementations of modern reinforcement learning algorithms that reflect the way that reinforcement learning researchers think about agent design and to provide the components necessary to build and test new ideas with minimal overhead.
 
-## Algorithms
-
-As of today, `all` contains implementations of the following deep RL algorithms:
-
-- [x] Advantage Actor-Critic (A2C)
-- [x] Categorical DQN (C51)
-- [x] Deep Deterministic Policy Gradient (DDPG)
-- [x] Deep Q-Learning (DQN) + extensions
-- [x] Proximal Policy Optimization (PPO)
-- [x] Rainbow (Rainbow)
-- [x] Soft Actor-Critic (SAC)
-
-It also contains implementations of the following "vanilla" agents, which provide useful baselines and perform better than you may expect:
-
-- [x] Vanilla Actor-Critic
-- [x] Vanilla Policy Gradient
-- [x] Vanilla Q-Learning
-- [x] Vanilla Sarsa
-
-We will try to stay up-to-date with advances in the field, but we do not intend to implement every algorithm. Rather, we prefer to maintain a smaller set of high-quality agents that have achieved notoriety in the field.
-
 ## Why use `all`?
 
 The primary reason for using `all` over its many competitors is because it contains components that allow you to *build your own* reinforcement learning agents.
 We provide out-of-the-box modules for:
 
 - [x] Custom Q-Networks, V-Networks, policy networks, and feature networks
 - [x] Generic function approximation
+- [x] Target networks
+- [x] Polyak averaging
 - [x] Experience Replay
 - [x] Prioritized Experience Replay
 - [x] Advantage Estimation
 - [x] Generalized Advantage Estimation (GAE)
-- [x] Target networks
-- [x] Polyak averaging
 - [x] Easy parameter and learning rate scheduling
 - [x] An enhanced `nn` module (includes dueling layers, noisy layers, action bounds, and the coveted `nn.Flatten`)
 - [x] `gym` to `pytorch` wrappers
 - [x] Atari wrappers
 - [x] An `Experiment` API for comparing and evaluating agents
 - [x] A `SlurmExperiment` API for running massive experiments on computing clusters
 - [x] A `Writer` object for easily logging information in `tensorboard`
+- [x] Plotting utilities for generating paper-worthy result plots
 
 Rather than being embedded in the agents, all of these modules are available for use by your own custom agents.
 Additionally, the included agents accept custom versions of any of the above objects.
 Have a new type of replay buffer in mind?
 Code it up and pass it directly to our `DQN` and `DDPG` implementations.
 Additionally, our agents were written with readibility as a primary concern, so they are easy to modify.
 
+## Algorithms
+
+As of today, `all` contains implementations of the following deep RL algorithms:
+
+- [x] Advantage Actor-Critic (A2C)
+- [x] Categorical DQN (C51)
+- [x] Deep Deterministic Policy Gradient (DDPG)
+- [x] Deep Q-Learning (DQN) + extensions
+- [x] Proximal Policy Optimization (PPO)
+- [x] Rainbow (Rainbow)
+- [x] Soft Actor-Critic (SAC)
+
+It also contains implementations of the following "vanilla" agents, which provide useful baselines and perform better than you may expect:
+
+- [x] Vanilla Actor-Critic
+- [x] Vanilla Policy Gradient
+- [x] Vanilla Q-Learning
+- [x] Vanilla Sarsa
+
+We will try to stay up-to-date with advances in the field, but we do not intend to implement every algorithm. Rather, we prefer to maintain a smaller set of high-quality agents that have achieved notoriety in the field.
+
+We have labored to make sure that our implementations produce results comparable to published results.
+Here's a sampling of performance on several Atari games:
+
+![atari40](atari40.png)
+
+These results were generated using the `all.presets.atari` module, the `SlurmExperiment` utility, and the `all.experiments.plots` module.
+
 ## Example
 
 Our agents implement a single method: `action = agent.act(state, reward)`.
diff --git a/all/experiments/__init__.py b/all/experiments/__init__.py
@@ -1,4 +1,5 @@
 from .experiment import Experiment
+from .plots import plot_returns_100
 from .slurm import SlurmExperiment
 from .watch import GreedyAgent, watch, load_and_watch
 
diff --git a/all/experiments/experiment_test.py b/all/experiments/experiment_test.py
@@ -32,6 +32,9 @@ def add_schedule(self, name, value, step="frame"):
     def add_evaluation(self, name, value, step="frame"):
         self.add_scalar("evaluation/" + name, value, self._get_step(step))
 
+    def add_summary(self, name, mean, std, step="frame"):
+        pass
+
     def _get_step(self, _type):
         if _type == "frame":
             return self.frames
diff --git a/all/experiments/plots.py b/all/experiments/plots.py
@@ -0,0 +1,59 @@
+import os
+import numpy as np
+import matplotlib.pyplot as plt
+
+
+def plot_returns_100(runs_dir, timesteps=-1):
+    data = load_returns_100_data(runs_dir)
+    lines = {}
+    fig, axes = plt.subplots(1, len(data))
+    for i, env in enumerate(sorted(data.keys())):
+        ax = axes[i]
+        subplot_returns_100(ax, env, data[env], lines, timesteps=timesteps)
+    fig.legend(list(lines.values()), list(lines.keys()), loc="center right")
+    plt.show()
+
+
+def load_returns_100_data(runs_dir):
+    data = {}
+
+    def add_data(agent, env, file):
+        if not env in data:
+            data[env] = {}
+        data[env][agent] = np.genfromtxt(file, delimiter=",").reshape((-1, 3))
+
+    for agent_dir in os.listdir(runs_dir):
+        agent = agent_dir.split(" ")[0].strip("_")
+        agent_path = os.path.join(runs_dir, agent_dir)
+        if os.path.isdir(agent_path):
+            for env in os.listdir(agent_path):
+                env_path = os.path.join(agent_path, env)
+                if os.path.isdir(env_path):
+                    returns100path = os.path.join(env_path, "returns100.csv")
+                    if os.path.exists(returns100path):
+                        add_data(agent, env, returns100path)
+
+    return data
+
+
+def subplot_returns_100(ax, env, data, lines, timesteps=-1):
+    for agent in data:
+        agent_data = data[agent]
+        x = agent_data[:, 0]
+        mean = agent_data[:, 1]
+        std = agent_data[:, 2]
+
+        if timesteps > 0:
+            x[-1] = timesteps
+
+        if agent in lines:
+            ax.plot(x, mean, label=agent, color=lines[agent].get_color())
+        else:
+            line, = ax.plot(x, mean, label=agent)
+            lines[agent] = line
+        ax.fill_between(
+            x, mean + std, mean - std, alpha=0.2, color=lines[agent].get_color()
+        )
+        ax.set_title(env)
+        ax.set_xlabel("timesteps")
+        ax.ticklabel_format(style='sci', axis='x', scilimits=(0, 5))
diff --git a/all/experiments/runner.py b/all/experiments/runner.py
@@ -44,8 +44,9 @@ def _log(self, returns, fps):
             self._best_returns = returns
         self._returns100.append(returns)
         if len(self._returns100) == 100:
-            self._writer.add_evaluation('returns100/mean', np.mean(self._returns100), step="frame")
-            self._writer.add_evaluation('returns100/std', np.std(self._returns100), step="frame")
+            mean = np.mean(self._returns100)
+            std = np.std(self._returns100)
+            self._writer.add_summary('returns100', mean, std, step="frame")
             self._returns100 = []
         self._writer.add_evaluation('returns/episode', returns, step="episode")
         self._writer.add_evaluation('returns/frame', returns, step="frame")
diff --git a/all/logging/__init__.py b/all/logging/__init__.py
@@ -1,12 +1,13 @@
-
+import csv
 import os
 import subprocess
 from abc import ABC, abstractmethod
 from datetime import datetime
 from tensorboardX import SummaryWriter
 
+
 class Writer(ABC):
-    log_dir = 'runs'
+    log_dir = "runs"
 
     @abstractmethod
     def add_loss(self, name, value, step="frame"):
@@ -24,6 +25,10 @@ def add_scalar(self, name, value, step="frame"):
     def add_schedule(self, name, value, step="frame"):
         pass
 
+    @abstractmethod
+    def add_summary(self, name, mean, std, step="frame"):
+        pass
+
 
 class DummyWriter(Writer):
     def add_loss(self, name, value, step="frame"):
@@ -38,13 +43,21 @@ def add_scalar(self, name, value, step="frame"):
     def add_schedule(self, name, value, step="frame"):
         pass
 
+    def add_summary(self, name, mean, std, step="frame"):
+        pass
+
 
 class ExperimentWriter(SummaryWriter, Writer):
     def __init__(self, agent_name, env_name, loss=True):
         self.env_name = env_name
         current_time = str(datetime.now())
+        os.makedirs(
+            os.path.join(
+                "runs", ("%s %s %s" % (agent_name, COMMIT_HASH, current_time)), env_name
+            )
+        )
         self.log_dir = os.path.join(
-            'runs', ("%s %s %s" % (agent_name, COMMIT_HASH, current_time))
+            "runs", ("%s %s %s" % (agent_name, COMMIT_HASH, current_time))
         )
         self._frames = 0
         self._episodes = 1
@@ -56,15 +69,22 @@ def add_loss(self, name, value, step="frame"):
             self.add_scalar("loss/" + name, value, step)
 
     def add_evaluation(self, name, value, step="frame"):
-        self.add_scalar('evaluation/' + name, value, self._get_step(step))
+        self.add_scalar("evaluation/" + name, value, self._get_step(step))
 
     def add_schedule(self, name, value, step="frame"):
         if self._loss:
-            self.add_scalar('schedule' + '/' + name, value, self._get_step(step))
+            self.add_scalar("schedule" + "/" + name, value, self._get_step(step))
 
     def add_scalar(self, name, value, step="frame"):
         super().add_scalar(self.env_name + "/" + name, value, self._get_step(step))
 
+    def add_summary(self, name, mean, std, step="frame"):
+        self.add_evaluation(name + "/mean", mean, step)
+        self.add_evaluation(name + "/std", std, step)
+
+        with open(os.path.join(self.log_dir, self.env_name, name + ".csv"), "a") as csvfile:
+            csv.writer(csvfile).writerow([self._get_step(step), mean, std])
+
     def _get_step(self, _type):
         if _type == "frame":
             return self.frames
@@ -91,13 +111,14 @@ def episodes(self, episodes):
 
 def get_commit_hash():
     result = subprocess.run(
-        ['git', 'rev-parse', '--short', 'HEAD'], stdout=subprocess.PIPE)
-    return result.stdout.decode('utf-8').rstrip()
+        ["git", "rev-parse", "--short", "HEAD"], stdout=subprocess.PIPE
+    )
+    return result.stdout.decode("utf-8").rstrip()
 
 
 COMMIT_HASH = get_commit_hash()
 
 try:
-    os.mkdir('runs')
+    os.mkdir("runs")
 except FileExistsError:
     pass
diff --git a/all/presets/atari/a2c.py b/all/presets/atari/a2c.py
@@ -1,6 +1,7 @@
 # /Users/cpnota/repos/autonomous-learning-library/all/approximation/value/action/torch.py
 import torch
-from torch.optim import RMSprop
+from torch.optim import Adam
+from torch.optim.lr_scheduler import CosineAnnealingLR
 from all.agents import A2C
 from all.bodies import DeepmindAtariBody
 from all.approximation import VNetwork, FeatureNetwork
@@ -10,42 +11,50 @@
 
 
 def a2c(
-        # taken from stable-baselines
+        # Common settings
+        device=torch.device('cuda'),
         discount_factor=0.99,
-        n_steps=5,
-        value_loss_scaling=0.25,
-        entropy_loss_scaling=0.01,
+        last_frame=40e6,
+        # Adam optimizer settings
+        lr=7e-4,
+        eps=1.5e-4,
+        # Other optimization settings
         clip_grad=0.5,
-        lr=7e-4,  # RMSprop learning rate
-        alpha=0.99,  # RMSprop momentum decay
-        eps=1e-5,  # RMSprop stability
+        entropy_loss_scaling=0.01,
+        value_loss_scaling=0.5,
+        # Batch settings
         n_envs=16,
-        device=torch.device("cuda"),
+        n_steps=5,
 ):
+    final_anneal_step = last_frame / (n_steps * n_envs * 4)
     def _a2c(envs, writer=DummyWriter()):
         env = envs[0]
 
         value_model = nature_value_head().to(device)
         policy_model = nature_policy_head(envs[0]).to(device)
         feature_model = nature_features().to(device)
 
-        feature_optimizer = RMSprop(
-            feature_model.parameters(), alpha=alpha, lr=lr, eps=eps
-        )
-        value_optimizer = RMSprop(value_model.parameters(), alpha=alpha, lr=lr, eps=eps)
-        policy_optimizer = RMSprop(
-            policy_model.parameters(), alpha=alpha, lr=lr, eps=eps
-        )
+        feature_optimizer = Adam(feature_model.parameters(), lr=lr, eps=eps)
+        value_optimizer = Adam(value_model.parameters(), lr=lr, eps=eps)
+        policy_optimizer = Adam(policy_model.parameters(), lr=lr, eps=eps)
 
         features = FeatureNetwork(
             feature_model,
             feature_optimizer,
+            scheduler=CosineAnnealingLR(
+                feature_optimizer,
+                final_anneal_step,
+            ),
             clip_grad=clip_grad,
             writer=writer
         )
         v = VNetwork(
             value_model,
             value_optimizer,
+            scheduler=CosineAnnealingLR(
+                value_optimizer,
+                final_anneal_step,
+            ),
             loss_scaling=value_loss_scaling,
             clip_grad=clip_grad,
             writer=writer
@@ -54,6 +63,10 @@ def _a2c(envs, writer=DummyWriter()):
             policy_model,
             policy_optimizer,
             env.action_space.n,
+            scheduler=CosineAnnealingLR(
+                policy_optimizer,
+                final_anneal_step,
+            ),
             entropy_loss_scaling=entropy_loss_scaling,
             clip_grad=clip_grad,
             writer=writer
diff --git a/all/presets/atari/ppo.py b/all/presets/atari/ppo.py
diff --git a/atari40.png b/atari40.png
diff --git a/scripts/plot.py b/scripts/plot.py
diff --git a/setup.py b/setup.py

Original file line number	Diff line number	Diff line change
`@@ -1,4 +1,5 @@`
`1`	`1`	`from .experiment import Experiment`
	`2`	`+from .plots import plot_returns_100`
`2`	`3`	`from .slurm import SlurmExperiment`
`3`	`4`	`from .watch import GreedyAgent, watch, load_and_watch`
`4`	`5`