[RLlib] Cleanup examples folder #14: Add example script for how to resume a tune.Tuner.fit() experiment from a checkpoint. (ray-project#45681)

sven1977 · web-flow · commit 440aa8120dff · 2024-06-03T13:40:43.000Z
diff --git a/rllib/BUILD b/rllib/BUILD
@@ -2120,15 +2120,6 @@ py_test(
 # subdirectory: checkpoints/
 # ....................................
 
-#@OldAPIStack
-py_test(
-    name = "examples/checkpoints/cartpole_dqn_export",
-    main = "examples/checkpoints/cartpole_dqn_export.py",
-    tags = ["team:rllib", "exclusive", "examples"],
-    size = "small",
-    srcs = ["examples/checkpoints/cartpole_dqn_export.py"],
-)
-
 py_test(
     name = "examples/checkpoints/checkpoint_by_custom_criteria",
     main = "examples/checkpoints/checkpoint_by_custom_criteria.py",
@@ -2138,6 +2129,42 @@ py_test(
     args = ["--enable-new-api-stack", "--stop-reward=150.0", "--num-cpus=8"]
 )
 
+py_test(
+    name = "examples/checkpoints/continue_training_from_checkpoint",
+    main = "examples/checkpoints/continue_training_from_checkpoint.py",
+    tags = ["team:rllib", "exclusive", "examples"],
+    size = "large",
+    srcs = ["examples/checkpoints/continue_training_from_checkpoint.py"],
+    args = ["--enable-new-api-stack", "--as-test"]
+)
+
+py_test(
+    name = "examples/checkpoints/continue_training_from_checkpoint_multi_agent",
+    main = "examples/checkpoints/continue_training_from_checkpoint.py",
+    tags = ["team:rllib", "exclusive", "examples"],
+    size = "large",
+    srcs = ["examples/checkpoints/continue_training_from_checkpoint.py"],
+    args = ["--enable-new-api-stack", "--as-test", "--num-agents=2", "--stop-reward-crash=400.0", "--stop-reward=900.0"]
+)
+
+#@OldAPIStack
+py_test(
+    name = "examples/checkpoints/continue_training_from_checkpoint_old_api_stack",
+    main = "examples/checkpoints/continue_training_from_checkpoint.py",
+    tags = ["team:rllib", "exclusive", "examples"],
+    size = "large",
+    srcs = ["examples/checkpoints/continue_training_from_checkpoint.py"],
+    args = ["--as-test"]
+)
+
+py_test(
+    name = "examples/checkpoints/cartpole_dqn_export",
+    main = "examples/checkpoints/cartpole_dqn_export.py",
+    tags = ["team:rllib", "exclusive", "examples"],
+    size = "small",
+    srcs = ["examples/checkpoints/cartpole_dqn_export.py"],
+)
+
 #@OldAPIStack
 py_test(
     name = "examples/checkpoints/onnx_tf2",
diff --git a/rllib/algorithms/dqn/dqn.py b/rllib/algorithms/dqn/dqn.py
@@ -630,22 +630,27 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict:
             )
 
         self.metrics.log_dict(
-            self.metrics.peek(ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED, default={}),
+            self.metrics.peek(
+                (ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED), default={}
+            ),
             key=NUM_AGENT_STEPS_SAMPLED_LIFETIME,
             reduce="sum",
         )
         self.metrics.log_value(
             NUM_ENV_STEPS_SAMPLED_LIFETIME,
-            self.metrics.peek(ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED, default=0),
+            self.metrics.peek((ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED), default=0),
             reduce="sum",
         )
         self.metrics.log_value(
             NUM_EPISODES_LIFETIME,
-            self.metrics.peek(ENV_RUNNER_RESULTS, NUM_EPISODES, default=0),
+            self.metrics.peek((ENV_RUNNER_RESULTS, NUM_EPISODES), default=0),
             reduce="sum",
         )
         self.metrics.log_dict(
-            self.metrics.peek(ENV_RUNNER_RESULTS, NUM_MODULE_STEPS_SAMPLED, default={}),
+            self.metrics.peek(
+                (ENV_RUNNER_RESULTS, NUM_MODULE_STEPS_SAMPLED),
+                default={},
+            ),
             key=NUM_MODULE_STEPS_SAMPLED_LIFETIME,
             reduce="sum",
         )
@@ -708,7 +713,7 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict:
                     self.metrics.log_value(
                         NUM_ENV_STEPS_TRAINED_LIFETIME,
                         self.metrics.peek(
-                            LEARNER_RESULTS, ALL_MODULES, NUM_ENV_STEPS_TRAINED
+                            (LEARNER_RESULTS, ALL_MODULES, NUM_ENV_STEPS_TRAINED)
                         ),
                         reduce="sum",
                     )
@@ -725,7 +730,7 @@ def _training_step_new_api_stack(self, *, with_noise_reset) -> ResultDict:
                     # TODO (sven): Uncomment this once agent steps are available in the
                     #  Learner stats.
                     # self.metrics.log_dict(self.metrics.peek(
-                    #   LEARNER_RESULTS, NUM_AGENT_STEPS_TRAINED, default={}
+                    #   (LEARNER_RESULTS, NUM_AGENT_STEPS_TRAINED), default={}
                     # ), key=NUM_AGENT_STEPS_TRAINED_LIFETIME, reduce="sum")
 
                 # Update replay buffer priorities.
diff --git a/rllib/algorithms/dreamerv3/dreamerv3.py b/rllib/algorithms/dreamerv3/dreamerv3.py
@@ -582,13 +582,13 @@ def training_step(self) -> ResultDict:
                 self.metrics.log_dict(
                     {
                         NUM_AGENT_STEPS_SAMPLED_LIFETIME: self.metrics.peek(
-                            ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED
+                            (ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED)
                         ),
                         NUM_ENV_STEPS_SAMPLED_LIFETIME: self.metrics.peek(
-                            ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED
+                            (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED)
                         ),
                         NUM_EPISODES_LIFETIME: self.metrics.peek(
-                            ENV_RUNNER_RESULTS, NUM_EPISODES
+                            (ENV_RUNNER_RESULTS, NUM_EPISODES)
                         ),
                     },
                     reduce="sum",
diff --git a/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py b/rllib/algorithms/dreamerv3/tf/dreamerv3_tf_learner.py
@@ -158,7 +158,7 @@ def compute_gradients(
                     # Take individual loss term from the registered metrics for
                     # the main module.
                     self.metrics.peek(
-                        DEFAULT_MODULE_ID, component.upper() + "_L_total"
+                        (DEFAULT_MODULE_ID, component.upper() + "_L_total")
                     ),
                     self.filter_param_dict_for_optimizer(
                         self._params, self.get_optimizer(optimizer_name=component)
diff --git a/rllib/algorithms/dreamerv3/utils/summaries.py b/rllib/algorithms/dreamerv3/utils/summaries.py
@@ -217,9 +217,7 @@ def report_dreamed_eval_trajectory_vs_samples(
             the report/videos.
     """
     dream_data = metrics.peek(
-        LEARNER_RESULTS,
-        DEFAULT_MODULE_ID,
-        "dream_data",
+        (LEARNER_RESULTS, DEFAULT_MODULE_ID, "dream_data"),
         default={},
     )
     metrics.delete(LEARNER_RESULTS, DEFAULT_MODULE_ID, "dream_data", key_error=False)
diff --git a/rllib/algorithms/ppo/ppo.py b/rllib/algorithms/ppo/ppo.py
@@ -463,13 +463,13 @@ def _training_step_new_api_stack(self) -> ResultDict:
             self.metrics.log_dict(
                 {
                     NUM_AGENT_STEPS_SAMPLED_LIFETIME: self.metrics.peek(
-                        ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED
+                        (ENV_RUNNER_RESULTS, NUM_AGENT_STEPS_SAMPLED)
                     ),
                     NUM_ENV_STEPS_SAMPLED_LIFETIME: self.metrics.peek(
-                        ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED
+                        (ENV_RUNNER_RESULTS, NUM_ENV_STEPS_SAMPLED)
                     ),
                     NUM_EPISODES_LIFETIME: self.metrics.peek(
-                        ENV_RUNNER_RESULTS, NUM_EPISODES
+                        (ENV_RUNNER_RESULTS, NUM_EPISODES)
                     ),
                 },
                 reduce="sum",
@@ -494,10 +494,10 @@ def _training_step_new_api_stack(self) -> ResultDict:
             self.metrics.log_dict(
                 {
                     NUM_ENV_STEPS_TRAINED_LIFETIME: self.metrics.peek(
-                        LEARNER_RESULTS, ALL_MODULES, NUM_ENV_STEPS_TRAINED
+                        (LEARNER_RESULTS, ALL_MODULES, NUM_ENV_STEPS_TRAINED)
                     ),
                     # NUM_MODULE_STEPS_TRAINED_LIFETIME: self.metrics.peek(
-                    #    LEARNER_RESULTS, NUM_MODULE_STEPS_TRAINED
+                    #    (LEARNER_RESULTS, NUM_MODULE_STEPS_TRAINED)
                     # ),
                 },
                 reduce="sum",
@@ -531,7 +531,9 @@ def _training_step_new_api_stack(self) -> ResultDict:
             if self.config.use_kl_loss:
                 for mid in modules_to_update:
                     kl = convert_to_numpy(
-                        self.metrics.peek(LEARNER_RESULTS, mid, LEARNER_RESULTS_KL_KEY)
+                        self.metrics.peek(
+                            (LEARNER_RESULTS, mid, LEARNER_RESULTS_KL_KEY)
+                        )
                     )
                     if np.isnan(kl):
                         logger.warning(
diff --git a/rllib/algorithms/sac/torch/sac_torch_learner.py b/rllib/algorithms/sac/torch/sac_torch_learner.py
@@ -314,7 +314,7 @@ def compute_gradients(
             for component in (
                 ["qf", "policy", "alpha"] + ["qf_twin"] if config.twin_q else []
             ):
-                self.metrics.peek(module_id, component + "_loss").backward(
+                self.metrics.peek((module_id, component + "_loss")).backward(
                     retain_graph=True
                 )
                 grads.update(
diff --git a/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py b/rllib/examples/checkpoints/checkpoint_by_custom_criteria.py
@@ -1,7 +1,7 @@
 """Example extracting a checkpoint from n trials using one or more custom criteria.
 
 This example:
-- runs a simple CartPole experiment with three different learning rates (three tune
+- runs a CartPole experiment with three different learning rates (three tune
 "trials"). During the experiment, for each trial, we create a checkpoint at each
 iteration.
 - at the end of the experiment, we compare the trials and pick the one that performed
diff --git a/rllib/examples/checkpoints/continue_training_from_checkpoint.py b/rllib/examples/checkpoints/continue_training_from_checkpoint.py
diff --git a/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py b/rllib/examples/checkpoints/restore_1_of_n_agents_from_checkpoint.py
diff --git a/rllib/utils/metrics/metrics_logger.py b/rllib/utils/metrics/metrics_logger.py

Original file line number	Diff line number	Diff line change
`@@ -314,7 +314,7 @@ def compute_gradients(`
`314`	`314`	`for component in (`
`315`	`315`	`["qf", "policy", "alpha"] + ["qf_twin"] if config.twin_q else []`
`316`	`316`	`):`
`317`		`- self.metrics.peek(module_id, component + "_loss").backward(`
	`317`	`+ self.metrics.peek((module_id, component + "_loss")).backward(`
`318`	`318`	`retain_graph=True`
`319`	`319`	`)`
`320`	`320`	`grads.update(`