[RLlib] Add LSTM option to run_connector_policyexample (old API stack; w/ manual Connector.reset() call). (ray-project#45829)

sven1977 · web-flow · commit e48dba9770ee · 2024-06-11T09:25:18.000Z
diff --git a/rllib/BUILD b/rllib/BUILD
@@ -2052,6 +2052,15 @@ py_test(
     srcs = ["examples/_old_api_stack/connectors/run_connector_policy.py"],
 )
 
+py_test(
+    name = "examples/_old_api_stack/connectors/run_connector_policy_w_lstm",
+    main = "examples/_old_api_stack/connectors/run_connector_policy.py",
+    tags = ["team:rllib", "exclusive", "examples", "old_api_stack"],
+    size = "small",
+    srcs = ["examples/_old_api_stack/connectors/run_connector_policy.py"],
+    args = ["--use-lstm"],
+)
+
 py_test(
     name = "examples/_old_api_stack/connectors/adapt_connector_policy",
     main = "examples/_old_api_stack/connectors/adapt_connector_policy.py",
diff --git a/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py b/rllib/examples/_old_api_stack/connectors/prepare_checkpoint.py
@@ -4,9 +4,14 @@
 from ray.rllib.algorithms.sac import SACConfig
 
 
-def create_appo_cartpole_checkpoint(output_dir):
+def create_appo_cartpole_checkpoint(output_dir, use_lstm=False):
     # enable_connectors defaults to True. Just trying to be explicit here.
-    config = APPOConfig().environment("CartPole-v1").env_runners(enable_connectors=True)
+    config = (
+        APPOConfig()
+        .environment("CartPole-v1")
+        .env_runners(enable_connectors=True)
+        .training(model={"use_lstm": use_lstm})
+    )
     # Build algorithm object.
     algo = config.build()
     algo.save(checkpoint_dir=output_dir)
diff --git a/rllib/examples/_old_api_stack/connectors/run_connector_policy.py b/rllib/examples/_old_api_stack/connectors/run_connector_policy.py
@@ -2,6 +2,7 @@
 and use it in a serving/inference setting.
 """
 
+import argparse
 import gymnasium as gym
 import os
 import tempfile
@@ -13,6 +14,9 @@
 from ray.rllib.policy.policy import Policy
 from ray.rllib.utils.policy import local_policy_inference
 
+parser = argparse.ArgumentParser()
+parser.add_argument("--use-lstm", action="store_true", help="Add LSTM to the setup.")
+
 
 def run(checkpoint_path, policy_id):
     # __sphinx_doc_begin__
@@ -24,34 +28,45 @@ def run(checkpoint_path, policy_id):
 
     # Run CartPole.
     env = gym.make("CartPole-v1")
+    env_id = "env_1"
     obs, info = env.reset()
-    terminated = truncated = False
-    step = 0
-    while not terminated and not truncated:
-        step += 1
-
+    # Run for 2 episodes.
+    episodes = step = 0
+    while episodes < 2:
         # Use local_policy_inference() to run inference, so we do not have to
         # provide policy states or extra fetch dictionaries.
         # "env_1" and "agent_1" are dummy env and agent IDs to run connectors with.
         policy_outputs = local_policy_inference(
-            policy, "env_1", "agent_1", obs, explore=False
+            policy, env_id, "agent_1", obs, explore=False
         )
         assert len(policy_outputs) == 1
         action, _, _ = policy_outputs[0]
-        print(f"step {step}", obs, action)
+        print(f"episode {episodes} step {step}", obs, action)
 
         # Step environment forward one more step.
         obs, _, terminated, truncated, _ = env.step(action)
+        step += 1
+
+        # If the episode is done, reset the env and our connectors and start a new
+        # episode.
+        if terminated or truncated:
+            episodes += 1
+            step = 0
+            obs, info = env.reset()
+            policy.agent_connectors.reset(env_id)
+
     # __sphinx_doc_end__
 
 
 if __name__ == "__main__":
+    args = parser.parse_args()
+
     with tempfile.TemporaryDirectory() as tmpdir:
         policy_id = "default_policy"
 
         # Note, this is just for demo purpose.
         # Normally, you would use a policy checkpoint from a real training run.
-        create_appo_cartpole_checkpoint(tmpdir)
+        create_appo_cartpole_checkpoint(tmpdir, args.use_lstm)
         policy_checkpoint_path = os.path.join(
             tmpdir,
             "policies",