diff --git a/examples/rl/actor_critic_cartpole.py b/examples/rl/actor_critic_cartpole.py index c44552446c..a4e1496ba0 100644 --- a/examples/rl/actor_critic_cartpole.py +++ b/examples/rl/actor_critic_cartpole.py @@ -2,16 +2,16 @@ Title: Actor Critic Method Author: [Apoorv Nandan](https://twitter.com/NandanApoorv) Date created: 2020/05/13 -Last modified: 2024/02/22 +Last modified: 2025/01/07 Description: Implement Actor Critic Method in CartPole environment. -Accelerator: NONE +Accelerator: None Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT) """ """ ## Introduction -This script shows an implementation of Actor Critic method on CartPole-V0 environment. +This script shows an implementation of Actor Critic method on CartPole-V1 environment. ### Actor Critic Method @@ -26,7 +26,7 @@ Agent and Critic learn to perform their tasks, such that the recommended actions from the actor maximize the rewards. -### CartPole-V0 +### CartPole-V1 A pole is attached to a cart placed on a frictionless track. The agent has to apply force to move the cart. It is rewarded for every time step the pole @@ -45,7 +45,7 @@ import os os.environ["KERAS_BACKEND"] = "tensorflow" -import gym +import gymnasium as gym import numpy as np import keras from keras import ops @@ -57,7 +57,7 @@ gamma = 0.99 # Discount factor for past rewards max_steps_per_episode = 10000 # Adding `render_mode='human'` will show the attempts of the agent -env = gym.make("CartPole-v0") # Create the environment +env = gym.make("CartPole-v1") # Create the environment env.reset(seed=seed) eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0 @@ -98,12 +98,12 @@ episode_count = 0 while True: # Run until solved - state = env.reset()[0] + obs, _ = env.reset() episode_reward = 0 with tf.GradientTape() as tape: for timestep in range(1, max_steps_per_episode): - state = ops.convert_to_tensor(state) + state = ops.convert_to_tensor(obs) state = ops.expand_dims(state, 0) # Predict action probabilities and estimated future rewards @@ -116,10 +116,11 @@ action_probs_history.append(ops.log(action_probs[0, action])) # Apply the sampled action in our environment - state, reward, done, *_ = env.step(action) + obs, reward, terminated, truncated, _ = env.step(action) rewards_history.append(reward) episode_reward += reward + done = terminated or truncated if done: break diff --git a/examples/rl/ipynb/actor_critic_cartpole.ipynb b/examples/rl/ipynb/actor_critic_cartpole.ipynb index 47d03a83fc..df7711cfb1 100644 --- a/examples/rl/ipynb/actor_critic_cartpole.ipynb +++ b/examples/rl/ipynb/actor_critic_cartpole.ipynb @@ -10,7 +10,7 @@ "\n", "**Author:** [Apoorv Nandan](https://twitter.com/NandanApoorv)
\n", "**Date created:** 2020/05/13
\n", - "**Last modified:** 2024/02/22
\n", + "**Last modified:** 2025/01/07
\n", "**Description:** Implement Actor Critic Method in CartPole environment." ] }, @@ -22,7 +22,7 @@ "source": [ "## Introduction\n", "\n", - "This script shows an implementation of Actor Critic method on CartPole-V0 environment.\n", + "This script shows an implementation of Actor Critic method on CartPole-V1 environment.\n", "\n", "### Actor Critic Method\n", "\n", @@ -37,7 +37,7 @@ "Agent and Critic learn to perform their tasks, such that the recommended actions\n", "from the actor maximize the rewards.\n", "\n", - "### CartPole-V0\n", + "### CartPole-V1\n", "\n", "A pole is attached to a cart placed on a frictionless track. The agent has to apply\n", "force to move the cart. It is rewarded for every time step the pole\n", @@ -47,7 +47,7 @@ "\n", "- [Environment documentation](https://gymnasium.farama.org/environments/classic_control/cart_pole/)\n", "- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)\n", - "- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)\n" + "- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)" ] }, { @@ -56,12 +56,12 @@ "colab_type": "text" }, "source": [ - "## Setup\n" + "## Setup" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, @@ -70,7 +70,7 @@ "import os\n", "\n", "os.environ[\"KERAS_BACKEND\"] = \"tensorflow\"\n", - "import gym\n", + "import gymnasium as gym\n", "import numpy as np\n", "import keras\n", "from keras import ops\n", @@ -82,7 +82,7 @@ "gamma = 0.99 # Discount factor for past rewards\n", "max_steps_per_episode = 10000\n", "# Adding `render_mode='human'` will show the attempts of the agent\n", - "env = gym.make(\"CartPole-v0\") # Create the environment\n", + "env = gym.make(\"CartPole-v1\") # Create the environment\n", "env.reset(seed=seed)\n", "eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0" ] @@ -102,12 +102,12 @@ "2. Critic: This takes as input the state of our environment and returns\n", "an estimate of total rewards in the future.\n", "\n", - "In our implementation, they share the initial layer.\n" + "In our implementation, they share the initial layer." ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, @@ -131,12 +131,12 @@ "colab_type": "text" }, "source": [ - "## Train\n" + "## Train" ] }, { "cell_type": "code", - "execution_count": null, + "execution_count": 0, "metadata": { "colab_type": "code" }, @@ -151,12 +151,12 @@ "episode_count = 0\n", "\n", "while True: # Run until solved\n", - " state = env.reset()[0]\n", + " obs, _ = env.reset()\n", " episode_reward = 0\n", " with tf.GradientTape() as tape:\n", " for timestep in range(1, max_steps_per_episode):\n", "\n", - " state = ops.convert_to_tensor(state)\n", + " state = ops.convert_to_tensor(obs)\n", " state = ops.expand_dims(state, 0)\n", "\n", " # Predict action probabilities and estimated future rewards\n", @@ -169,10 +169,11 @@ " action_probs_history.append(ops.log(action_probs[0, action]))\n", "\n", " # Apply the sampled action in our environment\n", - " state, reward, done, *_ = env.step(action)\n", + " obs, reward, terminated, truncated, _ = env.step(action)\n", " rewards_history.append(reward)\n", " episode_reward += reward\n", "\n", + " done = terminated or truncated\n", " if done:\n", " break\n", "\n", @@ -245,12 +246,12 @@ "![Imgur](https://i.imgur.com/5gCs5kH.gif)\n", "\n", "In later stages of training:\n", - "![Imgur](https://i.imgur.com/5ziiZUD.gif)\n" + "![Imgur](https://i.imgur.com/5ziiZUD.gif)" ] } ], "metadata": { - "accelerator": "GPU", + "accelerator": "None", "colab": { "collapsed_sections": [], "name": "actor_critic_cartpole", @@ -273,9 +274,9 @@ "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", - "version": "3.12.8" + "version": "3.7.0" } }, "nbformat": 4, "nbformat_minor": 0 -} +} \ No newline at end of file diff --git a/examples/rl/md/actor_critic_cartpole.md b/examples/rl/md/actor_critic_cartpole.md index 5dbef18efe..c8383cc241 100644 --- a/examples/rl/md/actor_critic_cartpole.md +++ b/examples/rl/md/actor_critic_cartpole.md @@ -2,7 +2,7 @@ **Author:** [Apoorv Nandan](https://twitter.com/NandanApoorv)
**Date created:** 2020/05/13
-**Last modified:** 2024/02/22
+**Last modified:** 2025/01/07
**Description:** Implement Actor Critic Method in CartPole environment. @@ -13,7 +13,7 @@ --- ## Introduction -This script shows an implementation of Actor Critic method on CartPole-V0 environment. +This script shows an implementation of Actor Critic method on CartPole-V1 environment. ### Actor Critic Method @@ -28,7 +28,7 @@ the observed state of the environment to two possible outputs: Agent and Critic learn to perform their tasks, such that the recommended actions from the actor maximize the rewards. -### CartPole-V0 +### CartPole-V1 A pole is attached to a cart placed on a frictionless track. The agent has to apply force to move the cart. It is rewarded for every time step the pole @@ -40,16 +40,15 @@ remains upright. The agent, therefore, must learn to keep the pole from falling - [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf) - [Actor Critic Method](https://hal.inria.fr/hal-00840470/document) - --- ## Setup - ```python import os + os.environ["KERAS_BACKEND"] = "tensorflow" -import gym +import gymnasium as gym import numpy as np import keras from keras import ops @@ -61,12 +60,10 @@ seed = 42 gamma = 0.99 # Discount factor for past rewards max_steps_per_episode = 10000 # Adding `render_mode='human'` will show the attempts of the agent -env = gym.make("CartPole-v0") # Create the environment +env = gym.make("CartPole-v1") # Create the environment env.reset(seed=seed) eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0 - ``` - --- ## Implement Actor Critic network @@ -80,7 +77,6 @@ an estimate of total rewards in the future. In our implementation, they share the initial layer. - ```python num_inputs = 4 num_actions = 2 @@ -92,14 +88,12 @@ action = layers.Dense(num_actions, activation="softmax")(common) critic = layers.Dense(1)(common) model = keras.Model(inputs=inputs, outputs=[action, critic]) - ``` --- ## Train - ```python optimizer = keras.optimizers.Adam(learning_rate=0.01) huber_loss = keras.losses.Huber() @@ -110,12 +104,12 @@ running_reward = 0 episode_count = 0 while True: # Run until solved - state = env.reset()[0] + obs, _ = env.reset() episode_reward = 0 with tf.GradientTape() as tape: for timestep in range(1, max_steps_per_episode): - state = ops.convert_to_tensor(state) + state = ops.convert_to_tensor(obs) state = ops.expand_dims(state, 0) # Predict action probabilities and estimated future rewards @@ -128,10 +122,11 @@ while True: # Run until solved action_probs_history.append(ops.log(action_probs[0, action])) # Apply the sampled action in our environment - state, reward, done, *_ = env.step(action) + obs, reward, terminated, truncated, _ = env.step(action) rewards_history.append(reward) episode_reward += reward + done = terminated or truncated if done: break @@ -191,47 +186,31 @@ while True: # Run until solved if running_reward > 195: # Condition to consider the task solved print("Solved at episode {}!".format(episode_count)) break - ```
``` -running reward: 8.82 at episode 10 -running reward: 23.04 at episode 20 -running reward: 28.41 at episode 30 -running reward: 53.59 at episode 40 -running reward: 53.71 at episode 50 -running reward: 77.35 at episode 60 -running reward: 74.76 at episode 70 -running reward: 57.89 at episode 80 -running reward: 46.59 at episode 90 -running reward: 43.48 at episode 100 -running reward: 63.77 at episode 110 -running reward: 111.13 at episode 120 -running reward: 142.77 at episode 130 -running reward: 127.96 at episode 140 -running reward: 113.92 at episode 150 -running reward: 128.57 at episode 160 -running reward: 139.95 at episode 170 -running reward: 154.95 at episode 180 -running reward: 171.45 at episode 190 -running reward: 171.33 at episode 200 -running reward: 177.74 at episode 210 -running reward: 184.76 at episode 220 -running reward: 190.88 at episode 230 -running reward: 154.78 at episode 240 -running reward: 114.38 at episode 250 -running reward: 107.51 at episode 260 -running reward: 128.99 at episode 270 -running reward: 157.48 at episode 280 -running reward: 174.54 at episode 290 -running reward: 184.76 at episode 300 -running reward: 190.87 at episode 310 -running reward: 194.54 at episode 320 -Solved at episode 322! +running reward: 13.73 at episode 10 +running reward: 22.93 at episode 20 +running reward: 20.96 at episode 30 +running reward: 18.73 at episode 40 +running reward: 28.80 at episode 50 +running reward: 27.52 at episode 60 +running reward: 29.73 at episode 70 +running reward: 45.53 at episode 80 +running reward: 60.19 at episode 90 +running reward: 78.66 at episode 100 +running reward: 112.70 at episode 110 +running reward: 91.89 at episode 120 +running reward: 91.08 at episode 130 +running reward: 77.85 at episode 140 +running reward: 121.86 at episode 150 +running reward: 173.82 at episode 160 +Solved at episode 163! ```
+ --- ## Visualizations In early stages of training: @@ -239,4 +218,3 @@ In early stages of training: In later stages of training: ![Imgur](https://i.imgur.com/5ziiZUD.gif) -