keras-team · hertschuh · Mar 3, 2025 · Feb 15, 2025 · Mar 1, 2025
diff --git a/examples/rl/actor_critic_cartpole.py b/examples/rl/actor_critic_cartpole.py
@@ -34,7 +34,8 @@
 
 ### References
 
-- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
+- [Environment documentation](https://www.gymlibrary.dev/environments/classic_control/cart_pole/)
+- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
 - [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)
 """
 """
@@ -56,6 +57,7 @@
 gamma = 0.99  # Discount factor for past rewards
 max_steps_per_episode = 10000
 env = gym.make("CartPole-v0")  # Create the environment
+# Adding `render_mode='human'` will show the attempts of the agent
 env.seed(seed)
 eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0
 
@@ -96,12 +98,10 @@
 episode_count = 0
 
 while True:  # Run until solved
-    state = env.reset()
+    state = env.reset()[0]
     episode_reward = 0
     with tf.GradientTape() as tape:
         for timestep in range(1, max_steps_per_episode):
-            # env.render(); Adding this line would show the attempts
-            # of the agent in a pop up window.
 
             state = ops.convert_to_tensor(state)
             state = ops.expand_dims(state, 0)
@@ -116,7 +116,7 @@
             action_probs_history.append(ops.log(action_probs[0, action]))
 
             # Apply the sampled action in our environment
-            state, reward, done, _ = env.step(action)
+            state, reward, done, _, _ = env.step(action)
             rewards_history.append(reward)
             episode_reward += reward
 

diff --git a/examples/rl/ipynb/actor_critic_cartpole.ipynb b/examples/rl/ipynb/actor_critic_cartpole.ipynb
@@ -45,7 +45,8 @@
     "\n",
     "### References\n",
     "\n",
-    "- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)\n",
+    "- [Environment documentation](https://www.gymlibrary.dev/environments/classic_control/cart_pole/)\n",
+    "- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)\n",
     "- [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)\n"
    ]
   },
@@ -80,6 +81,7 @@
     "gamma = 0.99  # Discount factor for past rewards\n",
     "max_steps_per_episode = 10000\n",
     "env = gym.make(\"CartPole-v0\")  # Create the environment\n",
+    "# Adding `render_mode='human'` will show the attempts of the agent\n",
     "env.seed(seed)\n",
     "eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0\n"
    ]
@@ -148,13 +150,11 @@
     "episode_count = 0\n",
     "\n",
     "while True:  # Run until solved\n",
-    "    state = env.reset()\n",
+    "    state = env.reset()[0]\n",
     "    episode_reward = 0\n",
     "    with tf.GradientTape() as tape:\n",
     "        for timestep in range(1, max_steps_per_episode):\n",
-    "            # env.render(); Adding this line would show the attempts\n",
-    "            # of the agent in a pop up window.\n",
-    "\n",
+    "            \n",
     "            state = ops.convert_to_tensor(state)\n",
     "            state = ops.expand_dims(state, 0)\n",
     "\n",
@@ -168,7 +168,7 @@
     "            action_probs_history.append(ops.log(action_probs[0, action]))\n",
     "\n",
     "            # Apply the sampled action in our environment\n",
-    "            state, reward, done, _ = env.step(action)\n",
+    "            state, reward, done, _, _ = env.step(action)\n",
     "            rewards_history.append(reward)\n",
     "            episode_reward += reward\n",
     "\n",

diff --git a/examples/rl/md/actor_critic_cartpole.md b/examples/rl/md/actor_critic_cartpole.md
@@ -36,7 +36,8 @@ remains upright. The agent, therefore, must learn to keep the pole from falling
 
 ### References
 
-- [CartPole](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
+- [Environment documentation](https://www.gymlibrary.dev/environments/classic_control/cart_pole/)
+- [CartPole paper](http://www.derongliu.org/adp/adp-cdrom/Barto1983.pdf)
 - [Actor Critic Method](https://hal.inria.fr/hal-00840470/document)
 
 
@@ -60,6 +61,7 @@ seed = 42
 gamma = 0.99  # Discount factor for past rewards
 max_steps_per_episode = 10000
 env = gym.make("CartPole-v0")  # Create the environment
+# Adding `render_mode='human'` will show the attempts of the agent
 env.seed(seed)
 eps = np.finfo(np.float32).eps.item()  # Smallest number such that 1.0 + eps != 1.0
 
@@ -108,12 +110,10 @@ running_reward = 0
 episode_count = 0
 
 while True:  # Run until solved
-    state = env.reset()
+    state = env.reset()[0]
     episode_reward = 0
     with tf.GradientTape() as tape:
         for timestep in range(1, max_steps_per_episode):
-            # env.render(); Adding this line would show the attempts
-            # of the agent in a pop up window.
 
             state = ops.convert_to_tensor(state)
             state = ops.expand_dims(state, 0)
@@ -128,7 +128,7 @@ while True:  # Run until solved
             action_probs_history.append(ops.log(action_probs[0, action]))
 
             # Apply the sampled action in our environment
-            state, reward, done, _ = env.step(action)
+            state, reward, done, _, _ = env.step(action)
             rewards_history.append(reward)
             episode_reward += reward