|
2 | 2 | Title: Actor Critic Method |
3 | 3 | Author: [Apoorv Nandan](https://twitter.com/NandanApoorv) |
4 | 4 | Date created: 2020/05/13 |
5 | | -Last modified: 2024/02/22 |
| 5 | +Last modified: 2026/02/28 |
6 | 6 | Description: Implement Actor Critic Method in CartPole environment. |
7 | 7 | Accelerator: NONE |
8 | 8 | Converted to Keras 3 by: [Sitam Meur](https://github.com/sitamgithub-MSIT) |
|
11 | 11 | """ |
12 | 12 | ## Introduction |
13 | 13 |
|
14 | | -This script shows an implementation of Actor Critic method on CartPole-V0 environment. |
| 14 | +This script shows an implementation of Actor Critic method on CartPole-V1 environment. |
15 | 15 |
|
16 | 16 | ### Actor Critic Method |
17 | 17 |
|
|
26 | 26 | Agent and Critic learn to perform their tasks, such that the recommended actions |
27 | 27 | from the actor maximize the rewards. |
28 | 28 |
|
29 | | -### CartPole-V0 |
| 29 | +### CartPole-V1 |
30 | 30 |
|
31 | 31 | A pole is attached to a cart placed on a frictionless track. The agent has to apply |
32 | 32 | force to move the cart. It is rewarded for every time step the pole |
|
45 | 45 | import os |
46 | 46 |
|
47 | 47 | os.environ["KERAS_BACKEND"] = "tensorflow" |
48 | | -import gym |
| 48 | +import gymnasium as gym |
49 | 49 | import numpy as np |
50 | 50 | import keras |
51 | 51 | from keras import ops |
|
57 | 57 | gamma = 0.99 # Discount factor for past rewards |
58 | 58 | max_steps_per_episode = 10000 |
59 | 59 | # Adding `render_mode='human'` will show the attempts of the agent |
60 | | -env = gym.make("CartPole-v0") # Create the environment |
61 | | -env.reset(seed=seed) |
| 60 | +env = gym.make("CartPole-v1") # Create the environment |
62 | 61 | eps = np.finfo(np.float32).eps.item() # Smallest number such that 1.0 + eps != 1.0 |
63 | 62 |
|
64 | 63 | """ |
|
98 | 97 | episode_count = 0 |
99 | 98 |
|
100 | 99 | while True: # Run until solved |
101 | | - state = env.reset()[0] |
| 100 | + state, _ = env.reset(seed=seed) |
102 | 101 | episode_reward = 0 |
103 | 102 | with tf.GradientTape() as tape: |
104 | 103 | for timestep in range(1, max_steps_per_episode): |
105 | 104 |
|
106 | | - state = ops.convert_to_tensor(state) |
| 105 | + state = ops.convert_to_tensor(np.array(state, dtype=np.float32)) |
107 | 106 | state = ops.expand_dims(state, 0) |
108 | 107 |
|
109 | 108 | # Predict action probabilities and estimated future rewards |
|
116 | 115 | action_probs_history.append(ops.log(action_probs[0, action])) |
117 | 116 |
|
118 | 117 | # Apply the sampled action in our environment |
119 | | - state, reward, done, *_ = env.step(action) |
| 118 | + state, reward, terminated, truncated, _ = env.step(action) |
| 119 | + done = terminated or truncated |
120 | 120 | rewards_history.append(reward) |
121 | 121 | episode_reward += reward |
122 | 122 |
|
|
176 | 176 | template = "running reward: {:.2f} at episode {}" |
177 | 177 | print(template.format(running_reward, episode_count)) |
178 | 178 |
|
179 | | - if running_reward > 195: # Condition to consider the task solved |
| 179 | + if running_reward > 475: # Condition to consider the task solved |
180 | 180 | print("Solved at episode {}!".format(episode_count)) |
181 | 181 | break |
182 | 182 | """ |
|
0 commit comments