-
Notifications
You must be signed in to change notification settings - Fork 15
Expand file tree
/
Copy path12_half_cheetah_vel.py
More file actions
165 lines (147 loc) · 6.07 KB
/
12_half_cheetah_vel.py
File metadata and controls
165 lines (147 loc) · 6.07 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
from argparse import ArgumentParser
import wandb
import amago
from amago.envs import AMAGOEnv
from amago.envs.builtin.half_cheetah_v4_vel import HalfCheetahV4_MetaVelocity
from amago import cli_utils
def add_cli(parser):
parser.add_argument(
"--policy_seq_len", type=int, default=32, help="Policy sequence length."
)
parser.add_argument(
"--eval_episodes_per_actor",
type=int,
default=1,
help="Validation episodes per parallel actor.",
)
parser.add_argument(
"--task_min_velocity",
type=float,
default=0.0,
help="Min running velocity the cheetah needs to be capable of to solve the meta-learning problem. Original benchmark used 0.",
)
parser.add_argument(
"--task_max_velocity",
type=float,
default=3.0,
help="Max running velocity the cheetah needs to be capable of to solve the meta-learning problem. Original benchmark used 3. Agents in the default locomotion env (no reward randomization) reach > 10.",
)
parser.add_argument(
"--inner_episode_steps",
type=int,
default=200,
help="Step horizon of each inner episode. Default 200 (combined with the default --k_train_episodes=3) keeps total trial length at 600 steps. Set 1000 with --k_train_episodes=1 to recover the unwrapped task.",
)
parser.add_argument(
"--k_train_episodes",
type=int,
default=3,
help="Inner episodes per meta-trial during training. Default 3 makes the env a true meta-RL trial: a single hidden target velocity persists across 3 inner episodes (soft resets between). Set 1 to recover the unwrapped task.",
)
parser.add_argument(
"--k_eval_episodes",
type=int,
default=None,
help="Inner episodes per meta-trial at eval time. Defaults to --k_train_episodes. Larger values (e.g. 10) probe how well the agent keeps adapting beyond its training horizon.",
)
return parser
"""
Because this task is so similar to the other gymnasium examples, this example script is overly
verbose about showing how you could customize the environment and create a train/test split.
If you don't edit anything, this only becomes a longer way to train/test on the default task
distribution (which is to sample a velocity uniformly between: [args.task_min_velocity, args.task_max_velocity])
"""
class MyCustomHalfCheetahTrain(HalfCheetahV4_MetaVelocity):
def sample_target_velocity(self) -> float:
# be sure to use `random` or be careful about np default_rng to ensure
# tasks are different across async parallel actors!
vel = super().sample_target_velocity() # random.uniform(min_vel, max_vel)
return vel
class MyCustomHalfCheetahEval(HalfCheetahV4_MetaVelocity):
def sample_target_velocity(self) -> float:
vel = super().sample_target_velocity()
# or, to create OOD eval tasks:
# vel = random.uniform(self.task_min_velocity, self.task_max_velocity * 10.0)
# or random.choice([0., 1., self.task_max_velocity * 1.2]), etc.
return vel
if __name__ == "__main__":
parser = ArgumentParser()
cli_utils.add_common_cli(parser)
add_cli(parser)
args = parser.parse_args()
k_eval = (
args.k_eval_episodes
if args.k_eval_episodes is not None
else args.k_train_episodes
)
def make_train_env():
return AMAGOEnv(
MyCustomHalfCheetahTrain(
task_min_velocity=args.task_min_velocity,
task_max_velocity=args.task_max_velocity,
max_episode_steps=args.inner_episode_steps,
k_episodes=args.k_train_episodes,
),
env_name="HalfCheetahV4Velocity",
)
def make_val_env():
return AMAGOEnv(
MyCustomHalfCheetahEval(
task_min_velocity=args.task_min_velocity,
task_max_velocity=args.task_max_velocity,
max_episode_steps=args.inner_episode_steps,
k_episodes=k_eval,
),
env_name="HalfCheetahV4Velocity",
)
config = {
"amago.nets.traj_encoders.TformerTrajEncoder.pos_emb": "rope",
}
# switch sequence model
traj_encoder_type = cli_utils.switch_traj_encoder(
config,
arch=args.traj_encoder,
memory_size=args.memory_size,
layers=args.memory_layers,
)
# switch agent
agent_type = cli_utils.switch_agent(
config,
args.agent_type,
reward_multiplier=1.0, # gym locomotion returns are already large
gamma=0.99, # locomotion policies don't need long horizons - fall back to the default
tau=0.005,
)
# "egreedy" exploration in continuous control is just the epsilon-scheduled random (normal)
# noise from most TD3/DPPG implementations.
exploration_type = cli_utils.switch_exploration(
config, "egreedy", steps_anneal=500_000
)
cli_utils.use_config(config, args.configs)
group_name = args.run_name
for trial in range(args.trials):
run_name = group_name + f"_trial_{trial}"
experiment = cli_utils.create_experiment_from_cli(
args,
make_train_env=make_train_env, # different train/val envs
make_val_env=make_val_env,
max_seq_len=args.policy_seq_len,
traj_save_len=args.policy_seq_len * 6,
run_name=run_name,
tstep_encoder_type=amago.nets.tstep_encoders.FFTstepEncoder,
traj_encoder_type=traj_encoder_type,
exploration_wrapper_type=exploration_type,
agent_type=agent_type,
group_name=group_name,
val_timesteps_per_epoch=args.eval_episodes_per_actor
* (args.inner_episode_steps * k_eval + 1),
grad_clip=2.0,
learning_rate=3e-4,
)
experiment.start()
if args.ckpt is not None:
experiment.load_checkpoint(args.ckpt)
experiment.learn()
experiment.evaluate_test(make_val_env, timesteps=10_000, render=False)
experiment.delete_buffer_from_disk()
wandb.finish()