-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Expand file tree
/
Copy pathmujoco_ppo_hl.py
More file actions
85 lines (73 loc) · 3.08 KB
/
mujoco_ppo_hl.py
File metadata and controls
85 lines (73 loc) · 3.08 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
#!/usr/bin/env python3
import os
from typing import Literal
import torch
from sensai.util import logging
from examples.mujoco.mujoco_env import MujocoEnvFactory
from tianshou.highlevel.config import OnPolicyTrainingConfig
from tianshou.highlevel.experiment import (
ExperimentConfig,
PPOExperimentBuilder,
)
from tianshou.highlevel.params.algorithm_params import PPOParams
from tianshou.highlevel.params.lr_scheduler import LRSchedulerFactoryFactoryLinear
def main(
task: str = "Ant-v4",
persistence_base_dir: str = "log",
num_experiments: int = 1,
experiment_launcher: Literal["sequential", "joblib"] = "sequential",
max_epochs: int = 100,
epoch_num_steps: int = 30000,
) -> None:
"""
Train an agent using PPO on a specified MuJoCo task, potentially running multiple experiments with different seeds
and evaluating the results using rliable.
:param task: the MuJoCo task to train on.
:param persistence_base_dir: the base directory for logging and saving experiment data,
the task name will be appended to it.
:param num_experiments: the number of experiments to run. The experiments differ exclusively in the seeds.
:param experiment_launcher: the type of experiment launcher to use, only has an effect if `num_experiments>1`.
You can use "joblib" for parallel execution of whole experiments.
:param max_epochs: the maximum number of training epochs.
:param epoch_num_steps: the number of environment steps per epoch.
"""
persistence_base_dir = os.path.abspath(os.path.join(persistence_base_dir, task))
experiment_config = ExperimentConfig(persistence_base_dir=persistence_base_dir, watch=False)
training_config = OnPolicyTrainingConfig(
max_epochs=max_epochs,
epoch_num_steps=epoch_num_steps,
batch_size=64,
num_training_envs=64,
num_test_envs=10,
buffer_size=4096,
collection_step_num_env_steps=2048,
update_step_num_repetitions=1,
)
env_factory = MujocoEnvFactory(task, obs_norm=True)
hidden_sizes = (64, 64)
experiment_builder = (
PPOExperimentBuilder(env_factory, experiment_config, training_config)
.with_ppo_params(
PPOParams(
gamma=0.99,
gae_lambda=0.95,
action_bound_method="clip",
return_scaling=True,
ent_coef=0.0,
vf_coef=0.25,
max_grad_norm=0.5,
value_clip=False,
advantage_normalization=False,
eps_clip=0.2,
dual_clip=None,
recompute_advantage=True,
lr=3e-4,
lr_scheduler=LRSchedulerFactoryFactoryLinear(training_config),
),
)
.with_actor_factory_default(hidden_sizes, torch.nn.Tanh, continuous_unbounded=True)
.with_critic_factory_default(hidden_sizes, torch.nn.Tanh)
)
experiment_builder.build_and_run(num_experiments=num_experiments, launcher=experiment_launcher)
if __name__ == "__main__":
result = logging.run_cli(main, level=logging.INFO)