-
Notifications
You must be signed in to change notification settings - Fork 1.3k
Expand file tree
/
Copy pathmujoco_ddpg_hl.py
More file actions
78 lines (66 loc) · 2.79 KB
/
mujoco_ddpg_hl.py
File metadata and controls
78 lines (66 loc) · 2.79 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3
import os
from typing import Literal
from sensai.util import logging
from examples.mujoco.mujoco_env import MujocoEnvFactory
from tianshou.highlevel.config import OffPolicyTrainingConfig
from tianshou.highlevel.experiment import (
DDPGExperimentBuilder,
ExperimentConfig,
)
from tianshou.highlevel.params.algorithm_params import DDPGParams
from tianshou.highlevel.params.noise import MaxActionScaledGaussian
def main(
task: str = "Ant-v4",
persistence_base_dir: str = "log",
num_experiments: int = 1,
experiment_launcher: Literal["sequential", "joblib"] = "joblib",
max_epochs: int = 50,
epoch_num_steps: int = 5000,
) -> None:
"""
Train an agent using DDPG on a specified MuJoCo task, potentially running multiple experiments with different seeds
and evaluating the results using rliable.
:param task: the MuJoCo task to train on.
:param persistence_base_dir: the base directory for logging and saving experiment data,
the task name will be appended to it.
:param num_experiments: the number of experiments to run. The experiments differ exclusively in the seeds.
:param experiment_launcher: the type of experiment launcher to use, only has an effect if `num_experiments>1`.
You can use "joblib" for parallel execution of whole experiments.
:param max_epochs: the maximum number of training epochs.
:param epoch_num_steps: the number of environment steps per epoch.
"""
persistence_base_dir = os.path.abspath(os.path.join(persistence_base_dir, task))
experiment_config = ExperimentConfig(persistence_base_dir=persistence_base_dir, watch=False)
training_config = OffPolicyTrainingConfig(
max_epochs=max_epochs,
epoch_num_steps=epoch_num_steps,
num_training_envs=1,
num_test_envs=10,
buffer_size=1000000,
batch_size=256,
collection_step_num_env_steps=1,
update_step_num_gradient_steps_per_sample=1,
start_timesteps=25000,
start_timesteps_random=True,
)
env_factory = MujocoEnvFactory(task, obs_norm=False)
hidden_sizes = (256, 256)
experiment_builder = (
DDPGExperimentBuilder(env_factory, experiment_config, training_config)
.with_ddpg_params(
DDPGParams(
actor_lr=1e-3,
critic_lr=1e-3,
gamma=0.99,
tau=0.005,
exploration_noise=MaxActionScaledGaussian(0.1),
n_step_return_horizon=1,
),
)
.with_actor_factory_default(hidden_sizes)
.with_critic_factory_default(hidden_sizes)
)
experiment_builder.build_and_run(num_experiments=num_experiments, launcher=experiment_launcher)
if __name__ == "__main__":
result = logging.run_cli(main, level=logging.INFO)