tianshou/examples/mujoco/mujoco_ddpg_hl.py at master · thu-ml/tianshou · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
#!/usr/bin/env python3

import os
from typing import Literal

from sensai.util import logging

from examples.mujoco.mujoco_env import MujocoEnvFactory
from tianshou.highlevel.config import OffPolicyTrainingConfig
from tianshou.highlevel.experiment import (
    DDPGExperimentBuilder,
    ExperimentConfig,
)
from tianshou.highlevel.params.algorithm_params import DDPGParams
from tianshou.highlevel.params.noise import MaxActionScaledGaussian


def main(
    task: str = "Ant-v4",
    persistence_base_dir: str = "log",
    num_experiments: int = 1,
    experiment_launcher: Literal["sequential", "joblib"] = "joblib",
    max_epochs: int = 50,
    epoch_num_steps: int = 5000,
) -> None:
    """
    Train an agent using DDPG on a specified MuJoCo task, potentially running multiple experiments with different seeds
    and evaluating the results using rliable.

    :param task: the MuJoCo task to train on.
    :param persistence_base_dir: the base directory for logging and saving experiment data,
        the task name will be appended to it.
    :param num_experiments: the number of experiments to run. The experiments differ exclusively in the seeds.
    :param experiment_launcher: the type of experiment launcher to use, only has an effect if `num_experiments>1`.
        You can use "joblib" for parallel execution of whole experiments.
    :param max_epochs: the maximum number of training epochs.
    :param epoch_num_steps: the number of environment steps per epoch.
    """
    persistence_base_dir = os.path.abspath(os.path.join(persistence_base_dir, task))
    experiment_config = ExperimentConfig(persistence_base_dir=persistence_base_dir, watch=False)

    training_config = OffPolicyTrainingConfig(
        max_epochs=max_epochs,
        epoch_num_steps=epoch_num_steps,
        num_training_envs=1,
        num_test_envs=10,
        buffer_size=1000000,
        batch_size=256,
        collection_step_num_env_steps=1,
        update_step_num_gradient_steps_per_sample=1,
        start_timesteps=25000,
        start_timesteps_random=True,
    )

    env_factory = MujocoEnvFactory(task, obs_norm=False)

    hidden_sizes = (256, 256)
    experiment_builder = (
        DDPGExperimentBuilder(env_factory, experiment_config, training_config)
        .with_ddpg_params(
            DDPGParams(
                actor_lr=1e-3,
                critic_lr=1e-3,
                gamma=0.99,
                tau=0.005,
                exploration_noise=MaxActionScaledGaussian(0.1),
                n_step_return_horizon=1,
            ),
        )
        .with_actor_factory_default(hidden_sizes)
        .with_critic_factory_default(hidden_sizes)
    )

    experiment_builder.build_and_run(num_experiments=num_experiments, launcher=experiment_launcher)


if __name__ == "__main__":
    result = logging.run_cli(main, level=logging.INFO)