ROLL/examples/qwen2.5-0.5B-agentic/agent_val_frozen_lake_gigpo.yaml at main · aoshen524/ROLL · GitHub

1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
defaults:
  - ../config/step_envs@_here_
  - ../config/deepspeed_zero@_here_
  - ../config/deepspeed_zero2@_here_
  - ../config/deepspeed_zero3@_here_
  - ../config/deepspeed_zero3_cpuoffload@_here_

hydra:
  run:
    dir: .
  output_subdir: null

exp_name: "agentic_pipeline"
seed: 42
logging_dir: ./output/logs
output_dir: ./output
render_save_dir: ./output/render
system_envs:
  USE_MODELSCOPE: '1'

#track_with: wandb
#tracker_kwargs:
#  api_key:
#  project: roll-agentic
#  name: ${exp_name}_sokoban
#  notes: "agentic_pipeline"
#  tags:
#    - agentic
#    - roll
#    - baseline

track_with: tensorboard
tracker_kwargs:
  log_dir: /data/oss_bucket_0/yali/llm/tensorboard/roll_exp/agentic_frozen_lake

checkpoint_config:
  type: file_system
  output_dir: /data/cpfs_0/rl_examples/models/${exp_name}

num_gpus_per_node: 8

max_steps: 10240
save_steps: 10000
logging_steps: 1
eval_steps: 10
resume_from_checkpoint: false

rollout_batch_size: 1024
val_batch_size: 1024
sequence_length: 1024

advantage_clip: 20
ppo_epochs: 1

# gigpo
adv_estimator: "gigpo"
batch_adjust_mode: "copy"
step_reward_weight: 1.0
episode_reward_weight: 1.0
step_reward_gamma: 0.95

# pg_clip: 0.1
#dual_clip_loss: True
init_kl_coef: 0.0
whiten_advantages: false
entropy_loss_coef: 0
max_grad_norm: 1.0
use_kl_loss: true
kl_loss_coef: 0.01

pretrain: Qwen/Qwen2.5-0.5B-Instruct
reward_pretrain: Qwen/Qwen2.5-0.5B-Instruct

actor_train:
  model_args:
    attn_implementation: fa2
    disable_gradient_checkpointing: false
    dtype: bf16
    model_type: ~
  training_args:
    learning_rate: 1.0e-6
    weight_decay: 0
    per_device_train_batch_size: 16
    gradient_accumulation_steps: 8
    warmup_steps: 100
    lr_scheduler_type: cosine
  data_args:
    template: qwen2_5
  strategy_args:
#    strategy_name: deepspeed_train
#    strategy_config: ${deepspeed_zero3}
    strategy_name: megatron_train
    strategy_config:
      tensor_model_parallel_size: 1
      pipeline_model_parallel_size: 1
      expert_model_parallel_size: 1
      use_distributed_optimizer: true
      recompute_granularity: full
  device_mapping: list(range(0,8))
  infer_batch_size: 16

actor_infer:
  model_args:
    disable_gradient_checkpointing: true
    dtype: bf16
  generating_args:
    max_new_tokens: 128 # single-turn response length
    top_p: 0.99
    top_k: 100
    num_beams: 1
    temperature: 0.99
    num_return_sequences: 1
  data_args:
    template: qwen2_5
  strategy_args:
    strategy_name: vllm
    strategy_config:
      gpu_memory_utilization: 0.8
      block_size: 16
      load_format: auto
  device_mapping: list(range(0,8))

reference:
  model_args:
    attn_implementation: fa2
    disable_gradient_checkpointing: true
    dtype: bf16
    model_type: ~
  data_args:
    template: qwen2_5
  strategy_args:
    strategy_name: hf_infer
    strategy_config: ~
  device_mapping: list(range(0,8))
  infer_batch_size: 16

reward_normalization:
  grouping: traj_group_id # 可以tags(env_type)/traj_group_id(group)/batch(rollout_batch)... group_by计算reward/adv
  method: mean # asym_clip / identity / mean_std / mean

train_env_manager:
  format_penalty: -0.1 # sokoban env penalty_for_step=-0.1
  max_env_num_per_worker: 16
  num_env_groups: 128
  # under the same group, the env config and env seed are ensured to be equal
  group_size: 8
  tags: [FrozenLake]
  num_groups_partition: [128] # If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation

val_env_manager:
  max_env_num_per_worker: 32
  num_env_groups: 1024
  group_size: 1 # should be set to 1 because val temperature is set to 0 and same prompt leads to same output
  tags: [SimpleSokoban, LargerSokoban, SokobanDifferentGridVocab, FrozenLake]
  num_groups_partition: [256, 256, 256, 256] # TODO: If not set, all env names divide nums equally. Under the same group, the env config and env seed (prompt) are equal in each generation

# Here, you can override variables defined in the imported envs. max_tokens_per_step: 128 in custom_env.SimpleSokoban, here replaced by 64
max_tokens_per_step: 64

custom_envs:
  SimpleSokoban:
    ${custom_env.SimpleSokoban}
  LargerSokoban:
    ${custom_env.LargerSokoban}
  SokobanDifferentGridVocab:
    ${custom_env.SokobanDifferentGridVocab}
  FrozenLake:
    ${custom_env.FrozenLake}
  FrozenLakeThink:
    ${custom_env.FrozenLakeThink}