Skip to content

Commit 8a4863b

Browse files
author
pytorchbot
committed
2025-12-13 nightly release (445bf59)
1 parent 6e1dbc7 commit 8a4863b

File tree

7 files changed

+553
-6
lines changed

7 files changed

+553
-6
lines changed

apps/grpo/slurm/qwen3_30b_a3b.yaml

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# Grouped Relative Policy Optimization (GRPO)
2+
# NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability
3+
# ./apps/grpo/slurm/submit.sh qwen3_30b_a3b
4+
5+
# Global configuration
6+
group_size: 4
7+
local_batch_size: 1 # per-device batch size
8+
max_req_tokens: 1024
9+
max_res_tokens: 1024
10+
model: "Qwen/Qwen3-30B-A3B"
11+
off_by_n: 1 # Off by one by default
12+
13+
provisioner:
14+
launcher: slurm
15+
memMB: 2047962
16+
cpu: 192
17+
account: agentic-models
18+
qos: h200_capabilities_shared
19+
20+
# Main loop configuration
21+
rollout_threads: 32 # make this 4x the number of policy replicas seems to work well
22+
23+
# Observability configuration
24+
metric_logging:
25+
wandb:
26+
entity: agentic-models
27+
project: grpo-training
28+
group: grpo_exp_${oc.env:USER}
29+
logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce
30+
console:
31+
logging_mode: global_reduce
32+
33+
# Dataset configuration
34+
dataset:
35+
path: "openai/gsm8k"
36+
revision: "main"
37+
data_split: "train"
38+
streaming: true
39+
model: ${model}
40+
41+
# Policy configuration
42+
policy:
43+
engine_args: # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
44+
model: ${model}
45+
tensor_parallel_size: 4
46+
pipeline_parallel_size: 1
47+
enforce_eager: false
48+
sampling_params: # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
49+
n: ${group_size}
50+
max_tokens: ${max_res_tokens}
51+
temperature: 1.0
52+
top_p: 1.0
53+
54+
# Trainer configuration
55+
trainer:
56+
model:
57+
name: qwen3
58+
flavor: 30B-A3B
59+
hf_assets_path: hf://${model}
60+
optimizer:
61+
name: AdamW
62+
lr: 1e-5
63+
eps: 1e-8
64+
lr_scheduler:
65+
warmup_steps: 1
66+
training:
67+
local_batch_size: ${local_batch_size}
68+
seq_len: ${sum:${max_req_tokens},${max_res_tokens}} # seq_len >= max_req_tokens + max_res_tokens
69+
max_norm: 1.0
70+
steps: 1000000
71+
dtype: bfloat16
72+
gc_freq: 1
73+
compile:
74+
enable: false
75+
parallelism:
76+
data_parallel_replicate_degree: 1
77+
data_parallel_shard_degree: -1
78+
tensor_parallel_degree: 1
79+
pipeline_parallel_degree: 1
80+
context_parallel_degree: 1
81+
expert_parallel_degree: 1
82+
expert_tensor_parallel_degree: 1
83+
disable_loss_parallel: true
84+
checkpoint:
85+
enable: true
86+
folder: ./checkpoint # The folder to save checkpoints to.
87+
initial_load_path: hf://${model} # The path to load the initial checkpoint from. Ignored if `folder` exists.
88+
initial_load_in_hf: true # If true, interpret initial_load_path as a HuggingFace model repo
89+
last_save_in_hf: true
90+
interval: 500
91+
async_mode: "disabled"
92+
activation_checkpoint:
93+
mode: full
94+
95+
# Replay buffer configuration
96+
replay_buffer:
97+
batch_size: ${local_batch_size}
98+
max_policy_age: ${off_by_n}
99+
# dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
100+
dp_size: 4
101+
102+
# Reference model configuration
103+
ref_model:
104+
model:
105+
name: qwen3
106+
flavor: 30B-A3B
107+
hf_assets_path: hf://${model}
108+
training:
109+
seq_len: ${trainer.training.seq_len}
110+
dtype: bfloat16
111+
gc_freq: 1
112+
compile:
113+
enable: false
114+
parallelism:
115+
data_parallel_replicate_degree: 1
116+
data_parallel_shard_degree: -1
117+
tensor_parallel_degree: 1
118+
pipeline_parallel_degree: 1
119+
context_parallel_degree: 1
120+
expert_parallel_degree: 1
121+
checkpoint:
122+
enable: true
123+
initial_load_path: hf://${model}
124+
initial_load_in_hf: true
125+
126+
# All resource allocations
127+
services:
128+
policy:
129+
procs: ${policy.engine_args.tensor_parallel_size}
130+
num_replicas: 1
131+
hosts: 1
132+
with_gpus: true
133+
mesh_name: policy
134+
ref_model:
135+
procs: 4
136+
num_replicas: 1
137+
with_gpus: true
138+
mesh_name: ref_model
139+
reward_actor:
140+
procs: 1
141+
num_replicas: 1
142+
with_gpus: false
143+
mesh_name: reward_actor
144+
145+
actors:
146+
dataset:
147+
procs: 1
148+
with_gpus: false
149+
mesh_name: dataset
150+
trainer:
151+
procs: 4
152+
hosts: 1
153+
with_gpus: true
154+
mesh_name: trainer
155+
replay_buffer:
156+
procs: 1
157+
with_gpus: false
158+
mesh_name: replay_buffer
159+
compute_advantages:
160+
procs: 1
161+
with_gpus: false
162+
mesh_name: compute_advantages

apps/grpo/slurm/qwen3_32b.yaml

Lines changed: 162 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,162 @@
1+
# Grouped Relative Policy Optimization (GRPO)
2+
# NOTE - This has not been tested for correctness yet! All testing so far has been only for infrastructure stability
3+
# ./apps/grpo/slurm/submit.sh qwen3_32b
4+
5+
# Global configuration
6+
group_size: 16
7+
local_batch_size: 2 # per-device batch size
8+
max_req_tokens: 1024
9+
max_res_tokens: 1024
10+
model: "Qwen/Qwen3-32B"
11+
off_by_n: 1 # Off by one by default
12+
compile: true # Enable torch.compile for trainer/ref_model, and CUDA graphs for vLLM
13+
14+
provisioner:
15+
launcher: slurm
16+
memMB: 2047962
17+
cpu: 192
18+
account: agentic-models
19+
qos: h200_capabilities_shared
20+
21+
# Main loop configuration
22+
rollout_threads: 32 # make this 4x the number of policy replicas seems to work well
23+
24+
# Observability configuration
25+
metric_logging:
26+
wandb:
27+
entity: agentic-models
28+
project: grpo-training
29+
group: grpo_exp_${oc.env:USER}
30+
logging_mode: global_reduce # global_reduce, per_rank_reduce, per_rank_no_reduce
31+
console:
32+
logging_mode: global_reduce
33+
34+
# Dataset configuration
35+
dataset:
36+
path: "openai/gsm8k"
37+
revision: "main"
38+
data_split: "train"
39+
streaming: true
40+
model: ${model}
41+
42+
# Policy configuration
43+
policy:
44+
engine_args: # https://docs.vllm.ai/en/v0.10.0/api/vllm/engine/arg_utils.html#vllm.engine.arg_utils.EngineArgs
45+
model: ${model}
46+
tensor_parallel_size: 4
47+
pipeline_parallel_size: 1
48+
enforce_eager: ${not:${compile}}
49+
sampling_params: # https://docs.vllm.ai/en/v0.10.0/api/vllm/sampling_params.html#vllm.sampling_params.SamplingParams
50+
n: ${group_size}
51+
max_tokens: ${max_res_tokens}
52+
temperature: 1.0
53+
top_p: 1.0
54+
55+
# Trainer configuration
56+
trainer:
57+
model:
58+
name: qwen3
59+
flavor: 32B
60+
hf_assets_path: hf://${model}
61+
optimizer:
62+
name: AdamW
63+
lr: 1e-5
64+
eps: 1e-8
65+
lr_scheduler:
66+
warmup_steps: 1
67+
training:
68+
local_batch_size: ${local_batch_size}
69+
seq_len: ${sum:${max_req_tokens},${max_res_tokens}} # seq_len >= max_req_tokens + max_res_tokens
70+
max_norm: 1.0
71+
steps: 1000000
72+
dtype: bfloat16
73+
gc_freq: 1
74+
compile:
75+
enable: ${compile}
76+
parallelism:
77+
data_parallel_replicate_degree: 1
78+
data_parallel_shard_degree: 1
79+
tensor_parallel_degree: 8
80+
pipeline_parallel_degree: 1
81+
context_parallel_degree: 1
82+
expert_parallel_degree: 1
83+
disable_loss_parallel: true
84+
checkpoint:
85+
enable: true
86+
folder: ./checkpoint # The folder to save checkpoints to.
87+
initial_load_path: hf://${model} # The path to load the initial checkpoint from. Ignored if `folder` exists.
88+
initial_load_in_hf: true # If true, interpret initial_load_path as a HuggingFace model repo
89+
last_save_in_hf: true
90+
interval: 500
91+
async_mode: "disabled"
92+
activation_checkpoint:
93+
mode: full
94+
95+
# Replay buffer configuration
96+
replay_buffer:
97+
batch_size: ${local_batch_size}
98+
max_policy_age: ${off_by_n}
99+
# dp_size: ${trainer.parallelism.data_parallel_shard_degree} # Must equal trainer DP degree
100+
dp_size: 1
101+
102+
# Reference model configuration
103+
ref_model:
104+
model:
105+
name: qwen3
106+
flavor: 32B
107+
hf_assets_path: hf://${model}
108+
training:
109+
seq_len: ${trainer.training.seq_len}
110+
dtype: bfloat16
111+
gc_freq: 1
112+
compile:
113+
enable: ${compile}
114+
parallelism:
115+
data_parallel_replicate_degree: 1
116+
data_parallel_shard_degree: 1
117+
tensor_parallel_degree: 4
118+
pipeline_parallel_degree: 1
119+
context_parallel_degree: 1
120+
expert_parallel_degree: 1
121+
checkpoint:
122+
enable: true
123+
initial_load_path: hf://${model}
124+
initial_load_in_hf: true
125+
126+
# All resource allocations
127+
services:
128+
policy:
129+
procs: ${policy.engine_args.tensor_parallel_size}
130+
num_replicas: 4
131+
hosts: 1
132+
with_gpus: true
133+
mesh_name: policy
134+
ref_model:
135+
procs: ${ref_model.parallelism.tensor_parallel_degree}
136+
num_replicas: 1
137+
with_gpus: true
138+
mesh_name: ref_model
139+
reward_actor:
140+
procs: 1
141+
num_replicas: 1
142+
with_gpus: false
143+
mesh_name: reward_actor
144+
145+
actors:
146+
dataset:
147+
procs: 1
148+
with_gpus: false
149+
mesh_name: dataset
150+
trainer:
151+
procs: 8
152+
hosts: 1
153+
with_gpus: true
154+
mesh_name: trainer
155+
replay_buffer:
156+
procs: 1
157+
with_gpus: false
158+
mesh_name: replay_buffer
159+
compute_advantages:
160+
procs: 1
161+
with_gpus: false
162+
mesh_name: compute_advantages

0 commit comments

Comments
 (0)