Skip to content

Add WIP code GRPO configs #593

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Draft
wants to merge 8 commits into
base: main
Choose a base branch
from
Draft
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
67 changes: 67 additions & 0 deletions recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,67 @@
# Model arguments
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
dataset_prompt_column: problem

system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"

# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 64
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
hub_model_revision: v03.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 24000
max_steps: -1
num_generations: 8
num_iterations: 4
num_train_epochs: 1.0
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v03.00
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
- code_format
e2b_router_url: "ip-10-53-85-124:8000"
reward_weights:
- 1.0
- 0.1
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
parallel_code_exec_per_proc: 10
66 changes: 66 additions & 0 deletions recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Model arguments
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem

# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"

# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 128
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
hub_model_revision: v04.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 24000
max_steps: -1
num_generations: 8
num_iterations: 1
num_train_epochs: 1.0
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v04.00
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
e2b_router_url: ip-10-53-86-47:8000
reward_weights:
- 1.0
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
mask_truncated_completions: true
loss_type: dr_grpo
66 changes: 66 additions & 0 deletions recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Model arguments
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem

# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"

# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
hub_model_revision: v05.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 24000
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1.0
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v05.00
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
reward_funcs:
- binary_code
e2b_router_url: ip-10-53-86-47:8000
reward_weights:
- 1.0
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
mask_truncated_completions: true
loss_type: dr_grpo
66 changes: 66 additions & 0 deletions recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,66 @@
# Model arguments
model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2
# Data training arguments
dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
dataset_prompt_column: problem

# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"

# GRPO trainer config
callbacks:
- push_to_hub_revision
benchmarks:
- lcb_v4
beta: 0.000
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true
vllm_device: auto
vllm_gpu_memory_utilization: 0.7
do_eval: false
gradient_accumulation_steps: 16
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
hub_model_revision: v06.00
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 24000
max_steps: -1
num_generations: 16
num_iterations: 1
num_train_epochs: 1.0
output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v06.00
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
reward_funcs:
- weighted_binary_code_reward
e2b_router_url: ip-10-53-86-47:8000
reward_weights:
- 1.0
save_strategy: "steps"
save_steps: 0.05
save_total_limit: 1
seed: 42
temperature: 0.7
wandb_entity: huggingface
wandb_project: open-r1
warmup_ratio: 0.1
mask_truncated_completions: true
loss_type: dr_grpo
10 changes: 5 additions & 5 deletions recipes/OlympicCoder-7B/sft/config_v00.00.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,7 @@ dataset_num_proc: 48
bf16: true
do_eval: false
eval_strategy: 'no'
gradient_accumulation_steps: 8
gradient_accumulation_steps: 2
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
Expand All @@ -27,20 +27,20 @@ logging_strategy: steps
lr_scheduler_type: cosine_with_min_lr
lr_scheduler_kwargs:
min_lr_rate: 0.1
packing: false
packing: true
max_grad_norm: 0.2
max_length: 32768
max_length: 16000
max_steps: -1
num_train_epochs: 10
output_dir: data/OlympicCoder-7B
overwrite_output_dir: true
per_device_eval_batch_size: 1
per_device_train_batch_size: 2
per_device_train_batch_size: 1
push_to_hub: true
report_to:
- wandb
save_strategy: epoch
save_total_limit: 1
seed: 42
use_liger_kernel: true
use_liger_kernel: false
warmup_ratio: 0.03
58 changes: 58 additions & 0 deletions recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml
Original file line number Diff line number Diff line change
@@ -0,0 +1,58 @@

# Model arguments
model_name_or_path: Qwen/Qwen2.5-7B-Instruct
model_revision: main
torch_dtype: bfloat16
attn_implementation: flash_attention_2

# Data training arguments
dataset_name: open-r1/OpenR1-Math-cn_k12-86k
system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"

# GRPO trainer config
beta: 0.001
bf16: true
do_eval: false
eval_strategy: "no"
use_vllm: true

do_eval: false
gradient_accumulation_steps: 64
gradient_checkpointing: true
gradient_checkpointing_kwargs:
use_reentrant: false
hub_model_id: Qwen2.5-7B-Instruct-GRPO
hub_model_revision: v01.05
hub_strategy: every_save
learning_rate: 1.0e-06
log_completions: true
log_level: info
logging_first_step: true
logging_steps: 1
logging_strategy: steps
lr_scheduler_type: constant_with_warmup
max_grad_norm: 0.2
max_prompt_length: 1024
max_completion_length: 4096
max_steps: -1
num_generations: 8
num_train_epochs: 1
output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.05
overwrite_output_dir: true
per_device_train_batch_size: 1
push_to_hub: true
use_liger_kernel: true
report_to:
- wandb
reward_funcs:
- accuracy
- format
reward_weights:
- 1.0
- 0.2
save_strategy: "steps"
save_steps: 0.1
save_total_limit: 1
seed: 42
temperature: 0.7
warmup_ratio: 0.1
Loading
Loading