huggingface · edbeeching · Apr 11, 2025 · Apr 11, 2025 · Apr 11, 2025 · Apr 17, 2025
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v03.00.yaml
@@ -0,0 +1,67 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested
+dataset_prompt_column: problem
+
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v03.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 8
+num_iterations: 4
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v03.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+- code_format
+e2b_router_url: "ip-10-53-85-124:8000"
+reward_weights:
+- 1.0
+- 0.1
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+parallel_code_exec_per_proc: 10
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v04.00.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 128
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v04.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 8
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v04.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v05.00.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v05.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v05.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- binary_code
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
diff --git a/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml b/recipes/DeepSeek-R1-Distill-Qwen-7B/grpo/config_v06.00.yaml
@@ -0,0 +1,66 @@
+# Model arguments
+model_name_or_path: deepseek-ai/DeepSeek-R1-Distill-Qwen-7B
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+# Data training arguments
+dataset_name: open-r1/verifiable-coding-problems-python_decontaminated-tested-shuffled
+dataset_prompt_column: problem
+
+# system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+callbacks:
+- push_to_hub_revision
+benchmarks:
+- lcb_v4
+beta: 0.000
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+vllm_device: auto
+vllm_gpu_memory_utilization: 0.7
+do_eval: false
+gradient_accumulation_steps: 16
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: open-r1/DeepSeek-R1-Distill-Qwen-7B-GRPO
+hub_model_revision: v06.00
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 24000
+max_steps: -1
+num_generations: 16
+num_iterations: 1
+num_train_epochs: 1.0
+output_dir: data/DeepSeek-R1-Distill-Qwen-7B_v06.00
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+report_to:
+- wandb
+reward_funcs:
+- weighted_binary_code_reward
+e2b_router_url: ip-10-53-86-47:8000
+reward_weights:
+- 1.0
+save_strategy: "steps"
+save_steps: 0.05
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+wandb_entity: huggingface 
+wandb_project: open-r1
+warmup_ratio: 0.1
+mask_truncated_completions: true
+loss_type: dr_grpo
diff --git a/recipes/OlympicCoder-7B/sft/config_v00.00.yaml b/recipes/OlympicCoder-7B/sft/config_v00.00.yaml
@@ -14,7 +14,7 @@ dataset_num_proc: 48
 bf16: true
 do_eval: false
 eval_strategy: 'no'
-gradient_accumulation_steps: 8
+gradient_accumulation_steps: 2
 gradient_checkpointing: true
 gradient_checkpointing_kwargs:
   use_reentrant: false
@@ -27,20 +27,20 @@ logging_strategy: steps
 lr_scheduler_type: cosine_with_min_lr
 lr_scheduler_kwargs:
   min_lr_rate: 0.1
-packing: false
+packing: true
 max_grad_norm: 0.2
-max_length: 32768
+max_length: 16000
 max_steps: -1
 num_train_epochs: 10
 output_dir: data/OlympicCoder-7B
 overwrite_output_dir: true
 per_device_eval_batch_size: 1
-per_device_train_batch_size: 2
+per_device_train_batch_size: 1
 push_to_hub: true
 report_to:
 - wandb
 save_strategy: epoch
 save_total_limit: 1
 seed: 42
-use_liger_kernel: true
+use_liger_kernel: false
 warmup_ratio: 0.03
diff --git a/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml b/recipes/Qwen2.5-7B-Instruct/grpo/config_v01.05.yaml
@@ -0,0 +1,58 @@
+
+# Model arguments
+model_name_or_path: Qwen/Qwen2.5-7B-Instruct
+model_revision: main
+torch_dtype: bfloat16
+attn_implementation: flash_attention_2
+
+# Data training arguments
+dataset_name: open-r1/OpenR1-Math-cn_k12-86k
+system_prompt: "You are a helpful AI Assistant that provides well-reasoned and detailed responses. You first think about the reasoning process as an internal monologue and then provide the user with the answer. Respond in the following format: <think>\n...\n</think>\n<answer>\n...\n</answer>"
+
+# GRPO trainer config
+beta: 0.001
+bf16: true
+do_eval: false
+eval_strategy: "no"
+use_vllm: true
+
+do_eval: false
+gradient_accumulation_steps: 64
+gradient_checkpointing: true
+gradient_checkpointing_kwargs:
+  use_reentrant: false
+hub_model_id: Qwen2.5-7B-Instruct-GRPO
+hub_model_revision: v01.05
+hub_strategy: every_save
+learning_rate: 1.0e-06
+log_completions: true
+log_level: info
+logging_first_step: true
+logging_steps: 1
+logging_strategy: steps
+lr_scheduler_type: constant_with_warmup
+max_grad_norm: 0.2
+max_prompt_length: 1024
+max_completion_length: 4096
+max_steps: -1
+num_generations: 8
+num_train_epochs: 1
+output_dir: data/Qwen2.5-7B-Instruct-GRPO_v01.05
+overwrite_output_dir: true
+per_device_train_batch_size: 1
+push_to_hub: true
+use_liger_kernel: true
+report_to:
+- wandb
+reward_funcs:
+- accuracy
+- format
+reward_weights:
+- 1.0
+- 0.2
+save_strategy: "steps"
+save_steps: 0.1
+save_total_limit: 1
+seed: 42
+temperature: 0.7
+warmup_ratio: 0.1