[fix] use token_mean_legacy loss reduction for DAPO scripts (#1637)

SumanthRH · claude · web-flow · commit b418d3404a7b · 2026-05-08T17:44:27.000-07:00
# What does this PR do? Switches back to `token_mean_legacy` for DAPO scripts. For reasoning RL, the legacy token mean reduction has better performance because it biases towards larger response lengths, leading to longer reasoning chains. This is also the reduction used for the original DAPO runs on VeRL. (And [our own ](https://wandb.ai/sky-posttraining-uc-berkeley/skyrl-train-dapo-aime/reports/SkyRL-Train-DAPO--VmlldzoxNDkyOTc0MQ?accessToken=vxzk3zov4kehgbfe7j897syz1vv5u01wdfnzajnr6hymeh1ne4hrnav4q9d0h2kk) reproduction runs from January) --------- Signed-off-by: SumanthRH <sumanthrh99@gmail.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
diff --git a/examples/train/algorithms/dapo/run_dapo_aime_qwen3_4b_aime.sh b/examples/train/algorithms/dapo/run_dapo_aime_qwen3_4b_aime.sh
@@ -16,8 +16,7 @@ LOGGER="wandb"  # change to "console" to print to stdout
 
 CLIP_RATIO_LOW=0.2
 CLIP_RATIO_HIGH=0.28
-# use dr. grpo loss reduction
-LOSS_REDUCTION="token_mean"
+LOSS_REDUCTION="token_mean_legacy"
 # applies overlong filtering (but not soft overlong punishment)
 APPLY_OVERLONG_FILTERING=true
 # apply soft overlong punishment with custom trainer impl in main_dapo.py
diff --git a/examples/train/algorithms/dapo/run_dapo_gsm8k.sh b/examples/train/algorithms/dapo/run_dapo_gsm8k.sh
@@ -15,7 +15,7 @@ EPS_CLIP_LOW=0.2
 EPS_CLIP_HIGH=0.28
 DYNAMIC_SAMPLING_TYPE=filter
 DYNAMIC_SAMPLING_MAX_SAMPLE_BATCHES=30
-LOSS_REDUCTION="token_mean"
+LOSS_REDUCTION="token_mean_legacy"
 # applies overlong filtering (but not soft overlong punishment)
 APPLY_OVERLONG_FILTERING=true
 # apply soft overlong punishment with custom trainer impl in main_dapo.py
diff --git a/examples/train/algorithms/dapo/run_dapo_qwen2.5_32b_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen2.5_32b_aime.sh
@@ -20,7 +20,7 @@ EPS_CLIP_HIGH=0.28
 # dynamic sampling parameters - off by default, since this greatly slows down inference
 DYNAMIC_SAMPLING_TYPE=null
 DYNAMIC_SAMPLING_MAX_SAMPLE_BATCHES=30
-LOSS_REDUCTION="token_mean"
+LOSS_REDUCTION="token_mean_legacy"
 # applies overlong filtering (but not soft overlong punishment)
 APPLY_OVERLONG_FILTERING=true
 # apply soft overlong punishment with custom trainer impl in main_dapo.py
diff --git a/examples/train/algorithms/dapo/run_dapo_qwen2.5_math_7b_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen2.5_math_7b_aime.sh
@@ -21,7 +21,7 @@ EPS_CLIP_HIGH=0.28
 # dynamic sampling parameters - off by default, since this greatly slows down inference
 DYNAMIC_SAMPLING_TYPE=null
 DYNAMIC_SAMPLING_MAX_SAMPLE_BATCHES=30
-LOSS_REDUCTION="token_mean"
+LOSS_REDUCTION="token_mean_legacy"
 # applies overlong filtering (but not soft overlong punishment)
 APPLY_OVERLONG_FILTERING=true
 # apply soft overlong punishment with custom trainer impl in main_dapo.py
diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime.sh
@@ -16,8 +16,7 @@ LOGGER="wandb"  # change to "console" to print to stdout
 
 CLIP_RATIO_LOW=0.2
 CLIP_RATIO_HIGH=0.28
-# use dr. grpo loss reduction
-LOSS_REDUCTION="token_mean"
+LOSS_REDUCTION="token_mean_legacy"
 # applies overlong filtering (but not soft overlong punishment)
 APPLY_OVERLONG_FILTERING=true
 # apply soft overlong punishment with custom trainer impl in main_dapo.py
diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async.sh
@@ -16,8 +16,7 @@ LOGGER="wandb"  # change to "console" to print to stdout
 
 CLIP_RATIO_LOW=0.2
 CLIP_RATIO_HIGH=0.28
-# use dr. grpo loss reduction
-LOSS_REDUCTION="token_mean"
+LOSS_REDUCTION="token_mean_legacy"
 # applies overlong filtering (but not soft overlong punishment)
 APPLY_OVERLONG_FILTERING=true
 # apply soft overlong punishment with custom trainer impl in main_dapo_fully_async.py
diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_lora_megatron_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_lora_megatron_aime.sh
@@ -18,8 +18,7 @@ LOGGER="wandb"  # change to "console" to print to stdout
 
 CLIP_RATIO_LOW=0.2
 CLIP_RATIO_HIGH=0.28
-# use token mean loss reduction
-LOSS_REDUCTION="token_mean"
+LOSS_REDUCTION="token_mean_legacy"
 # applies overlong filtering (but not soft overlong punishment)
 APPLY_OVERLONG_FILTERING=true
 # apply soft overlong punishment with custom trainer impl in main_dapo.py
@@ -126,4 +125,4 @@ uv run --isolated --extra megatron -m examples.train.algorithms.dapo.main_dapo \
   trainer.resume_mode=latest \
   trainer.max_ckpts_to_keep=3 \
   trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_30b_a3b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}_lora_rank${LORA_RANK}_alpha${LORA_ALPHA}" \
-  $@
+  $@
diff --git a/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_megatron_aime.sh b/examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_megatron_aime.sh
@@ -18,8 +18,7 @@ LOGGER="wandb"  # change to "console" to print to stdout
 
 CLIP_RATIO_LOW=0.2
 CLIP_RATIO_HIGH=0.28
-# use token mean loss reduction
-LOSS_REDUCTION="token_mean"
+LOSS_REDUCTION="token_mean_legacy"
 # applies overlong filtering (but not soft overlong punishment)
 APPLY_OVERLONG_FILTERING=true
 # apply soft overlong punishment with custom trainer impl in main_dapo.py
@@ -121,4 +120,4 @@ uv run --isolated --extra megatron -m examples.train.algorithms.dapo.main_dapo \
   trainer.resume_mode=latest \
   trainer.max_ckpts_to_keep=3 \
   trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_30b_a3b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \
-  $@
+  $@