Skip to content

Commit b418d34

Browse files
SumanthRHclaude
andauthored
[fix] use token_mean_legacy loss reduction for DAPO scripts (#1637)
# What does this PR do? Switches back to `token_mean_legacy` for DAPO scripts. For reasoning RL, the legacy token mean reduction has better performance because it biases towards larger response lengths, leading to longer reasoning chains. This is also the reduction used for the original DAPO runs on VeRL. (And [our own ](https://wandb.ai/sky-posttraining-uc-berkeley/skyrl-train-dapo-aime/reports/SkyRL-Train-DAPO--VmlldzoxNDkyOTc0MQ?accessToken=vxzk3zov4kehgbfe7j897syz1vv5u01wdfnzajnr6hymeh1ne4hrnav4q9d0h2kk) reproduction runs from January) --------- Signed-off-by: SumanthRH <sumanthrh99@gmail.com> Co-authored-by: Claude Sonnet 4.6 <noreply@anthropic.com>
1 parent 2df8f51 commit b418d34

8 files changed

Lines changed: 10 additions & 15 deletions

examples/train/algorithms/dapo/run_dapo_aime_qwen3_4b_aime.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@ LOGGER="wandb" # change to "console" to print to stdout
1616

1717
CLIP_RATIO_LOW=0.2
1818
CLIP_RATIO_HIGH=0.28
19-
# use dr. grpo loss reduction
20-
LOSS_REDUCTION="token_mean"
19+
LOSS_REDUCTION="token_mean_legacy"
2120
# applies overlong filtering (but not soft overlong punishment)
2221
APPLY_OVERLONG_FILTERING=true
2322
# apply soft overlong punishment with custom trainer impl in main_dapo.py

examples/train/algorithms/dapo/run_dapo_gsm8k.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,7 +15,7 @@ EPS_CLIP_LOW=0.2
1515
EPS_CLIP_HIGH=0.28
1616
DYNAMIC_SAMPLING_TYPE=filter
1717
DYNAMIC_SAMPLING_MAX_SAMPLE_BATCHES=30
18-
LOSS_REDUCTION="token_mean"
18+
LOSS_REDUCTION="token_mean_legacy"
1919
# applies overlong filtering (but not soft overlong punishment)
2020
APPLY_OVERLONG_FILTERING=true
2121
# apply soft overlong punishment with custom trainer impl in main_dapo.py

examples/train/algorithms/dapo/run_dapo_qwen2.5_32b_aime.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@ EPS_CLIP_HIGH=0.28
2020
# dynamic sampling parameters - off by default, since this greatly slows down inference
2121
DYNAMIC_SAMPLING_TYPE=null
2222
DYNAMIC_SAMPLING_MAX_SAMPLE_BATCHES=30
23-
LOSS_REDUCTION="token_mean"
23+
LOSS_REDUCTION="token_mean_legacy"
2424
# applies overlong filtering (but not soft overlong punishment)
2525
APPLY_OVERLONG_FILTERING=true
2626
# apply soft overlong punishment with custom trainer impl in main_dapo.py

examples/train/algorithms/dapo/run_dapo_qwen2.5_math_7b_aime.sh

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -21,7 +21,7 @@ EPS_CLIP_HIGH=0.28
2121
# dynamic sampling parameters - off by default, since this greatly slows down inference
2222
DYNAMIC_SAMPLING_TYPE=null
2323
DYNAMIC_SAMPLING_MAX_SAMPLE_BATCHES=30
24-
LOSS_REDUCTION="token_mean"
24+
LOSS_REDUCTION="token_mean_legacy"
2525
# applies overlong filtering (but not soft overlong punishment)
2626
APPLY_OVERLONG_FILTERING=true
2727
# apply soft overlong punishment with custom trainer impl in main_dapo.py

examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@ LOGGER="wandb" # change to "console" to print to stdout
1616

1717
CLIP_RATIO_LOW=0.2
1818
CLIP_RATIO_HIGH=0.28
19-
# use dr. grpo loss reduction
20-
LOSS_REDUCTION="token_mean"
19+
LOSS_REDUCTION="token_mean_legacy"
2120
# applies overlong filtering (but not soft overlong punishment)
2221
APPLY_OVERLONG_FILTERING=true
2322
# apply soft overlong punishment with custom trainer impl in main_dapo.py

examples/train/algorithms/dapo/run_dapo_qwen3_1.7b_aime_fully_async.sh

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -16,8 +16,7 @@ LOGGER="wandb" # change to "console" to print to stdout
1616

1717
CLIP_RATIO_LOW=0.2
1818
CLIP_RATIO_HIGH=0.28
19-
# use dr. grpo loss reduction
20-
LOSS_REDUCTION="token_mean"
19+
LOSS_REDUCTION="token_mean_legacy"
2120
# applies overlong filtering (but not soft overlong punishment)
2221
APPLY_OVERLONG_FILTERING=true
2322
# apply soft overlong punishment with custom trainer impl in main_dapo_fully_async.py

examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_lora_megatron_aime.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@ LOGGER="wandb" # change to "console" to print to stdout
1818

1919
CLIP_RATIO_LOW=0.2
2020
CLIP_RATIO_HIGH=0.28
21-
# use token mean loss reduction
22-
LOSS_REDUCTION="token_mean"
21+
LOSS_REDUCTION="token_mean_legacy"
2322
# applies overlong filtering (but not soft overlong punishment)
2423
APPLY_OVERLONG_FILTERING=true
2524
# apply soft overlong punishment with custom trainer impl in main_dapo.py
@@ -126,4 +125,4 @@ uv run --isolated --extra megatron -m examples.train.algorithms.dapo.main_dapo \
126125
trainer.resume_mode=latest \
127126
trainer.max_ckpts_to_keep=3 \
128127
trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_30b_a3b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}_lora_rank${LORA_RANK}_alpha${LORA_ALPHA}" \
129-
$@
128+
$@

examples/train/algorithms/dapo/run_dapo_qwen3_30b_a3b_megatron_aime.sh

Lines changed: 2 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -18,8 +18,7 @@ LOGGER="wandb" # change to "console" to print to stdout
1818

1919
CLIP_RATIO_LOW=0.2
2020
CLIP_RATIO_HIGH=0.28
21-
# use token mean loss reduction
22-
LOSS_REDUCTION="token_mean"
21+
LOSS_REDUCTION="token_mean_legacy"
2322
# applies overlong filtering (but not soft overlong punishment)
2423
APPLY_OVERLONG_FILTERING=true
2524
# apply soft overlong punishment with custom trainer impl in main_dapo.py
@@ -121,4 +120,4 @@ uv run --isolated --extra megatron -m examples.train.algorithms.dapo.main_dapo \
121120
trainer.resume_mode=latest \
122121
trainer.max_ckpts_to_keep=3 \
123122
trainer.ckpt_path="$HOME/ckpts/dapo_qwen3_30b_a3b_base_megatron_tp${MEGATRON_TP}_pp${MEGATRON_PP}_cp${MEGATRON_CP}_ep${MEGATRON_EP}_etp${MEGATRON_ETP}" \
124-
$@
123+
$@

0 commit comments

Comments
 (0)