Skip to content

Commit 018f8dc

Browse files
authored
[reward] fix: backward compatibility with old reward config (verl-project#5287)
### What does this PR do? add backward compatibility with old reward config ### Checklist Before Starting - [ ] Search for similar PRs. Paste at least one query link here: ... - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `veomni`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).) - [ ] If your PR is related to the `recipe` submodule, please also update the reference to the submodule commit via `git submodule update --remote` or `cd recipe && git pull origin main`.
1 parent a4ae296 commit 018f8dc

File tree

10 files changed

+215
-7
lines changed

10 files changed

+215
-7
lines changed

tests/trainer/config/test_legacy_config_on_cpu.py

Lines changed: 1 addition & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -49,10 +49,8 @@ class TestConfigComparison(unittest.TestCase):
4949
"profile_steps",
5050
"worker_nsight_options",
5151
"controller_nsight_options",
52-
"reward_model",
53-
"custom_reward_function",
5452
]
55-
ignored_paths = ["reward_model"]
53+
ignored_paths = ["reward_model", "custom_reward_function"]
5654

5755
def _compare_configs_recursively(
5856
self, current_config, legacy_config, path="", legacy_allow_missing=True, current_allow_missing=False

verl/experimental/reward_loop/__init__.py

Lines changed: 2 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -12,7 +12,7 @@
1212
# See the License for the specific language governing permissions and
1313
# limitations under the License.
1414

15-
from .reward_loop import RewardLoopManager, RewardLoopWorker
15+
from .reward_loop import RewardLoopManager, RewardLoopWorker, migrate_legacy_reward_impl
1616
from .reward_model import RewardModelManager
1717

18-
__all__ = ["RewardModelManager", "RewardLoopWorker", "RewardLoopManager"]
18+
__all__ = ["RewardModelManager", "RewardLoopWorker", "RewardLoopManager", "migrate_legacy_reward_impl"]

verl/experimental/reward_loop/reward_loop.py

Lines changed: 46 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -20,7 +20,7 @@
2020
import numpy as np
2121
import ray
2222
import torch
23-
from omegaconf import DictConfig
23+
from omegaconf import DictConfig, open_dict
2424
from tensordict import TensorDict
2525

2626
from verl.protocol import DataProto
@@ -35,6 +35,51 @@
3535
logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
3636

3737

38+
def migrate_legacy_reward_impl(config):
39+
"""
40+
Migrate the legacy reward model implementation to the new one.
41+
This is a temporary fix. A more robust one will be added.
42+
"""
43+
# 1. reward workers migration
44+
# config.reward_model.num_workers -> config.reward.num_workers
45+
if config.reward_model.num_workers is not None:
46+
config.reward.num_workers = config.reward_model.num_workers
47+
48+
# 2.reward manager migration
49+
# config.reward_model.reward_manager -> config.reward.reward_manager
50+
if config.reward_model.reward_manager is not None:
51+
config.reward.reward_manager.name = config.reward_model.reward_manager
52+
if config.reward_model.get("reward_loop_source") is not None:
53+
config.reward.reward_manager.source = config.reward_model.reward_loop_source
54+
config.reward.reward_manager.module.path = config.reward_model.reward_loop_module_path
55+
config.reward.reward_manager.module.name = config.reward_model.reward_loop_class_name
56+
57+
# 3. custom reward function migration
58+
# config.custom_reward_function -> config.reward.custom_reward_function
59+
if not all(v is None for v in config.custom_reward_function.values()):
60+
config.reward.custom_reward_function = config.custom_reward_function
61+
62+
# 4. reward model migration
63+
# config.reward_model -> config.reward.reward_model
64+
for key in ["enable", "enable_resource_pool", "n_gpus_per_node", "nnodes"]:
65+
if config.reward_model.get(key) is not None:
66+
config.reward.reward_model[key] = config.reward_model[key]
67+
# for dapo reward kwargs
68+
if config.reward_model.get("reward_kwargs") is not None:
69+
with open_dict(config.reward.reward_model):
70+
config.reward.reward_model["reward_kwargs"] = config.reward_model["reward_kwargs"]
71+
legacy_rollout = config.reward_model.rollout
72+
if not all(v is None for v in legacy_rollout.values()):
73+
config.reward.reward_model.rollout = legacy_rollout
74+
75+
# 5. sandbox_fusion migration
76+
# config.sandbox_fusion -> reward.sandbox_fusion
77+
if not all(v is None for v in config.sandbox_fusion.values()):
78+
config.reward.sandbox_fusion = config.sandbox_fusion
79+
80+
return config
81+
82+
3883
class RewardLoopWorker:
3984
"""
4085
RewardLoopWork can tackle reward computation:

verl/trainer/config/_generated_ppo_megatron_trainer.yaml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -572,6 +572,45 @@ critic:
572572
stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
573573
nccl_timeout: 600
574574
load_weight: true
575+
custom_reward_function:
576+
path: null
577+
name: null
578+
reward_model:
579+
num_workers: null
580+
reward_manager: null
581+
enable: null
582+
enable_resource_pool: null
583+
n_gpus_per_node: null
584+
nnodes: null
585+
reward_loop_source: null
586+
reward_loop_module_path: null
587+
reward_loop_class_name: null
588+
rollout:
589+
name: null
590+
dtype: null
591+
gpu_memory_utilization: null
592+
enforce_eager: null
593+
cudagraph_capture_sizes: null
594+
free_cache_engine: null
595+
data_parallel_size: null
596+
expert_parallel_size: null
597+
tensor_model_parallel_size: null
598+
max_num_batched_tokens: null
599+
max_model_len: null
600+
max_num_seqs: null
601+
load_format: null
602+
engine_kwargs: null
603+
limit_images: null
604+
enable_chunked_prefill: null
605+
enable_prefix_caching: null
606+
disable_log_stats: null
607+
skip_tokenizer_init: null
608+
prompt_length: null
609+
response_length: null
610+
sandbox_fusion:
611+
url: null
612+
max_concurrent: null
613+
memory_limit_mb: null
575614
reward:
576615
num_workers: 8
577616
custom_reward_function:

verl/trainer/config/_generated_ppo_trainer.yaml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -492,6 +492,45 @@ critic:
492492
forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
493493
ulysses_sequence_parallel_size: 1
494494
grad_clip: 1.0
495+
custom_reward_function:
496+
path: null
497+
name: null
498+
reward_model:
499+
num_workers: null
500+
reward_manager: null
501+
enable: null
502+
enable_resource_pool: null
503+
n_gpus_per_node: null
504+
nnodes: null
505+
reward_loop_source: null
506+
reward_loop_module_path: null
507+
reward_loop_class_name: null
508+
rollout:
509+
name: null
510+
dtype: null
511+
gpu_memory_utilization: null
512+
enforce_eager: null
513+
cudagraph_capture_sizes: null
514+
free_cache_engine: null
515+
data_parallel_size: null
516+
expert_parallel_size: null
517+
tensor_model_parallel_size: null
518+
max_num_batched_tokens: null
519+
max_model_len: null
520+
max_num_seqs: null
521+
load_format: null
522+
engine_kwargs: null
523+
limit_images: null
524+
enable_chunked_prefill: null
525+
enable_prefix_caching: null
526+
disable_log_stats: null
527+
skip_tokenizer_init: null
528+
prompt_length: null
529+
response_length: null
530+
sandbox_fusion:
531+
url: null
532+
max_concurrent: null
533+
memory_limit_mb: null
495534
reward:
496535
num_workers: 8
497536
custom_reward_function:

verl/trainer/config/_generated_ppo_veomni_trainer.yaml

Lines changed: 39 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -487,6 +487,45 @@ critic:
487487
_target_: verl.utils.profiler.config.TorchMemoryToolConfig
488488
trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
489489
stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
490+
custom_reward_function:
491+
path: null
492+
name: null
493+
reward_model:
494+
num_workers: null
495+
reward_manager: null
496+
enable: null
497+
enable_resource_pool: null
498+
n_gpus_per_node: null
499+
nnodes: null
500+
reward_loop_source: null
501+
reward_loop_module_path: null
502+
reward_loop_class_name: null
503+
rollout:
504+
name: null
505+
dtype: null
506+
gpu_memory_utilization: null
507+
enforce_eager: null
508+
cudagraph_capture_sizes: null
509+
free_cache_engine: null
510+
data_parallel_size: null
511+
expert_parallel_size: null
512+
tensor_model_parallel_size: null
513+
max_num_batched_tokens: null
514+
max_model_len: null
515+
max_num_seqs: null
516+
load_format: null
517+
engine_kwargs: null
518+
limit_images: null
519+
enable_chunked_prefill: null
520+
enable_prefix_caching: null
521+
disable_log_stats: null
522+
skip_tokenizer_init: null
523+
prompt_length: null
524+
response_length: null
525+
sandbox_fusion:
526+
url: null
527+
max_concurrent: null
528+
memory_limit_mb: null
490529
reward:
491530
num_workers: 8
492531
custom_reward_function:
Lines changed: 42 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,42 @@
1+
custom_reward_function:
2+
path: null
3+
name: null
4+
5+
reward_model:
6+
num_workers: null
7+
reward_manager: null
8+
enable: null
9+
enable_resource_pool: null
10+
n_gpus_per_node: null
11+
nnodes: null
12+
reward_loop_source: null
13+
reward_loop_module_path: null
14+
reward_loop_class_name: null
15+
rollout:
16+
name: null
17+
dtype: null
18+
gpu_memory_utilization: null
19+
enforce_eager: null
20+
cudagraph_capture_sizes: null
21+
free_cache_engine: null
22+
data_parallel_size: null
23+
expert_parallel_size: null
24+
tensor_model_parallel_size: null
25+
max_num_batched_tokens: null
26+
max_model_len: null
27+
max_num_seqs: null
28+
load_format: null
29+
engine_kwargs: null
30+
limit_images: null
31+
enable_chunked_prefill: null
32+
enable_prefix_caching: null
33+
disable_log_stats: null
34+
skip_tokenizer_init: null
35+
36+
prompt_length: null
37+
response_length: null
38+
39+
sandbox_fusion:
40+
url: null
41+
max_concurrent: null
42+
memory_limit_mb: null

verl/trainer/config/ppo_megatron_trainer.yaml

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -15,6 +15,8 @@ defaults:
1515
- model@actor_rollout_ref.model: hf_model
1616
# Critic model config.
1717
- critic@critic: megatron_critic
18+
# legacy reward impl config, for backward compatibility
19+
- legacy_reward_impl
1820
# Reward model config.
1921
- reward@reward: reward
2022
# Rollout correction config.

verl/trainer/config/ppo_trainer.yaml

Lines changed: 3 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -29,6 +29,9 @@ defaults:
2929
# Critic model config.
3030
- critic@critic: ${model_engine}_critic
3131

32+
# legacy reward impl config, for backward compatibility
33+
- legacy_reward_impl
34+
3235
# Reward config.
3336
- reward@reward: reward
3437

verl/trainer/main_ppo.py

Lines changed: 2 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -23,6 +23,7 @@
2323
from omegaconf import OmegaConf
2424

2525
from verl.experimental.dataset.sampler import AbstractSampler
26+
from verl.experimental.reward_loop import migrate_legacy_reward_impl
2627
from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
2728
from verl.trainer.ppo.ray_trainer import RayPPOTrainer
2829
from verl.trainer.ppo.utils import need_critic, need_reference_policy
@@ -40,7 +41,7 @@ def main(config):
4041
"""
4142
# Automatically set `config.trainer.device = npu` when running on Ascend NPU.
4243
auto_set_device(config)
43-
44+
config = migrate_legacy_reward_impl(config)
4445
run_ppo(config)
4546

4647

0 commit comments

Comments
 (0)