[reward] fix: backward compatibility with old reward config (verl-project#5287)

yyDing1 · web-flow · commit 018f8dc5c6c2 · 2026-02-11T20:33:35.000+08:00
### What does this PR do? add backward compatibility with old reward config ### Checklist Before Starting - [ ] Search for similar PRs. Paste at least one query link here: ... - [ ] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `veomni`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data`, `cfg`, `reward` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [ ] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [ ] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).) - [ ] If your PR is related to the `recipe` submodule, please also update the reference to the submodule commit via `git submodule update --remote` or `cd recipe && git pull origin main`.
diff --git a/tests/trainer/config/test_legacy_config_on_cpu.py b/tests/trainer/config/test_legacy_config_on_cpu.py
@@ -49,10 +49,8 @@ class TestConfigComparison(unittest.TestCase):
         "profile_steps",
         "worker_nsight_options",
         "controller_nsight_options",
-        "reward_model",
-        "custom_reward_function",
     ]
-    ignored_paths = ["reward_model"]
+    ignored_paths = ["reward_model", "custom_reward_function"]
 
     def _compare_configs_recursively(
         self, current_config, legacy_config, path="", legacy_allow_missing=True, current_allow_missing=False
diff --git a/verl/experimental/reward_loop/__init__.py b/verl/experimental/reward_loop/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-from .reward_loop import RewardLoopManager, RewardLoopWorker
+from .reward_loop import RewardLoopManager, RewardLoopWorker, migrate_legacy_reward_impl
 from .reward_model import RewardModelManager
 
-__all__ = ["RewardModelManager", "RewardLoopWorker", "RewardLoopManager"]
+__all__ = ["RewardModelManager", "RewardLoopWorker", "RewardLoopManager", "migrate_legacy_reward_impl"]
diff --git a/verl/experimental/reward_loop/reward_loop.py b/verl/experimental/reward_loop/reward_loop.py
@@ -20,7 +20,7 @@
 import numpy as np
 import ray
 import torch
-from omegaconf import DictConfig
+from omegaconf import DictConfig, open_dict
 from tensordict import TensorDict
 
 from verl.protocol import DataProto
@@ -35,6 +35,51 @@
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
 
+def migrate_legacy_reward_impl(config):
+    """
+    Migrate the legacy reward model implementation to the new one.
+    This is a temporary fix. A more robust one will be added.
+    """
+    # 1. reward workers migration
+    # config.reward_model.num_workers -> config.reward.num_workers
+    if config.reward_model.num_workers is not None:
+        config.reward.num_workers = config.reward_model.num_workers
+
+    # 2.reward manager migration
+    # config.reward_model.reward_manager -> config.reward.reward_manager
+    if config.reward_model.reward_manager is not None:
+        config.reward.reward_manager.name = config.reward_model.reward_manager
+    if config.reward_model.get("reward_loop_source") is not None:
+        config.reward.reward_manager.source = config.reward_model.reward_loop_source
+        config.reward.reward_manager.module.path = config.reward_model.reward_loop_module_path
+        config.reward.reward_manager.module.name = config.reward_model.reward_loop_class_name
+
+    # 3. custom reward function migration
+    # config.custom_reward_function -> config.reward.custom_reward_function
+    if not all(v is None for v in config.custom_reward_function.values()):
+        config.reward.custom_reward_function = config.custom_reward_function
+
+    # 4. reward model migration
+    # config.reward_model -> config.reward.reward_model
+    for key in ["enable", "enable_resource_pool", "n_gpus_per_node", "nnodes"]:
+        if config.reward_model.get(key) is not None:
+            config.reward.reward_model[key] = config.reward_model[key]
+    # for dapo reward kwargs
+    if config.reward_model.get("reward_kwargs") is not None:
+        with open_dict(config.reward.reward_model):
+            config.reward.reward_model["reward_kwargs"] = config.reward_model["reward_kwargs"]
+    legacy_rollout = config.reward_model.rollout
+    if not all(v is None for v in legacy_rollout.values()):
+        config.reward.reward_model.rollout = legacy_rollout
+
+    # 5. sandbox_fusion migration
+    # config.sandbox_fusion -> reward.sandbox_fusion
+    if not all(v is None for v in config.sandbox_fusion.values()):
+        config.reward.sandbox_fusion = config.sandbox_fusion
+
+    return config
+
+
 class RewardLoopWorker:
     """
     RewardLoopWork can tackle reward computation:
diff --git a/verl/trainer/config/_generated_ppo_megatron_trainer.yaml b/verl/trainer/config/_generated_ppo_megatron_trainer.yaml
@@ -572,6 +572,45 @@ critic:
         stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
   nccl_timeout: 600
   load_weight: true
+custom_reward_function:
+  path: null
+  name: null
+reward_model:
+  num_workers: null
+  reward_manager: null
+  enable: null
+  enable_resource_pool: null
+  n_gpus_per_node: null
+  nnodes: null
+  reward_loop_source: null
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  rollout:
+    name: null
+    dtype: null
+    gpu_memory_utilization: null
+    enforce_eager: null
+    cudagraph_capture_sizes: null
+    free_cache_engine: null
+    data_parallel_size: null
+    expert_parallel_size: null
+    tensor_model_parallel_size: null
+    max_num_batched_tokens: null
+    max_model_len: null
+    max_num_seqs: null
+    load_format: null
+    engine_kwargs: null
+    limit_images: null
+    enable_chunked_prefill: null
+    enable_prefix_caching: null
+    disable_log_stats: null
+    skip_tokenizer_init: null
+    prompt_length: null
+    response_length: null
+sandbox_fusion:
+  url: null
+  max_concurrent: null
+  memory_limit_mb: null
 reward:
   num_workers: 8
   custom_reward_function:
diff --git a/verl/trainer/config/_generated_ppo_trainer.yaml b/verl/trainer/config/_generated_ppo_trainer.yaml
@@ -492,6 +492,45 @@ critic:
   forward_micro_batch_size_per_gpu: ${oc.select:.ppo_micro_batch_size_per_gpu,null}
   ulysses_sequence_parallel_size: 1
   grad_clip: 1.0
+custom_reward_function:
+  path: null
+  name: null
+reward_model:
+  num_workers: null
+  reward_manager: null
+  enable: null
+  enable_resource_pool: null
+  n_gpus_per_node: null
+  nnodes: null
+  reward_loop_source: null
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  rollout:
+    name: null
+    dtype: null
+    gpu_memory_utilization: null
+    enforce_eager: null
+    cudagraph_capture_sizes: null
+    free_cache_engine: null
+    data_parallel_size: null
+    expert_parallel_size: null
+    tensor_model_parallel_size: null
+    max_num_batched_tokens: null
+    max_model_len: null
+    max_num_seqs: null
+    load_format: null
+    engine_kwargs: null
+    limit_images: null
+    enable_chunked_prefill: null
+    enable_prefix_caching: null
+    disable_log_stats: null
+    skip_tokenizer_init: null
+    prompt_length: null
+    response_length: null
+sandbox_fusion:
+  url: null
+  max_concurrent: null
+  memory_limit_mb: null
 reward:
   num_workers: 8
   custom_reward_function:
diff --git a/verl/trainer/config/_generated_ppo_veomni_trainer.yaml b/verl/trainer/config/_generated_ppo_veomni_trainer.yaml
@@ -487,6 +487,45 @@ critic:
         _target_: verl.utils.profiler.config.TorchMemoryToolConfig
         trace_alloc_max_entries: ${oc.select:global_profiler.global_tool_config.torch_memory.trace_alloc_max_entries,100000}
         stack_depth: ${oc.select:global_profiler.global_tool_config.torch_memory.stack_depth,32}
+custom_reward_function:
+  path: null
+  name: null
+reward_model:
+  num_workers: null
+  reward_manager: null
+  enable: null
+  enable_resource_pool: null
+  n_gpus_per_node: null
+  nnodes: null
+  reward_loop_source: null
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  rollout:
+    name: null
+    dtype: null
+    gpu_memory_utilization: null
+    enforce_eager: null
+    cudagraph_capture_sizes: null
+    free_cache_engine: null
+    data_parallel_size: null
+    expert_parallel_size: null
+    tensor_model_parallel_size: null
+    max_num_batched_tokens: null
+    max_model_len: null
+    max_num_seqs: null
+    load_format: null
+    engine_kwargs: null
+    limit_images: null
+    enable_chunked_prefill: null
+    enable_prefix_caching: null
+    disable_log_stats: null
+    skip_tokenizer_init: null
+    prompt_length: null
+    response_length: null
+sandbox_fusion:
+  url: null
+  max_concurrent: null
+  memory_limit_mb: null
 reward:
   num_workers: 8
   custom_reward_function:
diff --git a/verl/trainer/config/legacy_reward_impl.yaml b/verl/trainer/config/legacy_reward_impl.yaml
@@ -0,0 +1,42 @@
+custom_reward_function:
+  path: null
+  name: null
+
+reward_model:
+  num_workers: null
+  reward_manager: null
+  enable: null
+  enable_resource_pool: null
+  n_gpus_per_node: null
+  nnodes: null
+  reward_loop_source: null
+  reward_loop_module_path: null
+  reward_loop_class_name: null
+  rollout:
+    name: null
+    dtype: null
+    gpu_memory_utilization: null
+    enforce_eager: null
+    cudagraph_capture_sizes: null
+    free_cache_engine: null
+    data_parallel_size: null
+    expert_parallel_size: null
+    tensor_model_parallel_size: null
+    max_num_batched_tokens: null
+    max_model_len: null
+    max_num_seqs: null
+    load_format: null
+    engine_kwargs: null
+    limit_images: null
+    enable_chunked_prefill: null
+    enable_prefix_caching: null
+    disable_log_stats: null
+    skip_tokenizer_init: null
+
+    prompt_length: null
+    response_length: null
+
+sandbox_fusion:
+  url: null
+  max_concurrent: null
+  memory_limit_mb: null
diff --git a/verl/trainer/config/ppo_megatron_trainer.yaml b/verl/trainer/config/ppo_megatron_trainer.yaml
@@ -15,6 +15,8 @@ defaults:
   - model@actor_rollout_ref.model: hf_model
   # Critic model config.
   - critic@critic: megatron_critic
+  # legacy reward impl config, for backward compatibility
+  - legacy_reward_impl
   # Reward model config.
   - reward@reward: reward
   # Rollout correction config.
diff --git a/verl/trainer/config/ppo_trainer.yaml b/verl/trainer/config/ppo_trainer.yaml
@@ -29,6 +29,9 @@ defaults:
   # Critic model config.
   - critic@critic: ${model_engine}_critic
 
+  # legacy reward impl config, for backward compatibility
+  - legacy_reward_impl
+
   # Reward config.
   - reward@reward: reward
 
diff --git a/verl/trainer/main_ppo.py b/verl/trainer/main_ppo.py
@@ -23,6 +23,7 @@
 from omegaconf import OmegaConf
 
 from verl.experimental.dataset.sampler import AbstractSampler
+from verl.experimental.reward_loop import migrate_legacy_reward_impl
 from verl.trainer.constants_ppo import get_ppo_ray_runtime_env
 from verl.trainer.ppo.ray_trainer import RayPPOTrainer
 from verl.trainer.ppo.utils import need_critic, need_reference_policy
@@ -40,7 +41,7 @@ def main(config):
     """
     # Automatically set `config.trainer.device = npu` when running on Ascend NPU.
     auto_set_device(config)
-
+    config = migrate_legacy_reward_impl(config)
     run_ppo(config)