refactor

hjh0119 · hjh0119 · commit 956675cf0287 · 2026-02-28T14:11:45.000+08:00
diff --git a/examples/train/grpo/external/vllm_gym.sh b/examples/train/grpo/external/vllm_gym.sh
@@ -3,7 +3,6 @@
 # CUDA_VISIBLE_DEVICES=7 \
 # swift rollout \
 #   --model Qwen/Qwen2.5-3B-Instruct \
-#   --model_type qwen2_5\
 #   --max_turns 3\
 #   --multi_turn_scheduler gym_scheduler \
 #   --use_gym_env true \
diff --git a/examples/train/grpo/plugin/deepeyes/deepeyes_plugin.py b/examples/train/grpo/plugin/deepeyes/deepeyes_plugin.py
@@ -187,8 +187,8 @@ def rule_math_verify(ground_truth, model_answer):
 
 class DeepEyesReward(ORM):
 
-    def __init__(self):
-        super().__init__()
+    def __init__(self, args, **kwargs):
+        super().__init__(args)
         try:
             self.client = OpenAI(
                 api_key='EMPTY',
diff --git a/examples/train/grpo/plugin/plugin.py b/examples/train/grpo/plugin/plugin.py
@@ -154,7 +154,8 @@ class MultiTurnThinkingTips(ORM):
     function **must return an identical reward for every fragment**
     """
 
-    def __init__(self):
+    def __init__(self, args=None, **kwargs):
+        super().__init__(args)
         from swift.rewards.orm import MathAccuracy
         self.acc_func = MathAccuracy()
 
@@ -183,7 +184,8 @@ def __call__(self, completions, **kwargs) -> List[float]:
 # ref implementation: https://github.com/huggingface/open-r1/blob/main/src/open_r1/rewards.py
 class CodeReward(ORM):
 
-    def __init__(self):
+    def __init__(self, args=None, **kwargs):
+        super().__init__(args)
         import importlib.util
         assert importlib.util.find_spec('e2b') is not None, (
             "The e2b package is required but not installed. Please install it using 'pip install e2b-code-interpreter'."
@@ -368,7 +370,8 @@ class CodeRewardByJudge0(ORM):
     }
     PYTHON_ID = 71
 
-    def __init__(self):
+    def __init__(self, args, **kwargs):
+        super().__init__(args)
         self.endpoint = os.getenv('JUDGE0_ENDPOINT')
         assert self.endpoint is not None, (
             'Judge0 endpoint is not set. Please set the JUDGE0_ENDPOINT environment variable.')
@@ -488,7 +491,8 @@ class AsyncGenRMReward(AsyncORM):
            ```
     """
 
-    def __init__(self):
+    def __init__(self, args, **kwargs):
+        super().__init__(args)
         from openai import OpenAI
         self.api_base = os.getenv('GENRM_API_BASE', 'http://localhost:8000/v1')
         self.temperature = float(os.getenv('GENRM_TEMPERATURE', '0.3'))
@@ -637,7 +641,8 @@ async def __call__(self, completions, messages, **kwargs) -> List[float]:
 # COARSEREWARD -> Coarse, INTERMEDIATEREWARD -> Intermediate, REFINEDREWARD -> Finegrained
 class ToolUseFormatReward(ORM):
 
-    def __init__(self):
+    def __init__(self, args=None, **kwargs):
+        super().__init__(args)
         self.format_max_possible = 1.0
         self.format_min_possible = 0.0
 
@@ -700,7 +705,8 @@ def __call__(self, completions, solution, **kwargs) -> List[float]:
 
 class ToolUseLengthReward(ORM):
 
-    def __init__(self):
+    def __init__(self, args=None, **kwargs):
+        super().__init__(args)
         self.length_max_possible = 1.0
         self.length_min_possible = 0.0
 
@@ -739,7 +745,8 @@ def __call__(self, completions, solution, **kwargs):
 
 class ToolUseCorrectnessReward(ORM):
 
-    def __init__(self):
+    def __init__(self, args=None, **kwargs):
+        super().__init__(args)
         if str(os.getenv('CORRECTMAX1', 0)) == '1':
             self.tool_max_possible = 1.0
             self.tool_min_possible = -1.0
diff --git a/swift/rewards/orm.py b/swift/rewards/orm.py
@@ -4,10 +4,13 @@
 import json
 import os
 import re
-from typing import Dict, List, Union
+from typing import TYPE_CHECKING, Dict, List, Optional, Union
 
 from swift.infer_engine import InferRequest
 
+if TYPE_CHECKING:
+    from swift.rlhf_trainers import GRPOConfig
+
 
 class ORM:
     """Base class for synchronous outcome reward models (ORM).
@@ -20,6 +23,9 @@ def __call__(self, completions, **kwargs) -> List[float]:
                 return [1.0 if len(c) > 100 else 0.0 for c in completions]
     """
 
+    def __init__(self, args: Optional['GRPOConfig'] = None, **kwargs):
+        self.args = args
+
     def __call__(self, **kwargs) -> List[float]:
         raise NotImplementedError
 
@@ -52,13 +58,17 @@ async def score_single(session, text):
                     return list(rewards)
     """
 
+    def __init__(self, args: Optional['GRPOConfig'] = None, **kwargs):
+        self.args = args
+
     async def __call__(self, **kwargs) -> List[float]:
         raise NotImplementedError
 
 
 class MathAccuracy(ORM):
 
-    def __init__(self):
+    def __init__(self, args=None, **kwargs):
+        super().__init__(args, **kwargs)
         import importlib.util
         assert importlib.util.find_spec('math_verify') is not None, (
             'The math_verify package is required but not installed. '
@@ -129,18 +139,13 @@ def __call__(self, completions, **kwargs) -> List[float]:
 
 class CosineReward(ORM):
     # https://arxiv.org/abs/2502.03373
-    def __init__(self,
-                 cosine_min_len_value_wrong: float = -0.5,
-                 cosine_max_len_value_wrong: float = 0.0,
-                 cosine_min_len_value_correct: float = 1.0,
-                 cosine_max_len_value_correct: float = 0.5,
-                 cosine_max_len: int = 1000,
-                 accuracy_orm=None):
-        self.min_len_value_wrong = cosine_min_len_value_wrong
-        self.max_len_value_wrong = cosine_max_len_value_wrong
-        self.min_len_value_correct = cosine_min_len_value_correct
-        self.max_len_value_correct = cosine_max_len_value_correct
-        self.max_len = cosine_max_len
+    def __init__(self, args: Optional['GRPOConfig'] = None, accuracy_orm=None):
+        super().__init__(args)
+        self.min_len_value_wrong = args.cosine_min_len_value_wrong
+        self.max_len_value_wrong = args.cosine_max_len_value_wrong
+        self.min_len_value_correct = args.cosine_min_len_value_correct
+        self.max_len_value_correct = args.cosine_max_len_value_correct
+        self.max_len = args.cosine_max_len
         self.accuracy_orm = accuracy_orm or MathAccuracy()
 
     @staticmethod
@@ -169,9 +174,10 @@ def __call__(self, completions, solution, **kwargs) -> List[float]:
 
 class RepetitionPenalty(ORM):
     # https://arxiv.org/abs/2502.03373
-    def __init__(self, repetition_n_grams: int = 3, repetition_max_penalty: float = -1.0):
-        self.ngram_size = repetition_n_grams
-        self.max_penalty = repetition_max_penalty
+    def __init__(self, args: Optional['GRPOConfig'] = None, **kwargs):
+        super().__init__(args)
+        self.ngram_size = args.repetition_n_grams
+        self.max_penalty = args.repetition_max_penalty
 
     @staticmethod
     def zipngram(text: str, ngram_size: int):
@@ -208,10 +214,11 @@ def __call__(self, completions, **kwargs) -> List[float]:
 
 class SoftOverlong(ORM):
 
-    def __init__(self, soft_max_length, soft_cache_length):
-        assert soft_cache_length < soft_max_length
-        self.soft_max_length = soft_max_length
-        self.soft_cache_length = soft_cache_length
+    def __init__(self, args: Optional['GRPOConfig'] = None, **kwargs):
+        super().__init__(args)
+        assert args.soft_cache_length < args.soft_max_length
+        self.soft_max_length = args.soft_max_length
+        self.soft_cache_length = args.soft_cache_length
 
     def __call__(self, completions, **kwargs) -> List[float]:
         rewards = []
@@ -369,7 +376,8 @@ def evaluate_rougel(cand_list: list, ref_list: list):
 
 class MathORM(ORM):
 
-    def __init__(self):
+    def __init__(self, args=None, **kwargs):
+        super().__init__(args)
         from transformers.utils import strtobool
         self.use_opencompass = strtobool(os.environ.get('USE_OPENCOMPASS_EVALUATOR', 'False'))
         if self.use_opencompass:
diff --git a/swift/rlhf_trainers/arguments.py b/swift/rlhf_trainers/arguments.py
@@ -1,13 +1,23 @@
 # Copyright (c) ModelScope Contributors. All rights reserved.
+import trl
 from dataclasses import dataclass
+from packaging import version
 from transformers.utils.versions import require_version
-from trl import CPOConfig as HfCPOConfig
+
+if version.parse(trl.__version__) <= version.parse('0.28'):
+    from trl import CPOConfig as HfCPOConfig
+    from trl import GKDConfig as HfGKDConfig
+    from trl import ORPOConfig as HfORPOConfig
+    from trl import PPOConfig as HfPPOConfig
+else:
+    from trl.experimental.cpo import CPOConfig as HfCPOConfig
+    from trl.experimental.gkd import GKDConfig as HfGKDConfig
+    from trl.experimental.orpo import ORPOConfig as HfORPOConfig
+    from trl.experimental.ppo import PPOConfig as HfPPOConfig
+
 from trl import DPOConfig as HfDPOConfig
-from trl import GKDConfig as HfGKDConfig
 from trl import GRPOConfig as HfGRPOConfig
 from trl import KTOConfig as HfKTOConfig
-from trl import ORPOConfig as HfORPOConfig
-from trl import PPOConfig as HfPPOConfig
 from trl import RewardConfig as HfRewardConfig
 from typing import Optional
 
diff --git a/swift/rlhf_trainers/grpo_trainer.py b/swift/rlhf_trainers/grpo_trainer.py
@@ -943,7 +943,10 @@ def _prepare_batch_inputs(self, inputs: DataType) -> List[DataType]:
                 # rollout_logprobs is List[List[float]] - nested list where each inner list corresponds to
                 # one assistant response turn. We need to align these with completion_mask positions.
                 batch_encoded_inputs['rollout_per_token_logps'] = None
-                if self.use_fast_infer:
+                should_compute_rollout_logprobs = (
+                    self.rollout_importance_sampling_mode is not None or self.log_rollout_offpolicy_metrics)
+
+                if self.use_fast_infer and should_compute_rollout_logprobs:
                     rollout_logprobs_list = []
                     for data in batch:
                         if 'rollout_logprobs' in data and data['rollout_logprobs']:
@@ -2206,14 +2209,7 @@ def _prepare_rewards(self, reward_funcs, reward_model=None, reward_templates=Non
             for i, reward_func in enumerate(reward_funcs):
                 if reward_func in orms:
                     reward_func_class = orms[reward_func]
-                    reward_func_args = list(inspect.signature(reward_func_class.__init__).parameters)
-                    reward_func_kwargs = {
-                        key: getattr(args, key)
-                        for key in reward_func_args if key not in ['self', 'args', 'kwargs'] and hasattr(args, key)
-                    }
-                    if 'tokenizer' in reward_func_args:
-                        reward_func_kwargs['tokenizer'] = self.processing_class
-                    reward_funcs[i] = reward_func_class(**reward_func_kwargs)
+                    reward_funcs[i] = reward_func_class(args=args)
                 elif not callable(reward_func):
                     raise ValueError(f'reward_function {reward_func} is not implemented in swift.rewards')
 
@@ -2247,6 +2243,9 @@ def _prepare_rewards(self, reward_funcs, reward_model=None, reward_templates=Non
                 self.reward_funcs.append(rm)
                 self.reward_func_names.append(rm.config._name_or_path.split('/')[-1])
 
+        if self.use_gym_env and not self.reward_func_names:
+            self.reward_func_names = ['gym_reward']
+
         # Reward weights
         if args.reward_weights is not None:
             if len(args.reward_weights) != len(reward_funcs):
diff --git a/swift/rlhf_trainers/rollout_mixin.py b/swift/rlhf_trainers/rollout_mixin.py
@@ -167,8 +167,6 @@ def _prepare_vllm(self):
             self.use_gym_env = broadcast_object_list(use_gym_env, from_process=0)[0]
             self.enable_server_multi_turn = broadcast_object_list(enable_multi_turn, from_process=0)[0]
             self.rollout_enable_lora = broadcast_object_list(enable_lora, from_process=0)[0]
-            if self.use_gym_env:
-                self.reward_func_names = ['gym_reward']
 
         elif self.vllm_mode == 'colocate':
             if not self.accelerator.num_processes % self.vllm_tensor_parallel_size == 0:
diff --git a/swift/rlhf_trainers/utils.py b/swift/rlhf_trainers/utils.py
@@ -206,7 +206,9 @@ def _patched_stateless_pg_create(
 patch_stateless_process_group_for_ipv6()
 
 
-def nanstd(tensor: torch.Tensor, dim: Optional[int] = None, keepdim: bool = False) -> torch.Tensor:
+def nanstd(tensor: torch.Tensor,
+           dim: Optional[Union[int, tuple[int, ...]]] = None,
+           keepdim: bool = False) -> torch.Tensor:
     """
     Compute the standard deviation of a tensor, ignoring NaNs.
 
@@ -215,7 +217,7 @@ def nanstd(tensor: torch.Tensor, dim: Optional[int] = None, keepdim: bool = Fals
     Args:
         tensor (`torch.Tensor`):
             Input tensor.
-        dim (`int`, *optional*):
+        dim (`int` or `tuple[int, ...]`, *optional*):
             Dimension to reduce. Defaults to all dimensions.
         keepdim (`bool`, *optional*, defaults to `False`):
             Whether to keep reduced dimensions.
@@ -227,13 +229,20 @@ def nanstd(tensor: torch.Tensor, dim: Optional[int] = None, keepdim: bool = Fals
     mean = torch.nanmean(tensor, dim=dim, keepdim=True)
     variance = torch.nanmean((tensor - mean)**2, dim=dim, keepdim=True)
     count = torch.sum(~torch.isnan(tensor), dim=dim, keepdim=True)
-    correction = torch.where(count > 1, count / (count - 1), torch.full_like(count, float('nan')))
-    std = torch.sqrt(variance * correction)
+    correction = count / (count - 1)
+    correction = torch.where(count > 1, correction, torch.full_like(correction, float('nan')))
+    variance *= correction  # Bessel's correction
+    std = torch.sqrt(variance)
     if keepdim:
         return std
     if dim is None:
         return std.squeeze()
-    return std.squeeze(dim)
+    if isinstance(dim, int):
+        return std.squeeze(dim)
+    dims = [(d if d >= 0 else d + std.ndim) for d in dim]
+    for d in sorted(dims, reverse=True):
+        std = std.squeeze(d)
+    return std
 
 
 # code borrowed from verl/verl/utils/memory_utils.py