Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 2 additions & 2 deletions examples/config.yaml
Original file line number Diff line number Diff line change
Expand Up @@ -24,7 +24,7 @@ algorithm:
use_kl_loss: true
kl_penalty: low_var_kl
kl_coef: 1.0e-2
online_filtering: false
online_filtering: false # dapo filter groups

worker:
actor:
Expand Down Expand Up @@ -85,6 +85,7 @@ trainer:
logger: ["console", "wandb"]
nnodes: 1
n_gpus_per_node: 8
max_try_make_batch: 20 # -1 means no limit
val_freq: 5 # -1 to disable
val_before_train: true
val_only: false
Expand All @@ -94,4 +95,3 @@ trainer:
save_model_only: false
save_checkpoint_path: null
load_checkpoint_path: null
max_try_make_batch: 10 # -1 means no limit
20 changes: 20 additions & 0 deletions examples/qwen2_5_vl_7b_geo3k_dapo.sh
Original file line number Diff line number Diff line change
@@ -0,0 +1,20 @@
#!/bin/bash

set -x

export PYTHONUNBUFFERED=1

MODEL_PATH=Qwen/Qwen2.5-VL-7B-Instruct # replace it with your local file path

python3 -m verl.trainer.main \
config=examples/config.yaml \
data.train_files=hiyouga/geometry3k@train \
data.val_files=hiyouga/geometry3k@test \
data.mini_rollout_batch_size=128 \
worker.actor.model.model_path=${MODEL_PATH} \
worker.actor.clip_ratio_low=0.2 \
worker.actor.clip_ratio_high=0.28 \
algorithm.disable_kl=True \
algorithm.online_filtering=True \
trainer.experiment_name=qwen2_5_vl_7b_geo_dapo \
trainer.n_gpus_per_node=8
24 changes: 14 additions & 10 deletions examples/reward_function/math.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,28 +13,32 @@
# limitations under the License.

import re
from typing import Dict, List
from typing import Any, Dict, List

from mathruler.grader import extract_boxed_content, grade_answer


def format_reward(predict: str) -> float:
def format_reward(response: str) -> float:
pattern = re.compile(r"<think>.*</think>.*\\boxed\{.*\}.*", re.DOTALL)
format_match = re.fullmatch(pattern, predict)
format_match = re.fullmatch(pattern, response)
return 1.0 if format_match else 0.0


def accuracy_reward(predict: str, ground_truth: str) -> float:
answer = extract_boxed_content(predict)
def accuracy_reward(response: str, ground_truth: str) -> float:
answer = extract_boxed_content(response)
return 1.0 if grade_answer(answer, ground_truth) else 0.0


def compute_score(predicts: List[str], ground_truths: List[str], format_weight: float = 0.1) -> List[Dict[str, float]]:
def compute_score(reward_inputs: List[Dict[str, Any]]) -> List[Dict[str, float]]:
if not isinstance(reward_inputs, list):
raise ValueError("Please use `reward_type=batch` for math reward function.")

scores = []
for predict, ground_truth in zip(predicts, ground_truths):
predict = re.sub(r"\s*(<|>|/)\s*", r"\1", predict) # handle qwen2.5vl-32b format
format_score = format_reward(predict)
accuracy_score = accuracy_reward(predict, ground_truth)
for reward_input in reward_inputs:
response = re.sub(r"\s*(<|>|/)\s*", r"\1", reward_input["response"]) # handle qwen2.5vl-32b format
format_score = format_reward(response)
accuracy_score = accuracy_reward(response, reward_input["ground_truth"])
format_weight = reward_input.get("format_weight", 0.1)
scores.append(
{
"overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
Expand Down
22 changes: 13 additions & 9 deletions examples/reward_function/r1v.py
Original file line number Diff line number Diff line change
Expand Up @@ -13,21 +13,21 @@
# limitations under the License.

import re
from typing import Dict
from typing import Any, Dict

from mathruler.grader import grade_answer


def format_reward(predict: str) -> float:
def format_reward(response: str) -> float:
pattern = re.compile(r"<think>.*?</think>\s*<answer>.*?</answer>", re.DOTALL)
format_match = re.fullmatch(pattern, predict)
format_match = re.fullmatch(pattern, response)
return 1.0 if format_match else 0.0


def accuracy_reward(predict: str, ground_truth: str) -> float:
def accuracy_reward(response: str, ground_truth: str) -> float:
try:
content_match = re.search(r"<answer>(.*?)</answer>", predict)
given_answer = content_match.group(1).strip() if content_match else predict.strip()
content_match = re.search(r"<answer>(.*?)</answer>", response)
given_answer = content_match.group(1).strip() if content_match else response.strip()
if grade_answer(given_answer, ground_truth.strip()):
return 1.0

Expand All @@ -37,9 +37,13 @@ def accuracy_reward(predict: str, ground_truth: str) -> float:
return 0.0


def compute_score(predict: str, ground_truth: str, format_weight: float = 0.5) -> Dict[str, float]:
format_score = format_reward(predict)
accuracy_score = accuracy_reward(predict, ground_truth)
def compute_score(reward_input: Dict[str, Any]) -> Dict[str, float]:
if not isinstance(reward_input, dict):
raise ValueError("Please use `reward_type=sequential` for r1v reward function.")

format_score = format_reward(reward_input["response"])
accuracy_score = accuracy_reward(reward_input["response"], reward_input["ground_truth"])
format_weight = reward_input.get("format_weight", 0.5)
return {
"overall": (1 - format_weight) * accuracy_score + format_weight * format_score,
"format": format_score,
Expand Down
4 changes: 2 additions & 2 deletions verl/trainer/config.py
Original file line number Diff line number Diff line change
Expand Up @@ -110,6 +110,8 @@ class TrainerConfig:
"""number of nodes for training"""
n_gpus_per_node: int = 8
"""number of gpus per node for training"""
max_try_make_batch: int = 20
"""max number of generations for online filtering, -1 means no limit"""
critic_warmup: int = 0
"""critic warmup steps"""
val_freq: int = -1
Expand All @@ -130,8 +132,6 @@ class TrainerConfig:
"""save checkpoint path, if not specified, use `checkpoints/project_name/experiment_name`"""
load_checkpoint_path: Optional[str] = None
"""load checkpoint path"""
max_try_make_batch: int = 10
"""max number of generations for online filter, -1 means no limit"""

def post_init(self):
if self.save_checkpoint_path is None:
Expand Down
8 changes: 6 additions & 2 deletions verl/trainer/core_algos.py
Original file line number Diff line number Diff line change
Expand Up @@ -453,15 +453,19 @@ def compute_value_loss(
return vf_loss, vf_clipfrac


def compute_kl(log_probs: torch.FloatTensor, ref_log_probs: torch.FloatTensor, kl_penalty: str) -> torch.Tensor:
def compute_kl(
log_probs: torch.FloatTensor,
ref_log_probs: torch.FloatTensor,
kl_penalty: Literal["kl", "abs", "mse", "low_var_kl", "full"],
) -> torch.Tensor:
"""Compute KL divergence given log_probs and ref_log_probs.

Adapted from https://github.com/huggingface/trl/blob/v0.11.0/trl/trainer/ppo_trainer.py#L1150

Args:
log_probs: torch.Tensor
ref_log_probs: torch.Tensor
kl_penalty: str
kl_penalty: str ("kl", "abs", "mse", "low_var_kl", "full")

Returns:
kl_div: torch.Tensor
Expand Down
Loading