[recipe] feat: Add example for gpt-oss training using agent loop (verl-project#3774)

HJSang · Hejian Sang · gemini-code-assist[bot] · web-flow · commit 061535208c56 · 2025-10-15T16:45:11.000+08:00
### What does this PR do? > Add **concise** overview of what this PR aims to achieve or accomplish. Reference related GitHub issues and PRs that help with the review. ### Checklist Before Starting - [x] Search for similar PRs. Paste at least one query link here: ... - [x] Format the PR title as `[{modules}] {type}: {description}` (This will be checked by the CI) - `{modules}` include `fsdp`, `megatron`, `sglang`, `vllm`, `rollout`, `trainer`, `ci`, `training_utils`, `recipe`, `hardware`, `deployment`, `ray`, `worker`, `single_controller`, `misc`, `perf`, `model`, `algo`, `env`, `tool`, `ckpt`, `doc`, `data` - If this PR involves multiple modules, separate them with `,` like `[megatron, fsdp, doc]` - `{type}` is in `feat`, `fix`, `refactor`, `chore`, `test` - If this PR breaks any API (CLI arguments, config, function signature, etc.), add `[BREAKING]` to the beginning of the title. - Example: `[BREAKING][fsdp, megatron] feat: dynamic batching` ### Test TODO: run training test > For changes that can not be tested by CI (e.g., algorithm implementation, new model support), validate by experiment(s) and show results like training curve plots, evaluation results, etc. ### API and Usage Example > Demonstrate how the API changes if any, and provide usage example(s) if possible. ```python # Add code snippet or script demonstrating how to use this ``` ### Design & Code Changes > Demonstrate the high-level design if this PR is complex, and list the specific changes. ### Checklist Before Submitting > [!IMPORTANT] > Please check all the following items before requesting a review, otherwise the reviewer might deprioritize this PR for review. - [x] Read the [Contribute Guide](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md). - [x] Apply [pre-commit checks](https://github.com/volcengine/verl/blob/main/CONTRIBUTING.md#code-linting-and-formatting): `pre-commit install && pre-commit run --all-files --show-diff-on-failure --color=always` - [ ] Add / Update [the documentation](https://github.com/volcengine/verl/tree/main/docs). - [ ] Add unit or end-to-end test(s) to [the CI workflow](https://github.com/volcengine/verl/tree/main/.github/workflows) to cover all the code. If not feasible, explain why: ... - [ ] Once your PR is ready for CI, send a message in [the `ci-request` channel](https://verl-project.slack.com/archives/C091TCESWB1) in [the `verl` Slack workspace](https://join.slack.com/t/verl-project/shared_invite/zt-3855yhg8g-CTkqXu~hKojPCmo7k_yXTQ). (If not accessible, please try [the Feishu group (飞书群)](https://applink.larkoffice.com/client/chat/chatter/add_by_link?link_token=772jd4f1-cd91-441e-a820-498c6614126a).) --------- Co-authored-by: Hejian Sang <hsang@linkedin.com> Co-authored-by: gemini-code-assist[bot] <176961590+gemini-code-assist[bot]@users.noreply.github.com>
diff --git a/recipe/langgraph_agent/chat_model.py b/recipe/langgraph_agent/chat_model.py
@@ -43,6 +43,20 @@
 logger.setLevel(os.getenv("VERL_LOGGING_LEVEL", "WARN"))
 
 
+def format_tool_response_manually(tool_message: dict, tool_call_name: str) -> str:
+    """Manually format tool response without using tokenizer template.
+
+    Args:
+        tool_message: Tool message dictionary with 'content' field
+        tool_call_name: Name of the tool that was called
+
+    Returns:
+        Formatted tool response string
+    """
+    content = tool_message["content"]
+    return f"<|start|>functions.{tool_call_name} to=assistant<|channel|>commentary<|message|>{content}<|end|>"
+
+
 class MaxTokenExceededError(Exception):
     """Indicate that history chat messages + tool message exceeds LLM max_tokens."""
 
@@ -202,13 +216,39 @@ async def _preprocess(self, messages: list[BaseMessage], **kwargs: Any) -> tuple
 
         # encode tool response
         tool_responses = convert_to_openai_messages(messages[i + 1 :])
-        tool_response_ids = await loop.run_in_executor(
-            None,
-            lambda messages=tool_responses: self.tokenizer.apply_chat_template(
-                messages, add_generation_prompt=True, tokenize=True
-            ),
-        )
-        tool_response_ids = tool_response_ids[len(kwargs["system_prompt"]) :]
+        if self.tool_parser == "hermes":
+            tool_response_ids = await loop.run_in_executor(
+                None,
+                lambda messages=tool_responses: self.tokenizer.apply_chat_template(
+                    messages, add_generation_prompt=True, tokenize=True
+                ),
+            )
+            tool_response_ids = tool_response_ids[len(kwargs["system_prompt"]) :]
+        elif self.tool_parser == "gpt-oss":
+            # Format tool responses manually
+            # since gpt-oss chat template requires tool call messages to parse tool response messages
+            # we need to format the tool response messages manually
+            tool_response_texts = []
+            for tool_msg in tool_responses:
+                if tool_msg["role"] == "tool":
+                    # Use tool message's name if available (for multiple tool calls)
+                    actual_tool_name = tool_msg.get("name", "unknown")
+                    if actual_tool_name == "unknown":
+                        logger.error(f"actual_tool_name: {actual_tool_name}")
+                    formatted = format_tool_response_manually(tool_msg, actual_tool_name)
+                    tool_response_texts.append(formatted)
+            # need to add generation tokens for gpt-oss manually since add_generation_prompt is True
+            tool_response_texts.append("<|start|>assistant")
+
+            # Tokenize the manually formatted tool responses
+            tool_response_text = "".join(tool_response_texts)
+            print(f"tool_response_text: {tool_response_text}")
+
+            tool_response_ids = await loop.run_in_executor(
+                None, lambda: self.tokenizer.encode(tool_response_text, add_special_tokens=False)
+            )
+        else:
+            raise ValueError(f"Unsupported tool parser: {self.tool_parser}")
 
         # stop generation if response length exceeds max response length
         if len(messages[i].response_metadata["response_mask"]) + len(tool_response_ids) >= self.max_tokens:
diff --git a/recipe/langgraph_agent/example/run_gpt_oss_20b_bf16.sh b/recipe/langgraph_agent/example/run_gpt_oss_20b_bf16.sh
@@ -0,0 +1,143 @@
+#!/usr/bin/env bash
+#SBATCH --job-name=rl-langgraph-3B
+#SBATCH --partition=main
+#SBATCH --nodes=1
+#SBATCH --ntasks-per-node=1
+#SBATCH --cpus-per-task=64
+#SBATCH --gres=gpu:4
+#SBATCH --mem=0
+#SBATCH --time=10:00:00
+#SBATCH --output=%x_%j.out
+#SBATCH --error=%x_%j.err
+
+set -xeuo pipefail
+
+# ================= cluster topology =================
+export GPUS_PER_NODE=${SLURM_GPUS_ON_NODE:-${GPUS_PER_NODE:-2}}  # GPUs on this node
+NNODES=${SLURM_JOB_NUM_NODES:-${NNODES:-1}}
+export NNODES
+export RAY_NUM_NODES=$NNODES
+
+# Require at least 2 GPUs
+TOTAL_GPUS=$((GPUS_PER_NODE * NNODES))
+if [ "$TOTAL_GPUS" -lt 2 ]; then
+  echo "Error: at least 2 GPUs are required, detected $TOTAL_GPUS." >&2
+  exit 1
+fi
+
+echo "Using $NNODES nodes and $GPUS_PER_NODE GPUs per node..."
+
+# ================= data/model/tool =================
+HDFS_ROOT=${HDFS_ROOT:-$PWD}
+DATA_ROOT=${DATA_ROOT:-$PWD}
+
+# Prefer local model if present, otherwise fall back to HF hub path
+model_path="lmsys/gpt-oss-20b-bf16"
+
+# Use the default output directory produced by create_dataset.py
+train_files=$DATA_ROOT/data/math_expression_tool/train.parquet
+test_files=$DATA_ROOT/data/math_expression_tool/test.parquet
+
+# Agent config
+agent_loop_config_path=recipe/langgraph_agent/example/agent.yaml
+
+# =================== wandb ===================
+project_name=math_expression_tool
+experiment_name=gpt-oss-20b-bf16
+default_local_dir=$DATA_ROOT/checkpoint/$experiment_name
+
+# ================= algorithm =================
+adv_estimator=grpo
+
+use_kl_in_reward=false
+kl_coef=0.0
+use_kl_loss=false
+kl_loss_coef=0.0
+
+clip_ratio_low=0.2
+clip_ratio_high=0.28
+
+max_turns=8
+max_prompt_length=1024
+max_response_length=8192
+actor_lr=1e-6
+
+train_batch_size=128
+ppo_mini_batch_size=16
+n_resp_per_prompt=8
+n_resp_per_prompt_val=1
+
+# =================== logging ===================
+export RAY_LOGGING_LEVEL=DEBUG
+export HYDRA_FULL_ERROR=1
+
+# ================= performance =================
+export NCCL_IBEXT_DISABLE=1
+export NCCL_NVLS_ENABLE=1
+export NCCL_IB_HCA=mlx5
+export UCX_NET_DEVICES=mlx5_0:1,mlx5_1:1,mlx5_2:1,mlx5_3:1,mlx5_4:1,mlx5_5:1,mlx5_6:1,mlx5_7:1
+export VLLM_USE_V1=1
+export VLLM_ATTENTION_BACKEND=FLASH_ATTN
+
+infer_tp=2  # vLLM tensor parallel size
+train_sp=4  # Ulysses sequence parallel size for actor
+offload=true
+
+actor_max_token_len_per_gpu=$(( (max_prompt_length + max_response_length) * 4 ))
+log_prob_max_token_len_per_gpu=$(( actor_max_token_len_per_gpu * 2 ))
+
+train_files="['$train_files']"
+test_files="['$test_files']"
+
+python3 -m verl.trainer.main_ppo \
+    algorithm.adv_estimator=$adv_estimator \
+    algorithm.use_kl_in_reward=$use_kl_in_reward \
+    algorithm.kl_ctrl.kl_coef=$kl_coef \
+    data.train_files="$train_files" \
+    data.val_files="$test_files" \
+    data.return_raw_chat=true \
+    data.train_batch_size=$train_batch_size \
+    data.max_prompt_length=$max_prompt_length \
+    data.max_response_length=$max_response_length \
+    data.filter_overlong_prompts=true \
+    data.truncation='error' \
+    actor_rollout_ref.model.path="$model_path" \
+    actor_rollout_ref.model.use_remove_padding=true \
+    actor_rollout_ref.model.enable_gradient_checkpointing=true \
+    actor_rollout_ref.actor.use_kl_loss=$use_kl_loss \
+    actor_rollout_ref.actor.kl_loss_coef=$kl_loss_coef \
+    actor_rollout_ref.actor.clip_ratio_low=$clip_ratio_low \
+    actor_rollout_ref.actor.clip_ratio_high=$clip_ratio_high \
+    actor_rollout_ref.actor.clip_ratio_c=10.0 \
+    actor_rollout_ref.actor.optim.lr=$actor_lr \
+    actor_rollout_ref.actor.use_dynamic_bsz=true \
+    actor_rollout_ref.actor.ppo_mini_batch_size=$ppo_mini_batch_size \
+    actor_rollout_ref.actor.ppo_max_token_len_per_gpu=$actor_max_token_len_per_gpu \
+    actor_rollout_ref.actor.ulysses_sequence_parallel_size=$train_sp \
+    actor_rollout_ref.actor.fsdp_config.param_offload=$offload \
+    actor_rollout_ref.actor.fsdp_config.optimizer_offload=$offload \
+    actor_rollout_ref.ref.log_prob_max_token_len_per_gpu=$log_prob_max_token_len_per_gpu \
+    actor_rollout_ref.rollout.name=sglang \
+    actor_rollout_ref.rollout.mode=async \
+    actor_rollout_ref.rollout.tensor_model_parallel_size=$infer_tp \
+    actor_rollout_ref.rollout.multi_turn.max_user_turns=$max_turns \
+    actor_rollout_ref.rollout.multi_turn.max_assistant_turns=$max_turns \
+    actor_rollout_ref.rollout.multi_turn.format=gpt-oss \
+    actor_rollout_ref.rollout.agent.tool_parser=gpt-oss \
+    actor_rollout_ref.rollout.agent.agent_loop_config_path=$agent_loop_config_path \
+    actor_rollout_ref.rollout.gpu_memory_utilization=0.7 \
+    actor_rollout_ref.rollout.n=$n_resp_per_prompt \
+    actor_rollout_ref.rollout.val_kwargs.top_p=1.0\
+    actor_rollout_ref.rollout.val_kwargs.temperature=1.0 \
+    actor_rollout_ref.rollout.val_kwargs.n=$n_resp_per_prompt_val \
+    trainer.logger='["console","wandb"]' \
+    trainer.project_name=$project_name \
+    trainer.experiment_name=$experiment_name \
+    trainer.n_gpus_per_node="$GPUS_PER_NODE" \
+    trainer.val_before_train=true \
+    trainer.log_val_generations=50 \
+    trainer.nnodes="$NNODES" \
+    trainer.save_freq=-1 \
+    trainer.default_local_dir="$default_local_dir" \
+    trainer.test_freq=5 \
+    trainer.total_epochs=1 "$@"