Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
24 changes: 24 additions & 0 deletions trl/scripts/vllm_serve.py
Original file line number Diff line number Diff line change
Expand Up @@ -218,6 +218,14 @@ class ScriptArguments:
speculative_config (`str`, *optional*):
JSON string for vLLM speculative decoding config, forwarded to `LLM(speculative_config=...)`. When unset,
speculative decoding is disabled. Example: `'{"method": "qwen3_next_mtp", "num_speculative_tokens": 5}'`.
reasoning_parser (`str`, *optional*):
Reasoning parser for thinking models (e.g., `"qwen3"`, `"deepseek_r1"`). When set, vLLM configures a
`ReasoningConfig` on the engine so that `thinking_token_budget` can be used in `generation_kwargs`.
reasoning_config (`str`, *optional*):
JSON string for vLLM reasoning config specifying thinking delimiters, forwarded to
`LLM(reasoning_config=...)`. Example:
`'{"reasoning_start_str": "<think>", "reasoning_end_str": "</think>"}'`. If only `reasoning_parser` is set,
this is not required — vLLM will infer the delimiters from the parser.
"""

model: str = field(
Expand Down Expand Up @@ -329,6 +337,20 @@ class ScriptArguments:
'Example: \'{"method": "qwen3_next_mtp", "num_speculative_tokens": 5}\''
},
)
reasoning_parser: str | None = field(
default=None,
metadata={
"help": "Reasoning parser for thinking models (e.g., 'qwen3', 'deepseek_r1'). "
"Enables vLLM's ReasoningConfig so that thinking_token_budget works in generation_kwargs."
},
)
reasoning_config: str | None = field(
default=None,
metadata={
"help": "JSON string for vLLM reasoning config. "
'Example: \'{"reasoning_start_str": "<think>", "reasoning_end_str": "</think>"}\''
},
)


def llm_worker(
Expand Down Expand Up @@ -362,6 +384,8 @@ def llm_worker(
# Important so temperature scaling/logit tweaking affects the TIS log probs
logprobs_mode="processed_logprobs",
speculative_config=json.loads(script_args.speculative_config) if script_args.speculative_config else None,
reasoning_config=json.loads(script_args.reasoning_config) if script_args.reasoning_config else None,
reasoning_parser=script_args.reasoning_parser,
)

# Send ready signal to parent process
Expand Down
Loading