Skip to content

Commit 327590e

Browse files
committed
Enhance Gemma 4 documentation for thinking mode and function calling configuration
1 parent f3ceb0d commit 327590e

File tree

1 file changed

+39
-13
lines changed

1 file changed

+39
-13
lines changed

Google/Gemma4.md

Lines changed: 39 additions & 13 deletions
Original file line numberDiff line numberDiff line change
@@ -478,7 +478,11 @@ print(outputs[0].outputs[0].text)
478478

479479
## Thinking / Reasoning Mode
480480

481-
Gemma 4 supports structured thinking, where the model can reason step-by-step before producing a final answer. The reasoning process is exposed via the `reasoning_content` field in the API response.
481+
Gemma 4 supports structured thinking, where the model can reason step-by-step before producing a final answer. The reasoning process is exposed via the `reasoning` field in the API response (vLLM 0.18+) or `reasoning_content` in older vLLM versions.
482+
483+
> ⚠️ **Important: Required Configuration**
484+
>
485+
> When using thinking mode, you **must** include `"skip_special_tokens": False` in the `extra_body` parameter. Without this setting, the reasoning special tokens will be stripped and the thinking output will not be properly captured.
482486
483487
### Launch Server with Thinking Support
484488

@@ -507,16 +511,18 @@ response = client.chat.completions.create(
507511
],
508512
max_tokens=4096,
509513
extra_body={
510-
"chat_template_kwargs": {"enable_thinking": True}
514+
"chat_template_kwargs": {"enable_thinking": True},
515+
"skip_special_tokens": False
511516
}
512517
)
513518

514519
message = response.choices[0].message
515520

516-
# The thinking process is in reasoning_content
517-
if hasattr(message, "reasoning_content") and message.reasoning_content:
521+
# Get reasoning content (vLLM 0.18+ uses 'reasoning', older versions use 'reasoning_content')
522+
reasoning = getattr(message, "reasoning", None) or getattr(message, "reasoning_content", None)
523+
if reasoning:
518524
print("=== Thinking ===")
519-
print(message.reasoning_content)
525+
print(reasoning)
520526

521527
print("\n=== Answer ===")
522528
print(message.content)
@@ -533,7 +539,10 @@ curl http://localhost:8000/v1/chat/completions \
533539
{"role": "user", "content": "What is the derivative of x^3 * ln(x)?"}
534540
],
535541
"max_tokens": 4096,
536-
"chat_template_kwargs": {"enable_thinking": true}
542+
"chat_template_kwargs": {
543+
"enable_thinking": true
544+
},
545+
"skip_special_tokens": false
537546
}'
538547
```
539548

@@ -545,6 +554,10 @@ curl http://localhost:8000/v1/chat/completions \
545554

546555
Gemma 4 supports function calling with a dedicated tool-call protocol using custom special tokens (`<|tool_call|>`, `<tool_call|>`, etc.).
547556

557+
> ⚠️ **Important: Required Configuration**
558+
>
559+
> When using function calling, you **must** include `"skip_special_tokens": False` in the `extra_body` parameter. Without this setting, the tool-call special tokens will be stripped and function calling will not work properly.
560+
548561
### Launch Server with Tool Calling
549562

550563
```bash
@@ -599,7 +612,10 @@ response = client.chat.completions.create(
599612
{"role": "user", "content": "What is the weather in Tokyo today?"}
600613
],
601614
tools=tools,
602-
max_tokens=1024
615+
max_tokens=1024,
616+
extra_body={
617+
"skip_special_tokens": False
618+
}
603619
)
604620

605621
message = response.choices[0].message
@@ -623,7 +639,10 @@ if message.tool_calls:
623639
}
624640
],
625641
tools=tools,
626-
max_tokens=1024
642+
max_tokens=1024,
643+
extra_body={
644+
"skip_special_tokens": False
645+
}
627646
)
628647

629648
print(f"\nFinal answer: {response.choices[0].message.content}")
@@ -642,7 +661,8 @@ response = client.chat.completions.create(
642661
tools=tools,
643662
max_tokens=4096,
644663
extra_body={
645-
"chat_template_kwargs": {"enable_thinking": True}
664+
"chat_template_kwargs": {"enable_thinking": True},
665+
"skip_special_tokens": False
646666
}
647667
)
648668
```
@@ -671,7 +691,10 @@ response = client.chat.completions.create(
671691
}
672692
],
673693
tools=tools,
674-
max_tokens=1024
694+
max_tokens=1024,
695+
extra_body={
696+
"skip_special_tokens": False
697+
}
675698
)
676699
```
677700

@@ -826,15 +849,18 @@ response = client.chat.completions.create(
826849
},
827850
max_tokens=4096,
828851
extra_body={
829-
"chat_template_kwargs": {"enable_thinking": True}
852+
"chat_template_kwargs": {"enable_thinking": True},
853+
"skip_special_tokens": False
830854
}
831855
)
832856

833857
message = response.choices[0].message
834858

835-
if hasattr(message, "reasoning_content") and message.reasoning_content:
859+
# Get reasoning content (vLLM 0.18+ uses 'reasoning', older versions use 'reasoning_content')
860+
reasoning = getattr(message, "reasoning", None) or getattr(message, "reasoning_content", None)
861+
if reasoning:
836862
print("=== Thinking ===")
837-
print(message.reasoning_content)
863+
print(reasoning)
838864

839865
print("\n=== Structured Output ===")
840866
print(message.content)

0 commit comments

Comments
 (0)