m = [
{
"role": "system",
"content": "You are a helpful assistant." + "hi " * 1000,
},
{
"role": "user",
"content": "Issue Description.",
},
{
"content": "Reflection.",
"role": "assistant",
"tool_calls": [
{
"function": {
"arguments": "{\"command\": \"ls -la\"}",
"name": "bash"
},
"id": "chatcmpl-tool-81320909e20aa185",
"type": "function"
}
]
}
]
import json
from pyexpat.errors import messages
from litellm import completion
MODEL = "huggingface/Qwen/Qwen3-30B-A3B-Instruct-2507"
API_BASE = "http://localhost:32511/v1" # Use your LLM-d endpoint
response = completion(
model=MODEL,
api_base=API_BASE,
messages=m[0:2],
# max_tokens=1,
temperature=0
)
print(response.get("usage", {}))
response = completion(
model=MODEL,
api_base=API_BASE,
messages=m,
# max_tokens=1,
temperature=0
)
print(response.get("usage", {}))
grep -E 'prompt_token_ids' modelserving_pods.log > vllm.txt
grep -E '\"tokens\"' epp_pods.log > epp.txt
from transformers import AutoTokenizer
model_name = "Qwen/Qwen3-30B-A3B-Instruct-2507"
tokenizer = AutoTokenizer.from_pretrained(
model_name,
trust_remote_code=True
)
print(tokenizer.decode(epp_tokens_int).replace('\n', '\\n'))
print(tokenizer.decode(vllm_tokens_int).replace('\n', '\\n'))
My scenario
https://gist.github.com/shashwatj07/55c00e0b1de3adc5d8f498b054a6f5a0
Script to reproduce
Observe the anomaly:
I have a sample here: https://www.diffchecker.com/xwBgEXXF/