diff --git a/src/llmperf/ray_clients/openai_chat_completions_client.py b/src/llmperf/ray_clients/openai_chat_completions_client.py index f2e0a91..1798995 100644 --- a/src/llmperf/ray_clients/openai_chat_completions_client.py +++ b/src/llmperf/ray_clients/openai_chat_completions_client.py @@ -89,7 +89,8 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: raise RuntimeError(data["error"]["message"]) delta = data["choices"][0]["delta"] - if delta.get("content", None): + content = delta.get("content") or delta.get("reasoning_content") + if content: if not ttft: ttft = time.monotonic() - start_time time_to_next_token.append(ttft) @@ -98,7 +99,7 @@ def llm_request(self, request_config: RequestConfig) -> Dict[str, Any]: time.monotonic() - most_recent_received_token_time ) most_recent_received_token_time = time.monotonic() - generated_text += delta["content"] + generated_text += content total_request_time = time.monotonic() - start_time output_throughput = tokens_received / total_request_time