From 73ee36b90313e361e187c3b6b2877dc75e245fec Mon Sep 17 00:00:00 2001 From: Junhong Liu <98734602+LJH-LBJ@users.noreply.github.com> Date: Thu, 5 Feb 2026 19:21:46 +0800 Subject: [PATCH] concate the data in audio_data Signed-off-by: Junhong Liu <98734602+LJH-LBJ@users.noreply.github.com> --- vllm_omni/entrypoints/openai/serving_chat.py | 9 +++------ 1 file changed, 3 insertions(+), 6 deletions(-) diff --git a/vllm_omni/entrypoints/openai/serving_chat.py b/vllm_omni/entrypoints/openai/serving_chat.py index cfee79157e..08147da6ec 100644 --- a/vllm_omni/entrypoints/openai/serving_chat.py +++ b/vllm_omni/entrypoints/openai/serving_chat.py @@ -1679,12 +1679,9 @@ def _create_audio_choice( # OMNI: Access multimodal_output from CompletionOutput (outputs[0]), not from RequestOutput # Reference: examples/offline_inference/qwen3_omni/end2end.py line 421 audio_data = final_res.outputs[0].multimodal_output.get("audio") - if stream: - audio_tensor = audio_data[-1].float().detach().cpu().numpy() - else: - if isinstance(audio_data, list): - audio_data = torch.cat(audio_data, dim=-1) - audio_tensor = audio_data.float().detach().cpu().numpy() + if isinstance(audio_data, list): + audio_data = torch.cat(audio_data, dim=-1) + audio_tensor = audio_data.float().detach().cpu().numpy() # Ensure audio is 1D (flatten if needed) if audio_tensor.ndim > 1: