when using vllm on Qwen3-Omni-30B-A3B-Instruct, it triggers error: Failed to apply prompt replacement for mm_items['audio'][0]. I believe the inference pipeline is the same as that in the repo demo.
import torch
import json
import os
import re
import tqdm
import argparse
from typing import List, Dict, Any
from vllm import LLM, SamplingParams
from transformers import Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info
def extract_answer(decoded_text):
if isinstance(decoded_text, list):
decoded_text = decoded_text[0] if len(decoded_text) > 0 else ""
clean_text = decoded_text.strip()
match = re.search(r'([A-D])', clean_text)
if match:
return match.group(1).upper()
return clean_text[:1].upper() if clean_text else "N/A"
def evaluate_answer(model_answer, correct_answer):
if not model_answer:
return False
return model_answer.strip().upper() == correct_answer.strip().upper()
def run_evaluation(llm, processor, sampling_params, args):
metrics = {
"qa_type": {"count": {}, "correct": {}},
"video_cat": {"count": {}, "correct": {}},
"duration": {"count": {}, "correct": {}}
}
with open(args.json_file_path, 'r', encoding='utf-8') as f:
data = json.load(f)
output_results_path = args.output_path
total_questions = len(data)
overall_correct = 0
failed = 0
print(f"Starting evaluation on {total_questions} samples using Qwen3-Omni...")
with open(output_results_path, 'w', encoding='utf-8') as f_out:
for item in tqdm.tqdm(data):
video_id = item.get('video_id')
question = item.get('Question')
choices = item.get('Choice')
correct_answer = item.get('Answer')
qa_type = item.get('Type', 'unknown')
video_cat = item.get('video_category', 'unknown')
duration = item.get('video_duration', 'unknown')
video_path = os.path.join(args.video_base_dir, video_id, f"{video_id}_video.mp4")
if not os.path.exists(video_path):
print(f"Warning: Video not found at {video_path}")
failed += 1
continue
prompt = f"""Your task is to accurately answer multiple-choice questions based on the given video.
Select the single most accurate answer from the given choices.
Question: {question}
Choices: {choices}
Your answer should be a capital letter representing your choice: A, B, C, or D. Don't generate any other text."""
messages = [
{
"role": "user",
"content": [
{"type": "video", "video": video_path},
{"type": "text", "text": prompt},
],
}
]
log_entry = {
"video_id": video_id,
"question": question,
"correct_answer": correct_answer,
"model_raw_output": "",
"model_extracted": "",
"is_correct": False,
"status": "success"
}
try:
text_prompt = processor.apply_chat_template(
messages,
tokenize=False,
add_generation_prompt=True
)
audios, images, videos = process_mm_info(
messages,
use_audio_in_video=args.use_audio_in_video
)
mm_input = {
"prompt": text_prompt,
"multi_modal_data": {},
"mm_processor_kwargs": {
"use_audio_in_video": args.use_audio_in_video,
},
}
if images is not None: mm_input["multi_modal_data"]["image"] = images
if videos is not None: mm_input["multi_modal_data"]["video"] = videos
if audios is not None: mm_input["multi_modal_data"]["audio"] = audios
outputs = llm.generate([mm_input], sampling_params=sampling_params)
generated_text = outputs[0].outputs[0].text
model_answer = extract_answer(generated_text)
is_correct = evaluate_answer(model_answer, correct_answer)
log_entry["model_raw_output"] = generated_text
log_entry["model_extracted"] = model_answer
log_entry["is_correct"] = is_correct
if is_correct:
overall_correct += 1
def update_stat(key, val):
if val is None: val = "N/A"
metrics[key]["count"][val] = metrics[key]["count"].get(val, 0) + 1
metrics[key]["correct"][val] = metrics[key]["correct"].get(val, 0) + (1 if is_correct else 0)
update_stat("qa_type", qa_type)
update_stat("video_cat", video_cat)
update_stat("duration", duration)
except Exception as e:
log_entry["status"] = f"error: {str(e)}"
failed += 1
f_out.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
f_out.flush()
print("\n" + "="*30 + " EVALUATION REPORT " + "="*30)
valid_count = total_questions - failed
if valid_count > 0:
print(f"Overall Accuracy: {overall_correct}/{valid_count} ({overall_correct/valid_count:.2%})")
for category in ["qa_type", "video_cat", "duration"]:
print(f"\n--- Accuracy by {category} ---")
for k in sorted(metrics[category]["count"].keys(), key=lambda x: str(x)):
cnt = metrics[category]["count"][k]
corr = metrics[category]["correct"][k]
acc = (corr / cnt * 100) if cnt > 0 else 0
print(f"{k}: {corr}/{cnt} ({acc:.2%})")
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--video_base_dir', type=str, required=True)
parser.add_argument('--json_file_path', type=str, required=True)
parser.add_argument('--model_path', type=str, required=True)
parser.add_argument('--output_path', type=str, default='eval_results_qwen3.jsonl')
parser.add_argument('--use_audio_in_video', type=bool, default=True)
args = parser.parse_args()
llm = LLM(
model=args.model_path,
trust_remote_code=True,
gpu_memory_utilization=0.90,
tensor_parallel_size=torch.cuda.device_count(),
limit_mm_per_prompt={'image': 3, 'video': 3, 'audio': 3},
max_num_seqs=1,
max_model_len=32768,
seed=1234,
)
sampling_params = SamplingParams(
temperature=0.0,
top_p=1.0,
top_k=-1,
max_tokens=15,
)
processor = Qwen3OmniMoeProcessor.from_pretrained(args.model_path)
run_evaluation(llm, processor, sampling_params, args)
"video_id": "TwsSrD8g3HA", "question": "What was the chronological order of these events: (1) Fish splash, (2) '4 pounder' spoken, (3) 'good times' spoken, (4) Boat movement?", "correct_answer": "D", "model_raw_output": "A", "model_extracted": "A", "is_correct": false, "status": "success"}
{"video_id": "4U5u7OEcHFs", "question": "What visual shift occurs between the 0-10s and 10-20s segments to reflect the audio's emphasis on 'empathy, agility, insights'?", "correct_answer": "B", "model_raw_output": "", "model_extracted": "", "is_correct": false, "status": "error: Failed to apply prompt replacement for mm_items['audio'][0]"}
{"video_id": "LScZ6CXS5M8", "question": "What visual element appeared precisely after the speaker began discussing 'linear translation'?", "correct_answer": "A", "model_raw_output": "", "model_extracted": "", "is_correct": false, "status": "error: Failed to apply prompt replacement for mm_items['audio'][0]"}
Description
when using vllm on Qwen3-Omni-30B-A3B-Instruct, it triggers error: Failed to apply prompt replacement for mm_items['audio'][0]. I believe the inference pipeline is the same as that in the repo demo.
Reproduction
Logs
Environment Information
Linux
Python 3.10.0
Cuda 12.8
accelerate==1.12.0
aiohappyeyeballs==2.6.1
aiohttp==3.13.3
aiosignal==1.4.0
annotated-doc==0.0.4
annotated-types==0.7.0
anthropic==0.71.0
anyio==4.12.1
apache-tvm-ffi==0.1.8.post2
astor==0.8.1
async-timeout==5.0.1
attrs==25.4.0
audioread==3.1.0
av==16.1.0
blake3==1.0.8
cachetools==7.0.0
cbor2==5.8.0
certifi==2026.1.4
cffi==2.0.0
charset-normalizer==3.4.4
click==8.3.1
cloudpickle==3.1.2
compressed-tensors==0.12.2
cryptography==46.0.4
cuda-bindings==13.1.1
cuda-pathfinder==1.3.3
cuda-python==13.1.1
cupy-cuda12x==13.6.0
decorator==5.2.1
depyf==0.20.0
dill==0.4.1
diskcache==5.6.3
distro==1.9.0
dnspython==2.8.0
docstring_parser==0.17.0
einops==0.8.2
email-validator==2.3.0
exceptiongroup==1.3.1
fastapi==0.128.1
fastapi-cli==0.0.20
fastapi-cloud-cli==0.11.0
fastar==0.8.0
fastrlock==0.8.3
filelock==3.20.3
flash_attn==2.8.3
flashinfer-python==0.5.3
frozenlist==1.8.0
fsspec==2026.1.0
gguf==0.17.1
h11==0.16.0
hf-xet==1.2.0
httpcore==1.0.9
httptools==0.7.1
httpx==0.28.1
httpx-sse==0.4.3
huggingface_hub==0.36.1
idna==3.11
ijson==3.4.0.post0
ImageIO==2.37.2
imageio-ffmpeg==0.6.0
interegular==0.3.3
Jinja2==3.1.6
jiter==0.13.0
jmespath==1.1.0
joblib==1.5.3
jq==1.10.0
jsonschema==4.26.0
jsonschema-specifications==2025.9.1
lark==1.2.2
lazy_loader==0.4
librosa==0.11.0
llguidance==1.3.0
llvmlite==0.44.0
lm-format-enforcer==0.11.3
loguru==0.7.3
markdown-it-py==4.0.0
MarkupSafe==3.0.3
mcp==1.26.0
mdurl==0.1.2
mistral_common==1.9.0
model-hosting-container-standards==0.1.13
moviepy==2.2.1
mpmath==1.3.0
msgpack==1.1.2
msgspec==0.20.0
multidict==6.7.1
networkx==3.4.2
ninja==1.13.0
numba==0.61.2
numpy==2.2.6
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cudnn-cu12==9.10.2.21
nvidia-cudnn-frontend==1.18.0
nvidia-cufft-cu12==11.3.3.83
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparselt-cu12==0.7.1
nvidia-cutlass-dsl==4.3.5
nvidia-ml-py==13.590.48
nvidia-nccl-cu12==2.27.5
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvshmem-cu12==3.3.20
nvidia-nvtx-cu12==12.8.90
openai==2.16.0
openai-harmony==0.0.8
opencv-python-headless==4.13.0.92
outlines_core==0.2.11
packaging @ file:///home/task_176104877067765/conda-bld/packaging_1761049113113/work
partial-json-parser==0.2.1.1.post7
pillow==11.3.0
platformdirs==4.5.1
pooch==1.9.0
proglog==0.1.12
prometheus-fastapi-instrumentator==7.1.0
prometheus_client==0.24.1
propcache==0.4.1
protobuf==6.33.5
psutil==7.2.2
py-cpuinfo==9.0.0
pybase64==1.4.3
pycountry==24.6.1
pycparser==3.0
pydantic==2.12.5
pydantic-extra-types==2.11.0
pydantic-settings==2.12.0
pydantic_core==2.41.5
Pygments==2.19.2
PyJWT==2.11.0
python-dotenv==1.2.1
python-json-logger==4.0.0
python-multipart==0.0.22
PyYAML==6.0.3
pyzmq==27.1.0
qwen-omni-utils==0.0.8
ray==2.53.0
referencing==0.37.0
regex==2026.1.15
requests==2.32.5
resampy==0.4.3
rich==14.3.2
rich-toolkit==0.18.1
rignore==0.7.6
rpds-py==0.30.0
safetensors==0.7.0
scikit-learn==1.7.2
scipy==1.15.3
sentencepiece==0.2.1
sentry-sdk==2.52.0
setproctitle==1.3.7
shellingham==1.5.4
sniffio==1.3.1
soundfile==0.13.1
soxr==1.0.0
sse-starlette==3.2.0
starlette==0.50.0
supervisor==4.3.0
sympy==1.14.0
tabulate==0.9.0
threadpoolctl==3.6.0
tiktoken==0.12.0
tokenizers==0.22.2
tomli==2.4.0
torch==2.9.0
torchaudio==2.9.0
torchvision==0.24.0
tqdm==4.67.3
transformers==4.57.3
triton==3.5.0
typer==0.21.1
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.6.3
uvicorn==0.40.0
uvloop==0.22.1
vllm==0.13.0
watchfiles==1.1.1
websockets==16.0
xgrammar==0.1.27
yarl==1.22.0
Known Issue