Skip to content

using vllm on Qwen3-Omni-30B-A3B-Instruct: error: Failed to apply prompt replacement for mm_items['audio'][0] #166

@katie312

Description

@katie312

Description

when using vllm on Qwen3-Omni-30B-A3B-Instruct, it triggers error: Failed to apply prompt replacement for mm_items['audio'][0]. I believe the inference pipeline is the same as that in the repo demo.

Reproduction

import torch
import json
import os
import re
import tqdm
import argparse
from typing import List, Dict, Any
from vllm import LLM, SamplingParams
from transformers import Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info

def extract_answer(decoded_text):
    if isinstance(decoded_text, list):
        decoded_text = decoded_text[0] if len(decoded_text) > 0 else ""
    
    clean_text = decoded_text.strip()
    match = re.search(r'([A-D])', clean_text)
    if match:
        return match.group(1).upper()
    return clean_text[:1].upper() if clean_text else "N/A"

def evaluate_answer(model_answer, correct_answer):
    if not model_answer:
        return False
    return model_answer.strip().upper() == correct_answer.strip().upper()

def run_evaluation(llm, processor, sampling_params, args):
    metrics = {
        "qa_type": {"count": {}, "correct": {}},
        "video_cat": {"count": {}, "correct": {}},
        "duration": {"count": {}, "correct": {}}
    }
    
    with open(args.json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    output_results_path = args.output_path
    total_questions = len(data)
    overall_correct = 0
    failed = 0

    print(f"Starting evaluation on {total_questions} samples using Qwen3-Omni...")

    with open(output_results_path, 'w', encoding='utf-8') as f_out:
        for item in tqdm.tqdm(data):
            video_id = item.get('video_id')
            question = item.get('Question')
            choices = item.get('Choice')
            correct_answer = item.get('Answer')
            
            qa_type = item.get('Type', 'unknown')
            video_cat = item.get('video_category', 'unknown')
            duration = item.get('video_duration', 'unknown')

            video_path = os.path.join(args.video_base_dir, video_id, f"{video_id}_video.mp4")
            
            if not os.path.exists(video_path):
                print(f"Warning: Video not found at {video_path}")
                failed += 1
                continue

            prompt = f"""Your task is to accurately answer multiple-choice questions based on the given video.
Select the single most accurate answer from the given choices.
Question: {question}
Choices: {choices}
Your answer should be a capital letter representing your choice: A, B, C, or D. Don't generate any other text."""

            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "video", "video": video_path},
                        {"type": "text", "text": prompt},
                    ],
                }
            ]

            log_entry = {
                "video_id": video_id,
                "question": question,
                "correct_answer": correct_answer,
                "model_raw_output": "",
                "model_extracted": "",
                "is_correct": False,
                "status": "success"
            }

            try:
                text_prompt = processor.apply_chat_template(
                    messages, 
                    tokenize=False, 
                    add_generation_prompt=True
                )
                
                audios, images, videos = process_mm_info(
                    messages, 
                    use_audio_in_video=args.use_audio_in_video
                )

                mm_input = {
                    "prompt": text_prompt,
                    "multi_modal_data": {},
                    "mm_processor_kwargs": {
                        "use_audio_in_video": args.use_audio_in_video,
                    },
                }
                
                if images is not None: mm_input["multi_modal_data"]["image"] = images
                if videos is not None: mm_input["multi_modal_data"]["video"] = videos
                if audios is not None: mm_input["multi_modal_data"]["audio"] = audios

                outputs = llm.generate([mm_input], sampling_params=sampling_params)
                generated_text = outputs[0].outputs[0].text
                
                model_answer = extract_answer(generated_text)
                is_correct = evaluate_answer(model_answer, correct_answer)

                log_entry["model_raw_output"] = generated_text
                log_entry["model_extracted"] = model_answer
                log_entry["is_correct"] = is_correct

                if is_correct: 
                    overall_correct += 1
                
                def update_stat(key, val):
                    if val is None: val = "N/A"
                    metrics[key]["count"][val] = metrics[key]["count"].get(val, 0) + 1
                    metrics[key]["correct"][val] = metrics[key]["correct"].get(val, 0) + (1 if is_correct else 0)

                update_stat("qa_type", qa_type)
                update_stat("video_cat", video_cat)
                update_stat("duration", duration)

            except Exception as e:
                log_entry["status"] = f"error: {str(e)}"
                failed += 1

            f_out.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
            f_out.flush()

    print("\n" + "="*30 + " EVALUATION REPORT " + "="*30)
    valid_count = total_questions - failed
    if valid_count > 0:
        print(f"Overall Accuracy: {overall_correct}/{valid_count} ({overall_correct/valid_count:.2%})")
    
    for category in ["qa_type", "video_cat", "duration"]:
        print(f"\n--- Accuracy by {category} ---")
        for k in sorted(metrics[category]["count"].keys(), key=lambda x: str(x)):
            cnt = metrics[category]["count"][k]
            corr = metrics[category]["correct"][k]
            acc = (corr / cnt * 100) if cnt > 0 else 0
            print(f"{k}: {corr}/{cnt} ({acc:.2%})")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--video_base_dir', type=str, required=True)
    parser.add_argument('--json_file_path', type=str, required=True)
    parser.add_argument('--model_path', type=str, required=True)
    parser.add_argument('--output_path', type=str, default='eval_results_qwen3.jsonl')
    parser.add_argument('--use_audio_in_video', type=bool, default=True)
    args = parser.parse_args()


    llm = LLM(
        model=args.model_path,
        trust_remote_code=True,
        gpu_memory_utilization=0.90,
        tensor_parallel_size=torch.cuda.device_count(),
        limit_mm_per_prompt={'image': 3, 'video': 3, 'audio': 3},
        max_num_seqs=1, 
        max_model_len=32768,
        seed=1234,
    )

    sampling_params = SamplingParams(
        temperature=0.0,
        top_p=1.0, 
        top_k=-1, 
        max_tokens=15, 
    )

    processor = Qwen3OmniMoeProcessor.from_pretrained(args.model_path)

    run_evaluation(llm, processor, sampling_params, args)

Logs

"video_id": "TwsSrD8g3HA", "question": "What was the chronological order of these events: (1) Fish splash, (2) '4 pounder' spoken, (3) 'good times' spoken, (4) Boat movement?", "correct_answer": "D", "model_raw_output": "A", "model_extracted": "A", "is_correct": false, "status": "success"}
{"video_id": "4U5u7OEcHFs", "question": "What visual shift occurs between the 0-10s and 10-20s segments to reflect the audio's emphasis on 'empathy, agility, insights'?", "correct_answer": "B", "model_raw_output": "", "model_extracted": "", "is_correct": false, "status": "error: Failed to apply prompt replacement for mm_items['audio'][0]"}
{"video_id": "LScZ6CXS5M8", "question": "What visual element appeared precisely after the speaker began discussing 'linear translation'?", "correct_answer": "A", "model_raw_output": "", "model_extracted": "", "is_correct": false, "status": "error: Failed to apply prompt replacement for mm_items['audio'][0]"}

Environment Information

Linux
Python 3.10.0
Cuda 12.8

accelerate==1.12.0
aiohappyeyeballs==2.6.1
aiohttp==3.13.3
aiosignal==1.4.0
annotated-doc==0.0.4
annotated-types==0.7.0
anthropic==0.71.0
anyio==4.12.1
apache-tvm-ffi==0.1.8.post2
astor==0.8.1
async-timeout==5.0.1
attrs==25.4.0
audioread==3.1.0
av==16.1.0
blake3==1.0.8
cachetools==7.0.0
cbor2==5.8.0
certifi==2026.1.4
cffi==2.0.0
charset-normalizer==3.4.4
click==8.3.1
cloudpickle==3.1.2
compressed-tensors==0.12.2
cryptography==46.0.4
cuda-bindings==13.1.1
cuda-pathfinder==1.3.3
cuda-python==13.1.1
cupy-cuda12x==13.6.0
decorator==5.2.1
depyf==0.20.0
dill==0.4.1
diskcache==5.6.3
distro==1.9.0
dnspython==2.8.0
docstring_parser==0.17.0
einops==0.8.2
email-validator==2.3.0
exceptiongroup==1.3.1
fastapi==0.128.1
fastapi-cli==0.0.20
fastapi-cloud-cli==0.11.0
fastar==0.8.0
fastrlock==0.8.3
filelock==3.20.3
flash_attn==2.8.3
flashinfer-python==0.5.3
frozenlist==1.8.0
fsspec==2026.1.0
gguf==0.17.1
h11==0.16.0
hf-xet==1.2.0
httpcore==1.0.9
httptools==0.7.1
httpx==0.28.1
httpx-sse==0.4.3
huggingface_hub==0.36.1
idna==3.11
ijson==3.4.0.post0
ImageIO==2.37.2
imageio-ffmpeg==0.6.0
interegular==0.3.3
Jinja2==3.1.6
jiter==0.13.0
jmespath==1.1.0
joblib==1.5.3
jq==1.10.0
jsonschema==4.26.0
jsonschema-specifications==2025.9.1
lark==1.2.2
lazy_loader==0.4
librosa==0.11.0
llguidance==1.3.0
llvmlite==0.44.0
lm-format-enforcer==0.11.3
loguru==0.7.3
markdown-it-py==4.0.0
MarkupSafe==3.0.3
mcp==1.26.0
mdurl==0.1.2
mistral_common==1.9.0
model-hosting-container-standards==0.1.13
moviepy==2.2.1
mpmath==1.3.0
msgpack==1.1.2
msgspec==0.20.0
multidict==6.7.1
networkx==3.4.2
ninja==1.13.0
numba==0.61.2
numpy==2.2.6
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cudnn-cu12==9.10.2.21
nvidia-cudnn-frontend==1.18.0
nvidia-cufft-cu12==11.3.3.83
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparselt-cu12==0.7.1
nvidia-cutlass-dsl==4.3.5
nvidia-ml-py==13.590.48
nvidia-nccl-cu12==2.27.5
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvshmem-cu12==3.3.20
nvidia-nvtx-cu12==12.8.90
openai==2.16.0
openai-harmony==0.0.8
opencv-python-headless==4.13.0.92
outlines_core==0.2.11
packaging @ file:///home/task_176104877067765/conda-bld/packaging_1761049113113/work
partial-json-parser==0.2.1.1.post7
pillow==11.3.0
platformdirs==4.5.1
pooch==1.9.0
proglog==0.1.12
prometheus-fastapi-instrumentator==7.1.0
prometheus_client==0.24.1
propcache==0.4.1
protobuf==6.33.5
psutil==7.2.2
py-cpuinfo==9.0.0
pybase64==1.4.3
pycountry==24.6.1
pycparser==3.0
pydantic==2.12.5
pydantic-extra-types==2.11.0
pydantic-settings==2.12.0
pydantic_core==2.41.5
Pygments==2.19.2
PyJWT==2.11.0
python-dotenv==1.2.1
python-json-logger==4.0.0
python-multipart==0.0.22
PyYAML==6.0.3
pyzmq==27.1.0
qwen-omni-utils==0.0.8
ray==2.53.0
referencing==0.37.0
regex==2026.1.15
requests==2.32.5
resampy==0.4.3
rich==14.3.2
rich-toolkit==0.18.1
rignore==0.7.6
rpds-py==0.30.0
safetensors==0.7.0
scikit-learn==1.7.2
scipy==1.15.3
sentencepiece==0.2.1
sentry-sdk==2.52.0
setproctitle==1.3.7
shellingham==1.5.4
sniffio==1.3.1
soundfile==0.13.1
soxr==1.0.0
sse-starlette==3.2.0
starlette==0.50.0
supervisor==4.3.0
sympy==1.14.0
tabulate==0.9.0
threadpoolctl==3.6.0
tiktoken==0.12.0
tokenizers==0.22.2
tomli==2.4.0
torch==2.9.0
torchaudio==2.9.0
torchvision==0.24.0
tqdm==4.67.3
transformers==4.57.3
triton==3.5.0
typer==0.21.1
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.6.3
uvicorn==0.40.0
uvloop==0.22.1
vllm==0.13.0
watchfiles==1.1.1
websockets==16.0
xgrammar==0.1.27
yarl==1.22.0

Known Issue

  • The issue hasn't been already addressed in Documentation, Issues, and Discussions.

Metadata

Metadata

Assignees

No one assigned

    Labels

    No labels
    No labels

    Type

    No type
    No fields configured for issues without a type.

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions