using vllm on Qwen3-Omni-30B-A3B-Instruct: error: Failed to apply prompt replacement for mm_items['audio'][0]

### Description

when using vllm on Qwen3-Omni-30B-A3B-Instruct, it triggers error: Failed to apply prompt replacement for mm_items['audio'][0]. I believe the inference pipeline is the same as that in the repo demo.

### Reproduction

```
import torch
import json
import os
import re
import tqdm
import argparse
from typing import List, Dict, Any
from vllm import LLM, SamplingParams
from transformers import Qwen3OmniMoeProcessor
from qwen_omni_utils import process_mm_info

def extract_answer(decoded_text):
    if isinstance(decoded_text, list):
        decoded_text = decoded_text[0] if len(decoded_text) > 0 else ""
    
    clean_text = decoded_text.strip()
    match = re.search(r'([A-D])', clean_text)
    if match:
        return match.group(1).upper()
    return clean_text[:1].upper() if clean_text else "N/A"

def evaluate_answer(model_answer, correct_answer):
    if not model_answer:
        return False
    return model_answer.strip().upper() == correct_answer.strip().upper()

def run_evaluation(llm, processor, sampling_params, args):
    metrics = {
        "qa_type": {"count": {}, "correct": {}},
        "video_cat": {"count": {}, "correct": {}},
        "duration": {"count": {}, "correct": {}}
    }
    
    with open(args.json_file_path, 'r', encoding='utf-8') as f:
        data = json.load(f)

    output_results_path = args.output_path
    total_questions = len(data)
    overall_correct = 0
    failed = 0

    print(f"Starting evaluation on {total_questions} samples using Qwen3-Omni...")

    with open(output_results_path, 'w', encoding='utf-8') as f_out:
        for item in tqdm.tqdm(data):
            video_id = item.get('video_id')
            question = item.get('Question')
            choices = item.get('Choice')
            correct_answer = item.get('Answer')
            
            qa_type = item.get('Type', 'unknown')
            video_cat = item.get('video_category', 'unknown')
            duration = item.get('video_duration', 'unknown')

            video_path = os.path.join(args.video_base_dir, video_id, f"{video_id}_video.mp4")
            
            if not os.path.exists(video_path):
                print(f"Warning: Video not found at {video_path}")
                failed += 1
                continue

            prompt = f"""Your task is to accurately answer multiple-choice questions based on the given video.
Select the single most accurate answer from the given choices.
Question: {question}
Choices: {choices}
Your answer should be a capital letter representing your choice: A, B, C, or D. Don't generate any other text."""

            messages = [
                {
                    "role": "user",
                    "content": [
                        {"type": "video", "video": video_path},
                        {"type": "text", "text": prompt},
                    ],
                }
            ]

            log_entry = {
                "video_id": video_id,
                "question": question,
                "correct_answer": correct_answer,
                "model_raw_output": "",
                "model_extracted": "",
                "is_correct": False,
                "status": "success"
            }

            try:
                text_prompt = processor.apply_chat_template(
                    messages, 
                    tokenize=False, 
                    add_generation_prompt=True
                )
                
                audios, images, videos = process_mm_info(
                    messages, 
                    use_audio_in_video=args.use_audio_in_video
                )

                mm_input = {
                    "prompt": text_prompt,
                    "multi_modal_data": {},
                    "mm_processor_kwargs": {
                        "use_audio_in_video": args.use_audio_in_video,
                    },
                }
                
                if images is not None: mm_input["multi_modal_data"]["image"] = images
                if videos is not None: mm_input["multi_modal_data"]["video"] = videos
                if audios is not None: mm_input["multi_modal_data"]["audio"] = audios

                outputs = llm.generate([mm_input], sampling_params=sampling_params)
                generated_text = outputs[0].outputs[0].text
                
                model_answer = extract_answer(generated_text)
                is_correct = evaluate_answer(model_answer, correct_answer)

                log_entry["model_raw_output"] = generated_text
                log_entry["model_extracted"] = model_answer
                log_entry["is_correct"] = is_correct

                if is_correct: 
                    overall_correct += 1
                
                def update_stat(key, val):
                    if val is None: val = "N/A"
                    metrics[key]["count"][val] = metrics[key]["count"].get(val, 0) + 1
                    metrics[key]["correct"][val] = metrics[key]["correct"].get(val, 0) + (1 if is_correct else 0)

                update_stat("qa_type", qa_type)
                update_stat("video_cat", video_cat)
                update_stat("duration", duration)

            except Exception as e:
                log_entry["status"] = f"error: {str(e)}"
                failed += 1

            f_out.write(json.dumps(log_entry, ensure_ascii=False) + "\n")
            f_out.flush()

    print("\n" + "="*30 + " EVALUATION REPORT " + "="*30)
    valid_count = total_questions - failed
    if valid_count > 0:
        print(f"Overall Accuracy: {overall_correct}/{valid_count} ({overall_correct/valid_count:.2%})")
    
    for category in ["qa_type", "video_cat", "duration"]:
        print(f"\n--- Accuracy by {category} ---")
        for k in sorted(metrics[category]["count"].keys(), key=lambda x: str(x)):
            cnt = metrics[category]["count"][k]
            corr = metrics[category]["correct"][k]
            acc = (corr / cnt * 100) if cnt > 0 else 0
            print(f"{k}: {corr}/{cnt} ({acc:.2%})")

if __name__ == "__main__":
    parser = argparse.ArgumentParser()
    parser.add_argument('--video_base_dir', type=str, required=True)
    parser.add_argument('--json_file_path', type=str, required=True)
    parser.add_argument('--model_path', type=str, required=True)
    parser.add_argument('--output_path', type=str, default='eval_results_qwen3.jsonl')
    parser.add_argument('--use_audio_in_video', type=bool, default=True)
    args = parser.parse_args()


    llm = LLM(
        model=args.model_path,
        trust_remote_code=True,
        gpu_memory_utilization=0.90,
        tensor_parallel_size=torch.cuda.device_count(),
        limit_mm_per_prompt={'image': 3, 'video': 3, 'audio': 3},
        max_num_seqs=1, 
        max_model_len=32768,
        seed=1234,
    )

    sampling_params = SamplingParams(
        temperature=0.0,
        top_p=1.0, 
        top_k=-1, 
        max_tokens=15, 
    )

    processor = Qwen3OmniMoeProcessor.from_pretrained(args.model_path)

    run_evaluation(llm, processor, sampling_params, args)
```

### Logs

```shell
"video_id": "TwsSrD8g3HA", "question": "What was the chronological order of these events: (1) Fish splash, (2) '4 pounder' spoken, (3) 'good times' spoken, (4) Boat movement?", "correct_answer": "D", "model_raw_output": "A", "model_extracted": "A", "is_correct": false, "status": "success"}
{"video_id": "4U5u7OEcHFs", "question": "What visual shift occurs between the 0-10s and 10-20s segments to reflect the audio's emphasis on 'empathy, agility, insights'?", "correct_answer": "B", "model_raw_output": "", "model_extracted": "", "is_correct": false, "status": "error: Failed to apply prompt replacement for mm_items['audio'][0]"}
{"video_id": "LScZ6CXS5M8", "question": "What visual element appeared precisely after the speaker began discussing 'linear translation'?", "correct_answer": "A", "model_raw_output": "", "model_extracted": "", "is_correct": false, "status": "error: Failed to apply prompt replacement for mm_items['audio'][0]"}
```

### Environment Information

Linux 
Python 3.10.0 
Cuda 12.8

accelerate==1.12.0
aiohappyeyeballs==2.6.1
aiohttp==3.13.3
aiosignal==1.4.0
annotated-doc==0.0.4
annotated-types==0.7.0
anthropic==0.71.0
anyio==4.12.1
apache-tvm-ffi==0.1.8.post2
astor==0.8.1
async-timeout==5.0.1
attrs==25.4.0
audioread==3.1.0
av==16.1.0
blake3==1.0.8
cachetools==7.0.0
cbor2==5.8.0
certifi==2026.1.4
cffi==2.0.0
charset-normalizer==3.4.4
click==8.3.1
cloudpickle==3.1.2
compressed-tensors==0.12.2
cryptography==46.0.4
cuda-bindings==13.1.1
cuda-pathfinder==1.3.3
cuda-python==13.1.1
cupy-cuda12x==13.6.0
decorator==5.2.1
depyf==0.20.0
dill==0.4.1
diskcache==5.6.3
distro==1.9.0
dnspython==2.8.0
docstring_parser==0.17.0
einops==0.8.2
email-validator==2.3.0
exceptiongroup==1.3.1
fastapi==0.128.1
fastapi-cli==0.0.20
fastapi-cloud-cli==0.11.0
fastar==0.8.0
fastrlock==0.8.3
filelock==3.20.3
flash_attn==2.8.3
flashinfer-python==0.5.3
frozenlist==1.8.0
fsspec==2026.1.0
gguf==0.17.1
h11==0.16.0
hf-xet==1.2.0
httpcore==1.0.9
httptools==0.7.1
httpx==0.28.1
httpx-sse==0.4.3
huggingface_hub==0.36.1
idna==3.11
ijson==3.4.0.post0
ImageIO==2.37.2
imageio-ffmpeg==0.6.0
interegular==0.3.3
Jinja2==3.1.6
jiter==0.13.0
jmespath==1.1.0
joblib==1.5.3
jq==1.10.0
jsonschema==4.26.0
jsonschema-specifications==2025.9.1
lark==1.2.2
lazy_loader==0.4
librosa==0.11.0
llguidance==1.3.0
llvmlite==0.44.0
lm-format-enforcer==0.11.3
loguru==0.7.3
markdown-it-py==4.0.0
MarkupSafe==3.0.3
mcp==1.26.0
mdurl==0.1.2
mistral_common==1.9.0
model-hosting-container-standards==0.1.13
moviepy==2.2.1
mpmath==1.3.0
msgpack==1.1.2
msgspec==0.20.0
multidict==6.7.1
networkx==3.4.2
ninja==1.13.0
numba==0.61.2
numpy==2.2.6
nvidia-cublas-cu12==12.8.4.1
nvidia-cuda-cupti-cu12==12.8.90
nvidia-cuda-nvrtc-cu12==12.8.93
nvidia-cuda-runtime-cu12==12.8.90
nvidia-cudnn-cu12==9.10.2.21
nvidia-cudnn-frontend==1.18.0
nvidia-cufft-cu12==11.3.3.83
nvidia-cufile-cu12==1.13.1.3
nvidia-curand-cu12==10.3.9.90
nvidia-cusolver-cu12==11.7.3.90
nvidia-cusparse-cu12==12.5.8.93
nvidia-cusparselt-cu12==0.7.1
nvidia-cutlass-dsl==4.3.5
nvidia-ml-py==13.590.48
nvidia-nccl-cu12==2.27.5
nvidia-nvjitlink-cu12==12.8.93
nvidia-nvshmem-cu12==3.3.20
nvidia-nvtx-cu12==12.8.90
openai==2.16.0
openai-harmony==0.0.8
opencv-python-headless==4.13.0.92
outlines_core==0.2.11
packaging @ file:///home/task_176104877067765/conda-bld/packaging_1761049113113/work
partial-json-parser==0.2.1.1.post7
pillow==11.3.0
platformdirs==4.5.1
pooch==1.9.0
proglog==0.1.12
prometheus-fastapi-instrumentator==7.1.0
prometheus_client==0.24.1
propcache==0.4.1
protobuf==6.33.5
psutil==7.2.2
py-cpuinfo==9.0.0
pybase64==1.4.3
pycountry==24.6.1
pycparser==3.0
pydantic==2.12.5
pydantic-extra-types==2.11.0
pydantic-settings==2.12.0
pydantic_core==2.41.5
Pygments==2.19.2
PyJWT==2.11.0
python-dotenv==1.2.1
python-json-logger==4.0.0
python-multipart==0.0.22
PyYAML==6.0.3
pyzmq==27.1.0
qwen-omni-utils==0.0.8
ray==2.53.0
referencing==0.37.0
regex==2026.1.15
requests==2.32.5
resampy==0.4.3
rich==14.3.2
rich-toolkit==0.18.1
rignore==0.7.6
rpds-py==0.30.0
safetensors==0.7.0
scikit-learn==1.7.2
scipy==1.15.3
sentencepiece==0.2.1
sentry-sdk==2.52.0
setproctitle==1.3.7
shellingham==1.5.4
sniffio==1.3.1
soundfile==0.13.1
soxr==1.0.0
sse-starlette==3.2.0
starlette==0.50.0
supervisor==4.3.0
sympy==1.14.0
tabulate==0.9.0
threadpoolctl==3.6.0
tiktoken==0.12.0
tokenizers==0.22.2
tomli==2.4.0
torch==2.9.0
torchaudio==2.9.0
torchvision==0.24.0
tqdm==4.67.3
transformers==4.57.3
triton==3.5.0
typer==0.21.1
typing-inspection==0.4.2
typing_extensions==4.15.0
urllib3==2.6.3
uvicorn==0.40.0
uvloop==0.22.1
vllm==0.13.0
watchfiles==1.1.1
websockets==16.0
xgrammar==0.1.27
yarl==1.22.0


### Known Issue

- [x] The issue hasn't been already addressed in Documentation, Issues, and Discussions.

Provide feedback

Saved searches

Use saved searches to filter your results more quickly

using vllm on Qwen3-Omni-30B-A3B-Instruct: error: Failed to apply prompt replacement for mm_items['audio'][0] #166

Description

Reproduction

Logs

Environment Information

Known Issue

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

using vllm on Qwen3-Omni-30B-A3B-Instruct: error: Failed to apply prompt replacement for mm_items['audio'][0] #166

Description

Description

Reproduction

Logs

Environment Information

Known Issue

Metadata

Metadata

Assignees

Labels

Type

Fields

Projects

Milestone

Relationships

Development

Issue actions