openvinotoolkit · sgonorov · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026 · Mar 12, 2026
@@ -0,0 +1,100 @@
+# Qwen3-Omni CLI Chat
+
+Interactive CLI chat using the Qwen3-Omni-4B model with PyTorch.
+Supports text, image, audio, and video inputs. Generates text and audio outputs.
+
+## Requirements
+
+- Python 3.10+
+- PyTorch
+- transformers: `pip install git+https://github.com/huggingface/transformers@3d1a4f5e34753e51cb85052539c6ef10cab9a5c1`
+- qwen-omni-utils: `pip install qwen-omni-utils -U`
+- numpy
+
+## Model Download
+
+```bash
+wget https://multimodal-dialog.oss-cn-hangzhou.aliyuncs.com/yiru/Qwen3-Omni-Release/Qwen3-Omni-4B-Instruct-multilingual.tar \
+  -P temp/llm_cache/hf_models/
+
+tar xf temp/llm_cache/hf_models/Qwen3-Omni-4B-Instruct-multilingual.tar \
+  -C temp/llm_cache/hf_models/
+```
+
+## Usage
+
+### Interactive Chat
+
+```bash
+python -m tools.qwen3.qwen3_chat \
+  --model_path temp/llm_cache/hf_models/Qwen3-Omni-4B-Instruct-multilingual
+```
+
+### Text-only (no audio output, saves ~2GB GPU memory)
+
+```bash
+python -m tools.qwen3.qwen3_chat \
+  --model_path temp/llm_cache/hf_models/Qwen3-Omni-4B-Instruct-multilingual \
+  --no_audio
+```
+
+### Demo Mode (smoke test)
+
+Runs 3 predefined text prompts and exits:
+
+```bash
+python -m tools.qwen3.qwen3_chat \
+  --model_path temp/llm_cache/hf_models/Qwen3-Omni-4B-Instruct-multilingual \
+  --demo
+```
+
+### Select Speaker Voice
+
+```bash
+python -m tools.qwen3.qwen3_chat \
+  --model_path temp/llm_cache/hf_models/Qwen3-Omni-4B-Instruct-multilingual \
+  --speaker Chelsie
+```
+
+## CLI Arguments
+
+| Argument       | Default | Description                                          |
+|----------------|---------|------------------------------------------------------|
+| `--model_path` | —       | Path to the model directory (required)               |
+| `--device`     | `auto`  | Device map for model loading                         |
+| `--no_audio`   | `false` | Disable audio output at load time (saves GPU memory) |
+| `--output_dir` | `output`| Directory for saved audio files                      |
+| `--speaker`    | —       | Speaker voice name (model default if not set)        |
+| `--demo`       | `false` | Run text-only smoke test and exit                    |
+
+## Chat Commands
+
+| Command          | Description                                         |
+|------------------|-----------------------------------------------------|
+| `/image <path>`  | Attach an image (use quotes for paths with spaces)  |
+| `/audio <path>`  | Attach an audio file                                |
+| `/video <path>`  | Attach a video file                                 |
+| `/clear`         | Clear chat history                                  |
+| `/help`          | Show available commands                             |
+| `/quit`          | Exit chat                                           |
+
+### Example Session
+
+```
+You: Hello!
+Qwen: Hello! How can I help you today?
+
+You: /image photos/cat.jpg What do you see in this image?
+Qwen: I can see a cute cat sitting on a windowsill...
+[Audio saved to output/turn_001.wav]
+
+You: /audio recording.wav What is being said?
+Qwen: The audio contains someone saying...
+[Audio saved to output/turn_002.wav]
+
+You: /clear
+Chat history cleared.
+
+You: /quit
+Goodbye!
+```
@@ -0,0 +1,3 @@
+from .chat import main
+
+main()
@@ -0,0 +1,116 @@
+from typing import Any
+import argparse
+
+import torch
+
+from .model import load_model
+from .generate import init_history, generate_response
+from .io_utils import parse_user_input, save_audio
+
+HELP_TEXT = """Commands:
+  /image <path>  - Attach an image file (use quotes for paths with spaces)
+  /audio <path>  - Attach an audio file
+  /video <path>  - Attach a video file
+  /clear         - Clear chat history
+  /help          - Show this help
+  /quit          - Exit chat
+"""
+
+
+def _handle_command(
+    user_input: str,
+    history: list[dict[str, Any]],
+) -> tuple[str | None, list[dict[str, Any]]]:
+    if user_input in ("/quit", "/exit"):
+        print("Goodbye!")
+        return "quit", history
+    if user_input == "/help":
+        print(HELP_TEXT)
+        return "continue", history
+    if user_input == "/clear":
+        print("Chat history cleared.\n")
+        return "continue", init_history()
+    return None, history
+
+
+def chat_loop(
+    model: Any,
+    processor: Any,
+    enable_audio: bool,
+    speaker: str | None,
+    output_dir: str,
+) -> None:
+    history = init_history()
+    turn = 0
+
+    while True:
+        try:
+            user_input = input("You: ").strip()
+        except (KeyboardInterrupt, EOFError):
+            print("\nGoodbye!")
+            break
+
+        if not user_input:
+            continue
+
+        action, history = _handle_command(user_input, history)
+        if action == "quit":
+            break
+        if action == "continue":
+            continue
+
+        content = parse_user_input(user_input)
+        if not content:
+            continue
+
+        history.append({"role": "user", "content": content})
+
+        print("Qwen: ", end="", flush=True)
+        with torch.inference_mode():
+            text, audio = generate_response(
+                model,
+                processor,
+                history,
+                enable_audio,
+                speaker,
+            )
+
+        if audio is not None:
+            turn += 1
+            path = save_audio(audio, output_dir, turn)
+            print(f"[Audio saved to {path}]")
+
+        history.append(
+            {"role": "assistant", "content": [{"type": "text", "text": text}]},
+        )
+        print()
+
+
+def main() -> None:
+    parser = argparse.ArgumentParser(description="Qwen3-Omni CLI Chat")
+    parser.add_argument("--model_path", type=str, required=True)
+    parser.add_argument("--device", type=str, default="auto")
+    parser.add_argument("--no_audio", action="store_true")
+    parser.add_argument("--output_dir", type=str, default="output")
+    parser.add_argument("--speaker", type=str, default=None)
+    parser.add_argument("--demo", action="store_true")
+    args = parser.parse_args()
+
+    enable_audio = not args.no_audio
+
+    print(f"Loading model from {args.model_path}...")
+    model, processor = load_model(args.model_path, args.device, enable_audio)
+    print("Model loaded.\n")
+
+    if args.demo:
+        from .demo import run_demo
+
+        run_demo(model, processor)
+        return
+
+    print("Type /help for commands.\n")
+    chat_loop(model, processor, enable_audio, args.speaker, args.output_dir)
+
+
+if __name__ == "__main__":
+    main()
@@ -0,0 +1,35 @@
+from typing import Any
+
+import torch
+
+from .generate import init_history, generate_response
+
+DEMO_PROMPTS = [
+    "Hello, who are you?",
+    "What is 2+2? Answer in one sentence.",
+    "Tell me a one-sentence joke.",
+]
+
+
+def run_demo(model: Any, processor: Any) -> None:
+    print("=== Demo mode: text-only smoke test ===\n")
+
+    for prompt in DEMO_PROMPTS:
+        history = init_history()
+        history.append(
+            {"role": "user", "content": [{"type": "text", "text": prompt}]},
+        )
+
+        with torch.no_grad():
+            text, _ = generate_response(
+                model,
+                processor,
+                history,
+                enable_audio=False,
+                speaker=None,
+            )
+
+        print(f"You:  {prompt}")
+        print(f"Qwen: {text}\n")
+
+    print("=== Demo complete ===")
@@ -0,0 +1,66 @@
+from typing import Any
+
+import torch
+import transformers
+
+from .model import SYSTEM_PROMPT
+
+
+def init_history() -> list[dict[str, Any]]:
+    return [{"role": "system", "content": [{"type": "text", "text": SYSTEM_PROMPT}]}]
+
+
+def _extract_text_ids(output: Any) -> torch.Tensor:
+    if isinstance(output, torch.Tensor):
+        return output
+    if hasattr(output, "sequences"):
+        return output.sequences
+    return _extract_text_ids(output[0])
+
+
+def generate_response(
+    model: Any,
+    processor: Any,
+    history: list[dict[str, Any]],
+    enable_audio: bool,
+    speaker: str | None,
+) -> tuple[str, torch.Tensor | None]:
+    inputs = processor.apply_chat_template(
+        history,
+        add_generation_prompt=True,
+        tokenize=True,
+        return_dict=True,
+        return_tensors="pt",
+        padding=True,
+    ).to(model.device)
+
+    gen_kwargs: dict[str, Any] = {
+        "streamer": transformers.TextStreamer(
+            processor,
+            skip_prompt=True,
+            skip_special_tokens=False,
+            clean_up_tokenization_spaces=False,
+        ),
+        "thinker_do_sample": False,
+    }
+    if speaker:
+        gen_kwargs["speaker"] = speaker
+
+    input_len = inputs["input_ids"].shape[-1]
+
+    if enable_audio:
+        gen_kwargs["return_audio"] = True
+        gen_kwargs["talker_do_sample"] = True
+        text_ids, audio = model.generate(**inputs, **gen_kwargs)
+        text_ids = _extract_text_ids(text_ids)
+    else:
+        gen_kwargs["return_audio"] = False
+        output = model.generate(**inputs, **gen_kwargs)
+        text_ids = _extract_text_ids(output)
+        audio = None
+
+    generated_ids = text_ids[:, input_len:]
+    text = processor.batch_decode(
+        generated_ids, skip_special_tokens=True, clean_up_tokenization_spaces=False,
+    )[0]
+    return text, audio
@@ -0,0 +1,45 @@
+from pathlib import Path
+import re
+import wave
+
+import numpy as np
+import torch
+
+AUDIO_SAMPLE_RATE = 24000
+
+MEDIA_PATTERN = re.compile(r'/(image|audio|video)\s+(?:"([^"]+)"|(\S+))')
+
+
+def parse_user_input(raw_input: str) -> list[dict[str, str]]:
+    content: list[dict[str, str]] = []
+    remaining = raw_input
+
+    for match in MEDIA_PATTERN.finditer(remaining):
+        media_type = match.group(1)
+        path = match.group(2) or match.group(3)
+        content.append({"type": media_type, media_type: path})
+
+    remaining = MEDIA_PATTERN.sub("", remaining).strip()
+    if remaining:
+        content.append({"type": "text", "text": remaining})
-    remaining = raw_input
-
-    for match in MEDIA_PATTERN.finditer(remaining):
-        media_type = match.group(1)
-        path = match.group(2) or match.group(3)
-        content.append({"type": media_type, media_type: path})
-
-    remaining = MEDIA_PATTERN.sub("", remaining).strip()
-    if remaining:
-        content.append({"type": "text", "text": remaining})
+    last_end: int = 0
+
+    for match in MEDIA_PATTERN.finditer(raw_input):
+        if match.start() > last_end:
+            text_segment = raw_input[last_end:match.start()].strip()
+            if text_segment:
+                content.append({"type": "text", "text": text_segment})
+
+        media_type = match.group(1)
+        path = match.group(2) or match.group(3)
+        content.append({"type": media_type, media_type: path})
+        last_end = match.end()
+
+    if last_end < len(raw_input):
+        text_segment = raw_input[last_end:].strip()
+        if text_segment:
+            content.append({"type": "text", "text": text_segment})
-    remaining = raw_input
-
-    for match in MEDIA_PATTERN.finditer(remaining):
-        media_type = match.group(1)
-        path = match.group(2) or match.group(3)
-        content.append({"type": media_type, media_type: path})
-
-    remaining = MEDIA_PATTERN.sub("", remaining).strip()
-    if remaining:
-        content.append({"type": "text", "text": remaining})
+    last_end: int = 0
+
+    for match in MEDIA_PATTERN.finditer(raw_input):
+        if match.start() > last_end:
+            text_segment = raw_input[last_end:match.start()].strip()
+            if text_segment:
+                content.append({"type": "text", "text": text_segment})
+
+        media_type = match.group(1)
+        path = match.group(2) or match.group(3)
+        content.append({"type": media_type, media_type: path})
+        last_end = match.end()
+
+    if last_end < len(raw_input):
+        text_segment = raw_input[last_end:].strip()
+        if text_segment:
+            content.append({"type": "text", "text": text_segment})
+
+    return content
+
+
+def save_audio(audio_tensor: torch.Tensor, output_dir: str | Path, turn_number: int) -> Path:
+    output_dir = Path(output_dir)
+    output_dir.mkdir(parents=True, exist_ok=True)
+
+    filename = output_dir / f"turn_{turn_number:03d}.wav"
+    audio_np = audio_tensor.reshape(-1).detach().cpu().float().numpy()
+
+    pcm_data = np.clip(audio_np, -1.0, 1.0)
+    pcm_int16 = (pcm_data * 32767).astype(np.int16)
+
+    with wave.open(str(filename), "wb") as wf:
+        wf.setnchannels(1)
+        wf.setsampwidth(2)
+        wf.setframerate(AUDIO_SAMPLE_RATE)
+        wf.writeframes(pcm_int16.tobytes())
+
+    return filename