Added Qwen3-TTS VoiceDesign vLLM-Omni launcher

yfchoco208 · yfchoco208 · commit b7f21f44248f · 2026-05-20T07:06:39.000+02:00
diff --git a/examples/clariden/cli/qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign-vllm-omni.sh b/examples/clariden/cli/qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign-vllm-omni.sh
@@ -0,0 +1,22 @@
+#!/bin/bash
+# Launch Qwen3-TTS 12Hz 1.7B VoiceDesign with vLLM-Omni on one Clariden GH200 node.
+#
+# Model: Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
+
+sml advanced \
+  --firecrest-system clariden \
+  --partition normal \
+  --slurm-nodes 1 \
+  --slurm-time 6:00:00 \
+  --serving-framework vllm-omni \
+  --served-model-name Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \
+  --worker-port 8080 \
+  --slurm-environment src/swiss_ai_model_launch/assets/envs/vllm_qwen3_tts_cuda13.toml \
+  --framework-args "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \
+    --deploy-config /opt/venv/lib/python3.12/site-packages/vllm_omni/deploy/qwen3_tts.yaml \
+    --omni \
+    --trust-remote-code \
+    --enforce-eager \
+    --max-model-len 8192 \
+    --gpu-memory-utilization 0.40" \
+  --no-tui
diff --git a/images/vllm_qwen3_tts_cuda13/Dockerfile b/images/vllm_qwen3_tts_cuda13/Dockerfile
@@ -0,0 +1,84 @@
+# hadolint global ignore=DL3008,DL3059
+
+ARG UBUNTU_VERSION=24.04
+ARG CUDA_VERSION=13.0.0
+ARG DEVEL_BASE_IMAGE=docker.io/nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION}
+
+FROM ${DEVEL_BASE_IMAGE} AS build
+
+WORKDIR /opt
+
+ENV DEBIAN_FRONTEND=noninteractive
+ENV CUDA_HOME=/usr/local/cuda
+ENV PATH=/opt/venv/bin:/usr/local/cuda/bin:$PATH
+ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
+ENV VLLM_USE_DEEP_GEMM=0
+ENV TORCHDYNAMO_DISABLE=1
+ENV UV_LINK_MODE=copy
+
+RUN apt-get update && apt-get install -y --no-install-recommends \
+    python3.12 \
+    python3.12-dev \
+    python3.12-venv \
+    python3-pip \
+    build-essential \
+    git \
+    curl \
+    ca-certificates \
+    cmake \
+    ninja-build \
+    ffmpeg \
+    libsndfile1 \
+    && rm -rf /var/lib/apt/lists/*
+
+RUN python3.12 -m venv /opt/venv && \
+    python -m pip install --upgrade --no-cache-dir \
+      pip==26.1.1 \
+      wheel==0.47.0 \
+      packaging==26.2 \
+      setuptools==69.5.1 \
+      ninja==1.13.0 \
+      uv==0.11.7
+
+# Install vLLM CUDA13 stack.
+RUN uv pip install --python /opt/venv/bin/python vllm==0.20.2 --torch-backend=auto
+
+# Install vLLM-Omni and Qwen3-TTS runtime deps.
+RUN uv pip install --python /opt/venv/bin/python \
+      vllm-omni==0.20.0 \
+      transformers==5.8.0 \
+      gradio==6.14.0 \
+      gradio-client==2.5.0 \
+      soundfile==0.13.1 \
+      pydub==0.25.1 \
+      librosa==0.11.0 \
+      requests==2.34.2 \
+      numpy==2.3.5
+
+# Sanity check.
+RUN python - <<'PY'
+from pathlib import Path
+import torch
+import vllm
+import vllm_omni
+import transformers
+import soundfile
+from vllm.entrypoints.openai.realtime.serving import OpenAIServingRealtime
+
+root = Path(vllm_omni.__file__).resolve().parent
+matches = list(root.rglob("qwen3_tts.yaml"))
+
+print("torch:", torch.__version__)
+print("cuda:", torch.version.cuda)
+print("cuda available:", torch.cuda.is_available())
+print("vllm:", vllm.__version__)
+print("transformers:", transformers.__version__)
+print("vllm_omni:", vllm_omni.__file__)
+print("realtime import OK")
+print("qwen3_tts.yaml:", matches[0] if matches else "NOT FOUND")
+
+if not matches:
+    raise RuntimeError("qwen3_tts.yaml not found")
+PY
+
+WORKDIR /opt
diff --git a/src/swiss_ai_model_launch/assets/envs/vllm_qwen3_tts_cuda13.toml b/src/swiss_ai_model_launch/assets/envs/vllm_qwen3_tts_cuda13.toml
@@ -0,0 +1,36 @@
+image = "/capstor/store/cscs/swissai/infra01/container-images/vllm-qwen3-tts-cuda13.sqsh"
+
+mounts = [
+  "/capstor/store/cscs/swissai/infra01/ocf-share:/ocfbin",
+  "/capstor",
+  "/iopsstor",
+  "/usr/lib64/libhwloc.so.15:/usr/lib/libhwloc.so.15",
+  "/usr/lib64/libpciaccess.so.0:/usr/lib/libpciaccess.so.0",
+  "/usr/lib64/libxml2.so.2:/usr/lib/libxml2.so.2",
+  "/usr/lib64/libnuma.so.1:/usr/lib/libnuma.so.1",
+]
+
+workdir = "/opt"
+
+[env]
+NCCL_NET = "AWS Libfabric"
+NCCL_CROSS_NIC = "1"
+NCCL_NET_GDR_LEVEL = "PHB"
+NCCL_SOCKET_IFNAME = "hsn"
+NCCL_PROTO = "^LL128"
+FI_CXI_COMPAT = "0"
+FI_MR_CACHE_MONITOR = "userfaultfd"
+FI_CXI_RX_MATCH_MODE = "software"
+FI_CXI_DEFAULT_CQ_SIZE = "131072"
+FI_CXI_DEFAULT_TX_SIZE = "32768"
+FI_CXI_DISABLE_HOST_REGISTER = "1"
+OFI_NCCL_DISABLE_DMABUF = "1"
+VLLM_ALLREDUCE_USE_SYMM_MEM = "0"
+
+VLLM_USE_DEEP_GEMM = "0"
+TORCHDYNAMO_DISABLE = "1"
+
+[annotations]
+com.hooks.aws_ofi_nccl.enabled = "true"
+com.hooks.aws_ofi_nccl.variant = "cuda13"
+com.hooks.cxi.enabled = "true"
diff --git a/src/swiss_ai_model_launch/assets/models.json b/src/swiss_ai_model_launch/assets/models.json
@@ -202,5 +202,12 @@
     "environment": null,
     "nodes_per_worker": 1,
     "framework_args": "--tp-size 4 --reasoning-parser deepseek-r1 --enable-metrics"
+  },
+  {
+    "model": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
+    "framework": "vllm-omni",
+    "environment": "src/swiss_ai_model_launch/assets/envs/vllm_qwen3_tts_cuda13.toml",
+    "nodes_per_worker": 1,
+    "framework_args": "--deploy-config /opt/venv/lib/python3.12/site-packages/vllm_omni/deploy/qwen3_tts.yaml --omni --trust-remote-code --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.40"
   }
 ]
diff --git a/src/swiss_ai_model_launch/assets/template.jinja b/src/swiss_ai_model_launch/assets/template.jinja
@@ -83,6 +83,10 @@ case "$FRAMEWORK" in
         FRAMEWORK_ENV_SETUP="export RAY_CGRAPH_get_timeout=1800; export no_proxy=\"0.0.0.0,\$no_proxy\"; export NO_PROXY=\"0.0.0.0,\$NO_PROXY\""
         FRAMEWORK_LAUNCH="python3 -m vllm.entrypoints.openai.api_server"
         ;;
+    vllm-omni)
+        FRAMEWORK_ENV_SETUP="export RAY_CGRAPH_get_timeout=1800; export no_proxy=\"0.0.0.0,\$no_proxy\"; export NO_PROXY=\"0.0.0.0,\$NO_PROXY\""
+        FRAMEWORK_LAUNCH="vllm serve"
+        ;;
 esac
 
 # Router always uses sglang_router (works with any OpenAI-compatible backend)
diff --git a/src/swiss_ai_model_launch/launchers/launch_request.py b/src/swiss_ai_model_launch/launchers/launch_request.py
@@ -9,7 +9,7 @@ class LaunchRequest(BaseModel):
     """A fully-specified launch request — catalogue fields plus user-supplied runtime parameters."""
 
     model: str
-    framework: Literal["sglang", "vllm"]
+    framework: Literal["sglang", "vllm", "vllm-omni"]
     environment: str | None = None
     nodes_per_worker: int
     workers: int
diff --git a/src/swiss_ai_model_launch/launchers/model_catalog_entry.py b/src/swiss_ai_model_launch/launchers/model_catalog_entry.py
@@ -7,7 +7,7 @@ class ModelCatalogEntry(BaseModel):
     """A model entry from the catalogue — describes what the model needs, not how to run it."""
 
     model: str
-    framework: Literal["sglang", "vllm"]
+    framework: Literal["sglang", "vllm", "vllm-omni"]
     environment: str | None = None
     nodes_per_worker: int = 1
     framework_args: str | None = None
diff --git a/src/swiss_ai_model_launch/mcp/server.py b/src/swiss_ai_model_launch/mcp/server.py
@@ -246,7 +246,7 @@ async def launch_preconfigured_model(
         "Model in 'vendor/name' format (e.g. 'swiss-ai/Apertus-70B'). "
         "Use `list_preconfigured_models` to see available models.",
     ],
-    framework: Annotated[Literal["sglang", "vllm"], "Inference framework."],
+    framework: Annotated[Literal["sglang", "vllm", "vllm-omni"], "Inference framework."],
     workers: Annotated[int, "Number of workers."] = 1,
     time: Annotated[str, "Job time limit in HH:MM:SS format (e.g. '03:00:00')."] = "03:00:00",
     use_router: Annotated[bool, "Enable router for load balancing across workers."] = False,

Original file line number	Diff line number	Diff line change
`@@ -202,5 +202,12 @@`
`202`	`202`	`"environment": null,`
`203`	`203`	`"nodes_per_worker": 1,`
`204`	`204`	`"framework_args": "--tp-size 4 --reasoning-parser deepseek-r1 --enable-metrics"`
	`205`	`+ },`
	`206`	`+ {`
	`207`	`+ "model": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",`
	`208`	`+ "framework": "vllm-omni",`
	`209`	`+ "environment": "src/swiss_ai_model_launch/assets/envs/vllm_qwen3_tts_cuda13.toml",`
	`210`	`+ "nodes_per_worker": 1,`
	`211`	`+ "framework_args": "--deploy-config /opt/venv/lib/python3.12/site-packages/vllm_omni/deploy/qwen3_tts.yaml --omni --trust-remote-code --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.40"`
`205`	`212`	`}`
`206`	`213`	`]`