Skip to content

Commit b7f21f4

Browse files
committed
Added Qwen3-TTS VoiceDesign vLLM-Omni launcher
1 parent 4d67436 commit b7f21f4

8 files changed

Lines changed: 156 additions & 3 deletions

File tree

Lines changed: 22 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,22 @@
1+
#!/bin/bash
2+
# Launch Qwen3-TTS 12Hz 1.7B VoiceDesign with vLLM-Omni on one Clariden GH200 node.
3+
#
4+
# Model: Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign
5+
6+
sml advanced \
7+
--firecrest-system clariden \
8+
--partition normal \
9+
--slurm-nodes 1 \
10+
--slurm-time 6:00:00 \
11+
--serving-framework vllm-omni \
12+
--served-model-name Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \
13+
--worker-port 8080 \
14+
--slurm-environment src/swiss_ai_model_launch/assets/envs/vllm_qwen3_tts_cuda13.toml \
15+
--framework-args "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign \
16+
--deploy-config /opt/venv/lib/python3.12/site-packages/vllm_omni/deploy/qwen3_tts.yaml \
17+
--omni \
18+
--trust-remote-code \
19+
--enforce-eager \
20+
--max-model-len 8192 \
21+
--gpu-memory-utilization 0.40" \
22+
--no-tui
Lines changed: 84 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,84 @@
1+
# hadolint global ignore=DL3008,DL3059
2+
3+
ARG UBUNTU_VERSION=24.04
4+
ARG CUDA_VERSION=13.0.0
5+
ARG DEVEL_BASE_IMAGE=docker.io/nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION}
6+
7+
FROM ${DEVEL_BASE_IMAGE} AS build
8+
9+
WORKDIR /opt
10+
11+
ENV DEBIAN_FRONTEND=noninteractive
12+
ENV CUDA_HOME=/usr/local/cuda
13+
ENV PATH=/opt/venv/bin:/usr/local/cuda/bin:$PATH
14+
ENV LD_LIBRARY_PATH=/usr/local/cuda/lib64:$LD_LIBRARY_PATH
15+
ENV VLLM_USE_DEEP_GEMM=0
16+
ENV TORCHDYNAMO_DISABLE=1
17+
ENV UV_LINK_MODE=copy
18+
19+
RUN apt-get update && apt-get install -y --no-install-recommends \
20+
python3.12 \
21+
python3.12-dev \
22+
python3.12-venv \
23+
python3-pip \
24+
build-essential \
25+
git \
26+
curl \
27+
ca-certificates \
28+
cmake \
29+
ninja-build \
30+
ffmpeg \
31+
libsndfile1 \
32+
&& rm -rf /var/lib/apt/lists/*
33+
34+
RUN python3.12 -m venv /opt/venv && \
35+
python -m pip install --upgrade --no-cache-dir \
36+
pip==26.1.1 \
37+
wheel==0.47.0 \
38+
packaging==26.2 \
39+
setuptools==69.5.1 \
40+
ninja==1.13.0 \
41+
uv==0.11.7
42+
43+
# Install vLLM CUDA13 stack.
44+
RUN uv pip install --python /opt/venv/bin/python vllm==0.20.2 --torch-backend=auto
45+
46+
# Install vLLM-Omni and Qwen3-TTS runtime deps.
47+
RUN uv pip install --python /opt/venv/bin/python \
48+
vllm-omni==0.20.0 \
49+
transformers==5.8.0 \
50+
gradio==6.14.0 \
51+
gradio-client==2.5.0 \
52+
soundfile==0.13.1 \
53+
pydub==0.25.1 \
54+
librosa==0.11.0 \
55+
requests==2.34.2 \
56+
numpy==2.3.5
57+
58+
# Sanity check.
59+
RUN python - <<'PY'
60+
from pathlib import Path
61+
import torch
62+
import vllm
63+
import vllm_omni
64+
import transformers
65+
import soundfile
66+
from vllm.entrypoints.openai.realtime.serving import OpenAIServingRealtime
67+
68+
root = Path(vllm_omni.__file__).resolve().parent
69+
matches = list(root.rglob("qwen3_tts.yaml"))
70+
71+
print("torch:", torch.__version__)
72+
print("cuda:", torch.version.cuda)
73+
print("cuda available:", torch.cuda.is_available())
74+
print("vllm:", vllm.__version__)
75+
print("transformers:", transformers.__version__)
76+
print("vllm_omni:", vllm_omni.__file__)
77+
print("realtime import OK")
78+
print("qwen3_tts.yaml:", matches[0] if matches else "NOT FOUND")
79+
80+
if not matches:
81+
raise RuntimeError("qwen3_tts.yaml not found")
82+
PY
83+
84+
WORKDIR /opt
Lines changed: 36 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,36 @@
1+
image = "/capstor/store/cscs/swissai/infra01/container-images/vllm-qwen3-tts-cuda13.sqsh"
2+
3+
mounts = [
4+
"/capstor/store/cscs/swissai/infra01/ocf-share:/ocfbin",
5+
"/capstor",
6+
"/iopsstor",
7+
"/usr/lib64/libhwloc.so.15:/usr/lib/libhwloc.so.15",
8+
"/usr/lib64/libpciaccess.so.0:/usr/lib/libpciaccess.so.0",
9+
"/usr/lib64/libxml2.so.2:/usr/lib/libxml2.so.2",
10+
"/usr/lib64/libnuma.so.1:/usr/lib/libnuma.so.1",
11+
]
12+
13+
workdir = "/opt"
14+
15+
[env]
16+
NCCL_NET = "AWS Libfabric"
17+
NCCL_CROSS_NIC = "1"
18+
NCCL_NET_GDR_LEVEL = "PHB"
19+
NCCL_SOCKET_IFNAME = "hsn"
20+
NCCL_PROTO = "^LL128"
21+
FI_CXI_COMPAT = "0"
22+
FI_MR_CACHE_MONITOR = "userfaultfd"
23+
FI_CXI_RX_MATCH_MODE = "software"
24+
FI_CXI_DEFAULT_CQ_SIZE = "131072"
25+
FI_CXI_DEFAULT_TX_SIZE = "32768"
26+
FI_CXI_DISABLE_HOST_REGISTER = "1"
27+
OFI_NCCL_DISABLE_DMABUF = "1"
28+
VLLM_ALLREDUCE_USE_SYMM_MEM = "0"
29+
30+
VLLM_USE_DEEP_GEMM = "0"
31+
TORCHDYNAMO_DISABLE = "1"
32+
33+
[annotations]
34+
com.hooks.aws_ofi_nccl.enabled = "true"
35+
com.hooks.aws_ofi_nccl.variant = "cuda13"
36+
com.hooks.cxi.enabled = "true"

src/swiss_ai_model_launch/assets/models.json

Lines changed: 7 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -202,5 +202,12 @@
202202
"environment": null,
203203
"nodes_per_worker": 1,
204204
"framework_args": "--tp-size 4 --reasoning-parser deepseek-r1 --enable-metrics"
205+
},
206+
{
207+
"model": "Qwen/Qwen3-TTS-12Hz-1.7B-VoiceDesign",
208+
"framework": "vllm-omni",
209+
"environment": "src/swiss_ai_model_launch/assets/envs/vllm_qwen3_tts_cuda13.toml",
210+
"nodes_per_worker": 1,
211+
"framework_args": "--deploy-config /opt/venv/lib/python3.12/site-packages/vllm_omni/deploy/qwen3_tts.yaml --omni --trust-remote-code --enforce-eager --max-model-len 8192 --gpu-memory-utilization 0.40"
205212
}
206213
]

src/swiss_ai_model_launch/assets/template.jinja

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -83,6 +83,10 @@ case "$FRAMEWORK" in
8383
FRAMEWORK_ENV_SETUP="export RAY_CGRAPH_get_timeout=1800; export no_proxy=\"0.0.0.0,\$no_proxy\"; export NO_PROXY=\"0.0.0.0,\$NO_PROXY\""
8484
FRAMEWORK_LAUNCH="python3 -m vllm.entrypoints.openai.api_server"
8585
;;
86+
vllm-omni)
87+
FRAMEWORK_ENV_SETUP="export RAY_CGRAPH_get_timeout=1800; export no_proxy=\"0.0.0.0,\$no_proxy\"; export NO_PROXY=\"0.0.0.0,\$NO_PROXY\""
88+
FRAMEWORK_LAUNCH="vllm serve"
89+
;;
8690
esac
8791

8892
# Router always uses sglang_router (works with any OpenAI-compatible backend)

src/swiss_ai_model_launch/launchers/launch_request.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -9,7 +9,7 @@ class LaunchRequest(BaseModel):
99
"""A fully-specified launch request — catalogue fields plus user-supplied runtime parameters."""
1010

1111
model: str
12-
framework: Literal["sglang", "vllm"]
12+
framework: Literal["sglang", "vllm", "vllm-omni"]
1313
environment: str | None = None
1414
nodes_per_worker: int
1515
workers: int

src/swiss_ai_model_launch/launchers/model_catalog_entry.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -7,7 +7,7 @@ class ModelCatalogEntry(BaseModel):
77
"""A model entry from the catalogue — describes what the model needs, not how to run it."""
88

99
model: str
10-
framework: Literal["sglang", "vllm"]
10+
framework: Literal["sglang", "vllm", "vllm-omni"]
1111
environment: str | None = None
1212
nodes_per_worker: int = 1
1313
framework_args: str | None = None

src/swiss_ai_model_launch/mcp/server.py

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -246,7 +246,7 @@ async def launch_preconfigured_model(
246246
"Model in 'vendor/name' format (e.g. 'swiss-ai/Apertus-70B'). "
247247
"Use `list_preconfigured_models` to see available models.",
248248
],
249-
framework: Annotated[Literal["sglang", "vllm"], "Inference framework."],
249+
framework: Annotated[Literal["sglang", "vllm", "vllm-omni"], "Inference framework."],
250250
workers: Annotated[int, "Number of workers."] = 1,
251251
time: Annotated[str, "Job time limit in HH:MM:SS format (e.g. '03:00:00')."] = "03:00:00",
252252
use_router: Annotated[bool, "Enable router for load balancing across workers."] = False,

0 commit comments

Comments
 (0)