Add Qwen/Qwen3-ASR-1.7B (vLLM) (#106)

Alvorecer721 · web-flow · commit ee1466a1ef7d · 2026-04-29T11:13:46.000+02:00
diff --git a/examples/clariden/cli/qwen/Qwen3-ASR-1.7B-vllm.sh b/examples/clariden/cli/qwen/Qwen3-ASR-1.7B-vllm.sh
@@ -0,0 +1,35 @@
+#!/bin/bash
+# Launch Qwen/Qwen3-ASR-1.7B (1.7B multilingual ASR, 52 langs incl.
+# 22 Chinese dialects, built on Qwen3-Omni foundation) on one Clariden
+# GH200 node with vLLM, DP=4 TP=1 (4 independent replicas, one per GPU).
+# Suitable for high-throughput batch / streaming ASR over many audio clips.
+#
+# Qwen3-ASR uses the Qwen3ASRForConditionalGeneration architecture, which
+# is registered in stock vLLM 0.19+ (no vllm-omni needed). The generic
+# `vllm.toml` env points at the ci/vllm_cuda13 image (vLLM 0.19.1rc1,
+# transformers 5.5.4, torchaudio 2.11) which has the full audio arch set
+# and the newer Qwen3ASRConfig schema (with thinker_config). The image
+# is missing librosa/audioread (vLLM's audio file loader), so we install
+# them at launch via --pre-launch-cmds.
+#
+# Model weights (downloaded separately):
+#   /capstor/store/cscs/swissai/infra01/MLLM/audio_asr/Qwen3-ASR-1.7B/
+#
+sml advanced \
+  --firecrest-system clariden \
+  --partition normal \
+  --slurm-nodes 1 \
+  --slurm-time 6:00:00 \
+  --serving-framework vllm \
+  --worker-port 8080 \
+  --slurm-environment src/swiss_ai_model_launch/assets/envs/vllm.toml \
+  --pre-launch-cmds "pip install librosa audioread" \
+  --framework-args "--model /capstor/store/cscs/swissai/infra01/MLLM/audio_asr/Qwen3-ASR-1.7B \
+    --served-model-name Qwen/Qwen3-ASR-1.7B-$(whoami) \
+    --data-parallel-size 4 \
+    --tensor-parallel-size 1 \
+    --host 0.0.0.0 \
+    --port 8080 \
+    --dtype bfloat16 \
+    --max-model-len 32768 \
+    --trust-remote-code"
diff --git a/src/swiss_ai_model_launch/assets/models.json b/src/swiss_ai_model_launch/assets/models.json
@@ -145,5 +145,13 @@
     "environment": null,
     "nodes_per_worker": 1,
     "framework_args": "--dp-size 4 --tp-size 1 --trust-remote-code --context-length 16384 --mem-fraction-static 0.85 --enable-metrics"
+  },
+  {
+    "model": "Qwen/Qwen3-ASR-1.7B",
+    "framework": "vllm",
+    "environment": null,
+    "nodes_per_worker": 1,
+    "framework_args": "--data-parallel-size 4 --tensor-parallel-size 1 --dtype bfloat16 --max-model-len 32768 --trust-remote-code",
+    "pre_launch_cmds": "pip install librosa audioread"
   }
 ]

Original file line number	Diff line number	Diff line change
`@@ -145,5 +145,13 @@`
`145`	`145`	`"environment": null,`
`146`	`146`	`"nodes_per_worker": 1,`
`147`	`147`	`"framework_args": "--dp-size 4 --tp-size 1 --trust-remote-code --context-length 16384 --mem-fraction-static 0.85 --enable-metrics"`
	`148`	`+ },`
	`149`	`+ {`
	`150`	`+ "model": "Qwen/Qwen3-ASR-1.7B",`
	`151`	`+ "framework": "vllm",`
	`152`	`+ "environment": null,`
	`153`	`+ "nodes_per_worker": 1,`
	`154`	`+ "framework_args": "--data-parallel-size 4 --tensor-parallel-size 1 --dtype bfloat16 --max-model-len 32768 --trust-remote-code",`
	`155`	`+ "pre_launch_cmds": "pip install librosa audioread"`
`148`	`156`	`}`
`149`	`157`	`]`