File tree Expand file tree Collapse file tree
src/swiss_ai_model_launch/assets/envs Expand file tree Collapse file tree Original file line number Diff line number Diff line change 1+ #! /bin/bash
2+ sml advanced \
3+ --slurm-nodes 1 \
4+ --slurm-time 6:00:00 \
5+ --serving-framework vllm \
6+ --slurm-environment src/swiss_ai_model_launch/assets/envs/vllm_qwen3_omni.toml \
7+ --framework-args " --model /capstor/store/cscs/swissai/infra01/hf_models/models/swiss-ai/Qwen/Qwen3-Omni-30B-A3B-Captioner \
8+ --served-model-name Qwen/Qwen3-Omni-30B-A3B-Captioner-$( whoami) \
9+ --tensor-parallel-size 4 \
10+ --host 0.0.0.0 \
11+ --port 8080 \
12+ --dtype bfloat16 --max-model-len 32768 --trust-remote-code"
Original file line number Diff line number Diff line change 1+ image = " /capstor/store/cscs/swissai/infra01/container-images/vllm-qwen-omni-13.0.sqsh"
2+
3+ mounts = [
4+ " /capstor/store/cscs/swissai/infra01/ocf-share:/ocfbin" ,
5+ " /capstor" ,
6+ " /iopsstor" ,
7+ " /usr/lib64/libhwloc.so.15:/usr/lib/libhwloc.so.15" ,
8+ " /usr/lib64/libpciaccess.so.0:/usr/lib/libpciaccess.so.0" ,
9+ " /usr/lib64/libxml2.so.2:/usr/lib/libxml2.so.2" ,
10+ " /usr/lib64/libnuma.so.1:/usr/lib/libnuma.so.1" ,
11+ ]
12+
13+ workdir = " /opt"
14+
15+ [env ]
16+ # NCCL_DEBUG = "INFO" # uncomment for debugging
17+ # NCCL_DEBUG_SUBSYS = "INIT,NET" # uncomment for debugging
18+ NCCL_NET = " AWS Libfabric"
19+ NCCL_CROSS_NIC = " 1"
20+ NCCL_NET_GDR_LEVEL = " PHB"
21+ NCCL_SOCKET_IFNAME = " hsn"
22+ NCCL_PROTO = " ^LL128"
23+ FI_CXI_COMPAT = " 0"
24+ FI_MR_CACHE_MONITOR = " userfaultfd"
25+ FI_CXI_RX_MATCH_MODE = " software"
26+ FI_CXI_DEFAULT_CQ_SIZE = " 131072"
27+ FI_CXI_DEFAULT_TX_SIZE = " 32768"
28+ FI_CXI_DISABLE_HOST_REGISTER = " 1"
29+ OFI_NCCL_DISABLE_DMABUF = " 1"
30+ VLLM_ALLREDUCE_USE_SYMM_MEM = " 0"
31+
32+ [annotations ]
33+ com.hooks.aws_ofi_nccl.enabled = " true"
34+ com.hooks.aws_ofi_nccl.variant = " cuda13"
35+ com.hooks.cxi.enabled = " true"
You can’t perform that action at this time.
0 commit comments