Skip to content

Commit 0733d1d

Browse files
Add CUDA 13 Support (#20)
1 parent 7bcd347 commit 0733d1d

6 files changed

Lines changed: 14 additions & 117 deletions

File tree

src/swiss_ai_model_launch/assets/envs/sglang.toml

Lines changed: 5 additions & 6 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
image = "/iopsstor/scratch/cscs/edebened/ce-images/large_model_serving.sqsh"
1+
image = "/capstor/store/cscs/swissai/infra01/container-images/sglang_cuda13.sqsh"
22

33
mounts = [
44
"/iopsstor/store/cscs/swissai/a09/xyao/bin:/ocfbin",
@@ -7,16 +7,14 @@ mounts = [
77
"/usr/lib64/libhwloc.so.15:/usr/lib/libhwloc.so.15",
88
"/usr/lib64/libpciaccess.so.0:/usr/lib/libpciaccess.so.0",
99
"/usr/lib64/libxml2.so.2:/usr/lib/libxml2.so.2",
10-
"/opt/cray/libfabric/1.22.0/lib64:/opt/cray/libfabric/lib64",
11-
"/opt/cscs/aws-ofi-ccl-plugin/cuda12:/opt/cscs/aws-ofi-ccl-plugin/cuda12",
10+
"/usr/lib64/libnuma.so.1:/usr/lib/libnuma.so.1",
1211
]
1312

14-
workdir = "/opt"
13+
workdir = "/opt"
1514

1615
[env]
1716
# NCCL_DEBUG = "INFO" # uncomment for debugging
1817
# NCCL_DEBUG_SUBSYS = "INIT,NET" # uncomment for debugging
19-
NCCL_NET_PLUGIN = "/opt/cscs/aws-ofi-ccl-plugin/cuda12/libnccl-net.so"
2018
NCCL_NET = "AWS Libfabric"
2119
NCCL_CROSS_NIC = "1"
2220
NCCL_NET_GDR_LEVEL = "PHB"
@@ -34,4 +32,5 @@ VLLM_ALLREDUCE_USE_SYMM_MEM = "0"
3432

3533
[annotations]
3634
com.hooks.aws_ofi_nccl.enabled = "true"
37-
com.hooks.aws_ofi_nccl.variant = "cuda12"
35+
com.hooks.aws_ofi_nccl.variant = "cuda13"
36+
com.hooks.cxi.enabled = "true"

src/swiss_ai_model_launch/assets/envs/sglang_glm.toml

Lines changed: 0 additions & 30 deletions
This file was deleted.

src/swiss_ai_model_launch/assets/envs/sglang_kimi.toml

Lines changed: 0 additions & 35 deletions
This file was deleted.
Lines changed: 7 additions & 9 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
image = "vllm/vllm-openai:latest"
1+
image = "/capstor/store/cscs/swissai/infra01/container-images/vllm_cuda13.sqsh"
22

33
mounts = [
44
"/iopsstor/store/cscs/swissai/a09/xyao/bin:/ocfbin",
@@ -7,22 +7,19 @@ mounts = [
77
"/usr/lib64/libhwloc.so.15:/usr/lib/libhwloc.so.15",
88
"/usr/lib64/libpciaccess.so.0:/usr/lib/libpciaccess.so.0",
99
"/usr/lib64/libxml2.so.2:/usr/lib/libxml2.so.2",
10-
"/opt/cray/libfabric/1.22.0/lib64:/opt/cray/libfabric/lib64",
11-
"/opt/cscs/aws-ofi-ccl-plugin/cuda12:/opt/cscs/aws-ofi-ccl-plugin/cuda12",
10+
"/usr/lib64/libnuma.so.1:/usr/lib/libnuma.so.1",
1211
]
1312

14-
workdir = "/opt"
13+
workdir = "/opt"
1514

1615
[env]
17-
LD_LIBRARY_PATH = "/opt/cscs/aws-ofi-ccl-plugin/cuda12:/opt/cray/libfabric/lib64:/usr/lib:${LD_LIBRARY_PATH:-}"
18-
NCCL_NET_PLUGIN = "/opt/cscs/aws-ofi-ccl-plugin/cuda12/libnccl-net.so"
16+
# NCCL_DEBUG = "INFO" # uncomment for debugging
17+
# NCCL_DEBUG_SUBSYS = "INIT,NET" # uncomment for debugging
1918
NCCL_NET = "AWS Libfabric"
2019
NCCL_CROSS_NIC = "1"
2120
NCCL_NET_GDR_LEVEL = "PHB"
2221
NCCL_SOCKET_IFNAME = "hsn"
2322
NCCL_PROTO = "^LL128"
24-
NCCL_DEBUG = "INFO"
25-
NCCL_DEBUG_SUBSYS = "INIT,NET"
2623
FI_CXI_COMPAT = "0"
2724
FI_MR_CACHE_MONITOR = "userfaultfd"
2825
FI_CXI_RX_MATCH_MODE = "software"
@@ -34,4 +31,5 @@ VLLM_ALLREDUCE_USE_SYMM_MEM = "0"
3431

3532
[annotations]
3633
com.hooks.aws_ofi_nccl.enabled = "true"
37-
com.hooks.aws_ofi_nccl.variant = "cuda12"
34+
com.hooks.aws_ofi_nccl.variant = "cuda13"
35+
com.hooks.cxi.enabled = "true"

src/swiss_ai_model_launch/assets/envs/vllm_qwen35.toml

Lines changed: 0 additions & 37 deletions
This file was deleted.

src/swiss_ai_model_launch/assets/template.jinja

Lines changed: 2 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -35,6 +35,8 @@ OCF_BOOTSTRAP_ADDR=$(curl -sf http://148.187.108.172:8092/v1/dnt/bootstraps \
3535
|| echo "/ip4/148.187.108.172/tcp/43905/p2p/QmbUKJkCfotDzbFE5uoTsXD4GRyPHjzZC1f2yAGLoeBMn9")
3636

3737

38+
unset SLURM_CPU_BIND SLURM_CPU_BIND_TYPE SLURM_CPU_BIND_LIST SLURM_CPU_BIND_VERBOSE
39+
3840
# Architecture detection
3941
ARCH=$(uname -m)
4042
if [[ "$ARCH" == "aarch64" ]]; then

0 commit comments

Comments
 (0)