1- image = " vllm/vllm-openai:latest "
1+ image = " /capstor/store/cscs/swissai/infra01/container-images/vllm_cuda13.sqsh "
22
33mounts = [
44 " /iopsstor/store/cscs/swissai/a09/xyao/bin:/ocfbin" ,
@@ -7,22 +7,19 @@ mounts = [
77 " /usr/lib64/libhwloc.so.15:/usr/lib/libhwloc.so.15" ,
88 " /usr/lib64/libpciaccess.so.0:/usr/lib/libpciaccess.so.0" ,
99 " /usr/lib64/libxml2.so.2:/usr/lib/libxml2.so.2" ,
10- " /opt/cray/libfabric/1.22.0/lib64:/opt/cray/libfabric/lib64" ,
11- " /opt/cscs/aws-ofi-ccl-plugin/cuda12:/opt/cscs/aws-ofi-ccl-plugin/cuda12" ,
10+ " /usr/lib64/libnuma.so.1:/usr/lib/libnuma.so.1" ,
1211]
1312
14- workdir = " /opt"
13+ workdir = " /opt"
1514
1615[env ]
17- LD_LIBRARY_PATH = " /opt/cscs/aws-ofi-ccl-plugin/cuda12:/opt/cray/libfabric/lib64:/usr/lib:${LD_LIBRARY_PATH:-} "
18- NCCL_NET_PLUGIN = " /opt/cscs/aws-ofi-ccl-plugin/cuda12/libnccl-net.so "
16+ # NCCL_DEBUG = "INFO" # uncomment for debugging
17+ # NCCL_DEBUG_SUBSYS = "INIT,NET" # uncomment for debugging
1918NCCL_NET = " AWS Libfabric"
2019NCCL_CROSS_NIC = " 1"
2120NCCL_NET_GDR_LEVEL = " PHB"
2221NCCL_SOCKET_IFNAME = " hsn"
2322NCCL_PROTO = " ^LL128"
24- NCCL_DEBUG = " INFO"
25- NCCL_DEBUG_SUBSYS = " INIT,NET"
2623FI_CXI_COMPAT = " 0"
2724FI_MR_CACHE_MONITOR = " userfaultfd"
2825FI_CXI_RX_MATCH_MODE = " software"
@@ -34,4 +31,5 @@ VLLM_ALLREDUCE_USE_SYMM_MEM = "0"
3431
3532[annotations ]
3633com.hooks.aws_ofi_nccl.enabled = " true"
37- com.hooks.aws_ofi_nccl.variant = " cuda12"
34+ com.hooks.aws_ofi_nccl.variant = " cuda13"
35+ com.hooks.cxi.enabled = " true"
0 commit comments