Skip to content

Commit b70a1f7

Browse files
authored
Merge pull request #13 from swiss-ai/nccl_fix
NCCL Libfabric fix. Added new model configurations
2 parents 4c9e90a + 4e1d612 commit b70a1f7

6 files changed

Lines changed: 198 additions & 6 deletions

File tree

serving/README.md

Lines changed: 151 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -126,6 +126,71 @@ python serving/submit_job.py \
126126

127127
</details>
128128

129+
#### `Mistral-Small-24B-Instruct-2501`
130+
131+
<details>
132+
133+
<summary>SGLang, vLLM (tested ✅)</summary>
134+
135+
```bash
136+
python serving/submit_job.py \
137+
--slurm-nodes 1 \
138+
--serving-framework sglang \
139+
--worker-port 8080 \
140+
--slurm-environment $(pwd)/serving/envs/sglang.toml \
141+
--framework-args "--model-path mistralai/Mistral-Small-24B-Instruct-2501 \
142+
--host 0.0.0.0 \
143+
--port 8080 \
144+
--served-model-name mistralai/Mistral-Small-24B-Instruct-2501-$(whoami) \
145+
--dp-size 4"
146+
```
147+
148+
</details>
149+
150+
#### `Mistral-Large-3-675B-Instruct-2512`
151+
152+
<details>
153+
154+
<summary>vLLM (tested ✅)</summary>
155+
156+
```bash
157+
python serving/submit_job.py \
158+
--slurm-nodes 4 \
159+
--serving-framework vllm \
160+
--worker-port 8080 \
161+
--slurm-environment $(pwd)/serving/envs/vllm.toml \
162+
--disable-ocf \
163+
--framework-args "--model mistralai/Mistral-Large-3-675B-Instruct-2512 \
164+
--host 0.0.0.0 \
165+
--port 8080 \
166+
--served-model-name mistralai/Mistral-Large-3-675B-Instruct-2512-$(whoami) \
167+
--tensor-parallel-size 16"
168+
```
169+
170+
</details>
171+
172+
#### `Mixtral-8x22B-Instruct-v0.1`
173+
174+
<details>
175+
176+
<summary>SGLang, vLLM (tested ✅)</summary>
177+
178+
```bash
179+
python serving/submit_job.py \
180+
--slurm-nodes 2 \
181+
--serving-framework sglang \
182+
--disable-ocf \
183+
--worker-port 8080 \
184+
--slurm-environment $(pwd)/serving/envs/sglang.toml \
185+
--framework-args "--model mistralai/Mixtral-8x22B-Instruct-v0.1 \
186+
--host 0.0.0.0 \
187+
--port 8080 \
188+
--tp-size 8 \
189+
--served-model-name mistralai/Mixtral-8x22B-Instruct-v0.1-$(whoami)"
190+
```
191+
192+
</details>
193+
129194
### Snowflake
130195

131196
#### `snowflake-arctic-embed-l-v2.0`
@@ -149,6 +214,48 @@ python serving/submit_job.py \
149214

150215
### Qwen
151216

217+
#### `Qwen3-8B`
218+
219+
<details>
220+
221+
<summary>SGLang, vLLM (tested ✅)</summary>
222+
223+
```bash
224+
python serving/submit_job.py \
225+
--slurm-nodes 1 \
226+
--serving-framework sglang \
227+
--worker-port 8080 \
228+
--slurm-environment $(pwd)/serving/envs/sglang.toml \
229+
--framework-args "--model-path Qwen/Qwen3-8B \
230+
--host 0.0.0.0 \
231+
--port 8080 \
232+
--served-model-name Qwen/Qwen3-8B-$(whoami) \
233+
--dp-size 4"
234+
```
235+
236+
</details>
237+
238+
#### `Qwen3-32B`
239+
240+
<details>
241+
242+
<summary>SGLang, vLLM (tested ✅)</summary>
243+
244+
```bash
245+
python serving/submit_job.py \
246+
--slurm-nodes 1 \
247+
--serving-framework sglang \
248+
--worker-port 8080 \
249+
--slurm-environment $(pwd)/serving/envs/sglang.toml \
250+
--framework-args "--model-path Qwen/Qwen3-32B \
251+
--host 0.0.0.0 \
252+
--port 8080 \
253+
--served-model-name Qwen/Qwen3-32B-$(whoami) \
254+
--dp-size 4"
255+
```
256+
257+
</details>
258+
152259
#### `Qwen3-Next-80B-A3B-Instruct`
153260

154261
<details>
@@ -168,6 +275,50 @@ python serving/submit_job.py \
168275

169276
</details>
170277

278+
#### `Qwen3-235B-A22B-Instruct-2507`
279+
280+
<details>
281+
282+
<summary>SGLang, vLLM (tested ✅)</summary>
283+
284+
```bash
285+
python serving/submit_job.py \
286+
--slurm-nodes 2 \
287+
--serving-framework sglang \
288+
--worker-port 8080 \
289+
--slurm-environment $(pwd)/serving/envs/sglang.toml \
290+
--disable-ocf \
291+
--framework-args "--model-path Qwen/Qwen3-235B-A22B-Instruct-2507 \
292+
--host 0.0.0.0 \
293+
--port 8080 \
294+
--served-model-name Qwen/Qwen3-235B-A22B-Instruct-2507-$(whoami) \
295+
--tp-size 8"
296+
```
297+
298+
</details>
299+
300+
#### `Qwen3.5-397B-A17B`
301+
302+
<details>
303+
304+
<summary>vLLM (tested ✅)</summary>
305+
306+
```bash
307+
python serving/submit_job.py \
308+
--slurm-nodes 4 \
309+
--serving-framework vllm \
310+
--disable-ocf \
311+
--worker-port 8080 \
312+
--slurm-environment $(pwd)/serving/envs/vllm_qwen35.toml \
313+
--framework-args "--model Qwen/Qwen3.5-397B-A17B \
314+
--host 0.0.0.0 \
315+
--port 8080 \
316+
--tensor-parallel-size 16 \
317+
--served-model-name Qwen/Qwen3.5-397B-A17B-$(whoami)"
318+
```
319+
320+
</details>
321+
171322
### DeepSeek
172323

173324
#### `DeepSeek-V3.1`

serving/envs/sglang.toml

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -16,7 +16,6 @@ workdir = "/opt"
1616
[env]
1717
# NCCL_DEBUG = "INFO" # uncomment for debugging
1818
# NCCL_DEBUG_SUBSYS = "INIT,NET" # uncomment for debugging
19-
LD_LIBRARY_PATH = "/opt/cscs/aws-ofi-ccl-plugin/cuda12:/opt/cray/libfabric/lib64:/usr/lib:${LD_LIBRARY_PATH:-}"
2019
NCCL_NET_PLUGIN = "/opt/cscs/aws-ofi-ccl-plugin/cuda12/libnccl-net.so"
2120
NCCL_NET = "AWS Libfabric"
2221
NCCL_CROSS_NIC = "1"

serving/envs/sglang_glm.toml

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,4 @@
1-
image = "/iopsstor/scratch/cscs/ahadinia/sglang_glm.sqsh"
1+
image = "/capstor/store/cscs/swissai/infra01/container-images/sglang_glm5_nightly.sqsh"
22

33
# "src_path:trg_path" mounts the src_path on the host inside the container at the trg_path.
44
mounts = [

serving/envs/sglang_kimi.toml

Lines changed: 8 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -8,14 +8,15 @@ mounts = [
88
"/usr/lib64/libhwloc.so.15:/usr/lib/libhwloc.so.15",
99
"/usr/lib64/libpciaccess.so.0:/usr/lib/libpciaccess.so.0",
1010
"/usr/lib64/libxml2.so.2:/usr/lib/libxml2.so.2",
11+
"/opt/cray/libfabric/1.22.0/lib64:/opt/cray/libfabric/lib64",
12+
"/opt/cscs/aws-ofi-ccl-plugin/cuda12:/opt/cscs/aws-ofi-ccl-plugin/cuda12",
1113
]
1214

1315
workdir = "/opt"
1416

1517
[env]
16-
# NCCL_DEBUG = "info" # uncomment for debugging
17-
NCCL_NET_PLUGIN = "ofi"
18-
NCCL_NET = "Socket"
18+
NCCL_NET_PLUGIN = "/opt/cscs/aws-ofi-ccl-plugin/cuda12/libnccl-net.so"
19+
NCCL_NET = "AWS Libfabric"
1920
NCCL_CROSS_NIC = "1"
2021
NCCL_NET_GDR_LEVEL = "PHB"
2122
NCCL_SOCKET_IFNAME = "hsn"
@@ -28,3 +29,7 @@ FI_CXI_DEFAULT_TX_SIZE = "32768"
2829
FI_CXI_DISABLE_HOST_REGISTER = "1"
2930
OFI_NCCL_DISABLE_DMABUF = "1"
3031
SGL_ENABLE_JIT_DEEPGEMM = "0"
32+
33+
[annotations]
34+
com.hooks.aws_ofi_nccl.enabled = "true"
35+
com.hooks.aws_ofi_nccl.variant = "cuda12"

serving/envs/vllm_qwen35.toml

Lines changed: 37 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,37 @@
1+
image = "vllm/vllm-openai:nightly"
2+
3+
mounts = [
4+
"/iopsstor/store/cscs/swissai/a09/xyao/bin:/ocfbin",
5+
"/capstor",
6+
"/iopsstor",
7+
"/usr/lib64/libhwloc.so.15:/usr/lib/libhwloc.so.15",
8+
"/usr/lib64/libpciaccess.so.0:/usr/lib/libpciaccess.so.0",
9+
"/usr/lib64/libxml2.so.2:/usr/lib/libxml2.so.2",
10+
"/opt/cray/libfabric/1.22.0/lib64:/opt/cray/libfabric/lib64",
11+
"/opt/cscs/aws-ofi-ccl-plugin/cuda12:/opt/cscs/aws-ofi-ccl-plugin/cuda12",
12+
]
13+
14+
workdir = "/opt"
15+
16+
[env]
17+
LD_LIBRARY_PATH = "/opt/cscs/aws-ofi-ccl-plugin/cuda12:/opt/cray/libfabric/lib64:/usr/lib:${LD_LIBRARY_PATH:-}"
18+
NCCL_NET_PLUGIN = "/opt/cscs/aws-ofi-ccl-plugin/cuda12/libnccl-net.so"
19+
NCCL_NET = "AWS Libfabric"
20+
NCCL_CROSS_NIC = "1"
21+
NCCL_NET_GDR_LEVEL = "PHB"
22+
NCCL_SOCKET_IFNAME = "hsn"
23+
NCCL_PROTO = "^LL128"
24+
NCCL_DEBUG = "INFO"
25+
NCCL_DEBUG_SUBSYS = "INIT,NET"
26+
FI_CXI_COMPAT = "0"
27+
FI_MR_CACHE_MONITOR = "userfaultfd"
28+
FI_CXI_RX_MATCH_MODE = "software"
29+
FI_CXI_DEFAULT_CQ_SIZE = "131072"
30+
FI_CXI_DEFAULT_TX_SIZE = "32768"
31+
FI_CXI_DISABLE_HOST_REGISTER = "1"
32+
OFI_NCCL_DISABLE_DMABUF = "1"
33+
VLLM_ALLREDUCE_USE_SYMM_MEM = "0"
34+
35+
[annotations]
36+
com.hooks.aws_ofi_nccl.enabled = "true"
37+
com.hooks.aws_ofi_nccl.variant = "cuda12"

serving/template.jinja

Lines changed: 1 addition & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -59,7 +59,7 @@ case "$FRAMEWORK" in
5959
FRAMEWORK_LAUNCH="python3 -m sglang.launch_server"
6060
;;
6161
vllm)
62-
FRAMEWORK_ENV_SETUP="export no_proxy=\"0.0.0.0,\$no_proxy\"; export NO_PROXY=\"0.0.0.0,\$NO_PROXY\""
62+
FRAMEWORK_ENV_SETUP="export RAY_CGRAPH_get_timeout=1800; export no_proxy=\"0.0.0.0,\$no_proxy\"; export NO_PROXY=\"0.0.0.0,\$NO_PROXY\""
6363
FRAMEWORK_LAUNCH="python3 -m vllm.entrypoints.openai.api_server"
6464
;;
6565
esac

0 commit comments

Comments
 (0)