Skip to content

Commit e893a15

Browse files
authored
Cuda13 fix final (#21)
1 parent 0733d1d commit e893a15

11 files changed

Lines changed: 368 additions & 73 deletions

File tree

Lines changed: 86 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,86 @@
1+
ARG UBUNTU_VERSION=24.04
2+
ARG TARGET_PLATFORM=aarch64
3+
ARG CUDA_VERSION=13.0.0
4+
ARG CUDA_VERSION_PATH=cu130
5+
ARG PYTHON_VERSION=3.12
6+
ARG BASE_IMAGE=docker.io/library/ubuntu:${UBUNTU_VERSION}
7+
ARG DEVEL_BASE_IMAGE=docker.io/nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION}
8+
9+
#########################################################################
10+
# Build image
11+
#########################################################################
12+
13+
FROM ${DEVEL_BASE_IMAGE} AS build
14+
15+
# NOTE: libnuma1 is mounted from the host via the toml file.
16+
17+
WORKDIR /app/build
18+
19+
# Install miniconda, Python, and Python build dependencies.
20+
ARG TARGET_PLATFORM
21+
ARG PYTHON_VERSION
22+
ENV PATH=/opt/conda/bin:$PATH
23+
ADD "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-${TARGET_PLATFORM}.sh" /root/miniconda.sh
24+
RUN chmod +x /root/miniconda.sh && \
25+
bash /root/miniconda.sh -b -p /opt/conda && \
26+
rm /root/miniconda.sh && \
27+
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython git compilers make && \
28+
/opt/conda/bin/python -m pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja && \
29+
/opt/conda/bin/conda clean -ya
30+
31+
# Install PyTorch for CUDA 13.
32+
ARG CUDA_VERSION_PATH
33+
ARG TORCH_VERSION=2.9.1
34+
ARG INSTALL_CHANNEL=whl
35+
RUN pip install --no-cache-dir --index-url https://download.pytorch.org/${INSTALL_CHANNEL}/${CUDA_VERSION_PATH}/ \
36+
torch==${TORCH_VERSION} torchvision torchaudio
37+
38+
# Symlink cuDNN and NCCL headers into conda include path.
39+
RUN CUDNN_DIR=$(dirname $(find /usr -name 'cudnn.h' -print -quit)) && \
40+
ln -sf ${CUDNN_DIR}/cudnn*.h /opt/conda/include/ && \
41+
ln -sf $(find /opt/conda/lib -path '*/nvidia/nccl/include/nccl.h' -print -quit) /opt/conda/include/nccl.h
42+
43+
# Install flash-attn 3.
44+
ARG FLASH_ATTN_3_SHA="92ca9da8d66f7b34ff50dc080ec0fef9661260d6"
45+
ARG FA3_MAX_JOBS=32
46+
RUN git clone --depth 1 --recurse-submodules --shallow-submodules https://github.com/Dao-AILab/flash-attention.git && \
47+
cd flash-attention && \
48+
git fetch --depth 1 origin ${FLASH_ATTN_3_SHA} && \
49+
git checkout ${FLASH_ATTN_3_SHA} && \
50+
git submodule update --init --depth 1 && \
51+
cd hopper && \
52+
FLASH_ATTENTION_DISABLE_FP16=TRUE FLASH_ATTENTION_DISABLE_SM80=TRUE MAX_JOBS=${FA3_MAX_JOBS} python setup.py install && \
53+
cd /app/build && \
54+
rm -rf flash-attention
55+
56+
# Fix flash_attn_3 package structure for imports.
57+
RUN cd /opt/conda/lib/python${PYTHON_VERSION}/site-packages/ && \
58+
mkdir -p flash_attn_3 && \
59+
mv flash_attn_3-*.egg/flash_attn_3/* flash_attn_3/ && \
60+
mv flash_attn_3-*.egg/flash_attn_interface.py flash_attn_3/ && \
61+
touch flash_attn_3/__init__.py && \
62+
rm -rf flash_attn_3-*
63+
64+
# Install sglang with all dependencies.
65+
# NOTE: sglang pins torch to cuda12 wheels, so we use --no-deps first,
66+
# then install its non-torch dependencies, then re-force cu130 torch.
67+
RUN pip install --no-cache-dir "sglang[all]"
68+
69+
# Re-install PyTorch cu130 — sglang downgrades to cuda12 torch via its deps.
70+
ARG CUDA_VERSION_PATH2=${CUDA_VERSION_PATH}
71+
RUN pip install --no-cache-dir --force-reinstall --index-url https://download.pytorch.org/${INSTALL_CHANNEL}/${CUDA_VERSION_PATH2}/ \
72+
torch==${TORCH_VERSION} torchvision torchaudio
73+
74+
# Re-install sgl-kernel with cu130 wheels (default pulls cuda12 version).
75+
# Download wheel directly to avoid hash mismatch from the index.
76+
RUN pip install --no-cache-dir --no-deps --force-reinstall \
77+
"sgl-kernel @ https://github.com/sgl-project/whl/releases/download/v0.3.21/sgl_kernel-0.3.21+cu130-cp312-abi3-manylinux2014_aarch64.whl"
78+
79+
# Install nvidia-cublas for torch compile compatibility.
80+
RUN pip install --no-cache-dir nvidia-cublas
81+
82+
# Install curl for router health checks and sglang-router for load balancing.
83+
RUN conda install -y curl && conda clean -ya
84+
RUN pip install --no-cache-dir sglang-router
85+
86+
WORKDIR /opt
Lines changed: 91 additions & 3 deletions
Original file line numberDiff line numberDiff line change
@@ -1,4 +1,92 @@
1-
FROM docker.io/lmsysorg/sglang:nightly-dev-20260212-5875ef0a
1+
ARG UBUNTU_VERSION=24.04
2+
ARG TARGET_PLATFORM=aarch64
3+
ARG CUDA_VERSION=13.0.0
4+
ARG CUDA_VERSION_PATH=cu130
5+
ARG PYTHON_VERSION=3.12
6+
ARG BASE_IMAGE=docker.io/library/ubuntu:${UBUNTU_VERSION}
7+
ARG DEVEL_BASE_IMAGE=docker.io/nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION}
28

3-
RUN apt-get update && apt-get install git python3-pip python3-venv -y
4-
RUN pip install git+https://github.com/huggingface/transformers.git@c9ea365a7b56326418769a4ba4682864d407ed63
9+
#########################################################################
10+
# Build image
11+
#########################################################################
12+
13+
FROM ${DEVEL_BASE_IMAGE} AS build
14+
15+
# NOTE: libnuma1 is mounted from the host via the toml file.
16+
17+
WORKDIR /app/build
18+
19+
# Install miniconda, Python, and Python build dependencies.
20+
ARG TARGET_PLATFORM
21+
ARG PYTHON_VERSION
22+
ENV PATH=/opt/conda/bin:$PATH
23+
ADD "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-${TARGET_PLATFORM}.sh" /root/miniconda.sh
24+
RUN chmod +x /root/miniconda.sh && \
25+
bash /root/miniconda.sh -b -p /opt/conda && \
26+
rm /root/miniconda.sh && \
27+
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython git compilers make && \
28+
/opt/conda/bin/python -m pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja && \
29+
/opt/conda/bin/conda clean -ya
30+
31+
# Install PyTorch for CUDA 13.
32+
ARG CUDA_VERSION_PATH
33+
ARG TORCH_VERSION=2.9.1
34+
ARG INSTALL_CHANNEL=whl
35+
RUN pip install --no-cache-dir --index-url https://download.pytorch.org/${INSTALL_CHANNEL}/${CUDA_VERSION_PATH}/ \
36+
torch==${TORCH_VERSION} torchvision torchaudio
37+
38+
# Symlink cuDNN and NCCL headers into conda include path.
39+
RUN CUDNN_DIR=$(dirname $(find /usr -name 'cudnn.h' -print -quit)) && \
40+
ln -sf ${CUDNN_DIR}/cudnn*.h /opt/conda/include/ && \
41+
ln -sf $(find /opt/conda/lib -path '*/nvidia/nccl/include/nccl.h' -print -quit) /opt/conda/include/nccl.h
42+
43+
# Install flash-attn 3.
44+
ARG FLASH_ATTN_3_SHA="92ca9da8d66f7b34ff50dc080ec0fef9661260d6"
45+
ARG FA3_MAX_JOBS=32
46+
RUN git clone --depth 1 --recurse-submodules --shallow-submodules https://github.com/Dao-AILab/flash-attention.git && \
47+
cd flash-attention && \
48+
git fetch --depth 1 origin ${FLASH_ATTN_3_SHA} && \
49+
git checkout ${FLASH_ATTN_3_SHA} && \
50+
git submodule update --init --depth 1 && \
51+
cd hopper && \
52+
FLASH_ATTENTION_DISABLE_FP16=TRUE FLASH_ATTENTION_DISABLE_SM80=TRUE MAX_JOBS=${FA3_MAX_JOBS} python setup.py install && \
53+
cd /app/build && \
54+
rm -rf flash-attention
55+
56+
# Fix flash_attn_3 package structure for imports.
57+
RUN cd /opt/conda/lib/python${PYTHON_VERSION}/site-packages/ && \
58+
mkdir -p flash_attn_3 && \
59+
mv flash_attn_3-*.egg/flash_attn_3/* flash_attn_3/ && \
60+
mv flash_attn_3-*.egg/flash_attn_interface.py flash_attn_3/ && \
61+
touch flash_attn_3/__init__.py && \
62+
rm -rf flash_attn_3-*
63+
64+
# Install sglang with all dependencies.
65+
# NOTE: sglang pins torch to cuda12 wheels, so we use --no-deps first,
66+
# then install its non-torch dependencies, then re-force cu130 torch.
67+
RUN pip install --no-cache-dir "sglang[all]"
68+
69+
# Re-install PyTorch cu130 — sglang downgrades to cuda12 torch via its deps.
70+
ARG CUDA_VERSION_PATH2=${CUDA_VERSION_PATH}
71+
RUN pip install --no-cache-dir --force-reinstall --index-url https://download.pytorch.org/${INSTALL_CHANNEL}/${CUDA_VERSION_PATH2}/ \
72+
torch==${TORCH_VERSION} torchvision torchaudio
73+
74+
# Re-install sgl-kernel with cu130 wheels (default pulls cuda12 version).
75+
# Download wheel directly to avoid hash mismatch from the index.
76+
RUN pip install --no-cache-dir --no-deps --force-reinstall \
77+
"sgl-kernel @ https://github.com/sgl-project/whl/releases/download/v0.3.21/sgl_kernel-0.3.21+cu130-cp312-abi3-manylinux2014_aarch64.whl"
78+
79+
# Install nvidia-cublas for torch compile compatibility.
80+
RUN pip install --no-cache-dir nvidia-cublas
81+
82+
# Upgrade CuDNN to 9.16 — SGLang requires >= 9.15 with PyTorch 2.9.1.
83+
RUN pip install --no-cache-dir nvidia-cudnn-cu12==9.16.0.29
84+
85+
# Install curl for router health checks and sglang-router for load balancing.
86+
RUN conda install -y curl && conda clean -ya
87+
RUN pip install --no-cache-dir sglang-router
88+
89+
# Install GLM5-specific transformers version.
90+
RUN pip install --no-cache-dir git+https://github.com/huggingface/transformers.git@c9ea365a7b56326418769a4ba4682864d407ed63
91+
92+
WORKDIR /opt
Lines changed: 88 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -1,5 +1,89 @@
1-
FROM lmsysorg/sglang:nightly-dev-20260216-d3bae71e
1+
ARG UBUNTU_VERSION=24.04
2+
ARG TARGET_PLATFORM=aarch64
3+
ARG CUDA_VERSION=13.0.0
4+
ARG CUDA_VERSION_PATH=cu130
5+
ARG PYTHON_VERSION=3.12
6+
ARG BASE_IMAGE=docker.io/library/ubuntu:${UBUNTU_VERSION}
7+
ARG DEVEL_BASE_IMAGE=docker.io/nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION}
28

3-
RUN apt-get update && apt-get install git python3-pip python3-venv -y
4-
RUN pip install "sglang @ git+https://github.com/sgl-project/sglang.git#subdirectory=python"
5-
RUN pip install nvidia-cudnn-cu12==9.16.0.29
9+
#########################################################################
10+
# Build image
11+
#########################################################################
12+
13+
FROM ${DEVEL_BASE_IMAGE} AS build
14+
15+
# NOTE: libnuma1 is mounted from the host via the toml file.
16+
17+
WORKDIR /app/build
18+
19+
# Install miniconda, Python, and Python build dependencies.
20+
ARG TARGET_PLATFORM
21+
ARG PYTHON_VERSION
22+
ENV PATH=/opt/conda/bin:$PATH
23+
ADD "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-${TARGET_PLATFORM}.sh" /root/miniconda.sh
24+
RUN chmod +x /root/miniconda.sh && \
25+
bash /root/miniconda.sh -b -p /opt/conda && \
26+
rm /root/miniconda.sh && \
27+
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython git compilers make && \
28+
/opt/conda/bin/python -m pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja && \
29+
/opt/conda/bin/conda clean -ya
30+
31+
# Install PyTorch for CUDA 13.
32+
ARG CUDA_VERSION_PATH
33+
ARG TORCH_VERSION=2.9.1
34+
ARG INSTALL_CHANNEL=whl
35+
RUN pip install --no-cache-dir --index-url https://download.pytorch.org/${INSTALL_CHANNEL}/${CUDA_VERSION_PATH}/ \
36+
torch==${TORCH_VERSION} torchvision torchaudio
37+
38+
# Symlink cuDNN and NCCL headers into conda include path.
39+
RUN CUDNN_DIR=$(dirname $(find /usr -name 'cudnn.h' -print -quit)) && \
40+
ln -sf ${CUDNN_DIR}/cudnn*.h /opt/conda/include/ && \
41+
ln -sf $(find /opt/conda/lib -path '*/nvidia/nccl/include/nccl.h' -print -quit) /opt/conda/include/nccl.h
42+
43+
# Install flash-attn 3.
44+
ARG FLASH_ATTN_3_SHA="92ca9da8d66f7b34ff50dc080ec0fef9661260d6"
45+
ARG FA3_MAX_JOBS=32
46+
RUN git clone --depth 1 --recurse-submodules --shallow-submodules https://github.com/Dao-AILab/flash-attention.git && \
47+
cd flash-attention && \
48+
git fetch --depth 1 origin ${FLASH_ATTN_3_SHA} && \
49+
git checkout ${FLASH_ATTN_3_SHA} && \
50+
git submodule update --init --depth 1 && \
51+
cd hopper && \
52+
FLASH_ATTENTION_DISABLE_FP16=TRUE FLASH_ATTENTION_DISABLE_SM80=TRUE MAX_JOBS=${FA3_MAX_JOBS} python setup.py install && \
53+
cd /app/build && \
54+
rm -rf flash-attention
55+
56+
# Fix flash_attn_3 package structure for imports.
57+
RUN cd /opt/conda/lib/python${PYTHON_VERSION}/site-packages/ && \
58+
mkdir -p flash_attn_3 && \
59+
mv flash_attn_3-*.egg/flash_attn_3/* flash_attn_3/ && \
60+
mv flash_attn_3-*.egg/flash_attn_interface.py flash_attn_3/ && \
61+
touch flash_attn_3/__init__.py && \
62+
rm -rf flash_attn_3-*
63+
64+
# Install sglang with all dependencies.
65+
# NOTE: sglang pins torch to cuda12 wheels, so we use --no-deps first,
66+
# then install its non-torch dependencies, then re-force cu130 torch.
67+
RUN pip install --no-cache-dir "sglang[all]"
68+
69+
# Re-install PyTorch cu130 — sglang downgrades to cuda12 torch via its deps.
70+
ARG CUDA_VERSION_PATH2=${CUDA_VERSION_PATH}
71+
RUN pip install --no-cache-dir --force-reinstall --index-url https://download.pytorch.org/${INSTALL_CHANNEL}/${CUDA_VERSION_PATH2}/ \
72+
torch==${TORCH_VERSION} torchvision torchaudio
73+
74+
# Re-install sgl-kernel with cu130 wheels (default pulls cuda12 version).
75+
# Download wheel directly to avoid hash mismatch from the index.
76+
RUN pip install --no-cache-dir --no-deps --force-reinstall \
77+
"sgl-kernel @ https://github.com/sgl-project/whl/releases/download/v0.3.21/sgl_kernel-0.3.21+cu130-cp312-abi3-manylinux2014_aarch64.whl"
78+
79+
# Install nvidia-cublas for torch compile compatibility.
80+
RUN pip install --no-cache-dir nvidia-cublas
81+
82+
# Upgrade CuDNN to 9.16 — SGLang requires >= 9.15 with PyTorch 2.9.1.
83+
RUN pip install --no-cache-dir nvidia-cudnn-cu12==9.16.0.29
84+
85+
# Install curl for router health checks and sglang-router for load balancing.
86+
RUN conda install -y curl && conda clean -ya
87+
RUN pip install --no-cache-dir sglang-router
88+
89+
WORKDIR /opt
Lines changed: 49 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,49 @@
1+
ARG UBUNTU_VERSION=24.04
2+
ARG TARGET_PLATFORM=aarch64
3+
ARG CUDA_VERSION=13.0.0
4+
ARG CUDA_VERSION_PATH=cu130
5+
ARG PYTHON_VERSION=3.12
6+
ARG BASE_IMAGE=docker.io/library/ubuntu:${UBUNTU_VERSION}
7+
ARG DEVEL_BASE_IMAGE=docker.io/nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu${UBUNTU_VERSION}
8+
9+
#########################################################################
10+
# Build image
11+
#########################################################################
12+
13+
FROM ${DEVEL_BASE_IMAGE} AS build
14+
15+
# NOTE: libnuma1 is mounted from the host via the toml file.
16+
17+
WORKDIR /app/build
18+
19+
# Install miniconda, Python, and Python build dependencies.
20+
ARG TARGET_PLATFORM
21+
ARG PYTHON_VERSION
22+
ENV PATH=/opt/conda/bin:$PATH
23+
ADD "https://github.com/conda-forge/miniforge/releases/latest/download/Miniforge3-Linux-${TARGET_PLATFORM}.sh" /root/miniconda.sh
24+
RUN chmod +x /root/miniconda.sh && \
25+
bash /root/miniconda.sh -b -p /opt/conda && \
26+
rm /root/miniconda.sh && \
27+
/opt/conda/bin/conda install -y python=${PYTHON_VERSION} cmake conda-build pyyaml numpy ipython git compilers make && \
28+
/opt/conda/bin/python -m pip install --upgrade --no-cache-dir pip wheel packaging "setuptools<70.0.0" ninja && \
29+
/opt/conda/bin/conda clean -ya
30+
31+
# Install vllm cu130 nightly (brings its own torch 2.10.0+cu130).
32+
RUN pip install --no-cache-dir uv && \
33+
uv pip install --system -U vllm --torch-backend=auto --extra-index-url https://wheels.vllm.ai/nightly/cu130
34+
35+
# Ensure ray is installed and on PATH.
36+
RUN pip install --no-cache-dir "ray[default]"
37+
38+
# Symlink cuDNN and NCCL headers into conda include path.
39+
RUN CUDNN_DIR=$(dirname $(find /usr -name 'cudnn.h' -print -quit)) && \
40+
ln -sf ${CUDNN_DIR}/cudnn*.h /opt/conda/include/ && \
41+
ln -sf $(find /opt/conda/lib -path '*/nvidia/nccl/include/nccl.h' -print -quit) /opt/conda/include/nccl.h
42+
43+
# Remove nvidia-cublas (pulled as transitive dep) — conflicts with CUDA 13 toolkit's cuBLAS.
44+
RUN pip uninstall -y nvidia-cublas 2>/dev/null; true
45+
46+
# Install curl for router health checks.
47+
RUN conda install -y curl && conda clean -ya
48+
49+
WORKDIR /opt

legacy/serving/README.md

Lines changed: 19 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -281,6 +281,8 @@ python serving/submit_job.py \
281281

282282
<summary>SGLang, vLLM (tested ✅)</summary>
283283

284+
##### SGLang
285+
284286
```bash
285287
python serving/submit_job.py \
286288
--slurm-nodes 2 \
@@ -295,6 +297,22 @@ python serving/submit_job.py \
295297
--tp-size 8"
296298
```
297299

300+
##### vLLM
301+
302+
```
303+
python serving/submit_job.py \
304+
--slurm-nodes 2 \
305+
--serving-framework vllm \
306+
--worker-port 8080 \
307+
--slurm-environment $(pwd)/serving/envs/vllm.toml \
308+
--disable-ocf \
309+
--framework-args "--model Qwen/Qwen3-235B-A22B-Instruct-2507 \
310+
--host 0.0.0.0 \
311+
--port 8080 \
312+
--served-model-name Qwen/Qwen3-235B-A22B-Instruct-2507-$(whoami) \
313+
--tensor-parallel-size 8"
314+
```
315+
298316
</details>
299317

300318
#### `Qwen3.5-397B-A17B`
@@ -309,7 +327,7 @@ python serving/submit_job.py \
309327
--serving-framework vllm \
310328
--disable-ocf \
311329
--worker-port 8080 \
312-
--slurm-environment $(pwd)/serving/envs/vllm_qwen35.toml \
330+
--slurm-environment $(pwd)/serving/envs/vllm.toml \
313331
--framework-args "--model Qwen/Qwen3.5-397B-A17B \
314332
--host 0.0.0.0 \
315333
--port 8080 \

0 commit comments

Comments
 (0)