Skip to content

Commit fda0cb2

Browse files
authored
Fix Dockerfile not installing correct version of DeepEP for arm build (sgl-project#11773)
1 parent ebda73d commit fda0cb2

3 files changed

Lines changed: 10 additions & 3 deletions

File tree

.github/workflows/release-docker-dev.yml

Lines changed: 3 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -15,11 +15,13 @@ jobs:
1515
- runner: x64-docker-build-node
1616
platform: linux/amd64
1717
build_type: all
18+
grace_blackwell: 0
1819
tag: dev-x86
1920
version: 12.9.1
2021
- runner: arm-docker-build-node
2122
platform: linux/arm64
2223
build_type: all
24+
grace_blackwell: 1
2325
tag: dev-arm64
2426
version: 12.9.1
2527
steps:
@@ -51,7 +53,7 @@ jobs:
5153

5254
- name: Build and Push Dev Image
5355
run: |
54-
docker buildx build --platform ${{ matrix.platform }} --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.tag }} --no-cache .
56+
docker buildx build --platform ${{ matrix.platform }} --push -f docker/Dockerfile --build-arg CUDA_VERSION=${{ matrix.version }} --build-arg BUILD_TYPE=${{ matrix.build_type }} --build-arg GRACE_BLACKWELL=${{ matrix.grace_blackwell }} --build-arg CMAKE_BUILD_PARALLEL_LEVEL=$(nproc) -t lmsysorg/sglang:${{ matrix.tag }} --no-cache .
5557
5658
create-manifests:
5759
runs-on: ubuntu-22.04

.github/workflows/release-docker.yml

Lines changed: 4 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -16,6 +16,7 @@ jobs:
1616
variant:
1717
- cuda_version: "12.9.1"
1818
build_type: "all"
19+
grace_blackwell: 0
1920
runs-on: x64-docker-build-node
2021
steps:
2122
- name: Delete huge unnecessary tools folder
@@ -55,6 +56,7 @@ jobs:
5556
-f docker/Dockerfile \
5657
--build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} \
5758
--build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \
59+
--build-arg GRACE_BLACKWELL=${{ matrix.variant.grace_blackwell }} \
5860
-t lmsysorg/sglang:${tag} \
5961
--no-cache \
6062
.
@@ -67,6 +69,7 @@ jobs:
6769
variant:
6870
- cuda_version: "12.9.1"
6971
build_type: "all"
72+
grace_blackwell: 1
7073
runs-on: arm-docker-build-node
7174
steps:
7275
- name: Delete huge unnecessary tools folder
@@ -95,6 +98,7 @@ jobs:
9598
-f docker/Dockerfile \
9699
--build-arg CUDA_VERSION=${{ matrix.variant.cuda_version }} \
97100
--build-arg BUILD_TYPE=${{ matrix.variant.build_type }} \
101+
--build-arg GRACE_BLACKWELL=${{ matrix.variant.grace_blackwell }} \
98102
-t lmsysorg/sglang:${tag} \
99103
--no-cache \
100104
.

docker/Dockerfile

Lines changed: 3 additions & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -2,6 +2,7 @@ ARG CUDA_VERSION=12.9.1
22
FROM nvidia/cuda:${CUDA_VERSION}-cudnn-devel-ubuntu22.04 AS base
33
ARG TARGETARCH
44

5+
ARG GRACE_BLACKWELL=0
56
ARG BUILD_TYPE=all
67
ARG BRANCH_TYPE=remote
78
ARG DEEPEP_COMMIT=9af0e0d0e74f3577af1979c9b9e1ac2cad0104ee
@@ -99,7 +100,7 @@ RUN python3 -m pip install --no-cache-dir --upgrade pip setuptools wheel html5li
99100
# Download NVSHMEM source files
100101
# We use Tom's DeepEP fork for GB200 for now
101102
RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/source/nvshmem_src_cuda12-all-all-3.3.9.tar.gz && \
102-
if [ "$BUILD_TYPE" = "blackwell_aarch64" ]; then \
103+
if [ "$GRACE_BLACKWELL" = "1" ]; then \
103104
git clone https://github.com/fzyzcjy/DeepEP.git \
104105
&& cd DeepEP && git checkout 1b14ad661c7640137fcfe93cccb2694ede1220b0 && sed -i 's/#define NUM_CPU_TIMEOUT_SECS 100/#define NUM_CPU_TIMEOUT_SECS 1000/' csrc/kernels/configs.cuh && cd .. ; \
105106
else \
@@ -112,7 +113,7 @@ RUN wget https://developer.download.nvidia.com/compute/redist/nvshmem/3.3.9/sour
112113

113114
# Build and install NVSHMEM
114115
RUN cd /sgl-workspace/nvshmem && \
115-
if [ "$BUILD_TYPE" = "blackwell" ] || [ "$BUILD_TYPE" = "blackwell_aarch" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
116+
if [ "$GRACE_BLACKWELL" = "1" ]; then CUDA_ARCH="90;100;120"; else CUDA_ARCH="90"; fi && \
116117
NVSHMEM_SHMEM_SUPPORT=0 \
117118
NVSHMEM_UCX_SUPPORT=0 \
118119
NVSHMEM_USE_NCCL=0 \

0 commit comments

Comments
 (0)