Skip to content

Commit d1bf1f0

Browse files
committed
[release] bump to v0.1.0
1 parent 0b96b53 commit d1bf1f0

File tree

9 files changed

+163
-168
lines changed

9 files changed

+163
-168
lines changed

.github/workflows/conda-ci.yml

Lines changed: 79 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,79 @@
1+
name: conda CI
2+
3+
on:
4+
push:
5+
branches: [main]
6+
pull_request:
7+
branches: [main]
8+
9+
jobs:
10+
build-conda:
11+
if: contains(github.event.pull_request.title, '[release]')
12+
runs-on: self-hosted
13+
container:
14+
image: lmsysorg/sglang:v0.5.0rc0-cu126
15+
options: --gpus all --ipc=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 --memory-swap=0 -v /data/models:/root/models -v /data/datasets:/root/datasets
16+
17+
defaults:
18+
run:
19+
working-directory: ${{ github.workspace }}
20+
21+
steps:
22+
- name: Checkout repository
23+
uses: actions/checkout@v4
24+
25+
- name: Construct Conda
26+
run: |
27+
echo "📦 Installing slime..."
28+
cd $GITHUB_WORKSPACE
29+
echo "Current directory: $(pwd)"
30+
31+
BASE_DIR=$(pwd) bash build_conda.sh
32+
shell: bash
33+
34+
- name: Download model and dataset
35+
run: |
36+
echo "🔗 Downloading up model and dataset..."
37+
38+
# Create cache directories if they don't exist
39+
mkdir -p /root/models /root/datasets
40+
41+
echo "Downloading Qwen3-30B-A3B..."
42+
hf download Qwen/Qwen3-30B-A3B --local-dir /root/models/Qwen3-30B-A3B
43+
hf download Qwen/Qwen3-30B-A3B-FP8 --local-dir /root/models/Qwen3-30B-A3B-FP8
44+
45+
hf download --repo-type dataset zhuzilin/dapo-math-17k --local-dir /root/datasets/dapo-math-17k
46+
47+
hf download --repo-type dataset zhuzilin/aime-2024 --local-dir /root/datasets/aime-2024
48+
shell: bash
49+
50+
- name: Convert checkpoint
51+
run: |
52+
echo "🔄 Converting model checkpoint..."
53+
cd $GITHUB_WORKSPACE
54+
echo "Current directory: $(pwd)"
55+
56+
source scripts/models/qwen3-30B-A3B.sh
57+
PYTHONPATH=/root/Megatron-LM torchrun --nproc-per-node 8 tools/convert_hf_to_torch_dist.py \
58+
${MODEL_ARGS[@]} \
59+
--hf-checkpoint /root/models/Qwen3-30B-A3B \
60+
--save /root/Qwen3-30B-A3B_torch_dist
61+
shell: bash
62+
63+
- name: Run tests
64+
run: |
65+
echo "🧪 Running tests..."
66+
cd $GITHUB_WORKSPACE
67+
echo "Current directory: $(pwd)"
68+
69+
bash tests/test_qwen3-30B-A3B.sh
70+
shell: bash
71+
72+
- name: Cleanup
73+
if: always()
74+
run: |
75+
echo "🧹 Cleaning up..."
76+
pkill -9 ray || true
77+
ray stop --force || true
78+
pkill -9 python || true
79+
shell: bash

.github/workflows/slime-ci.yml

Lines changed: 6 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -3,8 +3,10 @@ name: GPU CI
33
on:
44
push:
55
branches: [main]
6+
paths: "slime/**"
67
pull_request:
78
branches: [main]
9+
paths: "slime/**"
810

911
jobs:
1012
# GPU tests on self-hosted runner
@@ -13,11 +15,11 @@ jobs:
1315
container:
1416
image: zhuzilin/slime:latest
1517
options: --gpus all --ipc=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 --memory-swap=0 -v /data/models:/root/models -v /data/datasets:/root/datasets
16-
18+
1719
defaults:
1820
run:
1921
working-directory: ${{ github.workspace }}
20-
22+
2123
steps:
2224
- name: Checkout repository
2325
uses: actions/checkout@v4
@@ -85,11 +87,11 @@ jobs:
8587
container:
8688
image: zhuzilin/slime:latest
8789
options: --gpus all --ipc=host --shm-size=16g --ulimit memlock=-1 --ulimit stack=67108864 --memory=0 --memory-swap=0 -v /data/models:/root/models -v /data/datasets:/root/datasets
88-
90+
8991
defaults:
9092
run:
9193
working-directory: ${{ github.workspace }}
92-
94+
9395
steps:
9496
- name: Checkout repository
9597
uses: actions/checkout@v4

README.md

Lines changed: 0 additions & 4 deletions
Original file line numberDiff line numberDiff line change
@@ -61,10 +61,6 @@ For complete usage instructions, please refer to the [Usage Documentation](docs/
6161

6262
- For debugging tips, please refer to the [Debugging Guide](docs/en/debug.md)
6363

64-
## Hardware Support
65-
- Nvidia: refer to this repo README
66-
- AMD: refer to the [tutorial](docs/en/amd_tutorial.md)
67-
6864
## FAQ & Acknowledgements
6965

7066
- For frequently asked questions, please see the [Q\&A](docs/en/qa.md)

build_conda.sh

Lines changed: 76 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,76 @@
1+
#!/bin/bash
2+
3+
set -ex
4+
5+
# create conda
6+
yes '' | "${SHELL}" <(curl -L micro.mamba.pm/install.sh)
7+
source ~/.bashrc
8+
9+
# >>> mamba initialize >>>
10+
# !! Contents within this block are managed by 'micromamba shell init' !!
11+
export MAMBA_EXE='/root/.local/bin/micromamba';
12+
export MAMBA_ROOT_PREFIX='/root/micromamba';
13+
__mamba_setup="$("$MAMBA_EXE" shell hook --shell bash --root-prefix "$MAMBA_ROOT_PREFIX" 2> /dev/null)"
14+
eval "$__mamba_setup"
15+
alias micromamba="$MAMBA_EXE" # Fallback on help from micromamba activate
16+
unset __mamba_setup
17+
# <<< mamba initialize <<<
18+
19+
micromamba self-update
20+
21+
micromamba create -n slime python=3.12 pip -c conda-forge -y
22+
micromamba activate slime
23+
export CUDA_HOME="$CONDA_PREFIX"
24+
25+
export BASE_DIR=${BASE_DIR:-"/root"}
26+
cd $BASE_DIR
27+
# install sglang
28+
git clone -b v0.5.0rc0 https://github.com/sgl-project/sglang.git
29+
cd sglang
30+
# Install the python packages
31+
pip install -e "python[all]"
32+
33+
# install cuda 12.8 as it's the default cuda version for torch
34+
micromamba install -n slime cuda cuda-nvtx cuda-nvtx-dev -c nvidia/label/cuda-12.8.0 -y
35+
micromamba install -n slime -c conda-forge cudnn -y
36+
pip install cmake ninja
37+
38+
# reinstall sglang deps
39+
pip install git+https://github.com/fzyzcjy/torch_memory_saver.git --no-cache-dir --force-reinstall --no-build-isolation
40+
41+
# install megatron deps
42+
TORCH_CUDA_ARCH_LIST="9.0;9.0a" \
43+
pip -v install --no-build-isolation \
44+
git+https://github.com/fanshiqing/grouped_gemm@v1.1.4
45+
# apex
46+
TORCH_CUDA_ARCH_LIST="9.0;9.0a" NVCC_APPEND_FLAGS="--threads 4" \
47+
\
48+
pip -v install --disable-pip-version-check --no-cache-dir \
49+
--no-build-isolation \
50+
--config-settings "--build-option=--cpp_ext --cuda_ext --parallel 8" git+https://github.com/NVIDIA/apex.git
51+
# transformer engine
52+
TORCH_CUDA_ARCH_LIST="9.0;9.0a" \
53+
pip -v install transformer_engine[pytorch]
54+
# flash attn
55+
# the newest version megatron supports is v2.7.4.post1
56+
MAX_JOBS=64 pip -v install flash-attn==2.7.4.post1
57+
# megatron
58+
cd $BASE_DIR
59+
git clone https://github.com/NVIDIA/Megatron-LM.git
60+
cd Megatron-LM/
61+
git checkout 48406695c4efcf1026a7ed70bb390793918dd97b
62+
pip install -e .
63+
64+
# mbridge
65+
pip install git+https://github.com/ISEEKYAN/mbridge.git --no-deps
66+
67+
# install slime and apply patches
68+
cd $BASE_DIR
69+
git clone https://github.com/THUDM/slime.git
70+
cd slime/
71+
pip install -e .
72+
# apply patch
73+
cd $BASE_DIR/sglang
74+
git apply $BASE_DIR/slime/docker/patch/v0.5.0rc0-cuda126/sglang.patch
75+
cd $BASE_DIR/Megatron-LM
76+
git apply $BASE_DIR/slime/docker/patch/v0.5.0rc0-cuda126/megatron.patch

docker/Dockerfile

Lines changed: 0 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -13,7 +13,6 @@ RUN pip install sglang-router --force-reinstall
1313
RUN pip install git+https://github.com/fzyzcjy/torch_memory_saver.git --no-cache-dir --force-reinstall
1414
RUN pip install ray[default]
1515
RUN pip install httpx[http2] wandb pylatexenc blobfile accelerate "mcp[cli]"
16-
RUN pip install git+https://github.com/zhuzilin/cumem_allocator.git
1716

1817
# mbridge
1918
RUN pip install git+https://github.com/ISEEKYAN/mbridge.git --no-deps

docs/en/build.md

Lines changed: 0 additions & 77 deletions
This file was deleted.

docs/en/quick_start.md

Lines changed: 1 addition & 2 deletions
Original file line numberDiff line numberDiff line change
@@ -8,7 +8,7 @@ This document will guide you through setting up the environment and getting star
88

99
Since slime may contain temporary patches for sglang/megatron, to avoid potential environment configuration issues, we strongly recommend **users to use our latest Docker image**, which comes pre-configured with all dependencies.
1010

11-
- For scenarios where Docker is not convenient, please refer to [Building Environment from Scratch](./build.md);
11+
- For scenarios where Docker is not convenient, please refer to [build_conda.sh](./../../build_conda.sh);
1212
- For AMD support, please refer to [AMD Usage Tutorial](./amd_tutorial.md).
1313

1414
### Pull and Start Docker Container
@@ -554,5 +554,4 @@ ray job submit --address="http://127.0.0.1:8265" \
554554
slime has been deeply optimized for distributed training of large-scale Mixture of Experts (MoE) models. We provide some end-to-end training cases for reference:
555555

556556
- [Example: 64xH100 Training GLM-4.5](models/glm4.5-355B-A32B.md)
557-
- [Example: 8xH100 Training Qwen3-30B-A3B](models/qwen3-30B-A3B.md)
558557
- [Example: 128xH100 Training DeepSeek-R1](models/deepseek-r1.md)

docs/zh/build.md

Lines changed: 0 additions & 78 deletions
This file was deleted.

0 commit comments

Comments
 (0)