-
Notifications
You must be signed in to change notification settings - Fork 2.3k
Expand file tree
/
Copy pathverl_config.yml
More file actions
51 lines (45 loc) · 2.01 KB
/
verl_config.yml
File metadata and controls
51 lines (45 loc) · 2.01 KB
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
# Verl configuration for CI stage
verl_config:
repo_url: "https://github.com/volcengine/verl.git"
repo_tag: "4cda6af"
test_dir: "tests"
install_commands:
# Install gdrcopy
- >-
git clone -b v2.5.1 https://github.com/NVIDIA/gdrcopy.git &&
(cd gdrcopy && make prefix=/usr/local lib_install) &&
rm -rf gdrcopy
# Install nvshmem
- "pip install nvidia-nvshmem-cu13==3.3.20"
# Create nvshmem symlink (needed before DeepEP build)
- >-
(cd /usr/local/lib/python3.12/dist-packages/nvidia/nvshmem/lib &&
ln -s libnvshmem_host.so.3 libnvshmem_host.so)
# Install DeepEP
- >-
git clone -b v1.2.1 https://github.com/deepseek-ai/DeepEP.git &&
(cd DeepEP &&
wget https://raw.githubusercontent.com/NVIDIA/Megatron-LM/refs/tags/core_v0.15.0/docker/patches/deepep.patch &&
patch -p1 < deepep.patch &&
TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" python setup.py install) &&
rm -rf DeepEP
# Install Python dependencies
- "pip3 install --no-cache-dir --no-deps trl"
- "pip3 install --no-cache-dir nvtx matplotlib liger_kernel cachetools"
- "pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git"
- "pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0"
- "pip3 install pytest-asyncio"
- "pip3 install --no-cache-dir 'ray[default]==2.54.1'"
# The environment variables to expose in the container before setting up
env_vars:
- "NVSHMEM_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/nvshmem"
- "LD_LIBRARY_PATH=\"${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH\""
- "PATH=\"${NVSHMEM_DIR}/bin:$PATH\""
- "TRTLLM_TEST_MODEL_PATH_ROOT=/tmp/verl-models"
# Read-only CI model cache (flat layout: /scratch.../ModelName)
ci_model_cache: "/scratch.trt_llm_data/llm-models"
# Models needed by verl tests (symlinks created from HF-style to CI cache paths)
required_models:
- "Qwen/Qwen2.5-0.5B-Instruct"
- "Qwen/Qwen2.5-1.5B-Instruct"
- "Qwen/Qwen2.5-VL-7B-Instruct"