Skip to content

Commit 0507609

Browse files
Superjomnhchings
andauthored
[TRTLLM-10695][ci] add verl stage in CI (#11306)
Signed-off-by: Chunwei Yan <yanchunwei@outlook.com> Co-authored-by: Erin <14718778+hchings@users.noreply.github.com>
1 parent 2eee701 commit 0507609

File tree

5 files changed

+224
-1
lines changed

5 files changed

+224
-1
lines changed

jenkins/L0_Test.groovy

Lines changed: 7 additions & 1 deletion
Original file line numberDiff line numberDiff line change
@@ -243,6 +243,7 @@ def processShardTestList(llmSrc, testDBList, splitId, splits, perfMode=false) {
243243
echo "Preprocessing testDBList to extract ISOLATION markers..."
244244

245245
def originalTestLines = readFile(file: testDBList).readLines()
246+
246247
def cleanedTestLines = []
247248
def isolationTestLines = []
248249

@@ -2174,8 +2175,12 @@ def getMakoArgsFromStageName(stageName, parseSysinfo=false) {
21742175
// If stageName contains "-AutoDeploy-", add "backend=autodeploy" to makoArgs
21752176
// At this point, only tests with backend=autodeploy or unspecified backend will be run
21762177
makoArgs += ["backend=autodeploy"]
2178+
} else if (stageName.contains("-Verl-")) {
2179+
// If stageName contains "-Verl-", add "backend=verl" to makoArgs
2180+
// At this point, only tests with backend=verl or unspecified backend will be run
2181+
makoArgs += ["backend=verl"]
21772182
} else {
2178-
// If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", "-FMHA-", or "-AutoDeploy-", do not add any backend
2183+
// If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", "-FMHA-", "-AutoDeploy-", or "-Verl-", do not add any backend
21792184
// At this point, all tests will be run
21802185
// For cases where backend is not specified in makoArgs, we will match all types of backends and tests without specified backend
21812186
}
@@ -3311,6 +3316,7 @@ def launchTestJobs(pipeline, testFilter)
33113316
"DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 2, 4, 1, true],
33123317
"DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["auto:dgx-b200-flex", "l0_dgx_b200", 2, 2, 4, 1, true],
33133318
"DGX_B200-8_GPUs-PyTorch-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 1, 8, 1, true],
3319+
"DGX_B200-4_GPUs-Verl-Post-Merge-1": ["auto:dgx-b200-flex", "l0_verl", 1, 1, 4, 1, true],
33143320
"B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
33153321
"DGX_B300-4_GPUs-PyTorch-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
33163322
"DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
Lines changed: 1 addition & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1 @@
1+
verl_repo/
Lines changed: 145 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,145 @@
1+
# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
2+
# SPDX-License-Identifier: Apache-2.0
3+
#
4+
# Licensed under the Apache License, Version 2.0 (the "License");
5+
# you may not use this file except in compliance with the License.
6+
# You may obtain a copy of the License at
7+
#
8+
# http://www.apache.org/licenses/LICENSE-2.0
9+
#
10+
# Unless required by applicable law or agreed to in writing, software
11+
# distributed under the License is distributed on an "AS IS" BASIS,
12+
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
13+
# See the License for the specific language governing permissions and
14+
# limitations under the License.
15+
"""Self-contained wrapper tests for the verl repo.
16+
17+
All setup (dependency installation, repo cloning, env vars) is handled by
18+
a session-scoped pytest fixture. Configuration is read from verl_config.yml.
19+
"""
20+
21+
import os
22+
import subprocess
23+
import sys
24+
25+
import pytest
26+
import yaml
27+
28+
_HERE = os.path.dirname(os.path.abspath(__file__))
29+
_CONFIG_PATH = os.path.join(_HERE, "verl_config.yml")
30+
VERL_ROOT = os.path.join(_HERE, "verl_repo")
31+
32+
33+
def _load_config():
34+
with open(_CONFIG_PATH) as f:
35+
return yaml.safe_load(f)["verl_config"]
36+
37+
38+
def _export_env_vars(config):
39+
"""Export env vars from config into the current process environment."""
40+
for entry in config.get("env_vars", []):
41+
key, val = entry.split("=", 1)
42+
val = val.strip('"')
43+
val = os.path.expandvars(val)
44+
os.environ[key] = val
45+
46+
47+
def _run_install_commands(config):
48+
"""Run install commands from config with env vars already set."""
49+
for cmd in config.get("install_commands", []):
50+
print(f"[verl setup] Running: {cmd}")
51+
subprocess.check_call(cmd, shell=True)
52+
53+
54+
def _clone_verl_repo(config):
55+
"""Clone the verl repo and checkout the specified tag."""
56+
if os.path.isdir(VERL_ROOT):
57+
print(f"[verl setup] Repo already exists at {VERL_ROOT}, skipping clone")
58+
return
59+
repo_url = config["repo_url"]
60+
repo_tag = config["repo_tag"]
61+
print(f"[verl setup] Cloning {repo_url} (tag={repo_tag}) into {VERL_ROOT}")
62+
subprocess.check_call(
63+
f"git clone {repo_url} {VERL_ROOT} && cd {VERL_ROOT} && git checkout {repo_tag}",
64+
shell=True,
65+
)
66+
assert os.path.isdir(VERL_ROOT), f"Failed to clone verl repo to {VERL_ROOT}"
67+
print(f"[verl setup] Installing verl package from {VERL_ROOT}")
68+
subprocess.check_call(
69+
[sys.executable, "-m", "pip", "install", "-e", VERL_ROOT],
70+
)
71+
72+
73+
def _setup_model_symlinks(config):
74+
"""Create symlinks from HF-style paths to CI cache paths.
75+
76+
Verl tests expect models at {model_root}/Qwen/ModelName but the CI cache
77+
stores them at {ci_cache}/ModelName (flat structure). We create symlinks
78+
in a writable staging directory that point to the read-only CI cache.
79+
"""
80+
model_root = os.environ.get("TRTLLM_TEST_MODEL_PATH_ROOT", "")
81+
ci_cache = config.get("ci_model_cache", "")
82+
if not model_root or not ci_cache:
83+
return
84+
for model_id in config.get("required_models", []):
85+
if "/" not in model_id:
86+
continue
87+
namespace, name = model_id.split("/", 1)
88+
ns_dir = os.path.join(model_root, namespace)
89+
src = os.path.join(ci_cache, name)
90+
dst = os.path.join(ns_dir, name)
91+
if os.path.exists(dst):
92+
print(f"[verl setup] Model symlink already exists: {dst}")
93+
continue
94+
if not os.path.isdir(src):
95+
print(f"[verl setup] Model not found in CI cache: {src}, skipping")
96+
continue
97+
os.makedirs(ns_dir, exist_ok=True)
98+
os.symlink(src, dst)
99+
print(f"[verl setup] Created symlink: {dst} -> {src}")
100+
101+
102+
@pytest.fixture(scope="session", autouse=True)
103+
def verl_setup():
104+
"""Session-scoped fixture: install deps, set env vars, clone verl repo."""
105+
config = _load_config()
106+
_export_env_vars(config)
107+
_run_install_commands(config)
108+
_clone_verl_repo(config)
109+
_setup_model_symlinks(config)
110+
yield VERL_ROOT
111+
112+
113+
def _run_verl_test(test_path, extra_args=None, timeout=600):
114+
"""Run a test from the verl repo via subprocess."""
115+
full_path = os.path.join(VERL_ROOT, test_path)
116+
assert os.path.exists(full_path), f"Verl test not found: {full_path}"
117+
cmd = [sys.executable, "-m", "pytest", full_path, "-v", "--tb=short"]
118+
if extra_args:
119+
cmd.extend(extra_args)
120+
result = subprocess.run(
121+
cmd,
122+
cwd=VERL_ROOT,
123+
env=os.environ.copy(),
124+
timeout=timeout,
125+
)
126+
assert result.returncode == 0, f"Verl test failed with return code {result.returncode}"
127+
128+
129+
def test_async_server():
130+
_run_verl_test("tests/workers/rollout/rollout_trtllm/test_async_server.py")
131+
132+
133+
def test_adapter():
134+
_run_verl_test("tests/workers/rollout/rollout_trtllm/test_adapter.py")
135+
136+
137+
def test_rollout_utils():
138+
_run_verl_test(
139+
"tests/workers/rollout/rollout_trtllm/test_trtllm_rollout_utils.py",
140+
extra_args=[
141+
"-k",
142+
"not (test_unimodal_generate or test_unimodal_batch_generate)",
143+
],
144+
timeout=900,
145+
)
Lines changed: 51 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,51 @@
1+
# Verl configuration for CI stage
2+
3+
verl_config:
4+
repo_url: "https://github.com/volcengine/verl.git"
5+
repo_tag: "4cda6af"
6+
test_dir: "tests"
7+
8+
install_commands:
9+
# Install gdrcopy
10+
- >-
11+
git clone -b v2.5.1 https://github.com/NVIDIA/gdrcopy.git &&
12+
(cd gdrcopy && make prefix=/usr/local lib_install) &&
13+
rm -rf gdrcopy
14+
# Install nvshmem
15+
- "pip install nvidia-nvshmem-cu13==3.3.20"
16+
# Create nvshmem symlink (needed before DeepEP build)
17+
- >-
18+
(cd /usr/local/lib/python3.12/dist-packages/nvidia/nvshmem/lib &&
19+
ln -s libnvshmem_host.so.3 libnvshmem_host.so)
20+
# Install DeepEP
21+
- >-
22+
git clone -b v1.2.1 https://github.com/deepseek-ai/DeepEP.git &&
23+
(cd DeepEP &&
24+
wget https://raw.githubusercontent.com/NVIDIA/Megatron-LM/refs/tags/core_v0.15.0/docker/patches/deepep.patch &&
25+
patch -p1 < deepep.patch &&
26+
TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" python setup.py install) &&
27+
rm -rf DeepEP
28+
# Install Python dependencies
29+
- "pip3 install --no-cache-dir --no-deps trl"
30+
- "pip3 install --no-cache-dir nvtx matplotlib liger_kernel cachetools"
31+
- "pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git"
32+
- "pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0"
33+
- "pip3 install pytest-asyncio"
34+
- "pip3 install --no-cache-dir 'ray[default]'"
35+
36+
37+
# The environment variables to expose in the container before setting up
38+
env_vars:
39+
- "NVSHMEM_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/nvshmem"
40+
- "LD_LIBRARY_PATH=\"${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH\""
41+
- "PATH=\"${NVSHMEM_DIR}/bin:$PATH\""
42+
- "TRTLLM_TEST_MODEL_PATH_ROOT=/tmp/verl-models"
43+
44+
# Read-only CI model cache (flat layout: /scratch.../ModelName)
45+
ci_model_cache: "/scratch.trt_llm_data/llm-models"
46+
47+
# Models needed by verl tests (symlinks created from HF-style to CI cache paths)
48+
required_models:
49+
- "Qwen/Qwen2.5-0.5B-Instruct"
50+
- "Qwen/Qwen2.5-1.5B-Instruct"
51+
- "Qwen/Qwen2.5-VL-7B-Instruct"
Lines changed: 20 additions & 0 deletions
Original file line numberDiff line numberDiff line change
@@ -0,0 +1,20 @@
1+
version: 0.0.1
2+
l0_verl:
3+
- condition:
4+
ranges:
5+
system_gpu_count:
6+
gte: 4
7+
lte: 4
8+
wildcards:
9+
gpu:
10+
- '*b200*'
11+
linux_distribution_name: ubuntu*
12+
cpu: x86_64
13+
terms:
14+
stage: post_merge
15+
backend: verl
16+
orchestrator: mpi
17+
tests:
18+
- verl/test_verl_cases.py::test_async_server
19+
- verl/test_verl_cases.py::test_adapter
20+
- verl/test_verl_cases.py::test_rollout_utils

0 commit comments

Comments
 (0)