[TRTLLM-10695][ci] add verl stage in CI (#11306)

Superjomn · hchings · web-flow · commit 0507609b30df · 2026-03-13T08:54:57.000+08:00
Signed-off-by: Chunwei Yan &lt;yanchunwei@outlook.com&gt;
Co-authored-by: Erin &lt;14718778+hchings@users.noreply.github.com&gt;
diff --git a/jenkins/L0_Test.groovy b/jenkins/L0_Test.groovy
@@ -243,6 +243,7 @@ def processShardTestList(llmSrc, testDBList, splitId, splits, perfMode=false) {
     echo "Preprocessing testDBList to extract ISOLATION markers..."
 
     def originalTestLines = readFile(file: testDBList).readLines()
+
     def cleanedTestLines = []
     def isolationTestLines = []
 
@@ -2174,8 +2175,12 @@ def getMakoArgsFromStageName(stageName, parseSysinfo=false) {
         // If stageName contains "-AutoDeploy-", add "backend=autodeploy" to makoArgs
         // At this point, only tests with backend=autodeploy or unspecified backend will be run
         makoArgs += ["backend=autodeploy"]
+    } else if (stageName.contains("-Verl-")) {
+        // If stageName contains "-Verl-", add "backend=verl" to makoArgs
+        // At this point, only tests with backend=verl or unspecified backend will be run
+        makoArgs += ["backend=verl"]
     } else {
-        // If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", "-FMHA-", or "-AutoDeploy-", do not add any backend
+        // If stageName does not contain "-PyTorch-", "-TensorRT-", "-CPP-", "-Triton-", "-FMHA-", "-AutoDeploy-", or "-Verl-", do not add any backend
         // At this point, all tests will be run
         // For cases where backend is not specified in makoArgs, we will match all types of backends and tests without specified backend
     }
@@ -3311,6 +3316,7 @@ def launchTestJobs(pipeline, testFilter)
         "DGX_B200-4_GPUs-PyTorch-Post-Merge-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 2, 4, 1, true],
         "DGX_B200-4_GPUs-PyTorch-Post-Merge-2": ["auto:dgx-b200-flex", "l0_dgx_b200", 2, 2, 4, 1, true],
         "DGX_B200-8_GPUs-PyTorch-1": ["auto:dgx-b200-flex", "l0_dgx_b200", 1, 1, 8, 1, true],
+        "DGX_B200-4_GPUs-Verl-Post-Merge-1": ["auto:dgx-b200-flex", "l0_verl", 1, 1, 4, 1, true],
         "B300-PyTorch-1": ["b300-single", "l0_b300", 1, 1],
         "DGX_B300-4_GPUs-PyTorch-1": ["b300-x4", "l0_dgx_b300", 1, 1, 4],
         "DGX_B300-4_GPUs-PyTorch-Post-Merge-1": ["b300-x4", "l0_dgx_b300", 1, 2, 4],
diff --git a/tests/integration/defs/verl/.gitignore b/tests/integration/defs/verl/.gitignore
@@ -0,0 +1 @@
+verl_repo/
diff --git a/tests/integration/defs/verl/test_verl_cases.py b/tests/integration/defs/verl/test_verl_cases.py
@@ -0,0 +1,145 @@
+# SPDX-FileCopyrightText: Copyright (c) 2025-2026 NVIDIA CORPORATION & AFFILIATES. All rights reserved.
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""Self-contained wrapper tests for the verl repo.
+
+All setup (dependency installation, repo cloning, env vars) is handled by
+a session-scoped pytest fixture. Configuration is read from verl_config.yml.
+"""
+
+import os
+import subprocess
+import sys
+
+import pytest
+import yaml
+
+_HERE = os.path.dirname(os.path.abspath(__file__))
+_CONFIG_PATH = os.path.join(_HERE, "verl_config.yml")
+VERL_ROOT = os.path.join(_HERE, "verl_repo")
+
+
+def _load_config():
+    with open(_CONFIG_PATH) as f:
+        return yaml.safe_load(f)["verl_config"]
+
+
+def _export_env_vars(config):
+    """Export env vars from config into the current process environment."""
+    for entry in config.get("env_vars", []):
+        key, val = entry.split("=", 1)
+        val = val.strip('"')
+        val = os.path.expandvars(val)
+        os.environ[key] = val
+
+
+def _run_install_commands(config):
+    """Run install commands from config with env vars already set."""
+    for cmd in config.get("install_commands", []):
+        print(f"[verl setup] Running: {cmd}")
+        subprocess.check_call(cmd, shell=True)
+
+
+def _clone_verl_repo(config):
+    """Clone the verl repo and checkout the specified tag."""
+    if os.path.isdir(VERL_ROOT):
+        print(f"[verl setup] Repo already exists at {VERL_ROOT}, skipping clone")
+        return
+    repo_url = config["repo_url"]
+    repo_tag = config["repo_tag"]
+    print(f"[verl setup] Cloning {repo_url} (tag={repo_tag}) into {VERL_ROOT}")
+    subprocess.check_call(
+        f"git clone {repo_url} {VERL_ROOT} && cd {VERL_ROOT} && git checkout {repo_tag}",
+        shell=True,
+    )
+    assert os.path.isdir(VERL_ROOT), f"Failed to clone verl repo to {VERL_ROOT}"
+    print(f"[verl setup] Installing verl package from {VERL_ROOT}")
+    subprocess.check_call(
+        [sys.executable, "-m", "pip", "install", "-e", VERL_ROOT],
+    )
+
+
+def _setup_model_symlinks(config):
+    """Create symlinks from HF-style paths to CI cache paths.
+
+    Verl tests expect models at {model_root}/Qwen/ModelName but the CI cache
+    stores them at {ci_cache}/ModelName (flat structure). We create symlinks
+    in a writable staging directory that point to the read-only CI cache.
+    """
+    model_root = os.environ.get("TRTLLM_TEST_MODEL_PATH_ROOT", "")
+    ci_cache = config.get("ci_model_cache", "")
+    if not model_root or not ci_cache:
+        return
+    for model_id in config.get("required_models", []):
+        if "/" not in model_id:
+            continue
+        namespace, name = model_id.split("/", 1)
+        ns_dir = os.path.join(model_root, namespace)
+        src = os.path.join(ci_cache, name)
+        dst = os.path.join(ns_dir, name)
+        if os.path.exists(dst):
+            print(f"[verl setup] Model symlink already exists: {dst}")
+            continue
+        if not os.path.isdir(src):
+            print(f"[verl setup] Model not found in CI cache: {src}, skipping")
+            continue
+        os.makedirs(ns_dir, exist_ok=True)
+        os.symlink(src, dst)
+        print(f"[verl setup] Created symlink: {dst} -> {src}")
+
+
+@pytest.fixture(scope="session", autouse=True)
+def verl_setup():
+    """Session-scoped fixture: install deps, set env vars, clone verl repo."""
+    config = _load_config()
+    _export_env_vars(config)
+    _run_install_commands(config)
+    _clone_verl_repo(config)
+    _setup_model_symlinks(config)
+    yield VERL_ROOT
+
+
+def _run_verl_test(test_path, extra_args=None, timeout=600):
+    """Run a test from the verl repo via subprocess."""
+    full_path = os.path.join(VERL_ROOT, test_path)
+    assert os.path.exists(full_path), f"Verl test not found: {full_path}"
+    cmd = [sys.executable, "-m", "pytest", full_path, "-v", "--tb=short"]
+    if extra_args:
+        cmd.extend(extra_args)
+    result = subprocess.run(
+        cmd,
+        cwd=VERL_ROOT,
+        env=os.environ.copy(),
+        timeout=timeout,
+    )
+    assert result.returncode == 0, f"Verl test failed with return code {result.returncode}"
+
+
+def test_async_server():
+    _run_verl_test("tests/workers/rollout/rollout_trtllm/test_async_server.py")
+
+
+def test_adapter():
+    _run_verl_test("tests/workers/rollout/rollout_trtllm/test_adapter.py")
+
+
+def test_rollout_utils():
+    _run_verl_test(
+        "tests/workers/rollout/rollout_trtllm/test_trtllm_rollout_utils.py",
+        extra_args=[
+            "-k",
+            "not (test_unimodal_generate or test_unimodal_batch_generate)",
+        ],
+        timeout=900,
+    )
diff --git a/tests/integration/defs/verl/verl_config.yml b/tests/integration/defs/verl/verl_config.yml
@@ -0,0 +1,51 @@
+# Verl configuration for CI stage
+
+verl_config:
+  repo_url: "https://github.com/volcengine/verl.git"
+  repo_tag: "4cda6af"
+  test_dir: "tests"
+
+  install_commands:
+    # Install gdrcopy
+    - >-
+      git clone -b v2.5.1 https://github.com/NVIDIA/gdrcopy.git &&
+      (cd gdrcopy && make prefix=/usr/local lib_install) &&
+      rm -rf gdrcopy
+    # Install nvshmem
+    - "pip install nvidia-nvshmem-cu13==3.3.20"
+    # Create nvshmem symlink (needed before DeepEP build)
+    - >-
+      (cd /usr/local/lib/python3.12/dist-packages/nvidia/nvshmem/lib &&
+      ln -s libnvshmem_host.so.3 libnvshmem_host.so)
+    # Install DeepEP
+    - >-
+      git clone -b v1.2.1 https://github.com/deepseek-ai/DeepEP.git &&
+      (cd DeepEP &&
+      wget https://raw.githubusercontent.com/NVIDIA/Megatron-LM/refs/tags/core_v0.15.0/docker/patches/deepep.patch &&
+      patch -p1 < deepep.patch &&
+      TORCH_CUDA_ARCH_LIST="9.0 10.0 12.0" python setup.py install) &&
+      rm -rf DeepEP
+    # Install Python dependencies
+    - "pip3 install --no-cache-dir --no-deps trl"
+    - "pip3 install --no-cache-dir nvtx matplotlib liger_kernel cachetools"
+    - "pip install --no-cache-dir -U git+https://github.com/ISEEKYAN/mbridge.git"
+    - "pip install --no-deps --no-cache-dir git+https://github.com/NVIDIA/Megatron-LM.git@core_v0.15.0"
+    - "pip3 install pytest-asyncio"
+    - "pip3 install --no-cache-dir 'ray[default]'"
+
+
+  # The environment variables to expose in the container before setting up
+  env_vars:
+    - "NVSHMEM_DIR=/usr/local/lib/python3.12/dist-packages/nvidia/nvshmem"
+    - "LD_LIBRARY_PATH=\"${NVSHMEM_DIR}/lib:$LD_LIBRARY_PATH\""
+    - "PATH=\"${NVSHMEM_DIR}/bin:$PATH\""
+    - "TRTLLM_TEST_MODEL_PATH_ROOT=/tmp/verl-models"
+
+  # Read-only CI model cache (flat layout: /scratch.../ModelName)
+  ci_model_cache: "/scratch.trt_llm_data/llm-models"
+
+  # Models needed by verl tests (symlinks created from HF-style to CI cache paths)
+  required_models:
+    - "Qwen/Qwen2.5-0.5B-Instruct"
+    - "Qwen/Qwen2.5-1.5B-Instruct"
+    - "Qwen/Qwen2.5-VL-7B-Instruct"
diff --git a/tests/integration/test_lists/test-db/l0_verl.yml b/tests/integration/test_lists/test-db/l0_verl.yml
@@ -0,0 +1,20 @@
+version: 0.0.1
+l0_verl:
+- condition:
+    ranges:
+      system_gpu_count:
+        gte: 4
+        lte: 4
+    wildcards:
+      gpu:
+      - '*b200*'
+      linux_distribution_name: ubuntu*
+      cpu: x86_64
+    terms:
+      stage: post_merge
+      backend: verl
+      orchestrator: mpi
+  tests:
+  - verl/test_verl_cases.py::test_async_server
+  - verl/test_verl_cases.py::test_adapter
+  - verl/test_verl_cases.py::test_rollout_utils